├── .gitignore ├── AUTHORS ├── README.org ├── aws_env.example ├── data └── .gitkeep ├── deploy.sh ├── docs ├── audition │ ├── eis │ │ ├── distance_from_best_precision@10_pct.png │ │ ├── metric_over_time_precision@10_pct.png │ │ ├── precision@10_pct_next_time.png │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ └── regret_over_time_precision@10_pct.png │ └── inspections │ │ ├── distance_from_best_precision@10_pct.png │ │ ├── metric_over_time_precision@10_pct.png │ │ ├── precision@10_pct_next_time.png │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ └── regret_over_time_precision@10_pct.png ├── css │ ├── htmlize.css │ ├── org-default.css │ ├── org.css │ ├── readtheorg.css │ └── rtd-full.css ├── eis_postmodeling_config.yaml ├── images │ ├── AWS_Batch_Architecture.png │ ├── AWS_Batch_Architecture.svg │ ├── data_road.png │ ├── eis.png │ ├── eis_jaccard_on_lists_over_time.png │ ├── eis_mg_prec_over_time.png │ ├── eis_mg_recall_over_time.png │ ├── eis_model_group_64_feature_group_importances.png │ ├── eis_model_group_64_feature_importances.png │ ├── eis_model_group_64_rayid_curve.png │ ├── facilities_inspected_over_time.png │ ├── facilities_with_failed_inspections_severe_violations_over_time.png │ ├── facilities_with_inspections_failed_over_time.png │ ├── failed_inspections_over_time.png │ ├── failed_inspections_severe_violations_over_time.png │ ├── inspection_jaccard_on_lists_over_time.png │ ├── inspection_mg_prec_over_time.png │ ├── inspection_mg_recall_over_time.png │ ├── inspection_model_group_11_feature_group_importances.png │ ├── inspection_model_group_11_feature_importances.png │ ├── inspection_model_group_11_rayid_curve.png │ ├── inspections.png │ ├── inspections_dt.png │ ├── inspections_over_time.png │ ├── model_7_tree_0.png │ ├── outcomes-eis.png │ ├── outcomes-inspections.png │ ├── rolling-origin.png │ ├── sanjose-2.png │ ├── simple_test_skeleton.png │ ├── timechop.png │ ├── timechop_1.png │ ├── timechop_1.svg │ ├── timechop_10.png │ ├── timechop_10.svg │ ├── timechop_2.png │ ├── timechop_2.svg │ ├── timechop_3.png │ ├── timechop_3.svg │ ├── timechop_4.png │ ├── timechop_4.svg │ ├── timechop_5.png │ ├── timechop_5.svg │ ├── timechop_6.png │ ├── timechop_6.svg │ ├── timechop_7.png │ ├── timechop_7.svg │ ├── timechop_8.png │ ├── timechop_8.svg │ ├── timechop_9.png │ ├── timechop_9.svg │ ├── timechop_example.png │ ├── timechop_inspections_test.png │ ├── timechop_withoutblocks.png │ └── timechop_withoutrows.png ├── index.html ├── index.md ├── js │ ├── jquery.stickytableheaders.min.js │ └── readtheorg.js ├── sql │ ├── create_cleaned_inspections_table.sql │ ├── create_semantic_tables.sql │ └── create_violations_table.sql └── triage │ ├── experiments │ ├── eis_01.yaml │ ├── inspections-training.yaml │ ├── inspections_baseline.yaml │ ├── inspections_dt.yaml │ ├── inspections_label_failed_01.yaml │ └── simple_test_skeleton.yaml │ └── images │ ├── distance_from_best_precision@10_pct.png │ ├── eis_01.png │ ├── inspections_baseline.png │ ├── inspections_dt.png │ ├── inspections_label_failed_01.png │ ├── metric_over_time_precision@10_pct.png │ ├── precision@10_pct_next_time.png │ ├── regret_distance_from_best_rules_precision@10_pct.png │ ├── regret_over_time_precision@10_pct.png │ └── simple_test_skeleton.png ├── infrastructure ├── aws_batch │ ├── credentials.filter.example │ ├── triage-job-definition.json.example │ └── triage-overrides.json.example ├── bastion │ ├── Dockerfile │ └── requirements.txt ├── docker-compose.yml ├── env_example ├── food_db │ ├── Dockerfile │ ├── activate_postgis.sql │ ├── create_extensions.sql │ ├── create_inspections_table.sql │ └── nuke_triage.sql ├── triage │ ├── Dockerfile │ ├── __init__.py │ ├── requirements.txt │ ├── setup.py │ ├── triage_experiment.py │ └── utils.py └── web │ ├── Dockerfile │ └── default.conf ├── org ├── 00_instructions.org ├── 01_intro.org ├── 02_infrastructure.org ├── 03_data_preparation.org ├── 04_triage_intro.org ├── 05_inspections.org ├── 06_eis.org ├── 07_quick_setup.org ├── 08_postmodeling.org ├── 09_aws_batch.org ├── 100_whats_next.org ├── audition ├── css │ ├── htmlize.css │ ├── org-default.css │ ├── org.css │ ├── readtheorg.css │ └── rtd-full.css ├── docker-kernel-connection.json ├── images │ ├── AWS_Batch_Architecture.png │ ├── AWS_Batch_Architecture.svg │ ├── data_road.png │ ├── eis.png │ ├── eis_jaccard_on_lists_over_time.png │ ├── eis_mg_prec_over_time.png │ ├── eis_mg_recall_over_time.png │ ├── eis_model_group_64_feature_group_importances.png │ ├── eis_model_group_64_feature_importances.png │ ├── eis_model_group_64_rayid_curve.png │ ├── facilities_inspected_over_time.png │ ├── facilities_with_failed_inspections_severe_violations_over_time.png │ ├── facilities_with_inspections_failed_over_time.png │ ├── failed_inspections_over_time.png │ ├── failed_inspections_severe_violations_over_time.png │ ├── inspection_jaccard_on_lists_over_time.png │ ├── inspection_mg_prec_over_time.png │ ├── inspection_mg_recall_over_time.png │ ├── inspection_model_group_11_feature_group_importances.png │ ├── inspection_model_group_11_feature_importances.png │ ├── inspection_model_group_11_rayid_curve.png │ ├── inspections.png │ ├── inspections_dt.png │ ├── inspections_over_time.png │ ├── model_7_tree_0.png │ ├── outcomes-eis.png │ ├── outcomes-inspections.png │ ├── rolling-origin.png │ ├── sanjose-2.png │ ├── simple_test_skeleton.png │ ├── timechop.png │ ├── timechop_1.png │ ├── timechop_1.svg │ ├── timechop_10.png │ ├── timechop_10.svg │ ├── timechop_2.png │ ├── timechop_2.svg │ ├── timechop_3.png │ ├── timechop_3.svg │ ├── timechop_4.png │ ├── timechop_4.svg │ ├── timechop_5.png │ ├── timechop_5.svg │ ├── timechop_6.png │ ├── timechop_6.svg │ ├── timechop_7.png │ ├── timechop_7.svg │ ├── timechop_8.png │ ├── timechop_8.svg │ ├── timechop_9.png │ ├── timechop_9.svg │ ├── timechop_example.png │ ├── timechop_inspections_test.png │ ├── timechop_withoutblocks.png │ └── timechop_withoutrows.png ├── index.org ├── js │ ├── jquery.stickytableheaders.min.js │ ├── readtheorg.js │ └── stickytableheaders-license.txt ├── publish.el ├── ref.bib ├── sql │ ├── create_cleaned_inspections_table.sql │ ├── create_semantic_tables.sql │ └── create_violations_table.sql ├── triage │ ├── experiments │ └── images └── tutorial.setup ├── requirements-dev.txt ├── requirements.txt ├── scratch.org ├── triage ├── .gitkeep ├── audition │ ├── eis │ │ ├── distance_from_best_precision@10_pct.png │ │ ├── metric_over_time_precision@10_pct.png │ │ ├── precision@10_pct_next_time.png │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ ├── regret_over_time_precision@10_pct.png │ │ └── results_model_group_ids.json │ └── inspections │ │ ├── distance_from_best_precision@10_pct.png │ │ ├── metric_over_time_precision@10_pct.png │ │ ├── precision@10_pct_next_time.png │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ ├── regret_over_time_precision@10_pct.png │ │ └── results_model_group_ids.json ├── eis_audition_config.yaml ├── eis_crosstabs_config.yaml ├── eis_postmodeling_config.yaml ├── experiments │ ├── eis_01.yaml │ ├── inspections-training.yaml │ ├── inspections_baseline.yaml │ ├── inspections_dt.yaml │ ├── inspections_label_failed_01.yaml │ └── simple_test_skeleton.yaml ├── inspection_audition_config.yaml ├── inspection_postmodeling_config.yaml ├── output │ ├── .gitkeep │ └── images │ │ ├── .gitkeep │ │ ├── eis.svg │ │ ├── inspections.svg │ │ ├── inspections_dt.svg │ │ ├── inspections_test.svg │ │ ├── model_7_tree_0.svg │ │ └── simple_test_skeleton.svg ├── selection_rules │ ├── .gitkeep │ └── rules.yaml └── session.key └── tutorial.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env* 80 | 81 | # triage database config 82 | triage/database.yaml 83 | 84 | # virtualenv 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | 95 | # Porquerías de MacOS 96 | .DS_Store 97 | 98 | # Documentación de Sphinx 99 | docs/_build 100 | 101 | # Los datos no se suben a git 102 | **/data/ 103 | 104 | # Bases de datos 105 | *.db 106 | 107 | # Logs de instalación 108 | pip-log.txt 109 | pip-delete-this-directory.txt 110 | 111 | # Pruebas unitarias / Coverage 112 | htmlcov/ 113 | .tox/ 114 | .coverage 115 | .coverage.* 116 | .cache 117 | nosetests.xml 118 | coverage.xml 119 | *,cover 120 | 121 | # Documentación de Mkdocs 122 | site/ 123 | 124 | # Archivos de datos 125 | *.xlsx 126 | *.dat 127 | *.csv 128 | *.tsv 129 | *.psv 130 | *.sqlite 131 | *.doc 132 | *.docx 133 | *.odt 134 | *.ods 135 | *.xls* 136 | *.pdf 137 | *.ppt* 138 | *.sqlite 139 | *.pkl 140 | 141 | # De la construcción de imágenes y contenedores 142 | **/.built* 143 | **/*_built* 144 | **/.data_built* 145 | **/.infrastructure_built* 146 | **/.network_built* 147 | **/.running* 148 | 149 | # Basura de Emacs 150 | **/.#* 151 | 152 | # VIM 153 | *.swp 154 | 155 | # From the makefiles 156 | **/*.built 157 | **/*.pushed 158 | 159 | **/*_SUCCESS 160 | 161 | **/*development* 162 | **/*staging* 163 | 164 | **/triage-generated/* 165 | **/matrices/* 166 | **/trained_models/* 167 | triage/images/* 168 | 169 | **/profiling_stats/* 170 | 171 | **/ltximg/* 172 | .aws_env 173 | infrastructure/**/*.json 174 | infrastructure/**/*.filter 175 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Adolfo De Unanue 2 | Joseph Walsh 3 | Hans Koening 4 | Arthi Ramachandran 5 | Iván Higuera 6 | Kit Rodolfa 7 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+TITLE: Dirty Duck: A Guided Tour of Triage 2 | #+AUTHOR: Center for Data Science and Public Policy 3 | #+EMAIL: adolfo@uchicago.edu 4 | #+STARTUP: showeverything 5 | #+STARTUP: nohideblocks 6 | #+STARTUP: Indent 7 | 8 | 9 | 10 | * THIS REPOSITORY HAS BEEN ARXIVED. DIRTYDUCK NOW IS PART OF TRIAGE. PLEASE GO TO [[https://github.com/dssg/triage][HERE]] TO CONTINUE WITH THE SOURCE CODE OR [[https://dssg.github.io/triage/dirtyduck/docs/][HERE]] TO CONTINUE WITH THE TUTORIAL 11 | 12 | 13 | 14 | * Welcome! 15 | 16 | This tutorial will show you how to use =triage=, a data science 17 | modeling tool developed at the [[http://dsapp.uchicago.edu][Center for Data Science and Public 18 | Policy]] (DSaPP) at the University of Chicago. 19 | 20 | =triage= helps build models for three [[https://dssg.uchicago.edu/data-science-for-social-good-conference-2017/training-workshop-data-science-for-social-good-problem-templates/][common applied problems]]: (a) Early 21 | warning systems (*EWS* or *EIS*), (b) /resource prioritization/ (a.k.a "an 22 | inspections problem") and (c) interaction level predictions (a.k.a 23 | "appointment level"). These problems 24 | are difficult to model because their conceptualization and 25 | and implementation are prone to error, thanks to their multi-dimensional, 26 | multi-entity, time-series structure. 27 | 28 | The last version of this tutorial is published in [[https://dssg.github.io/dirtyduck/]] 29 | 30 | *NOTE* This tutorial is in sync with the latest version of =triage=. At 31 | this moment [[https://github.com/dssg/triage/releases/tag/v3.3.0][v3.3.0 (Arepa)]]. 32 | 33 | * What you need for this tutorial 34 | 35 | Install [[http://www.docker.com][Docker CE]] and [[https://docs.docker.com/compose/][Docker Compose]]. That's it. 36 | Follow the links for installation instructions. 37 | 38 | Note that if you are using =GNU/Linux= you should add your user to the 39 | =docker= group following the instructions at this [[https://docs.docker.com/install/linux/linux-postinstall/][link]]. 40 | 41 | At the moment only operative systems with *nix-type command lines are 42 | supported, such as =GNU/Linux= and =MacOS=. Recent versions of 43 | =Windows= may also work. 44 | 45 | * How to use this tutorial 46 | 47 | First, clone this repository on your laptop 48 | 49 | #+BEGIN_EXAMPLE 50 | git clone https://github.com/dssg/dirtyduck.git 51 | #+END_EXAMPLE 52 | 53 | Second, run 54 | 55 | #+BEGIN_EXAMPLE 56 | ./tutorial.sh start 57 | #+END_Example 58 | 59 | This will take several minutes the first time you do it. 60 | 61 | * How you can help 62 | 63 | Help is always welcome! You can report errors, improve 64 | the tutorial, or propose improvements to 65 | =triage=. These three cases are discussed below. 66 | 67 | ** How to report errors 68 | 69 | There are almost surely errors. Please open an [[https://github.com/dssg/dirtyduck/issues][issue]] and 70 | we will try to issue a fix as soon as possible. 71 | 72 | ** How to improve the tutorial 73 | 74 | This tutorial was created following the practices of [[https://www-cs-faculty.stanford.edu/~knuth/lp.html][Literate 75 | Programming]] using [[https://orgmode.org/][org-mode]][fn:1] in [[https://www.gnu.org/software/emacs/][GNU Emacs]][fn:2]. That means the tutorial is a /live/ document that mixes code and text. 76 | 77 | The steps to help are: 78 | 79 | - clone the repository 80 | - edit the source =org= files in the =org= folder 81 | - From your terminal run 82 | 83 | #+BEGIN_SRC shell 84 | emacs --batch -l org/publish.el org/index.org --eval '(org-publish "dirtyduck" t)' 85 | #+END_SRC 86 | 87 | #+RESULTS: 88 | 89 | - create a *pull request*. 90 | 91 | 92 | ** How to help develop =triage= 93 | 94 | Go to the [[https://github.com/dssg/triage][triage]] repository and follow the instructions there. 95 | 96 | * Footnotes 97 | 98 | [fn:2] But it is supported in =vim= if you install a plugin. 99 | 100 | [fn:1] It's similar to =markdown= so you won't have any problem. 101 | -------------------------------------------------------------------------------- /aws_env.example: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PROJECT_NAME=dirtyduck 4 | TRIAGE_VERSION=3.3.0 5 | ENV=development 6 | AWS_REGISTRY={your-ecr-registry} 7 | AWS_JOB_QUEUE={your-job-queue} 8 | POSTGRES_DB={postgresql://user:password@db_server/dbname} 9 | S3_BUCKET={your-bucket} 10 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/data/.gitkeep -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source .aws_env 4 | 5 | # Exit the script as soon as something fails (-e) or if a variable is not defined (-u) 6 | set -e -u 7 | 8 | function info () { 9 | echo "##############################################" 10 | echo "# #" 11 | echo "# Project: ${PROJECT_NAME} #" 12 | echo "# Triage ver. ${TRIAGE_VERSION} #" 13 | echo "# #" 14 | echo "##############################################" 15 | echo "Environment: ${ENV}" 16 | echo "ECR Registry: ${AWS_REGISTRY}" 17 | echo "BATCH JOB QUEUE: ${AWS_JOB_QUEUE}" 18 | echo "DB: ${POSTGRES_DB}" 19 | echo "S3 Bucket: ${S3_BUCKET}" 20 | python --version 21 | pyenv --version 22 | pip --version 23 | } 24 | 25 | function sync_to_s3 () { 26 | 27 | echo "##############################################" 28 | echo "# #" 29 | echo "# Uploading changes to s3://${S3_BUCKET} " 30 | echo "# #" 31 | echo "##############################################" 32 | 33 | aws s3 sync triage/experiments/ s3://${S3_BUCKET}/experiments 34 | } 35 | 36 | function sync_from_s3 () { 37 | 38 | echo "##############################################" 39 | echo "# #" 40 | echo "# Getting changes from s3://${S3_BUCKET} " 41 | echo "# #" 42 | echo "##############################################" 43 | 44 | aws s3 sync s3://${S3_BUCKET}/experiments/ triage/experiments/ 45 | } 46 | 47 | function update_jobs () { 48 | echo "Updating the job definition of the following tasks: ${PROJECT_NAME}" 49 | 50 | echo "+----------------------------------------+" 51 | echo "| |" 52 | echo "| Updating ${PROJECT_NAME} job definition" 53 | echo "| |" 54 | echo "+----------------------------------------+" 55 | 56 | aws batch register-job-definition --cli-input-json file://infrastructure/aws_batch/triage-job-definition.json 57 | } 58 | 59 | function update_images () { 60 | echo "Updating images related to this project" 61 | 62 | tasks=triage 63 | 64 | echo "Updating the image of the following tasks: ${tasks}" 65 | 66 | for task in ${tasks} 67 | do 68 | echo "+----------------------------------------+" 69 | echo "| |" 70 | echo "| Updating ${task} image" 71 | echo "| |" 72 | echo "+----------------------------------------+" 73 | docker build --no-cache --tag dsapp/${PROJECT_NAME}/${task} infrastructure/${task} 74 | docker tag dsapp/${PROJECT_NAME}/${task} ${AWS_REGISTRY}/dsapp/${PROJECT_NAME}/${task}:${TRIAGE_VERSION} 75 | docker tag dsapp/${PROJECT_NAME}/${task} ${AWS_REGISTRY}/dsapp/${PROJECT_NAME}/${task}:latest 76 | 77 | eval "$(aws ecr get-login --no-include-email --region us-west-2)" 78 | 79 | docker push "${AWS_REGISTRY}"/dsapp/"${PROJECT_NAME}"/${task}:"${TRIAGE_VERSION}" 80 | docker push "${AWS_REGISTRY}"/dsapp/"${PROJECT_NAME}"/${task}:latest 81 | done 82 | 83 | } 84 | 85 | function update_triage_cli_image () { 86 | tasks=triage-cli 87 | 88 | echo "Updating the image of the following tasks: ${tasks}" 89 | 90 | for task in ${tasks} 91 | do 92 | echo "+----------------------------------------+" 93 | echo "| |" 94 | echo "| Updating ${task} image" 95 | echo "| |" 96 | echo "+----------------------------------------+" 97 | docker build --no-cache --tag dsapp/${task} infrastructure/triage 98 | docker tag dsapp/${task} ${AWS_REGISTRY}/dsapp/${task}:${TRIAGE_VERSION} 99 | docker tag dsapp/${task} ${AWS_REGISTRY}/dsapp/${task}:latest 100 | 101 | eval "$(aws ecr get-login --no-include-email --region us-west-2)" 102 | 103 | docker push "${AWS_REGISTRY}"/dsapp/${task}:"${TRIAGE_VERSION}" 104 | docker push "${AWS_REGISTRY}"/dsapp/${task}:latest 105 | done 106 | 107 | } 108 | 109 | 110 | 111 | function run_experiment () { 112 | job_name=$1 113 | echo "Running job ${job_name}" 114 | 115 | environment_overrides=$2 116 | echo "Using environment_overrides: ${environment_overrides}" 117 | 118 | parameters=$3 119 | echo "Using parameters: ${parameters}" 120 | 121 | command_overrides=${@:4} 122 | 123 | # # Retrieve temporary session credentials for current user 124 | session=$(aws sts get-session-token --duration-seconds 129600) # 36 h 125 | 126 | # # Restructure these to mirror pipeline overrides 127 | creds=$(<<<"$session" jq -f infrastructure/aws_batch/credentials.filter) 128 | 129 | 130 | # # Merge these AWS session credentials into *all* pipeline overrides 131 | overrides=$( 132 | < ${environment_overrides} \ 133 | jq --arg creds "$creds" \ 134 | '.environment += ($creds|fromjson|.environment)' 135 | ) 136 | 137 | if [ ! -z "$command_overrides" ] 138 | then 139 | 140 | echo "Adding ${command_overrides} to the command" 141 | 142 | for cmd in ${command_overrides} 143 | do 144 | overrides=$(echo $overrides | jq --arg cmds "${cmd}" \ 145 | '.command |= .+ [$cmds]') 146 | done 147 | 148 | fi 149 | 150 | aws batch submit-job --job-queue ${AWS_JOB_QUEUE} \ 151 | --job-name ${job_name} \ 152 | --job-definition triage-cli-experiment \ 153 | --container-overrides "${overrides}" \ 154 | --parameters "${parameters}" 155 | } 156 | 157 | function run() { 158 | run_experiment $1 infrastructure/aws_batch/triage-overrides.json "${@:2}" 159 | } 160 | 161 | 162 | function help_menu () { 163 | cat << EOF 164 | Usage: ${0} (-h | -i | -u | -b | -r | -a | --sync_{to,from}_s3 ) 165 | OPTIONS: 166 | -h|--help Show this message 167 | -i|--info Show information about the environment 168 | -t|--update-triage-image Build the ${PROJECT_NAME}'s triage image and push it to the AWS ECR 169 | -u|--update-jobs Update the ${PROJECT_NAME}'s triage job definition in AWS Batch 170 | -r|--run-experiment Run experiments on ${PROJECT_NAME} data 171 | --sync-to-s3 Uploads the experiments and configuration files to ${S3_BUCKET} 172 | --sync-from-s3 Gets the experiments and configuration files from ${S3_BUCKET} 173 | EXAMPLES: 174 | Build and push the images to your AWS ECR: 175 | $ ./deploy.sh -b 176 | Update the job's definitions: 177 | $ ./deploy.sh -u 178 | Sync your experiment config files: 179 | $ ./deploy.sh --sync-to-s3 180 | Run triage experiments: 181 | $ ./deploy.sh -r --experiment_file=s3://${S3_BUCKET}/experiments/test.yaml,output_path=s3://${S3_BUCKET}/triage,replace=--replace 182 | 183 | EOF 184 | } 185 | 186 | if [[ $# -eq 0 ]] ; then 187 | help_menu 188 | exit 0 189 | fi 190 | 191 | # Deal with command line flags. 192 | case "${1}" in 193 | -b|--update-images) 194 | update_images 195 | shift 196 | ;; 197 | -t|--update-triage-image) 198 | update_triage_cli_image 199 | shift 200 | ;; 201 | -u|--update-jobs) 202 | update_jobs 203 | shift 204 | ;; 205 | -r|--run-experiment) 206 | run ${@:2} 207 | shift 208 | ;; 209 | -a|--all) 210 | all 211 | shift 212 | ;; 213 | -i|--info) 214 | info 215 | shift 216 | ;; 217 | --sync-from-s3) 218 | sync_from_s3 219 | shift 220 | ;; 221 | --sync-to-s3) 222 | sync_to_s3 223 | shift 224 | ;; 225 | -h|--help) 226 | help_menu 227 | shift 228 | ;; 229 | *) 230 | echo "${1} is not a valid flag, try running: ${0} --help" 231 | ;; 232 | esac 233 | shift 234 | -------------------------------------------------------------------------------- /docs/audition/eis/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/eis/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/eis/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /docs/audition/eis/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/eis/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/inspections/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/inspections/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/inspections/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /docs/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /docs/audition/inspections/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/css/org-default.css: -------------------------------------------------------------------------------- 1 | .org-bold{font-weight:700}.org-bold-italic{font-weight:700;font-style:italic}.org-buffer-menu-buffer{font-weight:700}.org-builtin{color:#483d8b}.org-button{color:#3a5fcd;text-decoration:underline}.org-calendar-month-header{color:#00f}.org-calendar-today{text-decoration:underline}.org-calendar-weekday-header{color:#008b8b}.org-calendar-weekend-header{color:#b22222}.org-comint-highlight-input{font-weight:700}.org-comint-highlight-prompt{color:#0000cd}.org-comment,.org-comment-delimiter{color:#b22222}.org-constant{color:#008b8b}.org-diary{color:red}.org-doc{color:#8b2252}.org-error{color:red;font-weight:700}.org-escape-glyph{color:brown}.org-file-name-shadow{color:#7f7f7f}.org-fringe{background-color:#f2f2f2}.org-function-name{color:#00f}.org-glyphless-char{font-size:60%}.org-header-line{color:#333;background-color:#e5e5e5}.org-help-argument-name{font-style:italic}.org-highlight{background-color:#b4eeb4}.org-holiday{background-color:pink}.org-info-header-node{color:brown;font-weight:700;font-style:italic}.org-info-header-xref{color:#3a5fcd;text-decoration:underline}.org-info-index-match{background-color:#ff0}.org-info-menu-header{font-weight:700}.org-info-menu-star{color:red}.org-info-node{color:brown;font-weight:700;font-style:italic}.org-info-title-1{font-size:172%;font-weight:700}.org-info-title-2{font-size:144%;font-weight:700}.org-info-title-3{font-size:120%;font-weight:700}.org-info-title-4{font-weight:700}.org-info-xref{color:#3a5fcd;text-decoration:underline}.org-italic{font-style:italic}.org-keyword{color:#a020f0}.org-lazy-highlight{background-color:#afeeee}.org-link{color:#3a5fcd;text-decoration:underline}.org-link-visited{color:#8b008b;text-decoration:underline}.org-makefile-makepp-perl{background-color:#bfefff}.org-makefile-space{background-color:#ff69b4}.org-makefile-targets{color:#00f}.org-match{background-color:#ff0}.org-next-error{background-color:gtk_selection_bg_color}.org-nobreak-space{color:brown;text-decoration:underline}.org-org-agenda-calendar-event,.org-org-agenda-calendar-sexp{color:#000;background-color:#fff}.org-org-agenda-clocking{background-color:#ff0}.org-org-agenda-column-dateline{background-color:#e5e5e5}.org-org-agenda-current-time{color:#b8860b}.org-org-agenda-date{color:#00f}.org-org-agenda-date-today{color:#00f;font-weight:700;font-style:italic}.org-org-agenda-date-weekend{color:#00f;font-weight:700}.org-org-agenda-diary{color:#000;background-color:#fff}.org-org-agenda-dimmed-todo{color:#7f7f7f}.org-org-agenda-done{color:#228b22}.org-org-agenda-filter-category,.org-org-agenda-filter-effort,.org-org-agenda-filter-regexp,.org-org-agenda-filter-tags{color:#000;background-color:#bfbfbf}.org-org-agenda-restriction-lock{background-color:#eee}.org-org-agenda-structure{color:#00f}.org-org-archived,.org-org-block{color:#7f7f7f}.org-org-block-begin-line,.org-org-block-end-line{color:#b22222}.org-org-checkbox{font-weight:700}.org-org-checkbox-statistics-done{color:#228b22;font-weight:700}.org-org-checkbox-statistics-todo{color:red;font-weight:700}.org-org-clock-overlay{color:#000;background-color:#d3d3d3}.org-org-code{color:#7f7f7f}.org-org-column,.org-org-column-title{background-color:#e5e5e5}.org-org-column-title{font-weight:700;text-decoration:underline}.org-org-date{color:#a020f0;text-decoration:underline}.org-org-date-selected{color:red}.org-org-default{color:#000;background-color:#fff}.org-org-document-info{color:#191970}.org-org-document-info-keyword{color:#7f7f7f}.org-org-document-title{color:#191970;font-weight:700}.org-org-done{color:#228b22;font-weight:700}.org-org-drawer{color:#00f}.org-org-ellipsis{color:#b8860b;text-decoration:underline}.org-org-footnote{color:#a020f0;text-decoration:underline}.org-org-formula{color:#b22222}.org-org-headline-done{color:#bc8f8f}.org-org-hide{color:#fff}.org-org-latex-and-related{color:#8b4513}.org-org-level-1{color:#00f}.org-org-level-2{color:sienna}.org-org-level-3{color:#a020f0}.org-org-level-4{color:#b22222}.org-org-level-5{color:#228b22}.org-org-level-6{color:#008b8b}.org-org-level-7{color:#483d8b}.org-org-level-8{color:#8b2252}.org-org-link{color:#3a5fcd;text-decoration:underline}.org-org-list-dt{font-weight:700}.org-org-macro{color:#8b4513}.org-org-meta-line{color:#b22222}.org-org-mode-line-clock{color:#000;background-color:#bfbfbf}.org-org-mode-line-clock-overrun{color:#000;background-color:red}.org-org-priority{color:#a020f0}.org-org-quote{color:#7f7f7f}.org-org-scheduled{color:#006400}.org-org-scheduled-previously{color:#b22222}.org-org-scheduled-today{color:#006400}.org-org-sexp-date,.org-org-special-keyword{color:#a020f0}.org-org-table{color:#00f}.org-org-tag,.org-org-tag-group{font-weight:700}.org-org-target{text-decoration:underline}.org-org-time-grid{color:#b8860b}.org-org-todo{color:red;font-weight:700}.org-org-upcoming-deadline{color:#b22222}.org-org-verbatim,.org-org-verse{color:#7f7f7f}.org-org-warning{color:red;font-weight:700}.org-outline-1{color:#00f}.org-outline-2{color:sienna}.org-outline-3{color:#a020f0}.org-outline-4{color:#b22222}.org-outline-5{color:#228b22}.org-outline-6{color:#008b8b}.org-outline-7{color:#483d8b}.org-outline-8{color:#8b2252}.org-preprocessor{color:#483d8b}.org-regexp-grouping-backslash,.org-regexp-grouping-construct{font-weight:700}.org-region{background-color:gtk_selection_bg_color}.org-secondary-selection{background-color:#ff0}.org-shadow{color:#7f7f7f}.org-show-paren-match{background-color:#40e0d0}.org-show-paren-mismatch{color:#fff;background-color:#a020f0}.org-string{color:#8b2252}.org-success{color:#228b22;font-weight:700}.org-table-cell{color:#e5e5e5;background-color:#00f}.org-tooltip{color:#000;background-color:#ffffe0}.org-trailing-whitespace{background-color:red}.org-type{color:#228b22}.org-underline{text-decoration:underline}.org-variable-name{color:sienna}.org-warning{color:#ff8c00;font-weight:700}.org-warning-1{color:red;font-weight:700}.title{margin-bottom:.2em}.subtitle,.title{text-align:center}.subtitle{font-size:medium;font-weight:700;margin-top:0}.todo{color:red}.done,.todo{font-family:monospace}.done{color:green}.priority{color:orange}.priority,.tag{font-family:monospace}.tag{background-color:#eee;font-size:80%;font-weight:400;padding:2px}.timestamp{color:#bebebe}.timestamp-kwd{color:#5f9ea0}.org-right{margin-left:auto;margin-right:0;text-align:right}.org-left{margin-left:0;margin-right:auto;text-align:left}.org-center{margin-left:auto;margin-right:auto;text-align:center}.underline{text-decoration:underline}#postamble p,#preamble p{font-size:90%;margin:.2em}p.verse{margin-left:3%}pre{border:1px solid #ccc;box-shadow:3px 3px 3px #eee;font-family:monospace;margin:1.2em;overflow:auto;padding:8pt}pre.src{overflow:visible;padding-top:1.2em;position:relative}pre.src:before{background-color:#fff;border:1px solid #000;display:none;padding:3px;position:absolute;right:10px;top:-10px}pre.src:hover:before{display:inline}pre.src-bash:before,pre.src-sh:before{content:"sh"}pre.src-emacs-lisp:before{content:"Emacs Lisp"}pre.src-R:before{content:"R"}pre.src-perl:before{content:"Perl"}pre.src-java:before{content:"Java"}pre.src-sql:before{content:"SQL"}table{border-collapse:collapse}caption.t-above{caption-side:top}caption.t-bottom{caption-side:bottom}td,th{vertical-align:top}th.org-center,th.org-left,th.org-right{text-align:center}td.org-right{text-align:right}td.org-left{text-align:left}td.org-center{text-align:center}dt{font-weight:700}.footpara{display:inline}.footdef{margin-bottom:1em}.figure{padding:1em}.figure p{text-align:center}.inlinetask{background:#ffc;border:2px solid gray;margin:10px;padding:10px}#org-div-home-and-up{font-size:70%;text-align:right;white-space:nowrap}textarea{overflow-x:auto}.linenr{font-size:smaller}.code-highlighted{background-color:#ff0}.org-info-js_info-navigation{border-style:none}#org-info-js_console-label{font-size:10px;font-weight:700;white-space:nowrap}.org-info-js_search-highlight{background-color:#ff0;color:#000;font-weight:700} 2 | -------------------------------------------------------------------------------- /docs/eis_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | model_group_id: # List of model_id's [optional if a audition_output_path is given] 5 | - 40 6 | - 7 7 | - 156 8 | 9 | thresholds: # Thresholds for defining positive predictions 10 | rank_abs: [50, 100, 250] 11 | rank_pct: [5, 10, 25] 12 | 13 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 14 | SELECT g.model_group_id, 15 | m.model_id, 16 | EXTRACT('YEAR' FROM m.evaluation_end_time) AS as_of_date_year, 17 | m.metric, 18 | m.parameter, 19 | m.value, 20 | m.num_labeled_examples, 21 | m.num_labeled_above_threshold, 22 | m.num_positive_labels 23 | FROM test_results.evaluations m 24 | LEFT JOIN model_metadata.models g 25 | USING(model_id) 26 | WHERE g.model_group_id = 1 27 | AND metric = 'precision@' 28 | AND parameter = '10_pct' 29 | 30 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 31 | n_features_plots: 10 # Number of features for importances 32 | figsize: [12, 12] # Default size for plots 33 | fontsize: 20 # Default fontsize for plots 34 | -------------------------------------------------------------------------------- /docs/images/AWS_Batch_Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/AWS_Batch_Architecture.png -------------------------------------------------------------------------------- /docs/images/data_road.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/data_road.png -------------------------------------------------------------------------------- /docs/images/eis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis.png -------------------------------------------------------------------------------- /docs/images/eis_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /docs/images/eis_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_mg_prec_over_time.png -------------------------------------------------------------------------------- /docs/images/eis_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_mg_recall_over_time.png -------------------------------------------------------------------------------- /docs/images/eis_model_group_64_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_model_group_64_feature_group_importances.png -------------------------------------------------------------------------------- /docs/images/eis_model_group_64_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_model_group_64_feature_importances.png -------------------------------------------------------------------------------- /docs/images/eis_model_group_64_rayid_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_model_group_64_rayid_curve.png -------------------------------------------------------------------------------- /docs/images/facilities_inspected_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/facilities_inspected_over_time.png -------------------------------------------------------------------------------- /docs/images/facilities_with_failed_inspections_severe_violations_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/facilities_with_failed_inspections_severe_violations_over_time.png -------------------------------------------------------------------------------- /docs/images/facilities_with_inspections_failed_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/facilities_with_inspections_failed_over_time.png -------------------------------------------------------------------------------- /docs/images/failed_inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/failed_inspections_over_time.png -------------------------------------------------------------------------------- /docs/images/failed_inspections_severe_violations_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/failed_inspections_severe_violations_over_time.png -------------------------------------------------------------------------------- /docs/images/inspection_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /docs/images/inspection_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_mg_prec_over_time.png -------------------------------------------------------------------------------- /docs/images/inspection_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_mg_recall_over_time.png -------------------------------------------------------------------------------- /docs/images/inspection_model_group_11_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_model_group_11_feature_group_importances.png -------------------------------------------------------------------------------- /docs/images/inspection_model_group_11_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_model_group_11_feature_importances.png -------------------------------------------------------------------------------- /docs/images/inspection_model_group_11_rayid_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_model_group_11_rayid_curve.png -------------------------------------------------------------------------------- /docs/images/inspections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspections.png -------------------------------------------------------------------------------- /docs/images/inspections_dt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspections_dt.png -------------------------------------------------------------------------------- /docs/images/inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspections_over_time.png -------------------------------------------------------------------------------- /docs/images/model_7_tree_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/model_7_tree_0.png -------------------------------------------------------------------------------- /docs/images/outcomes-eis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/outcomes-eis.png -------------------------------------------------------------------------------- /docs/images/outcomes-inspections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/outcomes-inspections.png -------------------------------------------------------------------------------- /docs/images/rolling-origin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/rolling-origin.png -------------------------------------------------------------------------------- /docs/images/sanjose-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/sanjose-2.png -------------------------------------------------------------------------------- /docs/images/simple_test_skeleton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/simple_test_skeleton.png -------------------------------------------------------------------------------- /docs/images/timechop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop.png -------------------------------------------------------------------------------- /docs/images/timechop_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_1.png -------------------------------------------------------------------------------- /docs/images/timechop_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_10.png -------------------------------------------------------------------------------- /docs/images/timechop_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_2.png -------------------------------------------------------------------------------- /docs/images/timechop_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_3.png -------------------------------------------------------------------------------- /docs/images/timechop_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_4.png -------------------------------------------------------------------------------- /docs/images/timechop_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_5.png -------------------------------------------------------------------------------- /docs/images/timechop_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_6.png -------------------------------------------------------------------------------- /docs/images/timechop_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_7.png -------------------------------------------------------------------------------- /docs/images/timechop_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_8.png -------------------------------------------------------------------------------- /docs/images/timechop_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_9.png -------------------------------------------------------------------------------- /docs/images/timechop_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_example.png -------------------------------------------------------------------------------- /docs/images/timechop_inspections_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_inspections_test.png -------------------------------------------------------------------------------- /docs/images/timechop_withoutblocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_withoutblocks.png -------------------------------------------------------------------------------- /docs/images/timechop_withoutrows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_withoutrows.png -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Page Moved 6 | 7 | 8 | This page has moved. Click here to go to the new page. 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/js/jquery.stickytableheaders.min.js: -------------------------------------------------------------------------------- 1 | !function(a,b){"use strict";function c(c,g){var h=this;h.$el=a(c),h.el=c,h.id=e++,h.$window=a(b),h.$document=a(document),h.$el.bind("destroyed",a.proxy(h.teardown,h)),h.$clonedHeader=null,h.$originalHeader=null,h.isSticky=!1,h.hasBeenSticky=!1,h.leftOffset=null,h.topOffset=null,h.init=function(){h.$el.each(function(){var b=a(this);b.css("padding",0),h.$originalHeader=a("thead:first",this),h.$clonedHeader=h.$originalHeader.clone(),b.trigger("clonedHeader."+d,[h.$clonedHeader]),h.$clonedHeader.addClass("tableFloatingHeader"),h.$clonedHeader.css("display","none"),h.$originalHeader.addClass("tableFloatingHeaderOriginal"),h.$originalHeader.after(h.$clonedHeader),h.$printStyle=a(''),a("head").append(h.$printStyle)}),h.setOptions(g),h.updateWidth(),h.toggleHeaders(),h.bind()},h.destroy=function(){h.$el.unbind("destroyed",h.teardown),h.teardown()},h.teardown=function(){h.isSticky&&h.$originalHeader.css("position","static"),a.removeData(h.el,"plugin_"+d),h.unbind(),h.$clonedHeader.remove(),h.$originalHeader.removeClass("tableFloatingHeaderOriginal"),h.$originalHeader.css("visibility","visible"),h.$printStyle.remove(),h.el=null,h.$el=null},h.bind=function(){h.$scrollableArea.on("scroll."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.on("scroll."+d+h.id,h.setPositionValues),h.$window.on("resize."+d+h.id,h.toggleHeaders)),h.$scrollableArea.on("resize."+d,h.toggleHeaders),h.$scrollableArea.on("resize."+d,h.updateWidth)},h.unbind=function(){h.$scrollableArea.off("."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.off("."+d+h.id,h.setPositionValues),h.$window.off("."+d+h.id,h.toggleHeaders)),h.$scrollableArea.off("."+d,h.updateWidth)},h.toggleHeaders=function(){h.$el&&h.$el.each(function(){var b,c=a(this),d=h.isWindowScrolling?isNaN(h.options.fixedOffset)?h.options.fixedOffset.outerHeight():h.options.fixedOffset:h.$scrollableArea.offset().top+(isNaN(h.options.fixedOffset)?0:h.options.fixedOffset),e=c.offset(),f=h.$scrollableArea.scrollTop()+d,g=h.$scrollableArea.scrollLeft(),i=h.isWindowScrolling?f>e.top:d>e.top,j=(h.isWindowScrolling?f:0)a||a+h.$window.height()>h.$document.height()||0>b||b+h.$window.width()>h.$document.width()||h.$originalHeader.css({top:h.topOffset-(h.isWindowScrolling?0:a),left:h.leftOffset-(h.isWindowScrolling?0:b)})},h.updateWidth=function(){if(h.isSticky){h.$originalHeaderCells||(h.$originalHeaderCells=a("th,td",h.$originalHeader)),h.$clonedHeaderCells||(h.$clonedHeaderCells=a("th,td",h.$clonedHeader));var b=h.getWidth(h.$clonedHeaderCells);h.setWidth(b,h.$clonedHeaderCells,h.$originalHeaderCells),h.$originalHeader.css("width",h.$clonedHeader.width())}},h.getWidth=function(c){var d=[];return c.each(function(c){var e,f=a(this);if("border-box"===f.css("box-sizing"))e=f[0].getBoundingClientRect().width;else{var g=a("th",h.$originalHeader);if("collapse"===g.css("border-collapse"))if(b.getComputedStyle)e=parseFloat(b.getComputedStyle(this,null).width);else{var i=parseFloat(f.css("padding-left")),j=parseFloat(f.css("padding-right")),k=parseFloat(f.css("border-width"));e=f.outerWidth()-i-j-k}else e=f.width()}d[c]=e}),d},h.setWidth=function(a,b,c){b.each(function(b){var d=a[b];c.eq(b).css({"min-width":d,"max-width":d})})},h.resetWidth=function(b,c){b.each(function(b){var d=a(this);c.eq(b).css({"min-width":d.css("min-width"),"max-width":d.css("max-width")})})},h.setOptions=function(c){h.options=a.extend({},f,c),h.$scrollableArea=a(h.options.scrollableArea),h.isWindowScrolling=h.$scrollableArea[0]===b},h.updateOptions=function(a){h.setOptions(a),h.unbind(),h.bind(),h.updateWidth(),h.toggleHeaders()},h.init()}var d="stickyTableHeaders",e=0,f={fixedOffset:0,leftOffset:0,marginTop:0,scrollableArea:b};a.fn[d]=function(b){return this.each(function(){var e=a.data(this,"plugin_"+d);e?"string"==typeof b?e[b].apply(e):e.updateOptions(b):"destroy"!==b&&a.data(this,"plugin_"+d,new c(this,b))})}}(jQuery,window); -------------------------------------------------------------------------------- /docs/js/readtheorg.js: -------------------------------------------------------------------------------- 1 | 2 | $(function() { 3 | $('.note').before("

Note

"); 4 | $('.seealso').before("

See also

"); 5 | $('.warning').before("

Warning

"); 6 | $('.caution').before("

Caution

"); 7 | $('.attention').before("

Attention

"); 8 | $('.tip').before("

Tip

"); 9 | $('.important').before("

Important

"); 10 | $('.hint').before("

Hint

"); 11 | $('.error').before("

Error

"); 12 | $('.danger').before("

Danger

"); 13 | }); 14 | 15 | $( document ).ready(function() { 16 | 17 | // Shift nav in mobile when clicking the menu. 18 | $(document).on('click', "[data-toggle='wy-nav-top']", function() { 19 | $("[data-toggle='wy-nav-shift']").toggleClass("shift"); 20 | $("[data-toggle='rst-versions']").toggleClass("shift"); 21 | }); 22 | // Close menu when you click a link. 23 | $(document).on('click', ".wy-menu-vertical .current ul li a", function() { 24 | $("[data-toggle='wy-nav-shift']").removeClass("shift"); 25 | $("[data-toggle='rst-versions']").toggleClass("shift"); 26 | }); 27 | $(document).on('click', "[data-toggle='rst-current-version']", function() { 28 | $("[data-toggle='rst-versions']").toggleClass("shift-up"); 29 | }); 30 | // Make tables responsive 31 | $("table.docutils:not(.field-list)").wrap("
"); 32 | }); 33 | 34 | $( document ).ready(function() { 35 | $('#text-table-of-contents ul').first().addClass('nav'); 36 | // ScrollSpy also requires that we use 37 | // a Bootstrap nav component. 38 | $('body').scrollspy({target: '#text-table-of-contents'}); 39 | 40 | // add sticky table headers 41 | $('table').stickyTableHeaders(); 42 | 43 | // set the height of tableOfContents 44 | var $postamble = $('#postamble'); 45 | var $tableOfContents = $('#table-of-contents'); 46 | $tableOfContents.css({paddingBottom: $postamble.outerHeight()}); 47 | 48 | // add TOC button 49 | var toggleSidebar = $(''); 50 | $('#content').prepend(toggleSidebar); 51 | 52 | // add close button when sidebar showed in mobile screen 53 | var closeBtn = $('Close'); 54 | var tocTitle = $('#table-of-contents').find('h2'); 55 | tocTitle.append(closeBtn); 56 | }); 57 | 58 | window.SphinxRtdTheme = (function (jquery) { 59 | var stickyNav = (function () { 60 | var navBar, 61 | win, 62 | stickyNavCssClass = 'stickynav', 63 | applyStickNav = function () { 64 | if (navBar.height() <= win.height()) { 65 | navBar.addClass(stickyNavCssClass); 66 | } else { 67 | navBar.removeClass(stickyNavCssClass); 68 | } 69 | }, 70 | enable = function () { 71 | applyStickNav(); 72 | win.on('resize', applyStickNav); 73 | }, 74 | init = function () { 75 | navBar = jquery('nav.wy-nav-side:first'); 76 | win = jquery(window); 77 | }; 78 | jquery(init); 79 | return { 80 | enable : enable 81 | }; 82 | }()); 83 | return { 84 | StickyNav : stickyNav 85 | }; 86 | }($)); 87 | -------------------------------------------------------------------------------- /docs/sql/create_cleaned_inspections_table.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists cleaned; 2 | 3 | drop table if exists cleaned.inspections cascade; 4 | 5 | create table cleaned.inspections as ( 6 | with cleaned as ( 7 | select 8 | inspection::integer, 9 | btrim(lower(results)) as result, 10 | license_num::integer, 11 | btrim(lower(dba_name)) as facility, 12 | btrim(lower(aka_name)) as facility_aka, 13 | case when 14 | facility_type is null then 'unknown' 15 | else btrim(lower(facility_type)) 16 | end as facility_type, 17 | lower(substring(risk from '\((.+)\)')) as risk, 18 | btrim(lower(address)) as address, 19 | zip as zip_code, 20 | substring( 21 | btrim(lower(regexp_replace(type, 'liquor', 'task force', 'gi'))) 22 | from 'canvass|task force|complaint|food poisoning|consultation|license|tag removal') as type, 23 | date, 24 | -- point(longitude, latitude) as location 25 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as location -- We use geography so the measurements are in meters 26 | from raw.inspections 27 | where zip is not null -- removing NULL zip codes 28 | ) 29 | 30 | select * from cleaned where type is not null 31 | ); 32 | -------------------------------------------------------------------------------- /docs/sql/create_semantic_tables.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists semantic; 2 | 3 | drop table if exists semantic.entities cascade; 4 | 5 | create table semantic.entities as ( 6 | with entities as ( 7 | select 8 | distinct on ( 9 | license_num, 10 | facility, 11 | facility_aka, 12 | facility_type, 13 | address 14 | ) 15 | license_num, 16 | facility, 17 | facility_aka, 18 | facility_type, 19 | address, 20 | zip_code, 21 | location, 22 | min(date) over (partition by license_num, facility, facility_aka, facility_type, address) as start_time, 23 | max(case when result in ('out of business', 'business not located') 24 | then date 25 | else NULL 26 | end) 27 | over (partition by license_num, facility, facility_aka, address) as end_time 28 | from cleaned.inspections 29 | order by 30 | license_num, facility, facility_aka, facility_type, address, 31 | date asc -- IMPORTANT!! 32 | ) 33 | 34 | select 35 | row_number() over (order by start_time asc ) as entity_id, 36 | license_num, 37 | facility, 38 | facility_aka, 39 | facility_type, 40 | address, 41 | zip_code, 42 | location, 43 | start_time, 44 | end_time, 45 | daterange(start_time, end_time) as activity_period 46 | from entities 47 | ); 48 | 49 | create index entities_ix on semantic.entities (entity_id); 50 | create index entities_license_num_ix on semantic.entities (license_num); 51 | create index entities_facility_ix on semantic.entities (facility); 52 | create index entities_facility_type_ix on semantic.entities (facility_type); 53 | create index entities_zip_code_ix on semantic.entities (zip_code); 54 | 55 | -- Spatial index 56 | create index entities_location_gix on semantic.entities using gist (location); 57 | 58 | create index entities_full_key_ix on semantic.entities (license_num, facility, facility_aka, facility_type, address); 59 | 60 | drop table if exists semantic.events cascade; 61 | 62 | create table semantic.events as ( 63 | 64 | with entities as ( 65 | select * from semantic.entities 66 | ), 67 | 68 | inspections as ( 69 | select 70 | i.inspection, i.type, i.date, i.risk, i.result, 71 | i.license_num, i.facility, i.facility_aka, 72 | i.facility_type, i.address, i.zip_code, i.location, 73 | jsonb_agg( 74 | jsonb_build_object( 75 | 'code', v.code, 76 | 'severity', v.severity, 77 | 'description', v.description, 78 | 'comment', v.comment 79 | ) 80 | order by code 81 | ) as violations 82 | from 83 | cleaned.inspections as i 84 | inner join 85 | cleaned.violations as v 86 | on i.inspection = v.inspection 87 | group by 88 | i.inspection, i.type, i.license_num, i.facility, 89 | i.facility_aka, i.facility_type, i.address, i.zip_code, i.location, 90 | i.date, i.risk, i.result 91 | ) 92 | 93 | select 94 | i.inspection as event_id, 95 | e.entity_id, i.type, i.date, i.risk, i.result, 96 | e.facility_type, e.zip_code, e.location, 97 | i.violations 98 | from 99 | entities as e 100 | inner join 101 | inspections as i 102 | using (license_num, facility, facility_aka, facility_type, address, zip_code) 103 | ); 104 | 105 | -- Add some indices 106 | create index events_entity_ix on semantic.events (entity_id asc nulls last); 107 | create index events_event_ix on semantic.events (event_id asc nulls last); 108 | create index events_type_ix on semantic.events (type); 109 | create index events_date_ix on semantic.events(date asc nulls last); 110 | create index events_facility_type_ix on semantic.events (facility_type); 111 | create index events_zip_code_ix on semantic.events (zip_code); 112 | 113 | -- Spatial index 114 | create index events_location_gix on semantic.events using gist (location); 115 | 116 | -- JSONB indices 117 | create index events_violations on semantic.events using gin(violations); 118 | create index events_violations_json_path on semantic.events using gin(violations jsonb_path_ops); 119 | 120 | create index events_event_entity_zip_code_date on semantic.events (event_id asc nulls last, entity_id asc nulls last, zip_code, date desc nulls last); 121 | -------------------------------------------------------------------------------- /docs/sql/create_violations_table.sql: -------------------------------------------------------------------------------- 1 | drop table if exists cleaned.violations cascade; 2 | 3 | create table cleaned.violations as ( 4 | select 5 | inspection::integer, 6 | license_num::integer, 7 | date::date, 8 | btrim(tuple[1]) as code, 9 | btrim(tuple[2]) as description, 10 | btrim(tuple[3]) as comment, 11 | (case 12 | when btrim(tuple[1]) = '' then NULL 13 | when btrim(tuple[1])::int between 1 and 14 then 'critical' -- From the documentation 14 | when btrim(tuple[1])::int between 15 and 29 then 'serious' 15 | else 'minor' 16 | end 17 | ) as severity from 18 | ( 19 | select 20 | inspection, 21 | license_num, 22 | date, 23 | regexp_split_to_array( -- Create an array we will split the code, description, comment 24 | regexp_split_to_table( -- Create a row per each comment we split by | 25 | coalesce( -- If there isn't a violation add '- Comments:' 26 | regexp_replace(violations, '[\n\r]+', '', 'g' ) -- Remove line breaks 27 | , '- Comments:') 28 | , '\|') -- Split the violations 29 | , '(?<=\d+)\.\s*|\s*-\s*Comments:') -- Split each violation in three 30 | as tuple 31 | from raw.inspections 32 | where results in ('Fail', 'Pass', 'Pass w/ Conditions') and license_num is not null 33 | ) as t 34 | ); 35 | -------------------------------------------------------------------------------- /docs/triage/experiments/eis_01.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'eis: 01' 4 | 5 | user_metadata: 6 | label_definition: 'inspected' 7 | experiment_type: 'eis' 8 | description: | 9 | EIS 01 10 | purpose: 'model creation' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | label_config: 36 | query: | 37 | select 38 | entity_id, 39 | True::integer as outcome 40 | from semantic.events 41 | where '{as_of_date}'::timestamp <= date 42 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 43 | group by entity_id 44 | include_missing_labels_in_train_as: False 45 | name: 'inspected' 46 | 47 | cohort_config: 48 | query: | 49 | with buckets as ( 50 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 51 | from ( 52 | select entity_id, count(*) as number_of_inspections 53 | from semantic.events 54 | group by entity_id 55 | ) as t 56 | ) 57 | select e.entity_id 58 | from semantic.entities as e 59 | inner join 60 | buckets as b 61 | using (entity_id) 62 | where 63 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 64 | and bucket in (5) 65 | name: 'active_facilities' 66 | 67 | temporal_config: 68 | feature_start_time: '2010-01-04' 69 | feature_end_time: '2019-01-01' 70 | label_start_time: '2015-02-01' 71 | label_end_time: '2019-01-01' 72 | 73 | model_update_frequency: '1y' 74 | training_label_timespans: ['1month'] 75 | training_as_of_date_frequencies: '1month' 76 | 77 | test_durations: '1y' 78 | test_label_timespans: ['1month'] 79 | test_as_of_date_frequencies: '1month' 80 | 81 | max_training_histories: '5y' 82 | 83 | feature_aggregations: 84 | - 85 | prefix: 'inspections' 86 | from_obj: 'semantic.events' 87 | knowledge_date_column: 'date' 88 | 89 | aggregates_imputation: 90 | count: 91 | type: 'zero_noflag' 92 | 93 | aggregates: 94 | - 95 | quantity: 96 | total: "*" 97 | metrics: 98 | - 'count' 99 | 100 | intervals: ['1month', '3month', '6month', '1y', 'all'] 101 | 102 | groups: 103 | - 'entity_id' 104 | 105 | - 106 | prefix: 'risks' 107 | from_obj: 'semantic.events' 108 | knowledge_date_column: 'date' 109 | 110 | categoricals_imputation: 111 | sum: 112 | type: 'zero' 113 | avg: 114 | type: 'zero' 115 | 116 | categoricals: 117 | - 118 | column: 'risk' 119 | choices: ['low', 'medium', 'high'] 120 | metrics: 121 | - 'sum' 122 | - 'avg' 123 | 124 | intervals: ['1month', '3month', '6month', '1y', 'all'] 125 | 126 | groups: 127 | - 'entity_id' 128 | - 'zip_code' 129 | 130 | - 131 | prefix: 'results' 132 | from_obj: 'semantic.events' 133 | knowledge_date_column: 'date' 134 | 135 | categoricals_imputation: 136 | all: 137 | type: 'zero' 138 | 139 | categoricals: 140 | - 141 | column: 'result' 142 | choice_query: 'select distinct result from semantic.events' 143 | metrics: 144 | - 'sum' 145 | - 'avg' 146 | 147 | intervals: ['1month', '3month', '6month', '1y', 'all'] 148 | 149 | groups: 150 | - 'entity_id' 151 | 152 | - 153 | prefix: 'inspection_types' 154 | from_obj: 'semantic.events' 155 | knowledge_date_column: 'date' 156 | 157 | categoricals_imputation: 158 | sum: 159 | type: 'zero_noflag' 160 | 161 | categoricals: 162 | - 163 | column: 'type' 164 | choice_query: 'select distinct type from semantic.events where type is not null' 165 | metrics: 166 | - 'sum' 167 | 168 | intervals: ['1month', '3month', '6month', '1y', 'all'] 169 | 170 | groups: 171 | - 'entity_id' 172 | - 'zip_code' 173 | 174 | feature_group_definition: 175 | prefix: 176 | - 'inspections' 177 | - 'results' 178 | - 'risks' 179 | - 'inspection_types' 180 | 181 | feature_group_strategies: ['all', 'leave-one-out', 'leave-one-in'] 182 | 183 | grid_config: 184 | 'sklearn.tree.DecisionTreeClassifier': 185 | max_depth: [2,null] 186 | 'sklearn.ensemble.RandomForestClassifier': 187 | max_features: ['sqrt'] 188 | criterion: ['gini'] 189 | n_estimators: [500] 190 | min_samples_leaf: [1] 191 | min_samples_split: [50] 192 | 'sklearn.dummy.DummyClassifier': 193 | strategy: [most_frequent] 194 | 195 | scoring: 196 | testing_metric_groups: 197 | - 198 | metrics: [precision@, recall@] 199 | thresholds: 200 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 201 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 202 | 203 | 204 | training_metric_groups: 205 | - 206 | metrics: [accuracy] 207 | - 208 | metrics: [precision@, recall@] 209 | thresholds: 210 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 211 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 212 | -------------------------------------------------------------------------------- /docs/triage/experiments/inspections-training.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v3' 2 | 3 | model_comment: 'test_triage_inspections' 4 | 5 | temporal_config: 6 | feature_start_time=np.min(df.date) 7 | feature_end_time=np.max(df.date) 8 | label_start_time=np.min(df.date) 9 | label_end_time=np.max(df.date) 10 | 11 | model_update_frequency='3months' 12 | training_label_timespans='1day' 13 | training_as_of_date_frequencies='1day' 14 | max_training_histories='1year' 15 | 16 | test_durations='1day' 17 | test_label_timespans='3month' 18 | test_as_of_date_frequencies='1day' 19 | 20 | events_table: 'inspections.events' 21 | 22 | feature_aggregations: 23 | - 24 | # Number of violations of a specific code and proportion, grouped by entity 25 | prefix: 'violations' 26 | from_obj: 'cleaned.violations' 27 | knowledge_date_column: 'knowledge_date' 28 | 29 | categoricals: 30 | - 31 | column: 'violation_code' 32 | choice_query: 'select distinct violation_code from cleaned.violations' 33 | metrics: 34 | - 'sum' 35 | - 'avg' 36 | 37 | intervals: 38 | - '1 y' 39 | 40 | groups: 41 | - 'entity_id' 42 | 43 | - # inspections in the last year associated with this entity 44 | prefix: 'inspections' 45 | from_obj: 'cleaned.inspections' 46 | knowledge_date_column: 'date' 47 | aggregates: 48 | - 49 | quantity: '*' 50 | metrics: 51 | - 'count' 52 | intervals: 53 | - '1 y' 54 | 55 | groups: 56 | - 'license_num' 57 | 58 | - # inspections that happened in the last year grouped by type of facility 59 | prefix: 'inspections' 60 | from_obj: 'cleaned.inspections' 61 | knowledge_date_column: 'date' 62 | 63 | aggregates: 64 | - 65 | quantity: '*' 66 | metrics: 67 | - 'count' 68 | intervals: 69 | - '1 y' 70 | 71 | groups: 72 | - 'facility_type' 73 | 74 | - # inspections that happened in the last year grouped by zip code 75 | prefix: 'inspections' 76 | from_obj: 'cleaned.inspections' 77 | knowledge_date_column: 'date' 78 | 79 | aggregates: 80 | - 81 | quantity: '*' 82 | metrics: 83 | - 'count' 84 | intervals: 85 | - '1 y' 86 | 87 | groups: 88 | - 'zip_code' 89 | 90 | feature_group_strategies: ['all'] 91 | 92 | model_group_keys: [] 93 | 94 | grid_config: 95 | 'sklearn.tree.DecisionTreeClassifier': 96 | criterion: ['gini'] 97 | max_depth: [3] 98 | min_samples_split: [10] 99 | 100 | scoring: 101 | metric_groups: 102 | - 103 | metrics: ['precision@', 'recall@', 'fpr@'] 104 | thresholds: 105 | percentiles: [1.0, 2.0, 5.0, 10.0, 25.0] 106 | top_n: [25, 75, 150, 300, 500, 1000, 1500] 107 | -------------------------------------------------------------------------------- /docs/triage/experiments/inspections_baseline.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'inspections: baseline' 4 | 5 | user_metadata: 6 | label_definition: 'failed' 7 | experiment_type: 'inspections prioritization' 8 | description: | 9 | Baseline calculation 10 | purpose: 'baseline' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | temporal_config: 36 | feature_start_time: '2010-01-04' 37 | feature_end_time: '2019-01-01' 38 | label_start_time: '2015-02-01' 39 | label_end_time: '2019-01-01' 40 | 41 | model_update_frequency: '1y' 42 | training_label_timespans: ['1month'] 43 | training_as_of_date_frequencies: '1month' 44 | 45 | test_durations: '1y' 46 | test_label_timespans: ['1month'] 47 | test_as_of_date_frequencies: '1month' 48 | 49 | max_training_histories: '5y' 50 | 51 | label_config: 52 | query: | 53 | select 54 | entity_id, 55 | bool_or(result = 'fail')::integer as outcome 56 | from semantic.events 57 | where '{as_of_date}'::timestamp <= date 58 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 59 | group by entity_id 60 | name: 'failed_inspections' 61 | 62 | cohort_config: 63 | query: | 64 | with buckets as ( 65 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 66 | from ( 67 | select entity_id, count(*) as number_of_inspections 68 | from semantic.events 69 | group by entity_id 70 | ) as t 71 | ) 72 | select e.entity_id 73 | from semantic.entities as e 74 | inner join 75 | buckets as b 76 | using (entity_id) 77 | where 78 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 79 | and bucket in (5) 80 | name: 'active_facilities' 81 | 82 | feature_aggregations: 83 | - 84 | prefix: 'inspections' 85 | from_obj: 'semantic.events' 86 | knowledge_date_column: 'date' 87 | 88 | aggregates_imputation: 89 | count: 90 | type: 'zero_noflag' 91 | 92 | aggregates: 93 | - 94 | quantity: 95 | total: "*" 96 | metrics: 97 | - 'count' 98 | 99 | intervals: ['all'] 100 | 101 | groups: 102 | - 'entity_id' 103 | 104 | feature_group_definition: 105 | prefix: 106 | - 'inspections' 107 | 108 | feature_group_strategies: ['all'] 109 | 110 | grid_config: 111 | 'sklearn.dummy.DummyClassifier': 112 | strategy: [prior,uniform, most_frequent] 113 | 114 | scoring: 115 | testing_metric_groups: 116 | - 117 | metrics: [precision@, recall@] 118 | thresholds: 119 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 120 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 121 | 122 | training_metric_groups: 123 | - 124 | metrics: [accuracy] 125 | - 126 | metrics: [precision@, recall@] 127 | thresholds: 128 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 129 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 130 | -------------------------------------------------------------------------------- /docs/triage/experiments/inspections_dt.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'inspections: DT' 4 | 5 | user_metadata: 6 | label_definition: 'failed' 7 | experiment_type: 'inspections prioritization' 8 | description: | 9 | Decision Tree Classifier 10 | purpose: 'data mining' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | temporal_config: 36 | feature_start_time: '2010-01-04' 37 | feature_end_time: '2019-01-01' 38 | label_start_time: '2015-02-01' 39 | label_end_time: '2019-01-01' 40 | 41 | model_update_frequency: '1y' 42 | training_label_timespans: ['1month'] 43 | training_as_of_date_frequencies: '1month' 44 | 45 | test_durations: '1y' 46 | test_label_timespans: ['1month'] 47 | test_as_of_date_frequencies: '1month' 48 | 49 | max_training_histories: '5y' 50 | 51 | label_config: 52 | query: | 53 | select 54 | entity_id, 55 | bool_or(result = 'fail')::integer as outcome 56 | from semantic.events 57 | where '{as_of_date}'::timestamp <= date 58 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 59 | group by entity_id 60 | name: 'failed_inspections' 61 | 62 | cohort_config: 63 | query: | 64 | with buckets as ( 65 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 66 | from ( 67 | select entity_id, count(*) as number_of_inspections 68 | from semantic.events 69 | group by entity_id 70 | ) as t 71 | ) 72 | select e.entity_id 73 | from semantic.entities as e 74 | inner join 75 | buckets as b 76 | using (entity_id) 77 | where 78 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 79 | and bucket in (5) 80 | name: 'active_facilities' 81 | 82 | feature_aggregations: 83 | - 84 | prefix: 'inspections' 85 | from_obj: 'semantic.events' 86 | knowledge_date_column: 'date' 87 | 88 | aggregates_imputation: 89 | count: 90 | type: 'zero_noflag' 91 | 92 | aggregates: 93 | - 94 | quantity: 95 | total: "*" 96 | metrics: 97 | - 'count' 98 | 99 | intervals: ['1month', '3month', '6month', '1y', 'all'] 100 | 101 | groups: 102 | - 'entity_id' 103 | 104 | - 105 | prefix: 'risks' 106 | from_obj: 'semantic.events' 107 | knowledge_date_column: 'date' 108 | 109 | categoricals_imputation: 110 | sum: 111 | type: 'zero' 112 | avg: 113 | type: 'zero' 114 | 115 | categoricals: 116 | - 117 | column: 'risk' 118 | choices: ['low', 'medium', 'high'] 119 | metrics: 120 | - 'sum' 121 | - 'avg' 122 | 123 | intervals: ['1month', '3month', '6month', '1y', 'all'] 124 | 125 | groups: 126 | - 'entity_id' 127 | - 'zip_code' 128 | 129 | - 130 | prefix: 'results' 131 | from_obj: 'semantic.events' 132 | knowledge_date_column: 'date' 133 | 134 | categoricals_imputation: 135 | all: 136 | type: 'zero' 137 | 138 | categoricals: 139 | - 140 | column: 'result' 141 | choice_query: 'select distinct result from semantic.events' 142 | metrics: 143 | - 'sum' 144 | - 'avg' 145 | 146 | intervals: ['1month', '3month', '6month', '1y', 'all'] 147 | 148 | groups: 149 | - 'entity_id' 150 | 151 | - 152 | prefix: 'inspection_types' 153 | from_obj: 'semantic.events' 154 | knowledge_date_column: 'date' 155 | 156 | categoricals_imputation: 157 | sum: 158 | type: 'zero_noflag' 159 | 160 | categoricals: 161 | - 162 | column: 'type' 163 | choice_query: 'select distinct type from semantic.events where type is not null' 164 | metrics: 165 | - 'sum' 166 | 167 | intervals: ['1month', '3month', '6month', '1y', 'all'] 168 | 169 | groups: 170 | - 'entity_id' 171 | - 'zip_code' 172 | 173 | grid_config: 174 | 'sklearn.tree.DecisionTreeClassifier': 175 | max_depth: [2,10,~] 176 | min_samples_split: [2,5] 177 | 178 | feature_group_definition: 179 | prefix: 180 | - 'inspections' 181 | - 'results' 182 | - 'risks' 183 | - 'inspection_types' 184 | 185 | feature_group_strategies: ['all'] 186 | 187 | scoring: 188 | testing_metric_groups: 189 | - 190 | metrics: [precision@, recall@] 191 | thresholds: 192 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 193 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 194 | 195 | 196 | training_metric_groups: 197 | - 198 | metrics: [accuracy] 199 | - 200 | metrics: [precision@, recall@] 201 | thresholds: 202 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 203 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 204 | -------------------------------------------------------------------------------- /docs/triage/experiments/inspections_label_failed_01.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'inspections: advanced' 4 | 5 | user_metadata: 6 | label_definition: 'failed' 7 | experiment_type: 'inspections prioritization' 8 | description: | 9 | Using Ensamble methods 10 | purpose: 'trying ensamble algorithms' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | temporal_config: 36 | feature_start_time: '2010-01-04' 37 | feature_end_time: '2019-01-01' 38 | label_start_time: '2015-02-01' 39 | label_end_time: '2019-01-01' 40 | 41 | model_update_frequency: '1y' 42 | training_label_timespans: ['1month'] 43 | training_as_of_date_frequencies: '1month' 44 | 45 | test_durations: '1y' 46 | test_label_timespans: ['1month'] 47 | test_as_of_date_frequencies: '1month' 48 | 49 | max_training_histories: '5y' 50 | 51 | label_config: 52 | query: | 53 | select 54 | entity_id, 55 | bool_or(result = 'fail')::integer as outcome 56 | from semantic.events 57 | where '{as_of_date}'::timestamp <= date 58 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 59 | group by entity_id 60 | name: 'failed_inspections' 61 | 62 | 63 | cohort_config: 64 | query: | 65 | with buckets as ( 66 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 67 | from ( 68 | select entity_id, count(*) as number_of_inspections 69 | from semantic.events 70 | group by entity_id 71 | ) as t 72 | ) 73 | select e.entity_id 74 | from semantic.entities as e 75 | inner join 76 | buckets as b 77 | using (entity_id) 78 | where 79 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 80 | and bucket in (5) 81 | name: 'active_facilities' 82 | 83 | feature_aggregations: 84 | - 85 | prefix: 'inspections' 86 | from_obj: 'semantic.events' 87 | knowledge_date_column: 'date' 88 | 89 | aggregates_imputation: 90 | count: 91 | type: 'zero_noflag' 92 | 93 | aggregates: 94 | - 95 | quantity: 96 | total: "*" 97 | metrics: 98 | - 'count' 99 | 100 | intervals: ['1month', '3month', '6month', '1y', 'all'] 101 | 102 | groups: 103 | - 'entity_id' 104 | 105 | - 106 | prefix: 'risks' 107 | from_obj: 'semantic.events' 108 | knowledge_date_column: 'date' 109 | 110 | categoricals_imputation: 111 | sum: 112 | type: 'zero' 113 | avg: 114 | type: 'zero' 115 | 116 | categoricals: 117 | - 118 | column: 'risk' 119 | choices: ['low', 'medium', 'high'] 120 | metrics: 121 | - 'sum' 122 | - 'avg' 123 | 124 | intervals: ['1month', '3month', '6month', '1y', 'all'] 125 | 126 | groups: 127 | - 'entity_id' 128 | - 'zip_code' 129 | 130 | - 131 | prefix: 'results' 132 | from_obj: 'semantic.events' 133 | knowledge_date_column: 'date' 134 | 135 | categoricals_imputation: 136 | all: 137 | type: 'zero' 138 | 139 | categoricals: 140 | - 141 | column: 'result' 142 | choice_query: 'select distinct result from semantic.events' 143 | metrics: 144 | - 'sum' 145 | - 'avg' 146 | 147 | intervals: ['1month', '3month', '6month', '1y', 'all'] 148 | 149 | groups: 150 | - 'entity_id' 151 | 152 | - 153 | prefix: 'inspection_types' 154 | from_obj: 'semantic.events' 155 | knowledge_date_column: 'date' 156 | 157 | categoricals_imputation: 158 | sum: 159 | type: 'zero_noflag' 160 | 161 | categoricals: 162 | - 163 | column: 'type' 164 | choice_query: 'select distinct type from semantic.events where type is not null' 165 | metrics: 166 | - 'sum' 167 | 168 | intervals: ['1month', '3month', '6month', '1y', 'all'] 169 | 170 | groups: 171 | - 'entity_id' 172 | - 'zip_code' 173 | 174 | feature_group_definition: 175 | prefix: 176 | - 'inspections' 177 | - 'results' 178 | - 'risks' 179 | - 'inspection_types' 180 | 181 | feature_group_strategies: ['all', 'leave-one-in', 'leave-one-out'] 182 | 183 | grid_config: 184 | 'sklearn.ensemble.RandomForestClassifier': 185 | max_features: ['sqrt'] 186 | criterion: ['gini'] 187 | n_estimators: [100, 250] 188 | min_samples_split: [2,10] 189 | 190 | scoring: 191 | testing_metric_groups: 192 | - 193 | metrics: [precision@, recall@] 194 | thresholds: 195 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 196 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 197 | 198 | training_metric_groups: 199 | - 200 | metrics: [accuracy] 201 | - 202 | metrics: [precision@, recall@] 203 | thresholds: 204 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 205 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 206 | -------------------------------------------------------------------------------- /docs/triage/experiments/simple_test_skeleton.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'simple_test_skeleton' 4 | 5 | user_metadata: 6 | label_definition: 'failed_inspection' 7 | experiment_type: 'test' 8 | org: 'DSaPP' 9 | team: 'Tutorial' 10 | author: 'Adolfo De Unanue' 11 | etl_date: '2019-02-21' 12 | 13 | temporal_config: 14 | feature_start_time: '2014-01-01' 15 | feature_end_time: '2018-01-01' 16 | label_start_time: '2014-01-02' 17 | label_end_time: '2018-01-01' 18 | 19 | model_update_frequency: '1y' 20 | 21 | max_training_histories: '1y' 22 | training_label_timespans: ['1y'] 23 | training_as_of_date_frequencies: '1month' 24 | 25 | test_durations: '0d' 26 | test_label_timespans: ['1y'] 27 | test_as_of_date_frequencies: '1month' 28 | 29 | cohort_config: 30 | query: | 31 | select entity_id 32 | from semantic.entities 33 | where 34 | license_num in (1596210, 1874347, 1142451) 35 | and daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 36 | name: 'test_facilities' 37 | 38 | label_config: 39 | query: | 40 | select 41 | entity_id, 42 | bool_or(result = 'fail')::integer as outcome 43 | from semantic.events 44 | where '{as_of_date}'::timestamp <= date 45 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 46 | group by entity_id 47 | name: 'failed_inspections' 48 | 49 | grid_config: 50 | 'sklearn.dummy.DummyClassifier': 51 | strategy: [most_frequent] 52 | 53 | feature_aggregations: 54 | - 55 | prefix: 'inspections' 56 | from_obj: 'semantic.events' 57 | knowledge_date_column: 'date' 58 | 59 | aggregates_imputation: 60 | count: 61 | type: 'zero_noflag' 62 | 63 | aggregates: 64 | - 65 | quantity: 66 | total: "*" 67 | metrics: 68 | - 'count' 69 | 70 | intervals: ['1month', '3month', '6month', '1y', 'all'] 71 | 72 | groups: 73 | - 'entity_id' 74 | 75 | 76 | - 77 | prefix: 'risks' 78 | from_obj: 'semantic.events' 79 | knowledge_date_column: 'date' 80 | 81 | categoricals_imputation: 82 | sum: 83 | type: 'zero' 84 | avg: 85 | type: 'zero' 86 | 87 | categoricals: 88 | - 89 | column: 'risk' 90 | choices: ['low', 'medium', 'high'] 91 | metrics: 92 | - 'sum' 93 | - 'avg' 94 | 95 | intervals: ['1month', '3month', '6month', '1y', 'all'] 96 | 97 | groups: 98 | - 'entity_id' 99 | - 'zip_code' 100 | 101 | - 102 | prefix: 'results' 103 | from_obj: 'semantic.events' 104 | knowledge_date_column: 'date' 105 | 106 | categoricals_imputation: 107 | all: 108 | type: 'zero' 109 | 110 | categoricals: 111 | - 112 | column: 'result' 113 | choice_query: 'select distinct result from semantic.events' 114 | metrics: 115 | - 'sum' 116 | - 'avg' 117 | 118 | intervals: 119 | - '6month' 120 | 121 | groups: 122 | - 'entity_id' 123 | 124 | feature_group_definition: 125 | prefix: 126 | - 'results' 127 | - 'risks' 128 | - 'inspections' 129 | 130 | feature_group_strategies: ['all'] 131 | 132 | model_group_keys: 133 | - 'class_path' 134 | - 'parameters' 135 | - 'feature_names' 136 | - 'feature_groups' 137 | - 'cohort_name' 138 | - 'state' 139 | - 'label_name' 140 | - 'label_timespan' 141 | - 'training_as_of_date_frequency' 142 | - 'max_training_history' 143 | - 'label_definition' 144 | - 'experiment_type' 145 | - 'org' 146 | - 'team' 147 | - 'author' 148 | - 'etl_date' 149 | 150 | scoring: 151 | testing_metric_groups: 152 | - 153 | metrics: ['precision@', 'recall@'] 154 | thresholds: 155 | percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0] 156 | top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500] 157 | training_metric_groups: 158 | - 159 | metrics: ['accuracy'] 160 | - 161 | metrics: ['precision@', 'recall@'] 162 | thresholds: 163 | percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0] 164 | top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500] 165 | -------------------------------------------------------------------------------- /docs/triage/images/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /docs/triage/images/eis_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/eis_01.png -------------------------------------------------------------------------------- /docs/triage/images/inspections_baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/inspections_baseline.png -------------------------------------------------------------------------------- /docs/triage/images/inspections_dt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/inspections_dt.png -------------------------------------------------------------------------------- /docs/triage/images/inspections_label_failed_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/inspections_label_failed_01.png -------------------------------------------------------------------------------- /docs/triage/images/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/triage/images/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /docs/triage/images/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /docs/triage/images/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/triage/images/simple_test_skeleton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/simple_test_skeleton.png -------------------------------------------------------------------------------- /infrastructure/aws_batch/credentials.filter.example: -------------------------------------------------------------------------------- 1 | { 2 | "environment": [ 3 | { 4 | "name": "AWS_ACCESS_KEY_ID", 5 | "value": .Credentials.AccessKeyId 6 | }, 7 | { 8 | "name": "AWS_SECRET_ACCESS_KEY", 9 | "value": .Credentials.SecretAccessKey 10 | }, 11 | { 12 | "name": "AWS_SESSION_TOKEN", 13 | "value": .Credentials.SessionToken 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /infrastructure/aws_batch/triage-job-definition.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "containerProperties": { 3 | "command": [ 4 | "--tb", 5 | "Ref::experiment_file", 6 | "--project-path", 7 | "Ref::output_path", 8 | "Ref::replace", 9 | "Ref::save_predictions", 10 | "Ref::profile", 11 | "Ref::validate" 12 | ], 13 | "image": "AWS_ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/YOUR_TRIAGE_IMAGE", 14 | "jobRoleArn": "arn:aws:iam::AWS_ACCOUNT:role/dsappBatchJobRole", 15 | "memory": 16000, 16 | "vcpus": 1 17 | }, 18 | "jobDefinitionName": "triage-cli-experiment", 19 | "retryStrategy": { 20 | "attempts": 1 21 | }, 22 | "type": "container" 23 | } 24 | -------------------------------------------------------------------------------- /infrastructure/aws_batch/triage-overrides.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "environment": [ 3 | { 4 | "name":"AWS_DEFAULT_REGION", 5 | "value":"us-west-2" 6 | }, 7 | { 8 | "name":"AWS_JOB_QUEUE", 9 | "value":"" 10 | }, 11 | { 12 | "name":"POSTGRES_PASSWORD", 13 | "value":"" 14 | }, 15 | { 16 | "name":"POSTGRES_USER", 17 | "value":"" 18 | }, 19 | { 20 | "name":"POSTGRES_DB", 21 | "value":"" 22 | }, 23 | { 24 | "name":"POSTGRES_PORT", 25 | "value":"" 26 | }, 27 | { 28 | "name":"POSTGRES_HOST", 29 | "value":"" 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /infrastructure/bastion/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-stretch 2 | 3 | ## Installing clients 4 | RUN sh -c "echo 'deb http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main' > /etc/apt/sources.list.d/pgdg.list" && \ 5 | wget --quiet -O - http://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - && \ 6 | apt-get -y update && \ 7 | apt-get -y install less postgresql-9.6-postgis-2.2 \ 8 | postgresql-contrib-9.6 \ 9 | libpq-dev postgresql-9.6-pgrouting 10 | 11 | COPY session.key . 12 | COPY requirements.txt . 13 | 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | WORKDIR triage 17 | -------------------------------------------------------------------------------- /infrastructure/bastion/requirements.txt: -------------------------------------------------------------------------------- 1 | ipython 2 | jupyter 3 | 4 | ## DSaPP stuff 5 | git+https://github.com/dssg/triage.git 6 | -------------------------------------------------------------------------------- /infrastructure/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | food_db: 5 | build: 6 | context: ./food_db 7 | image: tutorial/db 8 | container_name: food_db 9 | env_file: ../.env 10 | volumes: 11 | - "../data:/tmp/raw-data" 12 | ports: 13 | - "5434:5432" 14 | 15 | bastion: 16 | build: 17 | context: ./bastion 18 | image: tutorial/bastion 19 | container_name: tutorial_bastion 20 | command: bash 21 | #user: ${UID}:${GID} 22 | tty: true 23 | env_file: ../.env 24 | environment: 25 | DATABASE_URL: 'postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@food_db/food' 26 | volumes: 27 | - "../data:/data" 28 | - "../triage:/triage" 29 | - "../src/sql:/sql" 30 | ports: 31 | - "56406-56410:56406-56410" 32 | 33 | triage: 34 | build: 35 | context: ./triage 36 | image: tutorial/triage:v3.3.0 37 | container_name: tutorial_triage 38 | env_file: ../.env 39 | environment: 40 | DATABASE_URL: 'postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@food_db/food' 41 | TRIAGE_OUTPUT_PATH: '/triage/output' 42 | volumes: 43 | - "../triage:/triage" 44 | - "../src/sql:/sql" 45 | -------------------------------------------------------------------------------- /infrastructure/env_example: -------------------------------------------------------------------------------- 1 | POSTGRES_HOST=0.0.0.0 2 | POSTGRES_USER=food_user 3 | POSTGRES_DB=food 4 | POSTGRES_PORT=5434 5 | POSTGRES_PASSWORD=your_password 6 | 7 | UID=1000 8 | GID=1000 9 | -------------------------------------------------------------------------------- /infrastructure/food_db/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:10 2 | 3 | ## PostGIS activation 4 | RUN apt-get -y update \ 5 | && apt-get -y install wget \ 6 | && wget --quiet -O - http://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - \ 7 | && apt-get -y update \ 8 | && apt-get -y install postgresql-10-postgis-2.4 postgis postgresql-10-pgrouting 9 | 10 | 11 | ## DB setup 12 | ADD activate_postgis.sql /docker-entrypoint-initdb.d/ 13 | ADD create_inspections_table.sql /docker-entrypoint-initdb.d/ 14 | ADD create_extensions.sql /docker-entrypoint-initdb.d/ 15 | ADD nuke_triage.sql /docker-entrypoint-initdb.d/ 16 | 17 | RUN chown postgres:postgres /docker-entrypoint-initdb.d/*.sql 18 | -------------------------------------------------------------------------------- /infrastructure/food_db/activate_postgis.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA postgis; 2 | 3 | ALTER DATABASE food SET search_path=public, postgis, contrib; 4 | 5 | CREATE EXTENSION postgis SCHEMA postgis; 6 | CREATE EXTENSION pgrouting; 7 | -------------------------------------------------------------------------------- /infrastructure/food_db/create_extensions.sql: -------------------------------------------------------------------------------- 1 | create extension fuzzystrmatch; 2 | -------------------------------------------------------------------------------- /infrastructure/food_db/create_inspections_table.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists raw; 2 | 3 | create table raw.inspections ( 4 | inspection varchar not null, 5 | DBA_Name varchar, 6 | AKA_Name varchar, 7 | license_Num decimal, 8 | facility_type varchar, 9 | risk varchar, 10 | address varchar, 11 | city varchar, 12 | state varchar, 13 | zip varchar, 14 | date date, 15 | type varchar, 16 | results varchar, 17 | violations varchar, 18 | latitude decimal, 19 | longitude decimal, 20 | location varchar 21 | ); 22 | -------------------------------------------------------------------------------- /infrastructure/food_db/nuke_triage.sql: -------------------------------------------------------------------------------- 1 | create or replace function nuke_triage() 2 | returns text as $result$ 3 | 4 | declare 5 | result text; 6 | query text; 7 | 8 | begin 9 | 10 | execute 'drop schema if exists model_metadata cascade'; 11 | raise notice 'model_metadata deleted'; 12 | execute 'drop schema if exists features cascade'; 13 | raise notice 'features deleted'; 14 | execute 'drop schema if exists train_results cascade'; 15 | raise notice 'train_results deleted'; 16 | execute 'drop schema if exists test_results cascade'; 17 | raise notice 'test_results deleted'; 18 | 19 | execute 'drop table if exists results_schema_versions'; 20 | raise notice 'results_schema_versions deleted'; 21 | 22 | 23 | select into query 24 | string_agg( 25 | format('drop table %I cascade;', tablename), E'\n' 26 | ) 27 | from pg_tables 28 | where tablename ~ 'cohort_|labels_'; 29 | 30 | 31 | 32 | if query is not null then 33 | raise notice '%', query; 34 | execute query; 35 | else 36 | raise notice 'no labels or states tables from triage found'; 37 | end if; 38 | 39 | return 'triage was send to the oblivion. Long live to triage!'; 40 | end; 41 | $result$ language plpgsql; 42 | -------------------------------------------------------------------------------- /infrastructure/triage/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | LABEL triage.version="v3.3.0" \ 4 | triage.from="cli" \ 5 | creator="Center for Data Science and Public Policy (DSaPP)" \ 6 | maintainer="Adolfo De Unánue " 7 | 8 | RUN apt update 9 | 10 | COPY requirements.txt . 11 | 12 | RUN pip install --no-cache-dir -r requirements.txt 13 | 14 | RUN mkdir triage 15 | 16 | WORKDIR triage 17 | 18 | ENTRYPOINT [ "triage", "experiment" ] 19 | -------------------------------------------------------------------------------- /infrastructure/triage/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /infrastructure/triage/requirements.txt: -------------------------------------------------------------------------------- 1 | ## DSaPP stuff 2 | git+https://github.com/dssg/triage.git 3 | -------------------------------------------------------------------------------- /infrastructure/triage/setup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name='triage_experiment', 7 | version='0.1', 8 | py_modules=['triage_experiment'], 9 | entry_points=''' 10 | [console_scripts] 11 | triage_experiment=triage_experiment:triage 12 | ''', 13 | ) 14 | -------------------------------------------------------------------------------- /infrastructure/triage/triage_experiment.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import yaml 5 | 6 | import datetime 7 | 8 | import click 9 | 10 | from triage.experiments import SingleThreadedExperiment 11 | from triage.component.catwalk.utils import filename_friendly_hash 12 | from triage import create_engine 13 | 14 | from utils import show_timechop, show_features_queries, show_model, audit_experiment 15 | 16 | import logging 17 | 18 | logging_level = logging.WARNING 19 | 20 | logging.basicConfig( 21 | format="%(name)-30s %(asctime)s %(levelname)10s %(process)6d %(filename)-24s %(lineno)4d: %(message)s", 22 | datefmt = "%d/%m/%Y %I:%M:%S %p", 23 | level=logging_level, 24 | handlers=[logging.StreamHandler()] 25 | ) 26 | 27 | @click.group() 28 | @click.option('--config_file', type=click.Path(), 29 | help="""Triage's experiment congiguration file name 30 | NOTE: It's assumed that the file is located inside 31 | triage/experiments)""", 32 | required=True) 33 | @click.option('--triage_db', envvar='TRIAGE_DB_URL', type=click.STRING, 34 | help="""DB URL, in the form of 'postgresql://user:password@host_db:host_port/db', 35 | by default it gets this from the environment (TRIAGE_DB_URL)""", 36 | required=True) 37 | @click.option('--replace/--no-replace', 38 | help="Triage will (or won't) replace all the matrices and models", 39 | default=True) ## Default True so it matches the default behaviour of Triage 40 | @click.option('--debug', is_flag=True, 41 | help="Activate to get a lot of information in your screen") 42 | @click.pass_context 43 | def triage(ctx, config_file, triage_db, replace, debug): 44 | 45 | config_file = os.path.join(os.sep, "triage", "experiments", config_file) 46 | 47 | click.echo(f"Using the config file {config_file}") 48 | 49 | with open(config_file) as f: 50 | experiments = yaml.load(f) 51 | 52 | click.echo(f"The output (matrices and models) of this experiment will be stored in triage/output") 53 | click.echo(f"Using data stored in {triage_db}") 54 | click.echo(f"The experiment will utilize any preexisting matrix or model: {not replace}") 55 | click.echo(f"Creating experiment object") 56 | 57 | experiment = SingleThreadedExperiment( 58 | config=experiments, 59 | db_engine=create_engine(triage_db), 60 | project_path='/triage/output', 61 | cleanup=True, 62 | replace=replace 63 | ) 64 | 65 | ctx.obj = experiment 66 | 67 | if debug: 68 | logging.basicConfig(level=logging.DEBUG) 69 | click.echo("Debug enabled (Expect A LOT of output at the screen!!!)") 70 | 71 | click.echo("Experiment loaded") 72 | 73 | @triage.command() 74 | @click.pass_obj 75 | def validate(experiment): 76 | click.echo("Validating experiment's configuration") 77 | experiment.validate() 78 | 79 | click.echo(""" 80 | The experiment configuration doesn't contain any obvious errors. 81 | Any error that occurs from now on, possibly will be related to hit the maximum 82 | number of columns allowed or collision in 83 | the column names, both due to PostgreSQL limitations. 84 | """) 85 | 86 | click.echo("The experiment looks in good shape. May the force be with you") 87 | 88 | @triage.command() 89 | @click.pass_obj 90 | def run(experiment): 91 | start_time = datetime.datetime.now() 92 | 93 | click.echo("Executing experiment") 94 | experiment.run() 95 | click.echo("Done") 96 | 97 | end_time = datetime.datetime.now() 98 | click.echo(f"Experiment completed in {end_time - start_time} seconds") 99 | 100 | @triage.command() 101 | @click.pass_obj 102 | def show_feature_generators(experiment): 103 | pass 104 | 105 | @triage.command() 106 | @click.pass_obj 107 | def show_temporal_blocks(experiment): 108 | click.echo("Generating temporal blocks image") 109 | chopper = experiment.chopper 110 | file_name = f"{experiment.config['model_comment'].replace(' ', '_')}.svg" 111 | image_path=show_timechop(chopper, file_name=file_name) 112 | click.echo("Image stored in:") 113 | click.echo(image_path) 114 | return image_path 115 | 116 | @triage.command() 117 | @click.pass_obj 118 | @click.option('--model', 119 | help="Model to plot", 120 | required=True) 121 | def show_model_plot(experiment, model): 122 | click.echo("Generating model image") 123 | image_path = show_model(model) 124 | click.echo("Image stored in: ") 125 | click.echo(image_path) 126 | 127 | return image_path 128 | 129 | 130 | @triage.command() 131 | @click.pass_obj 132 | @click.option('--metric', 133 | help="Model to plot", 134 | required=True) 135 | @click.option('--rules', 136 | help="Path to selection rules", 137 | required=True) 138 | def audit_models(experiment, metric, rules): 139 | click.echo("Auditing experiment") 140 | experiment_hash = filename_friendly_hash(experiment.config) 141 | 142 | with open(f"/triage/selection_rules/{rules}") as f: 143 | rules = yaml.load(f) 144 | 145 | metric, k = metric.split('@') 146 | 147 | audit_experiment(experiment_hash, f"{metric}@", k, rules) 148 | -------------------------------------------------------------------------------- /infrastructure/web/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx 2 | 3 | COPY default.conf /etc/nginx/conf.d/default.conf 4 | 5 | RUN chown -R nginx:nginx /usr/share/nginx/html/ 6 | 7 | VOLUME /usr/share/nginx/html 8 | -------------------------------------------------------------------------------- /infrastructure/web/default.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | server_name localhost; 4 | 5 | #charset koi8-r; 6 | #access_log /var/log/nginx/log/host.access.log main; 7 | 8 | location / { 9 | root /usr/share/nginx/html; 10 | index index.html index.htm; 11 | } 12 | 13 | #error_page 404 /404.html; 14 | 15 | # redirect server error pages to the static page /50x.html 16 | # 17 | error_page 500 502 503 504 /50x.html; 18 | location = /50x.html { 19 | root /usr/share/nginx/html; 20 | } 21 | 22 | # proxy the PHP scripts to Apache listening on 127.0.0.1:80 23 | # 24 | #location ~ \.php$ { 25 | # proxy_pass http://127.0.0.1; 26 | #} 27 | 28 | # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 29 | # 30 | #location ~ \.php$ { 31 | # root html; 32 | # fastcgi_pass 127.0.0.1:9000; 33 | # fastcgi_index index.php; 34 | # fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name; 35 | # include fastcgi_params; 36 | #} 37 | 38 | # deny access to .htaccess files, if Apache's document root 39 | # concurs with nginx's one 40 | # 41 | #location ~ /\.ht { 42 | # deny all; 43 | #} 44 | } -------------------------------------------------------------------------------- /org/00_instructions.org: -------------------------------------------------------------------------------- 1 | #+STARTUP: showeverything 2 | #+STARTUP: nohideblocks 3 | #+STARTUP: indent 4 | #+STARTUP: align 5 | #+STARTUP: inlineimages 6 | #+STARTUP: latexpreview 7 | #+PROPERTY: header-args:sql :engine postgresql 8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0 9 | #+PROPERTY: header-args:sql+ :dbport 5434 10 | #+PROPERTY: header-args:sql+ :dbuser food_user 11 | #+PROPERTY: header-args:sql+ :dbpassword some_password 12 | #+PROPERTY: header-args:sql+ :database food 13 | #+PROPERTY: header-args:sql+ :results table drawer 14 | #+PROPERTY: header-args:sql+ :cmdline -q 15 | #+PROPERTY: header-args:sh :results verbatim org 16 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue : 17 | #+PROPERTY: header-args:ipython :session Food_inspections 18 | #+PROPERTY: header-args:ipython+ :results raw drawer 19 | #+OPTIONS: broken-links:mark 20 | #+OPTIONS: tasks:todo 21 | #+OPTIONS: LaTeX:t 22 | 23 | 24 | * Welcome! 25 | 26 | This tutorial will show you how to use =triage=, a data science 27 | modeling tool developed at the [[http://dsapp.uchicago.edu][Center for Data Science and Public 28 | Policy]] (DSaPP) at the University of Chicago. 29 | 30 | =triage= helps build models for three [[https://dssg.uchicago.edu/data-science-for-social-good-conference-2017/training-workshop-data-science-for-social-good-problem-templates/][common applied problems]]: (a) Early 31 | warning systems (*EWS* or *EIS*), (b) /resource prioritization/ (a.k.a "an 32 | inspections problem") and (c) interaction level predictions (a.k.a 33 | "appointment level"). These problems 34 | are difficult to model because their conceptualization and 35 | and implementation are prone to error, thanks to their multi-dimensional, 36 | multi-entity, time-series structure. 37 | 38 | The last version of this tutorial is published in [[https://dssg.github.io/dirtyduck/]] 39 | 40 | *NOTE* This tutorial is in sync with the latest version of =triage=. At 41 | this moment [[https://github.com/dssg/triage/releases/tag/v3.3.0][v3.3.0 (Arepa)]]. 42 | 43 | * Before you start 44 | ** What you need for this tutorial 45 | 46 | Install [[http://www.docker.com][Docker CE]] and [[https://docs.docker.com/compose/][Docker Compose]]. That's it. 47 | Follow the links for installation instructions. 48 | 49 | Note that if you are using =GNU/Linux= you should add your user to the 50 | =docker= group following the instructions at this [[https://docs.docker.com/install/linux/linux-postinstall/][link]]. 51 | 52 | At the moment only operative systems with *nix-type command lines are 53 | supported, such as =GNU/Linux= and =MacOS=. Recent versions of 54 | =Windows= may also work. 55 | 56 | ** How to use this tutorial 57 | 58 | First, clone this repository on your laptop 59 | 60 | #+BEGIN_EXAMPLE 61 | git clone https://github.com/dssg/dirtyduck.git 62 | #+END_EXAMPLE 63 | 64 | Second, run 65 | 66 | #+BEGIN_EXAMPLE 67 | ./tutorial.sh start 68 | #+END_Example 69 | 70 | This will take several minutes the first time you do it. 71 | 72 | 73 | ** How you can help to improve this tutorial 74 | 75 | If you want to contribute, please follow the suggestions in the [[file:~/projects/dsapp/dirtyduck/README.org::*How%20you%20can%20help][README]] 76 | -------------------------------------------------------------------------------- /org/01_intro.org: -------------------------------------------------------------------------------- 1 | #+STARTUP: showeverything 2 | #+STARTUP: nohideblocks 3 | #+STARTUP: indent 4 | #+STARTUP: align 5 | #+STARTUP: inlineimages 6 | #+STARTUP: latexpreview 7 | #+PROPERTY: header-args:sql :engine postgresql 8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0 9 | #+PROPERTY: header-args:sql+ :dbport 5434 10 | #+PROPERTY: header-args:sql+ :dbuser food_user 11 | #+PROPERTY: header-args:sql+ :dbpassword some_password 12 | #+PROPERTY: header-args:sql+ :database food 13 | #+PROPERTY: header-args:sql+ :results table drawer 14 | #+PROPERTY: header-args:sql+ :cmdline -q 15 | #+PROPERTY: header-args:sh :results verbatim org 16 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue : 17 | #+PROPERTY: header-args:ipython :session Food_inspections 18 | #+PROPERTY: header-args:ipython+ :results raw drawer 19 | #+OPTIONS: broken-links:mark 20 | #+OPTIONS: tasks:todo 21 | #+OPTIONS: LaTeX:t 22 | 23 | 24 | * Description of the problem 25 | 26 | This tutorial aims to introduce the reader to [[https://github.com/dssg/triage][triage]], a machine learning modeling tool built by the [[https://dsapp.uchicago.edu][Center for Data Science and Public Policy]]. 27 | We will use the well-known [[https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5][Chicago Food Inspections dataset]].[fn:1] 28 | 29 | We will present the two problems that =triage= was built to model[fn:5]: 30 | 31 | 1. *Resource prioritization* (internally known as the /inspections 32 | problem/)[fn:2] and 33 | 2. *Early warning*.[fn:3] 34 | 35 | 36 | ** Inspection Prioritization 37 | 38 | In an ideal world, inspectors would frequently visit every food 39 | facility, every day[fn:4] to ensure it meets safety standards. But 40 | the real world doesn't have enough 41 | inspectors for that to happen, so the city needs to decide how to allocate 42 | its limited inspection workforce to find and remediate as many establishments 43 | with food hazards as possible. Assuming the city can inspect $n$ facilities 44 | in the next $X$ period of time, they can define the problem like this: 45 | 46 | #+CAPTION: How to define Chicago Food Inspections as an inspection-prioritization problem: 47 | #+begin_quote 48 | Which $n$ facilities will have a food violation in the 49 | following $X$ period of time? 50 | #+end_quote 51 | 52 | If our inspection workforce is really limited, we should probably just target 53 | the most serious violations. Then we'd define the problem like this: 54 | 55 | #+CAPTION: How to define Chicago Food Inspections as an inspection-prioritization problem that targets the most serious cases: 56 | #+begin_quote 57 | Which $n$ facilities will have a critical or serious violation in the 58 | following $X$ period of time? 59 | #+end_quote 60 | 61 | 62 | ** Early Warning 63 | Using the same data set, facility owners or managers would pose the 64 | ML problem as an early warning problem. 65 | They'd like to know whether an inspector is going to visit their facility 66 | so they can prepare for it. They can define the problem like this: 67 | 68 | #+CAPTION: How to define Chicago Food Inspections as an early warning problem: 69 | #+begin_quote 70 | Will my facility be inspected in the next $X$ period of time? 71 | #+end_quote 72 | 73 | Note that in both cases, we are defining a period of time in which the 74 | event potentially will happen. 75 | 76 | ** What do they have in common? 77 | For either problem, $X$ could be a day, a week, month, a quarter, a year, 56 days, 78 | or some other time period. 79 | 80 | Without going into detail, both problems use data where each 81 | row describes an *event* in which an *entity* was involved, and 82 | each event has a specific *outcome* or result. 83 | 84 | The *entity* for both inspection prioritizations and early warnings 85 | in this tutorial is a food /facility/, and the *event* is an inspection. 86 | But the *outcome* differs: for inspections the outcome is /inspection failed/ 87 | or /major violation found/, while for early warning the outcome is 88 | /inspected/. 89 | 90 | ** How do they differ? 91 | 92 | Besides the obvious (i.e. the label), these ML's problem formulations 93 | have very different internal structure: 94 | 95 | The /EIS/ problem *all* the entities of interest in a given period of 96 | time *have* a label. The /Inspections/ problem does not have that 97 | luxury: from all the existing entities of interest only a bunch are 98 | /inspected/ that means that only those inspected have a label 99 | (=True/False=) but all the remaining ones doesn't have one. This will be 100 | reflected, for example in the /training/ matrices: you only train in the 101 | facilities that were inspected (so you will have less rows in 102 | them). Another impact will be in the metrics: you need to be very 103 | careful about interpreting the metrics in an inspections 104 | problem. Finally, when you are designing the field validation of your 105 | model, you need to take in account this selection bias, if not, you 106 | will be inspecting the same facilities over and over[fn:6] 107 | 108 | 109 | * Footnotes 110 | 111 | [fn:6] This points is particularly acute: Imagine the scenario in 112 | which the /inspections/ problem is *crime prediction* in order to send 113 | cops (inspectors)to that "risky" area (facilities)... 114 | 115 | [fn:5] It is also possible to do "visit-level prediction" type of ML problem. 116 | 117 | [fn:4] Defined as "bakery, banquet 118 | hall, candy store, caterer, coffee shop, day care center (for ages less than 2), day care 119 | center (for ages 2 – 6), day care center (combo, for ages less than 2 and 2 – 6 120 | combined), gas station, Golden Diner, grocery store, hospital, long term care 121 | center(nursing home), liquor store, mobile food dispenser, restaurant, paleteria, school, 122 | shelter, tavern, social club, wholesaler, or Wrigley Field Rooftop" 123 | ([[https://data.cityofchicago.org/api/views/4ijn-s7e5/files/O9cwLJ4wvxQJ2MirxkNzAUCCMQiM31DMzRkckMsKlxc?download=true&filename=foodinspections_description.pdf][source]]). 124 | 125 | [fn:3] Examples include [[http://dsapp.uchicago.edu/projects/education/][Increasing High School Graduation Rates: Early 126 | Warnings and Predictive Systems]], [[http://dsapp.uchicago.edu/projects/public-safety/police-eis/][Building Data-Driven Early 127 | Intervention Systems for Police Officers]], and [[http://dsapp.uchicago.edu/projects/criminal-justice/data-driven-justice-initiative/][Data-Driven Justice 128 | Initiative: Identifying Frequent Users of Multiple Public Systems for 129 | More Effective Early Assistance]]. 130 | 131 | [fn:2] Examples include [[http://dsapp.uchicago.edu/projects/environment/][Predictive Enforcement 132 | of Hazardous Waste Regulations]] and [[http://dsapp.uchicago.edu/projects/health/lead-prevention/][Targeting Proactive Inspections for Lead Hazards]]. 133 | 134 | [fn:1] Several examples use this dataset, such as [[https://chicago.github.io/food-inspections-evaluation/][City of Chicago Food 135 | Inspection Forecasting]], [[https://youtu.be/lyDLAutA88s][PyCon 2016 keynote: Built in Super Heroes]], 136 | and [[https://youtu.be/1dKonIT-Yak][PyData 2016: Forecasting critical food violations at restaurants 137 | using open data]]. 138 | -------------------------------------------------------------------------------- /org/07_quick_setup.org: -------------------------------------------------------------------------------- 1 | #+STARTUP: showeverything 2 | #+STARTUP: nohideblocks 3 | #+STARTUP: indent 4 | #+STARTUP: align 5 | #+STARTUP: inlineimages 6 | #+STARTUP: latexpreview 7 | #+PROPERTY: header-args:sql :engine postgresql 8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0 9 | #+PROPERTY: header-args:sql+ :dbport 5434 10 | #+PROPERTY: header-args:sql+ :dbuser food_user 11 | #+PROPERTY: header-args:sql+ :dbpassword some_password 12 | #+PROPERTY: header-args:sql+ :database food 13 | #+PROPERTY: header-args:sql+ :results table drawer 14 | #+PROPERTY: header-args:sql+ :exports both 15 | #+PROPERTY: header-args:sql+ :eval no-export 16 | #+PROPERTY: header-args:sql+ :cmdline -q 17 | #+PROPERTY: header-args:sh :results verbatim org 18 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue : 19 | #+PROPERTY: header-args:ipython :session food_inspections 20 | #+PROPERTY: header-args:ipython+ :results raw drawer 21 | #+OPTIONS: broken-links:mark 22 | #+OPTIONS: tasks:todo 23 | #+OPTIONS: LaTeX:t 24 | 25 | * Appendix: For the impatient 26 | 27 | If you want to skip all the cleansing and transformation and deep 28 | directly into =triage= you can 29 | execute the following /inside bastion/: 30 | 31 | #+BEGIN_SRC sh :dir /docker:root@tutorial_bastion:/ 32 | curl "https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD" > data/inspections.csv 33 | 34 | psql ${DATABASE_URL} -c "\copy raw.inspections FROM '/data/inspections.csv' WITH HEADER CSV" 35 | 36 | psql ${DATABASE_URL} < /sql/create_cleaned_inspections_table.sql 37 | 38 | psql ${DATABASE_URL} < /sql/create_violations_table.sql 39 | 40 | psql ${DATABASE_URL} < /sql/create_semantic_tables.sql 41 | #+END_SRC 42 | 43 | #+RESULTS: 44 | #+BEGIN_SRC org 45 | COPY 168861 46 | CREATE SCHEMA 47 | NOTICE: table "inspections" does not exist, skipping 48 | DROP TABLE 49 | SELECT 168046 50 | NOTICE: table "violations" does not exist, skipping 51 | DROP TABLE 52 | SELECT 632487 53 | CREATE SCHEMA 54 | NOTICE: table "entities" does not exist, skipping 55 | DROP TABLE 56 | SELECT 35360 57 | CREATE INDEX 58 | CREATE INDEX 59 | CREATE INDEX 60 | CREATE INDEX 61 | CREATE INDEX 62 | CREATE INDEX 63 | CREATE INDEX 64 | NOTICE: table "events" does not exist, skipping 65 | DROP TABLE 66 | SELECT 145123 67 | CREATE INDEX 68 | CREATE INDEX 69 | CREATE INDEX 70 | CREATE INDEX 71 | CREATE INDEX 72 | CREATE INDEX 73 | CREATE INDEX 74 | CREATE INDEX 75 | CREATE INDEX 76 | CREATE INDEX 77 | #+END_SRC 78 | 79 | 80 | If everything works, you should end with two new schemas: =cleaned= and =semantic=. 81 | 82 | You could check that (from =psql=) With 83 | #+BEGIN_SRC sql 84 | \dn 85 | #+END_SRC 86 | 87 | #+RESULTS: 88 | :RESULTS: 89 | | List of schemas | | 90 | |-----------------+----------| 91 | | Name | Owner | 92 | | cleaned | food_user | 93 | | postgis | food_user | 94 | | public | postgres | 95 | | raw | food_user | 96 | | semantic | food_user | 97 | :END: 98 | 99 | Now you can continue to the introduction to triage section. 100 | -------------------------------------------------------------------------------- /org/100_whats_next.org: -------------------------------------------------------------------------------- 1 | #+STARTUP: showeverything 2 | #+STARTUP: nohideblocks 3 | #+STARTUP: indent 4 | #+STARTUP: align 5 | #+STARTUP: inlineimages 6 | #+STARTUP: latexpreview 7 | #+PROPERTY: header-args:sql :engine postgresql 8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0 9 | #+PROPERTY: header-args:sql+ :dbport 5434 10 | #+PROPERTY: header-args:sql+ :dbuser food_user 11 | #+PROPERTY: header-args:sql+ :dbpassword some_password 12 | #+PROPERTY: header-args:sql+ :database food 13 | #+PROPERTY: header-args:sql+ :results table drawer 14 | #+PROPERTY: header-args:sql+ :cmdline -q 15 | #+PROPERTY: header-args:sh :results verbatim org 16 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue : 17 | #+PROPERTY: header-args:ipython :session Food_inspections 18 | #+PROPERTY: header-args:ipython+ :results raw drawer 19 | #+OPTIONS: broken-links:mark 20 | #+OPTIONS: tasks:todo 21 | 22 | * What's next? 23 | 24 | - Add the shape file 25 | https://data.cityofchicago.org/api/geospatial/gdcf-axmw?method=export&format=Shapefile 26 | and generate geospatial variables using =location= 27 | - Text analysis on the /violations/' =comments= column and generate 28 | new /outcomes/ or /features/? 29 | - Run =some deduplication and had a better =semantic.entities=? 30 | - Routing based on the inspection list? 31 | - Add more data sources (Census, Schools, bus stops, ACS data, Yelp!): 32 | - [[https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses/r5kz-chrr][Business Licenses]] 33 | - Food Inspections 34 | - [[https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2][Crime]] 35 | - Garbage Cart Complaints 36 | - [[https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints/me59-5fac][Sanitation Complaints]] 37 | - Weather 38 | - Sanitarian Information 39 | -------------------------------------------------------------------------------- /org/audition: -------------------------------------------------------------------------------- 1 | ../triage/audition -------------------------------------------------------------------------------- /org/css/org-default.css: -------------------------------------------------------------------------------- 1 | .org-bold{font-weight:700}.org-bold-italic{font-weight:700;font-style:italic}.org-buffer-menu-buffer{font-weight:700}.org-builtin{color:#483d8b}.org-button{color:#3a5fcd;text-decoration:underline}.org-calendar-month-header{color:#00f}.org-calendar-today{text-decoration:underline}.org-calendar-weekday-header{color:#008b8b}.org-calendar-weekend-header{color:#b22222}.org-comint-highlight-input{font-weight:700}.org-comint-highlight-prompt{color:#0000cd}.org-comment,.org-comment-delimiter{color:#b22222}.org-constant{color:#008b8b}.org-diary{color:red}.org-doc{color:#8b2252}.org-error{color:red;font-weight:700}.org-escape-glyph{color:brown}.org-file-name-shadow{color:#7f7f7f}.org-fringe{background-color:#f2f2f2}.org-function-name{color:#00f}.org-glyphless-char{font-size:60%}.org-header-line{color:#333;background-color:#e5e5e5}.org-help-argument-name{font-style:italic}.org-highlight{background-color:#b4eeb4}.org-holiday{background-color:pink}.org-info-header-node{color:brown;font-weight:700;font-style:italic}.org-info-header-xref{color:#3a5fcd;text-decoration:underline}.org-info-index-match{background-color:#ff0}.org-info-menu-header{font-weight:700}.org-info-menu-star{color:red}.org-info-node{color:brown;font-weight:700;font-style:italic}.org-info-title-1{font-size:172%;font-weight:700}.org-info-title-2{font-size:144%;font-weight:700}.org-info-title-3{font-size:120%;font-weight:700}.org-info-title-4{font-weight:700}.org-info-xref{color:#3a5fcd;text-decoration:underline}.org-italic{font-style:italic}.org-keyword{color:#a020f0}.org-lazy-highlight{background-color:#afeeee}.org-link{color:#3a5fcd;text-decoration:underline}.org-link-visited{color:#8b008b;text-decoration:underline}.org-makefile-makepp-perl{background-color:#bfefff}.org-makefile-space{background-color:#ff69b4}.org-makefile-targets{color:#00f}.org-match{background-color:#ff0}.org-next-error{background-color:gtk_selection_bg_color}.org-nobreak-space{color:brown;text-decoration:underline}.org-org-agenda-calendar-event,.org-org-agenda-calendar-sexp{color:#000;background-color:#fff}.org-org-agenda-clocking{background-color:#ff0}.org-org-agenda-column-dateline{background-color:#e5e5e5}.org-org-agenda-current-time{color:#b8860b}.org-org-agenda-date{color:#00f}.org-org-agenda-date-today{color:#00f;font-weight:700;font-style:italic}.org-org-agenda-date-weekend{color:#00f;font-weight:700}.org-org-agenda-diary{color:#000;background-color:#fff}.org-org-agenda-dimmed-todo{color:#7f7f7f}.org-org-agenda-done{color:#228b22}.org-org-agenda-filter-category,.org-org-agenda-filter-effort,.org-org-agenda-filter-regexp,.org-org-agenda-filter-tags{color:#000;background-color:#bfbfbf}.org-org-agenda-restriction-lock{background-color:#eee}.org-org-agenda-structure{color:#00f}.org-org-archived,.org-org-block{color:#7f7f7f}.org-org-block-begin-line,.org-org-block-end-line{color:#b22222}.org-org-checkbox{font-weight:700}.org-org-checkbox-statistics-done{color:#228b22;font-weight:700}.org-org-checkbox-statistics-todo{color:red;font-weight:700}.org-org-clock-overlay{color:#000;background-color:#d3d3d3}.org-org-code{color:#7f7f7f}.org-org-column,.org-org-column-title{background-color:#e5e5e5}.org-org-column-title{font-weight:700;text-decoration:underline}.org-org-date{color:#a020f0;text-decoration:underline}.org-org-date-selected{color:red}.org-org-default{color:#000;background-color:#fff}.org-org-document-info{color:#191970}.org-org-document-info-keyword{color:#7f7f7f}.org-org-document-title{color:#191970;font-weight:700}.org-org-done{color:#228b22;font-weight:700}.org-org-drawer{color:#00f}.org-org-ellipsis{color:#b8860b;text-decoration:underline}.org-org-footnote{color:#a020f0;text-decoration:underline}.org-org-formula{color:#b22222}.org-org-headline-done{color:#bc8f8f}.org-org-hide{color:#fff}.org-org-latex-and-related{color:#8b4513}.org-org-level-1{color:#00f}.org-org-level-2{color:sienna}.org-org-level-3{color:#a020f0}.org-org-level-4{color:#b22222}.org-org-level-5{color:#228b22}.org-org-level-6{color:#008b8b}.org-org-level-7{color:#483d8b}.org-org-level-8{color:#8b2252}.org-org-link{color:#3a5fcd;text-decoration:underline}.org-org-list-dt{font-weight:700}.org-org-macro{color:#8b4513}.org-org-meta-line{color:#b22222}.org-org-mode-line-clock{color:#000;background-color:#bfbfbf}.org-org-mode-line-clock-overrun{color:#000;background-color:red}.org-org-priority{color:#a020f0}.org-org-quote{color:#7f7f7f}.org-org-scheduled{color:#006400}.org-org-scheduled-previously{color:#b22222}.org-org-scheduled-today{color:#006400}.org-org-sexp-date,.org-org-special-keyword{color:#a020f0}.org-org-table{color:#00f}.org-org-tag,.org-org-tag-group{font-weight:700}.org-org-target{text-decoration:underline}.org-org-time-grid{color:#b8860b}.org-org-todo{color:red;font-weight:700}.org-org-upcoming-deadline{color:#b22222}.org-org-verbatim,.org-org-verse{color:#7f7f7f}.org-org-warning{color:red;font-weight:700}.org-outline-1{color:#00f}.org-outline-2{color:sienna}.org-outline-3{color:#a020f0}.org-outline-4{color:#b22222}.org-outline-5{color:#228b22}.org-outline-6{color:#008b8b}.org-outline-7{color:#483d8b}.org-outline-8{color:#8b2252}.org-preprocessor{color:#483d8b}.org-regexp-grouping-backslash,.org-regexp-grouping-construct{font-weight:700}.org-region{background-color:gtk_selection_bg_color}.org-secondary-selection{background-color:#ff0}.org-shadow{color:#7f7f7f}.org-show-paren-match{background-color:#40e0d0}.org-show-paren-mismatch{color:#fff;background-color:#a020f0}.org-string{color:#8b2252}.org-success{color:#228b22;font-weight:700}.org-table-cell{color:#e5e5e5;background-color:#00f}.org-tooltip{color:#000;background-color:#ffffe0}.org-trailing-whitespace{background-color:red}.org-type{color:#228b22}.org-underline{text-decoration:underline}.org-variable-name{color:sienna}.org-warning{color:#ff8c00;font-weight:700}.org-warning-1{color:red;font-weight:700}.title{margin-bottom:.2em}.subtitle,.title{text-align:center}.subtitle{font-size:medium;font-weight:700;margin-top:0}.todo{color:red}.done,.todo{font-family:monospace}.done{color:green}.priority{color:orange}.priority,.tag{font-family:monospace}.tag{background-color:#eee;font-size:80%;font-weight:400;padding:2px}.timestamp{color:#bebebe}.timestamp-kwd{color:#5f9ea0}.org-right{margin-left:auto;margin-right:0;text-align:right}.org-left{margin-left:0;margin-right:auto;text-align:left}.org-center{margin-left:auto;margin-right:auto;text-align:center}.underline{text-decoration:underline}#postamble p,#preamble p{font-size:90%;margin:.2em}p.verse{margin-left:3%}pre{border:1px solid #ccc;box-shadow:3px 3px 3px #eee;font-family:monospace;margin:1.2em;overflow:auto;padding:8pt}pre.src{overflow:visible;padding-top:1.2em;position:relative}pre.src:before{background-color:#fff;border:1px solid #000;display:none;padding:3px;position:absolute;right:10px;top:-10px}pre.src:hover:before{display:inline}pre.src-bash:before,pre.src-sh:before{content:"sh"}pre.src-emacs-lisp:before{content:"Emacs Lisp"}pre.src-R:before{content:"R"}pre.src-perl:before{content:"Perl"}pre.src-java:before{content:"Java"}pre.src-sql:before{content:"SQL"}table{border-collapse:collapse}caption.t-above{caption-side:top}caption.t-bottom{caption-side:bottom}td,th{vertical-align:top}th.org-center,th.org-left,th.org-right{text-align:center}td.org-right{text-align:right}td.org-left{text-align:left}td.org-center{text-align:center}dt{font-weight:700}.footpara{display:inline}.footdef{margin-bottom:1em}.figure{padding:1em}.figure p{text-align:center}.inlinetask{background:#ffc;border:2px solid gray;margin:10px;padding:10px}#org-div-home-and-up{font-size:70%;text-align:right;white-space:nowrap}textarea{overflow-x:auto}.linenr{font-size:smaller}.code-highlighted{background-color:#ff0}.org-info-js_info-navigation{border-style:none}#org-info-js_console-label{font-size:10px;font-weight:700;white-space:nowrap}.org-info-js_search-highlight{background-color:#ff0;color:#000;font-weight:700} 2 | -------------------------------------------------------------------------------- /org/docker-kernel-connection.json: -------------------------------------------------------------------------------- 1 | { 2 | "shell_port": 56409, 3 | "iopub_port": 56408, 4 | "stdin_port": 56410, 5 | "control_port": 56406, 6 | "hb_port": 56407, 7 | "ip": "0.0.0.0", 8 | "key": "c2e3bb2a-f80c7b34d4fe02d7e5be87d9", 9 | "transport": "tcp", 10 | "signature_scheme": "hmac-sha256", 11 | "kernel_name": "" 12 | } -------------------------------------------------------------------------------- /org/images/AWS_Batch_Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/AWS_Batch_Architecture.png -------------------------------------------------------------------------------- /org/images/data_road.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/data_road.png -------------------------------------------------------------------------------- /org/images/eis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis.png -------------------------------------------------------------------------------- /org/images/eis_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /org/images/eis_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_mg_prec_over_time.png -------------------------------------------------------------------------------- /org/images/eis_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_mg_recall_over_time.png -------------------------------------------------------------------------------- /org/images/eis_model_group_64_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_model_group_64_feature_group_importances.png -------------------------------------------------------------------------------- /org/images/eis_model_group_64_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_model_group_64_feature_importances.png -------------------------------------------------------------------------------- /org/images/eis_model_group_64_rayid_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_model_group_64_rayid_curve.png -------------------------------------------------------------------------------- /org/images/facilities_inspected_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/facilities_inspected_over_time.png -------------------------------------------------------------------------------- /org/images/facilities_with_failed_inspections_severe_violations_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/facilities_with_failed_inspections_severe_violations_over_time.png -------------------------------------------------------------------------------- /org/images/facilities_with_inspections_failed_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/facilities_with_inspections_failed_over_time.png -------------------------------------------------------------------------------- /org/images/failed_inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/failed_inspections_over_time.png -------------------------------------------------------------------------------- /org/images/failed_inspections_severe_violations_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/failed_inspections_severe_violations_over_time.png -------------------------------------------------------------------------------- /org/images/inspection_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /org/images/inspection_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_mg_prec_over_time.png -------------------------------------------------------------------------------- /org/images/inspection_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_mg_recall_over_time.png -------------------------------------------------------------------------------- /org/images/inspection_model_group_11_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_model_group_11_feature_group_importances.png -------------------------------------------------------------------------------- /org/images/inspection_model_group_11_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_model_group_11_feature_importances.png -------------------------------------------------------------------------------- /org/images/inspection_model_group_11_rayid_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_model_group_11_rayid_curve.png -------------------------------------------------------------------------------- /org/images/inspections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspections.png -------------------------------------------------------------------------------- /org/images/inspections_dt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspections_dt.png -------------------------------------------------------------------------------- /org/images/inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspections_over_time.png -------------------------------------------------------------------------------- /org/images/model_7_tree_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/model_7_tree_0.png -------------------------------------------------------------------------------- /org/images/outcomes-eis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/outcomes-eis.png -------------------------------------------------------------------------------- /org/images/outcomes-inspections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/outcomes-inspections.png -------------------------------------------------------------------------------- /org/images/rolling-origin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/rolling-origin.png -------------------------------------------------------------------------------- /org/images/sanjose-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/sanjose-2.png -------------------------------------------------------------------------------- /org/images/simple_test_skeleton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/simple_test_skeleton.png -------------------------------------------------------------------------------- /org/images/timechop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop.png -------------------------------------------------------------------------------- /org/images/timechop_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_1.png -------------------------------------------------------------------------------- /org/images/timechop_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_10.png -------------------------------------------------------------------------------- /org/images/timechop_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_2.png -------------------------------------------------------------------------------- /org/images/timechop_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_3.png -------------------------------------------------------------------------------- /org/images/timechop_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_4.png -------------------------------------------------------------------------------- /org/images/timechop_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_5.png -------------------------------------------------------------------------------- /org/images/timechop_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_6.png -------------------------------------------------------------------------------- /org/images/timechop_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_7.png -------------------------------------------------------------------------------- /org/images/timechop_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_8.png -------------------------------------------------------------------------------- /org/images/timechop_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_9.png -------------------------------------------------------------------------------- /org/images/timechop_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_example.png -------------------------------------------------------------------------------- /org/images/timechop_inspections_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_inspections_test.png -------------------------------------------------------------------------------- /org/images/timechop_withoutblocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_withoutblocks.png -------------------------------------------------------------------------------- /org/images/timechop_withoutrows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_withoutrows.png -------------------------------------------------------------------------------- /org/index.org: -------------------------------------------------------------------------------- 1 | # -*- mode: org; -*- 2 | 3 | #+TITLE: Dirty Duck: A Guided Tour of Triage 4 | #+DESCRIPTION: 5 | #+AUTHOR: Center of Data Science for Public Policy 6 | #+EMAIL: adolfo@uchicago.edu 7 | #+STARTUP: showeverything 8 | #+STARTUP: nohideblocks 9 | #+STARTUP: indent 10 | #+STARTUP: align 11 | #+STARTUP: inlineimages 12 | #+STARTUP: latexpreview 13 | #+PROPERTY: header-args :cache yes 14 | #+PROPERTY: header-args :eval never-export 15 | #+PROPERTY: header-args:sql :engine postgresql 16 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0 17 | #+PROPERTY: header-args:sql+ :dbport 5434 18 | #+PROPERTY: header-args:sql+ :dbuser food_user 19 | #+PROPERTY: header-args:sql+ :dbpassword some_password 20 | #+PROPERTY: header-args:sql+ :database food 21 | #+PROPERTY: header-args:sql+ :results table drawer 22 | #+PROPERTY: header-args:sql+ :exports both 23 | #+PROPERTY: header-args:sql+ :eval no-export 24 | #+PROPERTY: header-args:sql+ :cmdline -q 25 | #+PROPERTY: header-args:sh :results verbatim org 26 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue : 27 | #+PROPERTY: header-args:ipython :session food_inspections 28 | #+PROPERTY: header-args:ipython+ :results raw drawer 29 | #+PROPERTY: header-args:ipython+ :eval no-export 30 | #+OPTIONS: broken-links:mark 31 | #+OPTIONS: tasks:todo 32 | #+OPTIONS: LaTeX:t 33 | #+SETUPFILE: tutorial.setup 34 | 35 | #+INCLUDE: 00_instructions.org :minlevel 1 36 | #+INCLUDE: 01_intro.org :minlevel 1 37 | #+INCLUDE: 02_infrastructure.org :minlevel 1 38 | #+INCLUDE: 03_data_preparation.org :minlevel 1 39 | #+INCLUDE: 04_triage_intro.org :minlevel 1 40 | #+INCLUDE: 05_inspections.org :minlevel 1 41 | #+INCLUDE: 06_eis.org :minlevel 1 42 | #+INCLUDE: 09_aws_batch.org :minlevel 1 43 | #+INCLUDE: 100_whats_next.org :minlevel 1 44 | #+INCLUDE: 07_quick_setup.org :minlevel 1 45 | 46 | #+EXPORT_EXCLUDE_TAGS: noexport 47 | -------------------------------------------------------------------------------- /org/js/jquery.stickytableheaders.min.js: -------------------------------------------------------------------------------- 1 | !function(a,b){"use strict";function c(c,g){var h=this;h.$el=a(c),h.el=c,h.id=e++,h.$window=a(b),h.$document=a(document),h.$el.bind("destroyed",a.proxy(h.teardown,h)),h.$clonedHeader=null,h.$originalHeader=null,h.isSticky=!1,h.hasBeenSticky=!1,h.leftOffset=null,h.topOffset=null,h.init=function(){h.$el.each(function(){var b=a(this);b.css("padding",0),h.$originalHeader=a("thead:first",this),h.$clonedHeader=h.$originalHeader.clone(),b.trigger("clonedHeader."+d,[h.$clonedHeader]),h.$clonedHeader.addClass("tableFloatingHeader"),h.$clonedHeader.css("display","none"),h.$originalHeader.addClass("tableFloatingHeaderOriginal"),h.$originalHeader.after(h.$clonedHeader),h.$printStyle=a(''),a("head").append(h.$printStyle)}),h.setOptions(g),h.updateWidth(),h.toggleHeaders(),h.bind()},h.destroy=function(){h.$el.unbind("destroyed",h.teardown),h.teardown()},h.teardown=function(){h.isSticky&&h.$originalHeader.css("position","static"),a.removeData(h.el,"plugin_"+d),h.unbind(),h.$clonedHeader.remove(),h.$originalHeader.removeClass("tableFloatingHeaderOriginal"),h.$originalHeader.css("visibility","visible"),h.$printStyle.remove(),h.el=null,h.$el=null},h.bind=function(){h.$scrollableArea.on("scroll."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.on("scroll."+d+h.id,h.setPositionValues),h.$window.on("resize."+d+h.id,h.toggleHeaders)),h.$scrollableArea.on("resize."+d,h.toggleHeaders),h.$scrollableArea.on("resize."+d,h.updateWidth)},h.unbind=function(){h.$scrollableArea.off("."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.off("."+d+h.id,h.setPositionValues),h.$window.off("."+d+h.id,h.toggleHeaders)),h.$scrollableArea.off("."+d,h.updateWidth)},h.toggleHeaders=function(){h.$el&&h.$el.each(function(){var b,c=a(this),d=h.isWindowScrolling?isNaN(h.options.fixedOffset)?h.options.fixedOffset.outerHeight():h.options.fixedOffset:h.$scrollableArea.offset().top+(isNaN(h.options.fixedOffset)?0:h.options.fixedOffset),e=c.offset(),f=h.$scrollableArea.scrollTop()+d,g=h.$scrollableArea.scrollLeft(),i=h.isWindowScrolling?f>e.top:d>e.top,j=(h.isWindowScrolling?f:0)a||a+h.$window.height()>h.$document.height()||0>b||b+h.$window.width()>h.$document.width()||h.$originalHeader.css({top:h.topOffset-(h.isWindowScrolling?0:a),left:h.leftOffset-(h.isWindowScrolling?0:b)})},h.updateWidth=function(){if(h.isSticky){h.$originalHeaderCells||(h.$originalHeaderCells=a("th,td",h.$originalHeader)),h.$clonedHeaderCells||(h.$clonedHeaderCells=a("th,td",h.$clonedHeader));var b=h.getWidth(h.$clonedHeaderCells);h.setWidth(b,h.$clonedHeaderCells,h.$originalHeaderCells),h.$originalHeader.css("width",h.$clonedHeader.width())}},h.getWidth=function(c){var d=[];return c.each(function(c){var e,f=a(this);if("border-box"===f.css("box-sizing"))e=f[0].getBoundingClientRect().width;else{var g=a("th",h.$originalHeader);if("collapse"===g.css("border-collapse"))if(b.getComputedStyle)e=parseFloat(b.getComputedStyle(this,null).width);else{var i=parseFloat(f.css("padding-left")),j=parseFloat(f.css("padding-right")),k=parseFloat(f.css("border-width"));e=f.outerWidth()-i-j-k}else e=f.width()}d[c]=e}),d},h.setWidth=function(a,b,c){b.each(function(b){var d=a[b];c.eq(b).css({"min-width":d,"max-width":d})})},h.resetWidth=function(b,c){b.each(function(b){var d=a(this);c.eq(b).css({"min-width":d.css("min-width"),"max-width":d.css("max-width")})})},h.setOptions=function(c){h.options=a.extend({},f,c),h.$scrollableArea=a(h.options.scrollableArea),h.isWindowScrolling=h.$scrollableArea[0]===b},h.updateOptions=function(a){h.setOptions(a),h.unbind(),h.bind(),h.updateWidth(),h.toggleHeaders()},h.init()}var d="stickyTableHeaders",e=0,f={fixedOffset:0,leftOffset:0,marginTop:0,scrollableArea:b};a.fn[d]=function(b){return this.each(function(){var e=a.data(this,"plugin_"+d);e?"string"==typeof b?e[b].apply(e):e.updateOptions(b):"destroy"!==b&&a.data(this,"plugin_"+d,new c(this,b))})}}(jQuery,window); -------------------------------------------------------------------------------- /org/js/readtheorg.js: -------------------------------------------------------------------------------- 1 | 2 | $(function() { 3 | $('.note').before("

Note

"); 4 | $('.seealso').before("

See also

"); 5 | $('.warning').before("

Warning

"); 6 | $('.caution').before("

Caution

"); 7 | $('.attention').before("

Attention

"); 8 | $('.tip').before("

Tip

"); 9 | $('.important').before("

Important

"); 10 | $('.hint').before("

Hint

"); 11 | $('.error').before("

Error

"); 12 | $('.danger').before("

Danger

"); 13 | }); 14 | 15 | $( document ).ready(function() { 16 | 17 | // Shift nav in mobile when clicking the menu. 18 | $(document).on('click', "[data-toggle='wy-nav-top']", function() { 19 | $("[data-toggle='wy-nav-shift']").toggleClass("shift"); 20 | $("[data-toggle='rst-versions']").toggleClass("shift"); 21 | }); 22 | // Close menu when you click a link. 23 | $(document).on('click', ".wy-menu-vertical .current ul li a", function() { 24 | $("[data-toggle='wy-nav-shift']").removeClass("shift"); 25 | $("[data-toggle='rst-versions']").toggleClass("shift"); 26 | }); 27 | $(document).on('click', "[data-toggle='rst-current-version']", function() { 28 | $("[data-toggle='rst-versions']").toggleClass("shift-up"); 29 | }); 30 | // Make tables responsive 31 | $("table.docutils:not(.field-list)").wrap("
"); 32 | }); 33 | 34 | $( document ).ready(function() { 35 | $('#text-table-of-contents ul').first().addClass('nav'); 36 | // ScrollSpy also requires that we use 37 | // a Bootstrap nav component. 38 | $('body').scrollspy({target: '#text-table-of-contents'}); 39 | 40 | // add sticky table headers 41 | $('table').stickyTableHeaders(); 42 | 43 | // set the height of tableOfContents 44 | var $postamble = $('#postamble'); 45 | var $tableOfContents = $('#table-of-contents'); 46 | $tableOfContents.css({paddingBottom: $postamble.outerHeight()}); 47 | 48 | // add TOC button 49 | var toggleSidebar = $(''); 50 | $('#content').prepend(toggleSidebar); 51 | 52 | // add close button when sidebar showed in mobile screen 53 | var closeBtn = $('Close'); 54 | var tocTitle = $('#table-of-contents').find('h2'); 55 | tocTitle.append(closeBtn); 56 | }); 57 | 58 | window.SphinxRtdTheme = (function (jquery) { 59 | var stickyNav = (function () { 60 | var navBar, 61 | win, 62 | stickyNavCssClass = 'stickynav', 63 | applyStickNav = function () { 64 | if (navBar.height() <= win.height()) { 65 | navBar.addClass(stickyNavCssClass); 66 | } else { 67 | navBar.removeClass(stickyNavCssClass); 68 | } 69 | }, 70 | enable = function () { 71 | applyStickNav(); 72 | win.on('resize', applyStickNav); 73 | }, 74 | init = function () { 75 | navBar = jquery('nav.wy-nav-side:first'); 76 | win = jquery(window); 77 | }; 78 | jquery(init); 79 | return { 80 | enable : enable 81 | }; 82 | }()); 83 | return { 84 | StickyNav : stickyNav 85 | }; 86 | }($)); 87 | -------------------------------------------------------------------------------- /org/js/stickytableheaders-license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Jonas Mosbech 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /org/publish.el: -------------------------------------------------------------------------------- 1 | (require 'package) 2 | 3 | (when (<= emacs-major-version 27) 4 | (package-initialize) ;; Not needed in Emacs 27 5 | ; Disable loading package again after init.el 6 | ) 7 | 8 | 9 | (unless (package-installed-p 'use-package) 10 | (package-refresh-contents) 11 | (package-install 'use-package)) 12 | 13 | (eval-when-compile 14 | (require 'use-package)) 15 | 16 | (use-package htmlize 17 | :defer t 18 | ) 19 | 20 | (require 'ox-publish) 21 | (setq org-publish-project-alist 22 | '( 23 | 24 | ("dirtyduck-notes" 25 | :base-directory "~/projects/dsapp/dirtyduck/org/" 26 | :base-extension "org" 27 | :exclude "[[:digit:]][[:digit:]]_.*\.org" 28 | :publishing-directory "~/projects/dsapp/dirtyduck/docs/" 29 | :recursive t 30 | :publishing-function org-html-publish-to-html 31 | :headline-levels 4 ; Just the default for this project. 32 | :auto-preamble t 33 | :sitemap-title "Dirtyduck" 34 | ) 35 | 36 | ("dirtyduck-notes-md" 37 | :base-directory "~/projects/dsapp/dirtyduck/org/" 38 | :base-extension "org" 39 | :exclude "[[:digit:]][[:digit:]]_.*\.org" 40 | :publishing-directory "~/projects/dsapp/dirtyduck/docs/" 41 | :recursive t 42 | :publishing-function org-gfm-export-to-markdown 43 | :headline-levels 4 ; Just the default for this project. 44 | :auto-preamble t 45 | :sitemap-title "Dirtyduck" 46 | ) 47 | 48 | ("dirtyduck-static" 49 | :base-directory "~/projects/dsapp/dirtyduck/org/" 50 | :base-extension "css\\|js\\|png\\|jpg\\|gif\\|pdf\\|mp3\\|ogg\\|swf\\|sql\\|svg\\|yaml" 51 | :publishing-directory "~/projects/dsapp/dirtyduck/docs/" 52 | :recursive t 53 | :publishing-function org-publish-attachment 54 | ) 55 | 56 | 57 | ("dirtyduck" :components ("dirtyduck-static" "dirtyduck-notes" "dirtyduck-notes-md")) 58 | 59 | )) 60 | -------------------------------------------------------------------------------- /org/ref.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/ref.bib -------------------------------------------------------------------------------- /org/sql/create_cleaned_inspections_table.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists cleaned; 2 | 3 | drop table if exists cleaned.inspections cascade; 4 | 5 | create table cleaned.inspections as ( 6 | with cleaned as ( 7 | select 8 | inspection::integer, 9 | btrim(lower(results)) as result, 10 | license_num::integer, 11 | btrim(lower(dba_name)) as facility, 12 | btrim(lower(aka_name)) as facility_aka, 13 | case when 14 | facility_type is null then 'unknown' 15 | else btrim(lower(facility_type)) 16 | end as facility_type, 17 | lower(substring(risk from '\((.+)\)')) as risk, 18 | btrim(lower(address)) as address, 19 | zip as zip_code, 20 | substring( 21 | btrim(lower(regexp_replace(type, 'liquor', 'task force', 'gi'))) 22 | from 'canvass|task force|complaint|food poisoning|consultation|license|tag removal') as type, 23 | date, 24 | -- point(longitude, latitude) as location 25 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as location -- We use geography so the measurements are in meters 26 | from raw.inspections 27 | where zip is not null -- removing NULL zip codes 28 | ) 29 | 30 | select * from cleaned where type is not null 31 | ); 32 | -------------------------------------------------------------------------------- /org/sql/create_semantic_tables.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists semantic; 2 | 3 | drop table if exists semantic.entities cascade; 4 | 5 | create table semantic.entities as ( 6 | with entities as ( 7 | select 8 | distinct on ( 9 | license_num, 10 | facility, 11 | facility_aka, 12 | facility_type, 13 | address 14 | ) 15 | license_num, 16 | facility, 17 | facility_aka, 18 | facility_type, 19 | address, 20 | zip_code, 21 | location, 22 | min(date) over (partition by license_num, facility, facility_aka, facility_type, address) as start_time, 23 | max(case when result in ('out of business', 'business not located') 24 | then date 25 | else NULL 26 | end) 27 | over (partition by license_num, facility, facility_aka, address) as end_time 28 | from cleaned.inspections 29 | order by 30 | license_num, facility, facility_aka, facility_type, address, 31 | date asc -- IMPORTANT!! 32 | ) 33 | 34 | select 35 | row_number() over (order by start_time asc ) as entity_id, 36 | license_num, 37 | facility, 38 | facility_aka, 39 | facility_type, 40 | address, 41 | zip_code, 42 | location, 43 | start_time, 44 | end_time, 45 | daterange(start_time, end_time) as activity_period 46 | from entities 47 | ); 48 | 49 | create index entities_ix on semantic.entities (entity_id); 50 | create index entities_license_num_ix on semantic.entities (license_num); 51 | create index entities_facility_ix on semantic.entities (facility); 52 | create index entities_facility_type_ix on semantic.entities (facility_type); 53 | create index entities_zip_code_ix on semantic.entities (zip_code); 54 | 55 | -- Spatial index 56 | create index entities_location_gix on semantic.entities using gist (location); 57 | 58 | create index entities_full_key_ix on semantic.entities (license_num, facility, facility_aka, facility_type, address); 59 | 60 | drop table if exists semantic.events cascade; 61 | 62 | create table semantic.events as ( 63 | 64 | with entities as ( 65 | select * from semantic.entities 66 | ), 67 | 68 | inspections as ( 69 | select 70 | i.inspection, i.type, i.date, i.risk, i.result, 71 | i.license_num, i.facility, i.facility_aka, 72 | i.facility_type, i.address, i.zip_code, i.location, 73 | jsonb_agg( 74 | jsonb_build_object( 75 | 'code', v.code, 76 | 'severity', v.severity, 77 | 'description', v.description, 78 | 'comment', v.comment 79 | ) 80 | order by code 81 | ) as violations 82 | from 83 | cleaned.inspections as i 84 | inner join 85 | cleaned.violations as v 86 | on i.inspection = v.inspection 87 | group by 88 | i.inspection, i.type, i.license_num, i.facility, 89 | i.facility_aka, i.facility_type, i.address, i.zip_code, i.location, 90 | i.date, i.risk, i.result 91 | ) 92 | 93 | select 94 | i.inspection as event_id, 95 | e.entity_id, i.type, i.date, i.risk, i.result, 96 | e.facility_type, e.zip_code, e.location, 97 | i.violations 98 | from 99 | entities as e 100 | inner join 101 | inspections as i 102 | using (license_num, facility, facility_aka, facility_type, address, zip_code) 103 | ); 104 | 105 | -- Add some indices 106 | create index events_entity_ix on semantic.events (entity_id asc nulls last); 107 | create index events_event_ix on semantic.events (event_id asc nulls last); 108 | create index events_type_ix on semantic.events (type); 109 | create index events_date_ix on semantic.events(date asc nulls last); 110 | create index events_facility_type_ix on semantic.events (facility_type); 111 | create index events_zip_code_ix on semantic.events (zip_code); 112 | 113 | -- Spatial index 114 | create index events_location_gix on semantic.events using gist (location); 115 | 116 | -- JSONB indices 117 | create index events_violations on semantic.events using gin(violations); 118 | create index events_violations_json_path on semantic.events using gin(violations jsonb_path_ops); 119 | 120 | create index events_event_entity_zip_code_date on semantic.events (event_id asc nulls last, entity_id asc nulls last, zip_code, date desc nulls last); 121 | -------------------------------------------------------------------------------- /org/sql/create_violations_table.sql: -------------------------------------------------------------------------------- 1 | drop table if exists cleaned.violations cascade; 2 | 3 | create table cleaned.violations as ( 4 | select 5 | inspection::integer, 6 | license_num::integer, 7 | date::date, 8 | btrim(tuple[1]) as code, 9 | btrim(tuple[2]) as description, 10 | btrim(tuple[3]) as comment, 11 | (case 12 | when btrim(tuple[1]) = '' then NULL 13 | when btrim(tuple[1])::int between 1 and 14 then 'critical' -- From the documentation 14 | when btrim(tuple[1])::int between 15 and 29 then 'serious' 15 | else 'minor' 16 | end 17 | ) as severity from 18 | ( 19 | select 20 | inspection, 21 | license_num, 22 | date, 23 | regexp_split_to_array( -- Create an array we will split the code, description, comment 24 | regexp_split_to_table( -- Create a row per each comment we split by | 25 | coalesce( -- If there isn't a violation add '- Comments:' 26 | regexp_replace(violations, '[\n\r]+', '', 'g' ) -- Remove line breaks 27 | , '- Comments:') 28 | , '\|') -- Split the violations 29 | , '(?<=\d+)\.\s*|\s*-\s*Comments:') -- Split each violation in three 30 | as tuple 31 | from raw.inspections 32 | where results in ('Fail', 'Pass', 'Pass w/ Conditions') and license_num is not null 33 | ) as t 34 | ); 35 | -------------------------------------------------------------------------------- /org/triage/experiments: -------------------------------------------------------------------------------- 1 | ../../triage/experiments -------------------------------------------------------------------------------- /org/triage/images: -------------------------------------------------------------------------------- 1 | ../../triage/images -------------------------------------------------------------------------------- /org/tutorial.setup: -------------------------------------------------------------------------------- 1 | # -*- mode: org; -*- 2 | 3 | #+LANGUAGE: en 4 | # +OPTIONS: toc:nil h:4 html-postamble:nil html-preamble:t tex:t f:t 5 | # +INFOJS_OPT: view:info toc:t ltoc:f mouse:underline buttons:0 path:http://thomasf.github.io/solarized-css/org-info.min.js 6 | # +HTML_HEAD: 7 | 8 | #+HTML_HEAD: 9 | #+HTML_HEAD: 10 | 11 | #+HTML_HEAD: 12 | #+HTML_HEAD: 13 | #+HTML_HEAD: 14 | #+HTML_HEAD: 15 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | sphinx_rtd_theme 3 | coverage 4 | flake8 5 | mkdocs 6 | tox 7 | tox-pyenv 8 | nose 9 | mock 10 | colorama 11 | ipython 12 | jupyter 13 | httpie 14 | psycopg2-binary 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ## DSaPP stuff 2 | git+https://github.com/dssg/triage.git 3 | -------------------------------------------------------------------------------- /scratch.org: -------------------------------------------------------------------------------- 1 | 2 | 3 | * Temp stuff 4 | 5 | 6 | Before doing that, let's check how many different =dba_name= we have. 7 | 8 | #+BEGIN_SRC sql :results table drawer 9 | select 10 | count(distinct dba_name) as different_names 11 | from inspections; 12 | #+END_SRC 13 | 14 | #+RESULTS: 15 | :RESULTS: 16 | | different_names | 17 | |----------------| 18 | | 25107 | 19 | :END: 20 | 21 | #+BEGIN_SRC sql :results table drawer 22 | select 23 | dba_name, 24 | btrim(upper(regexp_replace(replace(dba_name, '''', ''), '[^a-zA-Z0-9 ]', '', 'g'))) as cleaned_name 25 | from inspections 26 | limit 30 27 | #+END_SRC 28 | 29 | #+RESULTS: 30 | :RESULTS: 31 | | dba_name | cleaned_name | 32 | |----------------------------------------------+---------------------------------------------| 33 | | D AND Y GROCERY | D AND Y GROCERY | 34 | | ONE STOP FOOD MARKET | ONE STOP FOOD MARKET | 35 | | CITGO | CITGO | 36 | | KHAN DOLLAR STATION | KHAN DOLLAR STATION | 37 | | FOSTER & BROADWAY BP/AUTOTECH | FOSTER BROADWAY BPAUTOTECH | 38 | | Rizzo's Bar & Inn | RIZZOS BAR INN | 39 | | Rizzo's Bar & Inn | RIZZOS BAR INN | 40 | | SAVE-A-LOT #882 | SAVEALOT 882 | 41 | | MEDITERRANEAN EXPRESS | MEDITERRANEAN EXPRESS | 42 | | SWEET FREAKS | SWEET FREAKS | 43 | | MINGHIN CUISINE KITCHEN | MINGHIN CUISINE KITCHEN | 44 | | HAPPY GROCERY & DOLLAR | HAPPY GROCERY DOLLAR | 45 | | ARDEN RESTAURANT | ARDEN RESTAURANT | 46 | | TBD | TBD | 47 | | MAGGIE GYROS & CHICKEN | MAGGIE GYROS CHICKEN | 48 | | WOLCOTT TAP | WOLCOTT TAP | 49 | | WOLCOTT TAP | WOLCOTT TAP | 50 | | 3JJJ'S BETTER TASTE JAMAICAN JERK RESTAURANT | 3JJJS BETTER TASTE JAMAICAN JERK RESTAURANT | 51 | | THE HARDING TAVERN | THE HARDING TAVERN | 52 | | ZACATACOS, II. INC | ZACATACOS II INC | 53 | | ONESTI PIZZERIA INC | ONESTI PIZZERIA INC | 54 | | 3JJJ'S BETTER TASTE JAMAICAN JERK RESTAURANT | 3JJJS BETTER TASTE JAMAICAN JERK RESTAURANT | 55 | | NORMAN'S | NORMANS | 56 | | MCCB | MCCB | 57 | | CHECKERS DRIVE-IN RESTAURANTS, INC | CHECKERS DRIVEIN RESTAURANTS INC | 58 | | Rizzo's Bar & Inn | RIZZOS BAR INN | 59 | | GRILL 87 | GRILL 87 | 60 | | KFC | KFC | 61 | | PACO'S TACOS 2 | PACOS TACOS 2 | 62 | | MARTINI CLUB | MARTINI CLUB | 63 | :END: 64 | -------------------------------------------------------------------------------- /triage/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/.gitkeep -------------------------------------------------------------------------------- /triage/audition/eis/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/eis/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/eis/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /triage/audition/eis/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/eis/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/eis/results_model_group_ids.json: -------------------------------------------------------------------------------- 1 | {"best_current_value_precision@_10_pct": [70, 66, 64], "best_average_value_precision@_10_pct": [66, 64, 72], "lowest_metric_variance_precision@_10_pct": [65, 67, 69], "most_frequent_best_dist_precision@_10_pct_0.05": [64, 66, 70]} -------------------------------------------------------------------------------- /triage/audition/inspections/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/inspections/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/inspections/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /triage/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/inspections/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /triage/audition/inspections/results_model_group_ids.json: -------------------------------------------------------------------------------- 1 | {"best_current_value_precision@_10_pct": [41, 45, 32], "best_average_value_precision@_10_pct": [41, 11, 45], "lowest_metric_variance_precision@_10_pct": [6, 20, 27], "most_frequent_best_dist_precision@_10_pct_0.05": [10, 11, 12]} -------------------------------------------------------------------------------- /triage/eis_audition_config.yaml: -------------------------------------------------------------------------------- 1 | # CHOOSE MODEL GROUPS 2 | model_groups: 3 | query: | 4 | select distinct(model_group_id) 5 | from model_metadata.model_groups 6 | where model_config ->> 'experiment_type' ~ 'eis' 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES 8 | time_stamps: 9 | query: | 10 | select distinct train_end_time 11 | from model_metadata.models 12 | where model_group_id in ({}) 13 | and extract(day from train_end_time) in (1) 14 | and train_end_time >= '2015-01-01' 15 | # FILTER 16 | filter: 17 | metric: 'precision@' # metric of interest 18 | parameter: '10_pct' # parameter of interest 19 | max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time. 20 | threshold_value: 0.0 # The worst absolute value that the given metric should be. 21 | distance_table: 'eis_distance_table' # name of the distance table 22 | models_table: 'models' # name of the models table 23 | 24 | # RULES 25 | rules: 26 | - 27 | shared_parameters: 28 | - 29 | metric: 'precision@' 30 | parameter: '10_pct' 31 | 32 | selection_rules: 33 | - 34 | name: 'best_current_value' # Pick the model group with the best current metric value 35 | n: 3 36 | - 37 | name: 'best_average_value' # Pick the model with the highest average metric value 38 | n: 3 39 | - 40 | name: 'lowest_metric_variance' # Pick the model with the lowest metric variance 41 | n: 3 42 | - 43 | name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case` 44 | dist_from_best_case: [0.05] 45 | n: 3 46 | -------------------------------------------------------------------------------- /triage/eis_crosstabs_config.yaml: -------------------------------------------------------------------------------- 1 | output: 2 | schema: 'test_results' 3 | table: 'eis_crosstabs' 4 | 5 | thresholds: 6 | rank_abs: [50] 7 | rank_pct: [5] 8 | 9 | #(optional): a list of entity_ids to subset on the crosstabs analysis 10 | entity_id_list: [] 11 | 12 | models_list_query: "select unnest(ARRAY[226]) :: int as model_id" 13 | 14 | as_of_dates_query: "select generate_series('2017-12-01'::date, '2018-09-01'::date, interval '1month') as as_of_date" 15 | 16 | #don't change this query unless strictly necessary. It is just validating pairs of (model_id,as_of_date) 17 | #it is just a join with distinct (model_id, as_of_date) in a predictions table 18 | models_dates_join_query: | 19 | select model_id, 20 | as_of_date 21 | from models_list_query as m 22 | cross join as_of_dates_query a join (select distinct model_id, as_of_date from test_results.predictions) as p 23 | using (model_id, as_of_date) 24 | 25 | #features_query must join models_dates_join_query with 1 or more features table using as_of_date 26 | features_query: | 27 | select m.model_id, m.as_of_date, f4.entity_id, f4.results_entity_id_1month_result_fail_avg, f4.results_entity_id_3month_result_fail_avg, f4.results_entity_id_6month_result_fail_avg, 28 | f2.inspection_types_zip_code_1month_type_canvass_sum, f3.risks_zip_code_1month_risk_high_sum, f4.results_entity_id_6month_result_pass_avg, 29 | f3.risks_entity_id_all_risk_high_sum, f2.inspection_types_zip_code_3month_type_canvass_sum, f4.results_entity_id_6month_result_pass_sum, 30 | f2.inspection_types_entity_id_all_type_canvass_sum 31 | from features.inspection_types_aggregation_imputed as f2 32 | inner join features.risks_aggregation_imputed as f3 using (entity_id, as_of_date) 33 | inner join features.results_aggregation_imputed as f4 using (entity_id, as_of_date) 34 | inner join models_dates_join_query as m using (as_of_date) 35 | 36 | #the predictions query must return model_id, as_of_date, entity_id, score, label_value, rank_abs and rank_pct 37 | #it must join models_dates_join_query using both model_id and as_of_date 38 | predictions_query: | 39 | select model_id, 40 | as_of_date, 41 | entity_id, 42 | score, 43 | label_value, 44 | coalesce(rank_abs, row_number() over (partition by (model_id, as_of_date) order by score desc)) as rank_abs, 45 | coalesce(rank_pct*100, ntile(100) over (partition by (model_id, as_of_date) order by score desc)) as rank_pct 46 | from test_results.predictions 47 | join models_dates_join_query using(model_id, as_of_date) 48 | where model_id in (select model_id from models_list_query) 49 | and as_of_date in (select as_of_date from as_of_dates_query) 50 | -------------------------------------------------------------------------------- /triage/eis_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | audition_output_path: '/triage/audition/eis/results_model_group_ids.json' 5 | 6 | thresholds: # Thresholds for2 defining positive predictions 7 | rank_abs: [50, 100, 250] 8 | rank_pct: [5, 10, 25] 9 | 10 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 11 | select g.model_group_id, 12 | m.model_id, 13 | extract('year' from m.evaluation_end_time) as as_of_date_year, 14 | m.metric, 15 | m.parameter, 16 | m.value, 17 | m.num_labeled_examples, 18 | m.num_labeled_above_threshold, 19 | m.num_positive_labels 20 | from test_results.evaluations m 21 | left join model_metadata.models g 22 | using(model_id) 23 | where g.model_group_id = 81 24 | and metric = 'precision@' 25 | and parameter = '10_pct' 26 | 27 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 28 | n_features_plots: 10 # Number of features for importances 29 | figsize: [12, 12] # Default size for plots 30 | fontsize: 20 # Default fontsize for plots 31 | -------------------------------------------------------------------------------- /triage/experiments/eis_01.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'eis: 01' 4 | 5 | user_metadata: 6 | label_definition: 'inspected' 7 | experiment_type: 'eis' 8 | description: | 9 | EIS 01 10 | purpose: 'model creation' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | label_config: 36 | query: | 37 | select 38 | entity_id, 39 | True::integer as outcome 40 | from semantic.events 41 | where '{as_of_date}'::timestamp <= date 42 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 43 | group by entity_id 44 | include_missing_labels_in_train_as: False 45 | name: 'inspected' 46 | 47 | cohort_config: 48 | query: | 49 | with buckets as ( 50 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 51 | from ( 52 | select entity_id, count(*) as number_of_inspections 53 | from semantic.events 54 | group by entity_id 55 | ) as t 56 | ) 57 | select e.entity_id 58 | from semantic.entities as e 59 | inner join 60 | buckets as b 61 | using (entity_id) 62 | where 63 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 64 | and bucket in (5) 65 | name: 'active_facilities' 66 | 67 | temporal_config: 68 | feature_start_time: '2010-01-04' 69 | feature_end_time: '2019-01-01' 70 | label_start_time: '2015-02-01' 71 | label_end_time: '2019-01-01' 72 | 73 | model_update_frequency: '1y' 74 | training_label_timespans: ['1month'] 75 | training_as_of_date_frequencies: '1month' 76 | 77 | test_durations: '1y' 78 | test_label_timespans: ['1month'] 79 | test_as_of_date_frequencies: '1month' 80 | 81 | max_training_histories: '5y' 82 | 83 | feature_aggregations: 84 | - 85 | prefix: 'inspections' 86 | from_obj: 'semantic.events' 87 | knowledge_date_column: 'date' 88 | 89 | aggregates_imputation: 90 | count: 91 | type: 'zero_noflag' 92 | 93 | aggregates: 94 | - 95 | quantity: 96 | total: "*" 97 | metrics: 98 | - 'count' 99 | 100 | intervals: ['1month', '3month', '6month', '1y', 'all'] 101 | 102 | groups: 103 | - 'entity_id' 104 | 105 | - 106 | prefix: 'risks' 107 | from_obj: 'semantic.events' 108 | knowledge_date_column: 'date' 109 | 110 | categoricals_imputation: 111 | sum: 112 | type: 'zero' 113 | avg: 114 | type: 'zero' 115 | 116 | categoricals: 117 | - 118 | column: 'risk' 119 | choices: ['low', 'medium', 'high'] 120 | metrics: 121 | - 'sum' 122 | - 'avg' 123 | 124 | intervals: ['1month', '3month', '6month', '1y', 'all'] 125 | 126 | groups: 127 | - 'entity_id' 128 | - 'zip_code' 129 | 130 | - 131 | prefix: 'results' 132 | from_obj: 'semantic.events' 133 | knowledge_date_column: 'date' 134 | 135 | categoricals_imputation: 136 | all: 137 | type: 'zero' 138 | 139 | categoricals: 140 | - 141 | column: 'result' 142 | choice_query: 'select distinct result from semantic.events' 143 | metrics: 144 | - 'sum' 145 | - 'avg' 146 | 147 | intervals: ['1month', '3month', '6month', '1y', 'all'] 148 | 149 | groups: 150 | - 'entity_id' 151 | 152 | - 153 | prefix: 'inspection_types' 154 | from_obj: 'semantic.events' 155 | knowledge_date_column: 'date' 156 | 157 | categoricals_imputation: 158 | sum: 159 | type: 'zero_noflag' 160 | 161 | categoricals: 162 | - 163 | column: 'type' 164 | choice_query: 'select distinct type from semantic.events where type is not null' 165 | metrics: 166 | - 'sum' 167 | 168 | intervals: ['1month', '3month', '6month', '1y', 'all'] 169 | 170 | groups: 171 | - 'entity_id' 172 | - 'zip_code' 173 | 174 | feature_group_definition: 175 | prefix: 176 | - 'inspections' 177 | - 'results' 178 | - 'risks' 179 | - 'inspection_types' 180 | 181 | feature_group_strategies: ['all', 'leave-one-out', 'leave-one-in'] 182 | 183 | grid_config: 184 | 'sklearn.tree.DecisionTreeClassifier': 185 | max_depth: [2,null] 186 | 'sklearn.ensemble.RandomForestClassifier': 187 | max_features: ['sqrt'] 188 | criterion: ['gini'] 189 | n_estimators: [500] 190 | min_samples_leaf: [1] 191 | min_samples_split: [50] 192 | 'sklearn.dummy.DummyClassifier': 193 | strategy: [most_frequent] 194 | 195 | scoring: 196 | testing_metric_groups: 197 | - 198 | metrics: [precision@, recall@] 199 | thresholds: 200 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 201 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 202 | 203 | 204 | training_metric_groups: 205 | - 206 | metrics: [accuracy] 207 | - 208 | metrics: [precision@, recall@] 209 | thresholds: 210 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 211 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 212 | -------------------------------------------------------------------------------- /triage/experiments/inspections-training.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v3' 2 | 3 | model_comment: 'test_triage_inspections' 4 | 5 | temporal_config: 6 | feature_start_time=np.min(df.date) 7 | feature_end_time=np.max(df.date) 8 | label_start_time=np.min(df.date) 9 | label_end_time=np.max(df.date) 10 | 11 | model_update_frequency='3months' 12 | training_label_timespans='1day' 13 | training_as_of_date_frequencies='1day' 14 | max_training_histories='1year' 15 | 16 | test_durations='1day' 17 | test_label_timespans='3month' 18 | test_as_of_date_frequencies='1day' 19 | 20 | events_table: 'inspections.events' 21 | 22 | feature_aggregations: 23 | - 24 | # Number of violations of a specific code and proportion, grouped by entity 25 | prefix: 'violations' 26 | from_obj: 'cleaned.violations' 27 | knowledge_date_column: 'knowledge_date' 28 | 29 | categoricals: 30 | - 31 | column: 'violation_code' 32 | choice_query: 'select distinct violation_code from cleaned.violations' 33 | metrics: 34 | - 'sum' 35 | - 'avg' 36 | 37 | intervals: 38 | - '1 y' 39 | 40 | groups: 41 | - 'entity_id' 42 | 43 | - # inspections in the last year associated with this entity 44 | prefix: 'inspections' 45 | from_obj: 'cleaned.inspections' 46 | knowledge_date_column: 'date' 47 | aggregates: 48 | - 49 | quantity: '*' 50 | metrics: 51 | - 'count' 52 | intervals: 53 | - '1 y' 54 | 55 | groups: 56 | - 'license_num' 57 | 58 | - # inspections that happened in the last year grouped by type of facility 59 | prefix: 'inspections' 60 | from_obj: 'cleaned.inspections' 61 | knowledge_date_column: 'date' 62 | 63 | aggregates: 64 | - 65 | quantity: '*' 66 | metrics: 67 | - 'count' 68 | intervals: 69 | - '1 y' 70 | 71 | groups: 72 | - 'facility_type' 73 | 74 | - # inspections that happened in the last year grouped by zip code 75 | prefix: 'inspections' 76 | from_obj: 'cleaned.inspections' 77 | knowledge_date_column: 'date' 78 | 79 | aggregates: 80 | - 81 | quantity: '*' 82 | metrics: 83 | - 'count' 84 | intervals: 85 | - '1 y' 86 | 87 | groups: 88 | - 'zip_code' 89 | 90 | feature_group_strategies: ['all'] 91 | 92 | model_group_keys: [] 93 | 94 | grid_config: 95 | 'sklearn.tree.DecisionTreeClassifier': 96 | criterion: ['gini'] 97 | max_depth: [3] 98 | min_samples_split: [10] 99 | 100 | scoring: 101 | metric_groups: 102 | - 103 | metrics: ['precision@', 'recall@', 'fpr@'] 104 | thresholds: 105 | percentiles: [1.0, 2.0, 5.0, 10.0, 25.0] 106 | top_n: [25, 75, 150, 300, 500, 1000, 1500] 107 | -------------------------------------------------------------------------------- /triage/experiments/inspections_baseline.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'inspections: baseline' 4 | 5 | user_metadata: 6 | label_definition: 'failed' 7 | experiment_type: 'inspections prioritization' 8 | description: | 9 | Baseline calculation 10 | purpose: 'baseline' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | temporal_config: 36 | feature_start_time: '2010-01-04' 37 | feature_end_time: '2019-01-01' 38 | label_start_time: '2015-02-01' 39 | label_end_time: '2019-01-01' 40 | 41 | model_update_frequency: '1y' 42 | training_label_timespans: ['1month'] 43 | training_as_of_date_frequencies: '1month' 44 | 45 | test_durations: '1y' 46 | test_label_timespans: ['1month'] 47 | test_as_of_date_frequencies: '1month' 48 | 49 | max_training_histories: '5y' 50 | 51 | label_config: 52 | query: | 53 | select 54 | entity_id, 55 | bool_or(result = 'fail')::integer as outcome 56 | from semantic.events 57 | where '{as_of_date}'::timestamp <= date 58 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 59 | group by entity_id 60 | name: 'failed_inspections' 61 | 62 | cohort_config: 63 | query: | 64 | with buckets as ( 65 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 66 | from ( 67 | select entity_id, count(*) as number_of_inspections 68 | from semantic.events 69 | group by entity_id 70 | ) as t 71 | ) 72 | select e.entity_id 73 | from semantic.entities as e 74 | inner join 75 | buckets as b 76 | using (entity_id) 77 | where 78 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 79 | and bucket in (5) 80 | name: 'active_facilities' 81 | 82 | feature_aggregations: 83 | - 84 | prefix: 'inspections' 85 | from_obj: 'semantic.events' 86 | knowledge_date_column: 'date' 87 | 88 | aggregates_imputation: 89 | count: 90 | type: 'zero_noflag' 91 | 92 | aggregates: 93 | - 94 | quantity: 95 | total: "*" 96 | metrics: 97 | - 'count' 98 | 99 | intervals: ['all'] 100 | 101 | groups: 102 | - 'entity_id' 103 | 104 | feature_group_definition: 105 | prefix: 106 | - 'inspections' 107 | 108 | feature_group_strategies: ['all'] 109 | 110 | grid_config: 111 | 'sklearn.dummy.DummyClassifier': 112 | strategy: [prior,uniform, most_frequent] 113 | 114 | scoring: 115 | testing_metric_groups: 116 | - 117 | metrics: [precision@, recall@] 118 | thresholds: 119 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 120 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 121 | 122 | training_metric_groups: 123 | - 124 | metrics: [accuracy] 125 | - 126 | metrics: [precision@, recall@] 127 | thresholds: 128 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 129 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 130 | -------------------------------------------------------------------------------- /triage/experiments/inspections_dt.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'inspections: DT' 4 | 5 | user_metadata: 6 | label_definition: 'failed' 7 | experiment_type: 'inspections prioritization' 8 | description: | 9 | Decision Tree Classifier 10 | purpose: 'data mining' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | temporal_config: 36 | feature_start_time: '2010-01-04' 37 | feature_end_time: '2019-01-01' 38 | label_start_time: '2015-02-01' 39 | label_end_time: '2019-01-01' 40 | 41 | model_update_frequency: '1y' 42 | training_label_timespans: ['1month'] 43 | training_as_of_date_frequencies: '1month' 44 | 45 | test_durations: '1y' 46 | test_label_timespans: ['1month'] 47 | test_as_of_date_frequencies: '1month' 48 | 49 | max_training_histories: '5y' 50 | 51 | label_config: 52 | query: | 53 | select 54 | entity_id, 55 | bool_or(result = 'fail')::integer as outcome 56 | from semantic.events 57 | where '{as_of_date}'::timestamp <= date 58 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 59 | group by entity_id 60 | name: 'failed_inspections' 61 | 62 | cohort_config: 63 | query: | 64 | with buckets as ( 65 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 66 | from ( 67 | select entity_id, count(*) as number_of_inspections 68 | from semantic.events 69 | group by entity_id 70 | ) as t 71 | ) 72 | select e.entity_id 73 | from semantic.entities as e 74 | inner join 75 | buckets as b 76 | using (entity_id) 77 | where 78 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 79 | and bucket in (5) 80 | name: 'active_facilities' 81 | 82 | feature_aggregations: 83 | - 84 | prefix: 'inspections' 85 | from_obj: 'semantic.events' 86 | knowledge_date_column: 'date' 87 | 88 | aggregates_imputation: 89 | count: 90 | type: 'zero_noflag' 91 | 92 | aggregates: 93 | - 94 | quantity: 95 | total: "*" 96 | metrics: 97 | - 'count' 98 | 99 | intervals: ['1month', '3month', '6month', '1y', 'all'] 100 | 101 | groups: 102 | - 'entity_id' 103 | 104 | - 105 | prefix: 'risks' 106 | from_obj: 'semantic.events' 107 | knowledge_date_column: 'date' 108 | 109 | categoricals_imputation: 110 | sum: 111 | type: 'zero' 112 | avg: 113 | type: 'zero' 114 | 115 | categoricals: 116 | - 117 | column: 'risk' 118 | choices: ['low', 'medium', 'high'] 119 | metrics: 120 | - 'sum' 121 | - 'avg' 122 | 123 | intervals: ['1month', '3month', '6month', '1y', 'all'] 124 | 125 | groups: 126 | - 'entity_id' 127 | - 'zip_code' 128 | 129 | - 130 | prefix: 'results' 131 | from_obj: 'semantic.events' 132 | knowledge_date_column: 'date' 133 | 134 | categoricals_imputation: 135 | all: 136 | type: 'zero' 137 | 138 | categoricals: 139 | - 140 | column: 'result' 141 | choice_query: 'select distinct result from semantic.events' 142 | metrics: 143 | - 'sum' 144 | - 'avg' 145 | 146 | intervals: ['1month', '3month', '6month', '1y', 'all'] 147 | 148 | groups: 149 | - 'entity_id' 150 | 151 | - 152 | prefix: 'inspection_types' 153 | from_obj: 'semantic.events' 154 | knowledge_date_column: 'date' 155 | 156 | categoricals_imputation: 157 | sum: 158 | type: 'zero_noflag' 159 | 160 | categoricals: 161 | - 162 | column: 'type' 163 | choice_query: 'select distinct type from semantic.events where type is not null' 164 | metrics: 165 | - 'sum' 166 | 167 | intervals: ['1month', '3month', '6month', '1y', 'all'] 168 | 169 | groups: 170 | - 'entity_id' 171 | - 'zip_code' 172 | 173 | grid_config: 174 | 'sklearn.tree.DecisionTreeClassifier': 175 | max_depth: [2,10,~] 176 | min_samples_split: [2,5] 177 | 178 | feature_group_definition: 179 | prefix: 180 | - 'inspections' 181 | - 'results' 182 | - 'risks' 183 | - 'inspection_types' 184 | 185 | feature_group_strategies: ['all'] 186 | 187 | scoring: 188 | testing_metric_groups: 189 | - 190 | metrics: [precision@, recall@] 191 | thresholds: 192 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 193 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 194 | 195 | 196 | training_metric_groups: 197 | - 198 | metrics: [accuracy] 199 | - 200 | metrics: [precision@, recall@] 201 | thresholds: 202 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 203 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 204 | -------------------------------------------------------------------------------- /triage/experiments/inspections_label_failed_01.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'inspections: advanced' 4 | 5 | user_metadata: 6 | label_definition: 'failed' 7 | experiment_type: 'inspections prioritization' 8 | description: | 9 | Using Ensamble methods 10 | purpose: 'trying ensamble algorithms' 11 | org: 'DSaPP' 12 | team: 'Tutorial' 13 | author: 'Your name here' 14 | etl_date: '2019-02-21' 15 | 16 | model_group_keys: 17 | - 'class_path' 18 | - 'parameters' 19 | - 'feature_names' 20 | - 'feature_groups' 21 | - 'cohort_name' 22 | - 'state' 23 | - 'label_name' 24 | - 'label_timespan' 25 | - 'training_as_of_date_frequency' 26 | - 'max_training_history' 27 | - 'label_definition' 28 | - 'experiment_type' 29 | - 'org' 30 | - 'team' 31 | - 'author' 32 | - 'purpose' 33 | - 'etl_date' 34 | 35 | temporal_config: 36 | feature_start_time: '2010-01-04' 37 | feature_end_time: '2019-01-01' 38 | label_start_time: '2015-02-01' 39 | label_end_time: '2019-01-01' 40 | 41 | model_update_frequency: '1y' 42 | training_label_timespans: ['1month'] 43 | training_as_of_date_frequencies: '1month' 44 | 45 | test_durations: '1y' 46 | test_label_timespans: ['1month'] 47 | test_as_of_date_frequencies: '1month' 48 | 49 | max_training_histories: '5y' 50 | 51 | label_config: 52 | query: | 53 | select 54 | entity_id, 55 | bool_or(result = 'fail')::integer as outcome 56 | from semantic.events 57 | where '{as_of_date}'::timestamp <= date 58 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 59 | group by entity_id 60 | name: 'failed_inspections' 61 | 62 | 63 | cohort_config: 64 | query: | 65 | with buckets as ( 66 | select *, ntile(5) over (order by number_of_inspections asc) as bucket 67 | from ( 68 | select entity_id, count(*) as number_of_inspections 69 | from semantic.events 70 | group by entity_id 71 | ) as t 72 | ) 73 | select e.entity_id 74 | from semantic.entities as e 75 | inner join 76 | buckets as b 77 | using (entity_id) 78 | where 79 | daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 80 | and bucket in (5) 81 | name: 'active_facilities' 82 | 83 | feature_aggregations: 84 | - 85 | prefix: 'inspections' 86 | from_obj: 'semantic.events' 87 | knowledge_date_column: 'date' 88 | 89 | aggregates_imputation: 90 | count: 91 | type: 'zero_noflag' 92 | 93 | aggregates: 94 | - 95 | quantity: 96 | total: "*" 97 | metrics: 98 | - 'count' 99 | 100 | intervals: ['1month', '3month', '6month', '1y', 'all'] 101 | 102 | groups: 103 | - 'entity_id' 104 | 105 | - 106 | prefix: 'risks' 107 | from_obj: 'semantic.events' 108 | knowledge_date_column: 'date' 109 | 110 | categoricals_imputation: 111 | sum: 112 | type: 'zero' 113 | avg: 114 | type: 'zero' 115 | 116 | categoricals: 117 | - 118 | column: 'risk' 119 | choices: ['low', 'medium', 'high'] 120 | metrics: 121 | - 'sum' 122 | - 'avg' 123 | 124 | intervals: ['1month', '3month', '6month', '1y', 'all'] 125 | 126 | groups: 127 | - 'entity_id' 128 | - 'zip_code' 129 | 130 | - 131 | prefix: 'results' 132 | from_obj: 'semantic.events' 133 | knowledge_date_column: 'date' 134 | 135 | categoricals_imputation: 136 | all: 137 | type: 'zero' 138 | 139 | categoricals: 140 | - 141 | column: 'result' 142 | choice_query: 'select distinct result from semantic.events' 143 | metrics: 144 | - 'sum' 145 | - 'avg' 146 | 147 | intervals: ['1month', '3month', '6month', '1y', 'all'] 148 | 149 | groups: 150 | - 'entity_id' 151 | 152 | - 153 | prefix: 'inspection_types' 154 | from_obj: 'semantic.events' 155 | knowledge_date_column: 'date' 156 | 157 | categoricals_imputation: 158 | sum: 159 | type: 'zero_noflag' 160 | 161 | categoricals: 162 | - 163 | column: 'type' 164 | choice_query: 'select distinct type from semantic.events where type is not null' 165 | metrics: 166 | - 'sum' 167 | 168 | intervals: ['1month', '3month', '6month', '1y', 'all'] 169 | 170 | groups: 171 | - 'entity_id' 172 | - 'zip_code' 173 | 174 | feature_group_definition: 175 | prefix: 176 | - 'inspections' 177 | - 'results' 178 | - 'risks' 179 | - 'inspection_types' 180 | 181 | feature_group_strategies: ['all', 'leave-one-in', 'leave-one-out'] 182 | 183 | grid_config: 184 | 'sklearn.ensemble.RandomForestClassifier': 185 | max_features: ['sqrt'] 186 | criterion: ['gini'] 187 | n_estimators: [100, 250] 188 | min_samples_split: [2,10] 189 | 190 | scoring: 191 | testing_metric_groups: 192 | - 193 | metrics: [precision@, recall@] 194 | thresholds: 195 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 196 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 197 | 198 | training_metric_groups: 199 | - 200 | metrics: [accuracy] 201 | - 202 | metrics: [precision@, recall@] 203 | thresholds: 204 | percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] 205 | top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000] 206 | -------------------------------------------------------------------------------- /triage/experiments/simple_test_skeleton.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v6' 2 | 3 | model_comment: 'simple_test_skeleton' 4 | 5 | user_metadata: 6 | label_definition: 'failed_inspection' 7 | experiment_type: 'test' 8 | org: 'DSaPP' 9 | team: 'Tutorial' 10 | author: 'Adolfo De Unanue' 11 | etl_date: '2019-02-21' 12 | 13 | temporal_config: 14 | feature_start_time: '2014-01-01' 15 | feature_end_time: '2018-01-01' 16 | label_start_time: '2014-01-02' 17 | label_end_time: '2018-01-01' 18 | 19 | model_update_frequency: '1y' 20 | 21 | max_training_histories: '1y' 22 | training_label_timespans: ['1y'] 23 | training_as_of_date_frequencies: '1month' 24 | 25 | test_durations: '0d' 26 | test_label_timespans: ['1y'] 27 | test_as_of_date_frequencies: '1month' 28 | 29 | cohort_config: 30 | query: | 31 | select entity_id 32 | from semantic.entities 33 | where 34 | license_num in (1596210, 1874347, 1142451) 35 | and daterange(start_time, end_time, '[]') @> '{as_of_date}'::date 36 | name: 'test_facilities' 37 | 38 | label_config: 39 | query: | 40 | select 41 | entity_id, 42 | bool_or(result = 'fail')::integer as outcome 43 | from semantic.events 44 | where '{as_of_date}'::timestamp <= date 45 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 46 | group by entity_id 47 | name: 'failed_inspections' 48 | 49 | grid_config: 50 | 'sklearn.dummy.DummyClassifier': 51 | strategy: [most_frequent] 52 | 53 | feature_aggregations: 54 | - 55 | prefix: 'inspections' 56 | from_obj: 'semantic.events' 57 | knowledge_date_column: 'date' 58 | 59 | aggregates_imputation: 60 | count: 61 | type: 'zero_noflag' 62 | 63 | aggregates: 64 | - 65 | quantity: 66 | total: "*" 67 | metrics: 68 | - 'count' 69 | 70 | intervals: ['1month', '3month', '6month', '1y', 'all'] 71 | 72 | groups: 73 | - 'entity_id' 74 | 75 | 76 | - 77 | prefix: 'risks' 78 | from_obj: 'semantic.events' 79 | knowledge_date_column: 'date' 80 | 81 | categoricals_imputation: 82 | sum: 83 | type: 'zero' 84 | avg: 85 | type: 'zero' 86 | 87 | categoricals: 88 | - 89 | column: 'risk' 90 | choices: ['low', 'medium', 'high'] 91 | metrics: 92 | - 'sum' 93 | - 'avg' 94 | 95 | intervals: ['1month', '3month', '6month', '1y', 'all'] 96 | 97 | groups: 98 | - 'entity_id' 99 | - 'zip_code' 100 | 101 | - 102 | prefix: 'results' 103 | from_obj: 'semantic.events' 104 | knowledge_date_column: 'date' 105 | 106 | categoricals_imputation: 107 | all: 108 | type: 'zero' 109 | 110 | categoricals: 111 | - 112 | column: 'result' 113 | choice_query: 'select distinct result from semantic.events' 114 | metrics: 115 | - 'sum' 116 | - 'avg' 117 | 118 | intervals: 119 | - '6month' 120 | 121 | groups: 122 | - 'entity_id' 123 | 124 | feature_group_definition: 125 | prefix: 126 | - 'results' 127 | - 'risks' 128 | - 'inspections' 129 | 130 | feature_group_strategies: ['all'] 131 | 132 | model_group_keys: 133 | - 'class_path' 134 | - 'parameters' 135 | - 'feature_names' 136 | - 'feature_groups' 137 | - 'cohort_name' 138 | - 'state' 139 | - 'label_name' 140 | - 'label_timespan' 141 | - 'training_as_of_date_frequency' 142 | - 'max_training_history' 143 | - 'label_definition' 144 | - 'experiment_type' 145 | - 'org' 146 | - 'team' 147 | - 'author' 148 | - 'etl_date' 149 | 150 | scoring: 151 | testing_metric_groups: 152 | - 153 | metrics: ['precision@', 'recall@'] 154 | thresholds: 155 | percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0] 156 | top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500] 157 | training_metric_groups: 158 | - 159 | metrics: ['accuracy'] 160 | - 161 | metrics: ['precision@', 'recall@'] 162 | thresholds: 163 | percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0] 164 | top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500] 165 | -------------------------------------------------------------------------------- /triage/inspection_audition_config.yaml: -------------------------------------------------------------------------------- 1 | # CHOOSE MODEL GROUPS 2 | model_groups: 3 | query: | 4 | select distinct(model_group_id) 5 | from model_metadata.model_groups 6 | where model_config ->> 'experiment_type' ~ 'inspection' 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES 8 | time_stamps: 9 | query: | 10 | select distinct train_end_time 11 | from model_metadata.models 12 | where model_group_id in ({}) 13 | and extract(day from train_end_time) in (1) 14 | and train_end_time >= '2015-01-01' 15 | # FILTER 16 | filter: 17 | metric: 'precision@' # metric of interest 18 | parameter: '10_pct' # parameter of interest 19 | max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time. 20 | threshold_value: 0.0 # The worst absolute value that the given metric should be. 21 | distance_table: 'inspections_distance_table' # name of the distance table 22 | models_table: 'models' # name of the models table 23 | 24 | # RULES 25 | rules: 26 | - 27 | shared_parameters: 28 | - 29 | metric: 'precision@' 30 | parameter: '10_pct' 31 | 32 | selection_rules: 33 | - 34 | name: 'best_current_value' # Pick the model group with the best current metric value 35 | n: 3 36 | - 37 | name: 'best_average_value' # Pick the model with the highest average metric value 38 | n: 3 39 | - 40 | name: 'lowest_metric_variance' # Pick the model with the lowest metric variance 41 | n: 3 42 | - 43 | name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case` 44 | dist_from_best_case: [0.05] 45 | n: 3 46 | -------------------------------------------------------------------------------- /triage/inspection_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | model_group_id: 5 | - 41 6 | - 32 7 | - 45 8 | - 11 9 | 10 | thresholds: # Thresholds for defining positive predictions 11 | rank_abs: [50, 100, 250] 12 | rank_pct: [5, 10, 25] 13 | 14 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 15 | select g.model_group_id, 16 | m.model_id, 17 | extract('year' from m.evaluation_end_time) as as_of_date_year, 18 | m.metric, 19 | m.parameter, 20 | m.value, 21 | m.num_labeled_examples, 22 | m.num_labeled_above_threshold, 23 | m.num_positive_labels 24 | from test_results.evaluations m 25 | left join model_metadata.models g 26 | using(model_id) 27 | where g.model_group_id = 1 28 | and metric = 'precision@' 29 | and parameter = '10_pct' 30 | 31 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 32 | n_features_plots: 10 # Number of features for importances 33 | figsize: [12, 12] # Default size for plots 34 | fontsize: 20 # Default fontsize for plots 35 | -------------------------------------------------------------------------------- /triage/output/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/output/.gitkeep -------------------------------------------------------------------------------- /triage/output/images/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/output/images/.gitkeep -------------------------------------------------------------------------------- /triage/output/images/model_7_tree_0.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | Tree 11 | 12 | 13 | 0 14 | 15 | inspections_zip_code_3month_type_complaint_sum ≤ 0.5 16 | gini = 0.353 17 | samples = 26018 18 | value = [20060, 5958] 19 | class = y 20 | 0 21 | 22 | 23 | 1 24 | 25 | gini = 0.375 26 | samples = 21708 27 | value = [16281, 5427] 28 | class = y 29 | 0 30 | 31 | 32 | 0->1 33 | 34 | 35 | True 36 | 37 | 38 | 2 39 | 40 | gini = 0.216 41 | samples = 4310 42 | value = [3779, 531] 43 | class = y 44 | 0 45 | 46 | 47 | 0->2 48 | 49 | 50 | False 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /triage/selection_rules/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/selection_rules/.gitkeep -------------------------------------------------------------------------------- /triage/selection_rules/rules.yaml: -------------------------------------------------------------------------------- 1 | - 2 | shared_parameters: 3 | - 4 | metric: 'precision@' 5 | parameter: '50_abs' 6 | selection_rules: 7 | - 8 | name: best_current_value 9 | n: 1 10 | - 11 | name: best_average_value 12 | n: 1 13 | - 14 | name: lowest_metric_variance 15 | n: 1 16 | - 17 | name: most_frequent_best_dist 18 | dist_from_best_case: [0.05] 19 | n: 1 20 | -------------------------------------------------------------------------------- /triage/session.key: -------------------------------------------------------------------------------- 1 | c2e3bb2a-f80c7b34d4fe02d7e5be87d9 2 | -------------------------------------------------------------------------------- /tutorial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -u 4 | 5 | PROJECT="triage-dirtyduck" 6 | PROJECT_HOME="$( cd "$( dirname "$0" )" && pwd )" 7 | INFRASTRUCTURE_HOME="${PROJECT_HOME}/infrastructure" 8 | 9 | cd "$INFRASTRUCTURE_HOME" 10 | 11 | function help_menu () { 12 | cat << EOF 13 | Usage: ${0} {start|stop|build|rebuild|run|logs|status|destroy|all|} 14 | 15 | OPTIONS: 16 | -h|help Show this message 17 | start 18 | stop 19 | rebuild 20 | status 21 | destroy 22 | -t|triage 23 | -a|all 24 | 25 | INFRASTRUCTURE: 26 | Build the infrastructure: 27 | $ ./tutorial.sh start 28 | 29 | Check the status of the containers: 30 | $ ./tutorial.sh status 31 | 32 | Stop the tutorial's infrastructure: 33 | $ ./tutorial.sh stop 34 | 35 | Destroy all the resources related to the tutorial: 36 | $ ./tutorial.sh destroy 37 | 38 | View the infrastructure logs: 39 | $ ./tutorial.sh -l 40 | 41 | EXPERIMENTS: 42 | NOTE: 43 | The following commands assume that "sample_experiments.yaml" 44 | is located inside the triage/experiments directory 45 | 46 | Run one experiment: 47 | $ ./tutorial.sh -t --config_file sample_experiment_config.yaml run 48 | 49 | Run one experiment, do not replace existing matrices or models, and enable debug: 50 | $ ./tutorial.sh -t --config_file sample_experiment_config.yaml --no-replace --debug run 51 | 52 | Validate experiment configuration file: 53 | $ ./tutorial.sh triage --config_file sample_experiment_config.yaml validate 54 | 55 | Show the experiment's temporal cross-validation blocks: 56 | $ ./tutorial.sh -t --config_file sample_experiment_config.yaml show-temporal-blocks 57 | 58 | Plot model number 4 (for Decision Trees and Random Forests): 59 | $ ./tutorial.sh -t --config_file sample_experiment_config.yaml show_model_plot --model 4 60 | 61 | Triage help: 62 | $ ./tutorial.sh triage --help 63 | 64 | EOF 65 | } 66 | 67 | function start_infrastructure () { 68 | docker-compose --project-name ${PROJECT} up -d food_db 69 | #tyra reverseproxy api 70 | } 71 | 72 | function stop_infrastructure () { 73 | docker-compose --project-name ${PROJECT} stop 74 | } 75 | 76 | function build_images () { 77 | docker-compose --project-name ${PROJECT} build "${@}" 78 | } 79 | 80 | function destroy () { 81 | docker-compose --project-name ${PROJECT} down --rmi all --remove-orphans --volumes 82 | } 83 | 84 | function infrastructure_logs () { 85 | docker-compose --project-name ${PROJECT} logs -f -t 86 | } 87 | 88 | function status () { 89 | docker-compose --project-name ${PROJECT} ps 90 | } 91 | 92 | function bastion () { 93 | docker-compose --project-name ${PROJECT} run --service-ports --rm --name tutorial_bastion bastion 94 | } 95 | 96 | function triage () { 97 | docker-compose --project-name ${PROJECT} run --rm --name triage_experiment triage "${@}" 98 | } 99 | 100 | function all () { 101 | build_images 102 | start_infrastructure 103 | status 104 | } 105 | 106 | 107 | if [[ $# -eq 0 ]] ; then 108 | help_menu 109 | exit 0 110 | fi 111 | 112 | case "$1" in 113 | start) 114 | start_infrastructure 115 | shift 116 | ;; 117 | stop) 118 | stop_infrastructure 119 | shift 120 | ;; 121 | build) 122 | build_images 123 | shift 124 | ;; 125 | rebuild) 126 | build_images --no-cache 127 | shift 128 | ;; 129 | -d|destroy) 130 | destroy 131 | shift 132 | ;; 133 | -l|logs) 134 | infrastructure_logs 135 | shift 136 | ;; 137 | status) 138 | status 139 | shift 140 | ;; 141 | -t|triage) 142 | triage ${@:2} 143 | shift 144 | ;; 145 | bastion) 146 | bastion 147 | shift 148 | ;; 149 | -a|--all) 150 | all 151 | shift 152 | ;; 153 | -h|--help) 154 | help_menu 155 | shift 156 | ;; 157 | *) 158 | echo "${1} is not a valid flag, try running: ${0} --help" 159 | shift 160 | ;; 161 | esac 162 | shift 163 | 164 | cd - > /dev/null 165 | --------------------------------------------------------------------------------