├── .gitignore
├── AUTHORS
├── README.org
├── aws_env.example
├── data
    └── .gitkeep
├── deploy.sh
├── docs
    ├── audition
    │   ├── eis
    │   │   ├── distance_from_best_precision@10_pct.png
    │   │   ├── metric_over_time_precision@10_pct.png
    │   │   ├── precision@10_pct_next_time.png
    │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   └── regret_over_time_precision@10_pct.png
    │   └── inspections
    │   │   ├── distance_from_best_precision@10_pct.png
    │   │   ├── metric_over_time_precision@10_pct.png
    │   │   ├── precision@10_pct_next_time.png
    │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   └── regret_over_time_precision@10_pct.png
    ├── css
    │   ├── htmlize.css
    │   ├── org-default.css
    │   ├── org.css
    │   ├── readtheorg.css
    │   └── rtd-full.css
    ├── eis_postmodeling_config.yaml
    ├── images
    │   ├── AWS_Batch_Architecture.png
    │   ├── AWS_Batch_Architecture.svg
    │   ├── data_road.png
    │   ├── eis.png
    │   ├── eis_jaccard_on_lists_over_time.png
    │   ├── eis_mg_prec_over_time.png
    │   ├── eis_mg_recall_over_time.png
    │   ├── eis_model_group_64_feature_group_importances.png
    │   ├── eis_model_group_64_feature_importances.png
    │   ├── eis_model_group_64_rayid_curve.png
    │   ├── facilities_inspected_over_time.png
    │   ├── facilities_with_failed_inspections_severe_violations_over_time.png
    │   ├── facilities_with_inspections_failed_over_time.png
    │   ├── failed_inspections_over_time.png
    │   ├── failed_inspections_severe_violations_over_time.png
    │   ├── inspection_jaccard_on_lists_over_time.png
    │   ├── inspection_mg_prec_over_time.png
    │   ├── inspection_mg_recall_over_time.png
    │   ├── inspection_model_group_11_feature_group_importances.png
    │   ├── inspection_model_group_11_feature_importances.png
    │   ├── inspection_model_group_11_rayid_curve.png
    │   ├── inspections.png
    │   ├── inspections_dt.png
    │   ├── inspections_over_time.png
    │   ├── model_7_tree_0.png
    │   ├── outcomes-eis.png
    │   ├── outcomes-inspections.png
    │   ├── rolling-origin.png
    │   ├── sanjose-2.png
    │   ├── simple_test_skeleton.png
    │   ├── timechop.png
    │   ├── timechop_1.png
    │   ├── timechop_1.svg
    │   ├── timechop_10.png
    │   ├── timechop_10.svg
    │   ├── timechop_2.png
    │   ├── timechop_2.svg
    │   ├── timechop_3.png
    │   ├── timechop_3.svg
    │   ├── timechop_4.png
    │   ├── timechop_4.svg
    │   ├── timechop_5.png
    │   ├── timechop_5.svg
    │   ├── timechop_6.png
    │   ├── timechop_6.svg
    │   ├── timechop_7.png
    │   ├── timechop_7.svg
    │   ├── timechop_8.png
    │   ├── timechop_8.svg
    │   ├── timechop_9.png
    │   ├── timechop_9.svg
    │   ├── timechop_example.png
    │   ├── timechop_inspections_test.png
    │   ├── timechop_withoutblocks.png
    │   └── timechop_withoutrows.png
    ├── index.html
    ├── index.md
    ├── js
    │   ├── jquery.stickytableheaders.min.js
    │   └── readtheorg.js
    ├── sql
    │   ├── create_cleaned_inspections_table.sql
    │   ├── create_semantic_tables.sql
    │   └── create_violations_table.sql
    └── triage
    │   ├── experiments
    │       ├── eis_01.yaml
    │       ├── inspections-training.yaml
    │       ├── inspections_baseline.yaml
    │       ├── inspections_dt.yaml
    │       ├── inspections_label_failed_01.yaml
    │       └── simple_test_skeleton.yaml
    │   └── images
    │       ├── distance_from_best_precision@10_pct.png
    │       ├── eis_01.png
    │       ├── inspections_baseline.png
    │       ├── inspections_dt.png
    │       ├── inspections_label_failed_01.png
    │       ├── metric_over_time_precision@10_pct.png
    │       ├── precision@10_pct_next_time.png
    │       ├── regret_distance_from_best_rules_precision@10_pct.png
    │       ├── regret_over_time_precision@10_pct.png
    │       └── simple_test_skeleton.png
├── infrastructure
    ├── aws_batch
    │   ├── credentials.filter.example
    │   ├── triage-job-definition.json.example
    │   └── triage-overrides.json.example
    ├── bastion
    │   ├── Dockerfile
    │   └── requirements.txt
    ├── docker-compose.yml
    ├── env_example
    ├── food_db
    │   ├── Dockerfile
    │   ├── activate_postgis.sql
    │   ├── create_extensions.sql
    │   ├── create_inspections_table.sql
    │   └── nuke_triage.sql
    ├── triage
    │   ├── Dockerfile
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── triage_experiment.py
    │   └── utils.py
    └── web
    │   ├── Dockerfile
    │   └── default.conf
├── org
    ├── 00_instructions.org
    ├── 01_intro.org
    ├── 02_infrastructure.org
    ├── 03_data_preparation.org
    ├── 04_triage_intro.org
    ├── 05_inspections.org
    ├── 06_eis.org
    ├── 07_quick_setup.org
    ├── 08_postmodeling.org
    ├── 09_aws_batch.org
    ├── 100_whats_next.org
    ├── audition
    ├── css
    │   ├── htmlize.css
    │   ├── org-default.css
    │   ├── org.css
    │   ├── readtheorg.css
    │   └── rtd-full.css
    ├── docker-kernel-connection.json
    ├── images
    │   ├── AWS_Batch_Architecture.png
    │   ├── AWS_Batch_Architecture.svg
    │   ├── data_road.png
    │   ├── eis.png
    │   ├── eis_jaccard_on_lists_over_time.png
    │   ├── eis_mg_prec_over_time.png
    │   ├── eis_mg_recall_over_time.png
    │   ├── eis_model_group_64_feature_group_importances.png
    │   ├── eis_model_group_64_feature_importances.png
    │   ├── eis_model_group_64_rayid_curve.png
    │   ├── facilities_inspected_over_time.png
    │   ├── facilities_with_failed_inspections_severe_violations_over_time.png
    │   ├── facilities_with_inspections_failed_over_time.png
    │   ├── failed_inspections_over_time.png
    │   ├── failed_inspections_severe_violations_over_time.png
    │   ├── inspection_jaccard_on_lists_over_time.png
    │   ├── inspection_mg_prec_over_time.png
    │   ├── inspection_mg_recall_over_time.png
    │   ├── inspection_model_group_11_feature_group_importances.png
    │   ├── inspection_model_group_11_feature_importances.png
    │   ├── inspection_model_group_11_rayid_curve.png
    │   ├── inspections.png
    │   ├── inspections_dt.png
    │   ├── inspections_over_time.png
    │   ├── model_7_tree_0.png
    │   ├── outcomes-eis.png
    │   ├── outcomes-inspections.png
    │   ├── rolling-origin.png
    │   ├── sanjose-2.png
    │   ├── simple_test_skeleton.png
    │   ├── timechop.png
    │   ├── timechop_1.png
    │   ├── timechop_1.svg
    │   ├── timechop_10.png
    │   ├── timechop_10.svg
    │   ├── timechop_2.png
    │   ├── timechop_2.svg
    │   ├── timechop_3.png
    │   ├── timechop_3.svg
    │   ├── timechop_4.png
    │   ├── timechop_4.svg
    │   ├── timechop_5.png
    │   ├── timechop_5.svg
    │   ├── timechop_6.png
    │   ├── timechop_6.svg
    │   ├── timechop_7.png
    │   ├── timechop_7.svg
    │   ├── timechop_8.png
    │   ├── timechop_8.svg
    │   ├── timechop_9.png
    │   ├── timechop_9.svg
    │   ├── timechop_example.png
    │   ├── timechop_inspections_test.png
    │   ├── timechop_withoutblocks.png
    │   └── timechop_withoutrows.png
    ├── index.org
    ├── js
    │   ├── jquery.stickytableheaders.min.js
    │   ├── readtheorg.js
    │   └── stickytableheaders-license.txt
    ├── publish.el
    ├── ref.bib
    ├── sql
    │   ├── create_cleaned_inspections_table.sql
    │   ├── create_semantic_tables.sql
    │   └── create_violations_table.sql
    ├── triage
    │   ├── experiments
    │   └── images
    └── tutorial.setup
├── requirements-dev.txt
├── requirements.txt
├── scratch.org
├── triage
    ├── .gitkeep
    ├── audition
    │   ├── eis
    │   │   ├── distance_from_best_precision@10_pct.png
    │   │   ├── metric_over_time_precision@10_pct.png
    │   │   ├── precision@10_pct_next_time.png
    │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   ├── regret_over_time_precision@10_pct.png
    │   │   └── results_model_group_ids.json
    │   └── inspections
    │   │   ├── distance_from_best_precision@10_pct.png
    │   │   ├── metric_over_time_precision@10_pct.png
    │   │   ├── precision@10_pct_next_time.png
    │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   ├── regret_over_time_precision@10_pct.png
    │   │   └── results_model_group_ids.json
    ├── eis_audition_config.yaml
    ├── eis_crosstabs_config.yaml
    ├── eis_postmodeling_config.yaml
    ├── experiments
    │   ├── eis_01.yaml
    │   ├── inspections-training.yaml
    │   ├── inspections_baseline.yaml
    │   ├── inspections_dt.yaml
    │   ├── inspections_label_failed_01.yaml
    │   └── simple_test_skeleton.yaml
    ├── inspection_audition_config.yaml
    ├── inspection_postmodeling_config.yaml
    ├── output
    │   ├── .gitkeep
    │   └── images
    │   │   ├── .gitkeep
    │   │   ├── eis.svg
    │   │   ├── inspections.svg
    │   │   ├── inspections_dt.svg
    │   │   ├── inspections_test.svg
    │   │   ├── model_7_tree_0.svg
    │   │   └── simple_test_skeleton.svg
    ├── selection_rules
    │   ├── .gitkeep
    │   └── rules.yaml
    └── session.key
└── tutorial.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env*
 80 | 
 81 | # triage database config
 82 | triage/database.yaml
 83 | 
 84 | # virtualenv
 85 | venv/
 86 | ENV/
 87 | 
 88 | # Spyder project settings
 89 | .spyderproject
 90 | 
 91 | # Rope project settings
 92 | .ropeproject
 93 | 
 94 | 
 95 | # Porquerías de MacOS
 96 | .DS_Store
 97 | 
 98 | # Documentación de Sphinx
 99 | docs/_build
100 | 
101 | # Los datos no se suben a git
102 | **/data/
103 | 
104 | # Bases de datos
105 | *.db
106 | 
107 | # Logs de instalación
108 | pip-log.txt
109 | pip-delete-this-directory.txt
110 | 
111 | # Pruebas unitarias / Coverage
112 | htmlcov/
113 | .tox/
114 | .coverage
115 | .coverage.*
116 | .cache
117 | nosetests.xml
118 | coverage.xml
119 | *,cover
120 | 
121 | # Documentación de Mkdocs
122 | site/
123 | 
124 | # Archivos de datos
125 | *.xlsx
126 | *.dat
127 | *.csv
128 | *.tsv
129 | *.psv
130 | *.sqlite
131 | *.doc
132 | *.docx
133 | *.odt
134 | *.ods
135 | *.xls*
136 | *.pdf
137 | *.ppt*
138 | *.sqlite
139 | *.pkl
140 | 
141 | # De la construcción de imágenes y contenedores
142 | **/.built*
143 | **/*_built*
144 | **/.data_built*
145 | **/.infrastructure_built*
146 | **/.network_built*
147 | **/.running*
148 | 
149 | # Basura de Emacs
150 | **/.#*
151 | 
152 | # VIM
153 | *.swp
154 | 
155 | # From the makefiles
156 | **/*.built
157 | **/*.pushed
158 | 
159 | **/*_SUCCESS
160 | 
161 | **/*development*
162 | **/*staging*
163 | 
164 | **/triage-generated/*
165 | **/matrices/*
166 | **/trained_models/*
167 | triage/images/*
168 | 
169 | **/profiling_stats/*
170 | 
171 | **/ltximg/*
172 | .aws_env
173 | infrastructure/**/*.json
174 | infrastructure/**/*.filter
175 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Adolfo De Unanue <adolfo@uchicago.edu>
2 | Joseph Walsh     <jtwalsh@uchicago.edu>
3 | Hans Koening
4 | Arthi Ramachandran <aramachandran1@medicine.bsd.uchicago.edu>
5 | Iván Higuera
6 | Kit Rodolfa
7 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | #+TITLE: Dirty Duck: A Guided Tour of Triage
  2 | #+AUTHOR: Center for Data Science and Public Policy
  3 | #+EMAIL: adolfo@uchicago.edu
  4 | #+STARTUP: showeverything
  5 | #+STARTUP: nohideblocks
  6 | #+STARTUP: Indent
  7 | 
  8 | 
  9 | 
 10 | * THIS REPOSITORY HAS BEEN ARXIVED. DIRTYDUCK NOW IS PART OF TRIAGE. PLEASE GO TO [[https://github.com/dssg/triage][HERE]] TO CONTINUE WITH THE SOURCE CODE OR [[https://dssg.github.io/triage/dirtyduck/docs/][HERE]] TO CONTINUE WITH THE TUTORIAL
 11 | 
 12 | 
 13 | 
 14 | * Welcome!
 15 | 
 16 | This tutorial will show you how to use =triage=, a data science
 17 |  modeling tool developed at the [[http://dsapp.uchicago.edu][Center for Data Science and Public
 18 |  Policy]] (DSaPP) at the University of Chicago.
 19 | 
 20 | =triage= helps build models for three [[https://dssg.uchicago.edu/data-science-for-social-good-conference-2017/training-workshop-data-science-for-social-good-problem-templates/][common applied problems]]: (a) Early
 21 | warning systems (*EWS* or *EIS*), (b) /resource prioritization/ (a.k.a "an
 22 | inspections problem") and (c) interaction level predictions (a.k.a
 23 | "appointment level"). These problems
 24 | are difficult to model because their conceptualization and
 25 | and implementation are prone to error, thanks to their multi-dimensional,
 26 | multi-entity, time-series structure.
 27 | 
 28 | The last version of this tutorial is published in [[https://dssg.github.io/dirtyduck/]]
 29 | 
 30 | *NOTE* This tutorial is in sync with the latest version of =triage=. At
 31 | this moment [[https://github.com/dssg/triage/releases/tag/v3.3.0][v3.3.0 (Arepa)]].
 32 | 
 33 | * What you need for this tutorial
 34 | 
 35 | Install [[http://www.docker.com][Docker CE]] and [[https://docs.docker.com/compose/][Docker Compose]]. That's it.
 36 | Follow the links for installation instructions.
 37 | 
 38 | Note that if you are using =GNU/Linux= you should add your user to the
 39 | =docker= group following the instructions at this [[https://docs.docker.com/install/linux/linux-postinstall/][link]].
 40 | 
 41 | At the moment only operative systems with *nix-type command lines are
 42 | supported, such as =GNU/Linux= and =MacOS=. Recent versions of
 43 | =Windows= may also work.
 44 | 
 45 | * How to use this tutorial
 46 | 
 47 | First, clone this repository on your laptop
 48 | 
 49 | #+BEGIN_EXAMPLE
 50 |  git clone https://github.com/dssg/dirtyduck.git
 51 | #+END_EXAMPLE
 52 | 
 53 | Second, run
 54 | 
 55 | #+BEGIN_EXAMPLE
 56 | ./tutorial.sh start
 57 | #+END_Example
 58 | 
 59 | This will take several minutes the first time you do it.
 60 | 
 61 | * How you can help
 62 | 
 63 | Help is always welcome! You can report errors, improve
 64 | the tutorial, or propose improvements to
 65 | =triage=. These three cases are discussed below.
 66 | 
 67 | ** How to report errors
 68 | 
 69 | There are almost surely errors. Please open an [[https://github.com/dssg/dirtyduck/issues][issue]] and
 70 | we will try to issue a fix as soon as possible.
 71 | 
 72 | ** How to improve the tutorial
 73 | 
 74 | This tutorial was created following the practices of [[https://www-cs-faculty.stanford.edu/~knuth/lp.html][Literate
 75 | Programming]] using [[https://orgmode.org/][org-mode]][fn:1] in [[https://www.gnu.org/software/emacs/][GNU Emacs]][fn:2]. That means the tutorial is a /live/ document that mixes code and text.
 76 | 
 77 | The steps to help are:
 78 | 
 79 | - clone the repository
 80 | - edit the source =org= files in the =org= folder
 81 | - From your terminal run
 82 | 
 83 | #+BEGIN_SRC shell
 84 | emacs  --batch -l org/publish.el org/index.org --eval '(org-publish "dirtyduck" t)'
 85 | #+END_SRC
 86 | 
 87 | #+RESULTS:
 88 | 
 89 | - create a *pull request*.
 90 | 
 91 | 
 92 | ** How to help develop =triage=
 93 | 
 94 | Go to the [[https://github.com/dssg/triage][triage]] repository and follow the instructions there.
 95 | 
 96 | * Footnotes
 97 | 
 98 | [fn:2] But it is supported in =vim= if you install a plugin.
 99 | 
100 | [fn:1] It's similar to =markdown= so you won't have any problem.
101 | 


--------------------------------------------------------------------------------
/aws_env.example:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PROJECT_NAME=dirtyduck
 4 | TRIAGE_VERSION=3.3.0
 5 | ENV=development
 6 | AWS_REGISTRY={your-ecr-registry}
 7 | AWS_JOB_QUEUE={your-job-queue}
 8 | POSTGRES_DB={postgresql://user:password@db_server/dbname}
 9 | S3_BUCKET={your-bucket}
10 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/data/.gitkeep


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | source .aws_env
  4 | 
  5 | # Exit the script as soon as something fails (-e) or if a variable is not defined (-u)
  6 | set -e -u
  7 | 
  8 | function info () {
  9 | 	echo "##############################################"
 10 | 	echo "#                                            #"
 11 | 	echo "#     Project: ${PROJECT_NAME}                    #"
 12 |     echo "#     Triage ver. ${TRIAGE_VERSION}                      #"
 13 | 	echo "#                                            #"
 14 | 	echo "##############################################"
 15 | 	echo "Environment: ${ENV}"
 16 | 	echo "ECR Registry: ${AWS_REGISTRY}"
 17 | 	echo "BATCH JOB QUEUE: ${AWS_JOB_QUEUE}"
 18 |     echo "DB: ${POSTGRES_DB}"
 19 | 	echo "S3 Bucket: ${S3_BUCKET}"
 20 | 	python --version
 21 | 	pyenv --version
 22 | 	pip --version
 23 | }
 24 | 
 25 | function sync_to_s3 () {
 26 | 
 27 |     echo "##############################################"
 28 | 	echo "#                                            #"
 29 | 	echo "#  Uploading changes to s3://${S3_BUCKET} "
 30 |     echo "#                                            #"
 31 | 	echo "##############################################"
 32 | 
 33 | 	aws s3 sync triage/experiments/ s3://${S3_BUCKET}/experiments
 34 | }
 35 | 
 36 | function sync_from_s3 () {
 37 | 
 38 |     echo "##############################################"
 39 | 	echo "#                                            #"
 40 | 	echo "#  Getting changes from s3://${S3_BUCKET} "
 41 |     echo "#                                            #"
 42 | 	echo "##############################################"
 43 | 
 44 | 	aws s3 sync s3://${S3_BUCKET}/experiments/ triage/experiments/
 45 | }
 46 | 
 47 | function update_jobs () {
 48 | 	echo "Updating the job definition of the following tasks: ${PROJECT_NAME}"
 49 | 
 50 | 	echo "+----------------------------------------+"
 51 | 	echo "|                                        |"
 52 | 	echo "| Updating  ${PROJECT_NAME} job definition"
 53 | 	echo "|                                        |"
 54 | 	echo "+----------------------------------------+"
 55 | 
 56 | 	aws batch register-job-definition --cli-input-json file://infrastructure/aws_batch/triage-job-definition.json
 57 | }
 58 | 
 59 | function update_images () {
 60 |     echo "Updating images related to this project"
 61 | 
 62 |     tasks=triage
 63 | 
 64 | 	echo "Updating the image of the following tasks: ${tasks}"
 65 | 
 66 | 	for task in ${tasks}
 67 | 	do
 68 | 		echo "+----------------------------------------+"
 69 | 		echo "|                                        |"
 70 | 		echo "| Updating ${task} image"
 71 | 		echo "|                                        |"
 72 | 		echo "+----------------------------------------+"
 73 | 		docker build --no-cache --tag dsapp/${PROJECT_NAME}/${task} infrastructure/${task}
 74 |         docker tag dsapp/${PROJECT_NAME}/${task} ${AWS_REGISTRY}/dsapp/${PROJECT_NAME}/${task}:${TRIAGE_VERSION}
 75 | 		docker tag dsapp/${PROJECT_NAME}/${task} ${AWS_REGISTRY}/dsapp/${PROJECT_NAME}/${task}:latest
 76 | 
 77 | 		eval "$(aws ecr get-login --no-include-email --region us-west-2)"
 78 | 
 79 | 		docker push "${AWS_REGISTRY}"/dsapp/"${PROJECT_NAME}"/${task}:"${TRIAGE_VERSION}"
 80 | 		docker push "${AWS_REGISTRY}"/dsapp/"${PROJECT_NAME}"/${task}:latest
 81 | 	done
 82 | 
 83 | }
 84 | 
 85 | function update_triage_cli_image () {
 86 |     tasks=triage-cli
 87 | 
 88 | 	echo "Updating the image of the following tasks: ${tasks}"
 89 | 
 90 | 	for task in ${tasks}
 91 | 	do
 92 | 		echo "+----------------------------------------+"
 93 | 		echo "|                                        |"
 94 | 		echo "| Updating ${task} image"
 95 | 		echo "|                                        |"
 96 | 		echo "+----------------------------------------+"
 97 | 		docker build --no-cache --tag dsapp/${task} infrastructure/triage
 98 |         docker tag dsapp/${task} ${AWS_REGISTRY}/dsapp/${task}:${TRIAGE_VERSION}
 99 | 		docker tag dsapp/${task} ${AWS_REGISTRY}/dsapp/${task}:latest
100 | 
101 | 		eval "$(aws ecr get-login --no-include-email --region us-west-2)"
102 | 
103 | 		docker push "${AWS_REGISTRY}"/dsapp/${task}:"${TRIAGE_VERSION}"
104 | 		docker push "${AWS_REGISTRY}"/dsapp/${task}:latest
105 | 	done
106 | 
107 | }
108 | 
109 | 
110 | 
111 | function run_experiment () {
112 |     job_name=$1
113 |     echo "Running job ${job_name}"
114 | 
115 | 	environment_overrides=$2
116 | 	echo "Using environment_overrides: ${environment_overrides}"
117 | 
118 |     parameters=$3
119 |     echo "Using parameters: ${parameters}"
120 | 
121 |     command_overrides=${@:4}
122 | 
123 |     # # Retrieve temporary session credentials for current user
124 |     session=$(aws sts get-session-token --duration-seconds 129600)  # 36 h
125 | 
126 |     # # Restructure these to mirror pipeline overrides
127 |     creds=$(<<<"$session" jq -f infrastructure/aws_batch/credentials.filter)
128 | 
129 | 
130 |     # # Merge these AWS session credentials into *all* pipeline overrides
131 |     overrides=$(
132 |         < ${environment_overrides} \
133 |         jq --arg creds "$creds" \
134 |         '.environment += ($creds|fromjson|.environment)'
135 |     )
136 | 
137 |     if [ ! -z "$command_overrides" ]
138 | 	then
139 | 
140 | 		echo "Adding ${command_overrides} to the command"
141 | 
142 | 		for cmd in ${command_overrides}
143 | 		do
144 | 			overrides=$(echo $overrides | jq --arg cmds "${cmd}" \
145 | 											 '.command |= .+ [$cmds]')
146 | 		done
147 | 
148 | 	fi
149 | 
150 |     aws batch submit-job --job-queue ${AWS_JOB_QUEUE} \
151 | 		--job-name ${job_name} \
152 |         --job-definition triage-cli-experiment \
153 |         --container-overrides "${overrides}" \
154 |         --parameters "${parameters}"
155 | }
156 | 
157 | function run() {
158 |     run_experiment $1 infrastructure/aws_batch/triage-overrides.json "${@:2}"
159 | }
160 | 
161 | 
162 | function help_menu () {
163 | cat << EOF
164 | Usage: ${0} (-h | -i | -u | -b | -r | -a | --sync_{to,from}_s3 )
165 | OPTIONS:
166 |    -h|--help                   Show this message
167 |    -i|--info                   Show information about the environment
168 |    -t|--update-triage-image    Build the ${PROJECT_NAME}'s triage image and push it to the AWS ECR
169 |    -u|--update-jobs            Update the ${PROJECT_NAME}'s triage job definition in AWS Batch
170 |    -r|--run-experiment         Run experiments on ${PROJECT_NAME} data
171 |    --sync-to-s3                Uploads the experiments and configuration files to ${S3_BUCKET}
172 |    --sync-from-s3              Gets the experiments and configuration files from ${S3_BUCKET}
173 | EXAMPLES:
174 |    Build and push the images to your AWS ECR:
175 |         $ ./deploy.sh -b
176 |    Update the job's definitions:
177 |         $ ./deploy.sh -u
178 |    Sync your experiment config files:
179 |         $ ./deploy.sh --sync-to-s3
180 |    Run triage experiments:
181 |         $ ./deploy.sh -r --experiment_file=s3://${S3_BUCKET}/experiments/test.yaml,output_path=s3://${S3_BUCKET}/triage,replace=--replace
182 | 
183 | EOF
184 | }
185 | 
186 | if [[ $# -eq 0 ]] ; then
187 | 	help_menu
188 | 	exit 0
189 | fi
190 | 
191 | # Deal with command line flags.
192 | case "${1}" in
193 |   -b|--update-images)
194 |   update_images
195 |   shift
196 |   ;;
197 |   -t|--update-triage-image)
198 |   update_triage_cli_image
199 |   shift
200 |   ;;
201 |   -u|--update-jobs)
202 |   update_jobs
203 |   shift
204 |   ;;
205 |   -r|--run-experiment)
206 |   run ${@:2}
207 |   shift
208 |   ;;
209 |   -a|--all)
210 |   all
211 |   shift
212 |   ;;
213 |   -i|--info)
214 |   info
215 |   shift
216 |   ;;
217 |   --sync-from-s3)
218 |   sync_from_s3
219 |   shift
220 |   ;;
221 |   --sync-to-s3)
222 |   sync_to_s3
223 |   shift
224 |   ;;
225 |   -h|--help)
226 |   help_menu
227 |   shift
228 |   ;;
229 |   *)
230 |   echo "${1} is not a valid flag, try running: ${0} --help"
231 |   ;;
232 | esac
233 | shift
234 | 


--------------------------------------------------------------------------------
/docs/audition/eis/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/eis/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/eis/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/docs/audition/eis/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/eis/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/eis/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/inspections/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/inspections/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/inspections/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/docs/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/audition/inspections/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/audition/inspections/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/css/org-default.css:
--------------------------------------------------------------------------------
1 | .org-bold{font-weight:700}.org-bold-italic{font-weight:700;font-style:italic}.org-buffer-menu-buffer{font-weight:700}.org-builtin{color:#483d8b}.org-button{color:#3a5fcd;text-decoration:underline}.org-calendar-month-header{color:#00f}.org-calendar-today{text-decoration:underline}.org-calendar-weekday-header{color:#008b8b}.org-calendar-weekend-header{color:#b22222}.org-comint-highlight-input{font-weight:700}.org-comint-highlight-prompt{color:#0000cd}.org-comment,.org-comment-delimiter{color:#b22222}.org-constant{color:#008b8b}.org-diary{color:red}.org-doc{color:#8b2252}.org-error{color:red;font-weight:700}.org-escape-glyph{color:brown}.org-file-name-shadow{color:#7f7f7f}.org-fringe{background-color:#f2f2f2}.org-function-name{color:#00f}.org-glyphless-char{font-size:60%}.org-header-line{color:#333;background-color:#e5e5e5}.org-help-argument-name{font-style:italic}.org-highlight{background-color:#b4eeb4}.org-holiday{background-color:pink}.org-info-header-node{color:brown;font-weight:700;font-style:italic}.org-info-header-xref{color:#3a5fcd;text-decoration:underline}.org-info-index-match{background-color:#ff0}.org-info-menu-header{font-weight:700}.org-info-menu-star{color:red}.org-info-node{color:brown;font-weight:700;font-style:italic}.org-info-title-1{font-size:172%;font-weight:700}.org-info-title-2{font-size:144%;font-weight:700}.org-info-title-3{font-size:120%;font-weight:700}.org-info-title-4{font-weight:700}.org-info-xref{color:#3a5fcd;text-decoration:underline}.org-italic{font-style:italic}.org-keyword{color:#a020f0}.org-lazy-highlight{background-color:#afeeee}.org-link{color:#3a5fcd;text-decoration:underline}.org-link-visited{color:#8b008b;text-decoration:underline}.org-makefile-makepp-perl{background-color:#bfefff}.org-makefile-space{background-color:#ff69b4}.org-makefile-targets{color:#00f}.org-match{background-color:#ff0}.org-next-error{background-color:gtk_selection_bg_color}.org-nobreak-space{color:brown;text-decoration:underline}.org-org-agenda-calendar-event,.org-org-agenda-calendar-sexp{color:#000;background-color:#fff}.org-org-agenda-clocking{background-color:#ff0}.org-org-agenda-column-dateline{background-color:#e5e5e5}.org-org-agenda-current-time{color:#b8860b}.org-org-agenda-date{color:#00f}.org-org-agenda-date-today{color:#00f;font-weight:700;font-style:italic}.org-org-agenda-date-weekend{color:#00f;font-weight:700}.org-org-agenda-diary{color:#000;background-color:#fff}.org-org-agenda-dimmed-todo{color:#7f7f7f}.org-org-agenda-done{color:#228b22}.org-org-agenda-filter-category,.org-org-agenda-filter-effort,.org-org-agenda-filter-regexp,.org-org-agenda-filter-tags{color:#000;background-color:#bfbfbf}.org-org-agenda-restriction-lock{background-color:#eee}.org-org-agenda-structure{color:#00f}.org-org-archived,.org-org-block{color:#7f7f7f}.org-org-block-begin-line,.org-org-block-end-line{color:#b22222}.org-org-checkbox{font-weight:700}.org-org-checkbox-statistics-done{color:#228b22;font-weight:700}.org-org-checkbox-statistics-todo{color:red;font-weight:700}.org-org-clock-overlay{color:#000;background-color:#d3d3d3}.org-org-code{color:#7f7f7f}.org-org-column,.org-org-column-title{background-color:#e5e5e5}.org-org-column-title{font-weight:700;text-decoration:underline}.org-org-date{color:#a020f0;text-decoration:underline}.org-org-date-selected{color:red}.org-org-default{color:#000;background-color:#fff}.org-org-document-info{color:#191970}.org-org-document-info-keyword{color:#7f7f7f}.org-org-document-title{color:#191970;font-weight:700}.org-org-done{color:#228b22;font-weight:700}.org-org-drawer{color:#00f}.org-org-ellipsis{color:#b8860b;text-decoration:underline}.org-org-footnote{color:#a020f0;text-decoration:underline}.org-org-formula{color:#b22222}.org-org-headline-done{color:#bc8f8f}.org-org-hide{color:#fff}.org-org-latex-and-related{color:#8b4513}.org-org-level-1{color:#00f}.org-org-level-2{color:sienna}.org-org-level-3{color:#a020f0}.org-org-level-4{color:#b22222}.org-org-level-5{color:#228b22}.org-org-level-6{color:#008b8b}.org-org-level-7{color:#483d8b}.org-org-level-8{color:#8b2252}.org-org-link{color:#3a5fcd;text-decoration:underline}.org-org-list-dt{font-weight:700}.org-org-macro{color:#8b4513}.org-org-meta-line{color:#b22222}.org-org-mode-line-clock{color:#000;background-color:#bfbfbf}.org-org-mode-line-clock-overrun{color:#000;background-color:red}.org-org-priority{color:#a020f0}.org-org-quote{color:#7f7f7f}.org-org-scheduled{color:#006400}.org-org-scheduled-previously{color:#b22222}.org-org-scheduled-today{color:#006400}.org-org-sexp-date,.org-org-special-keyword{color:#a020f0}.org-org-table{color:#00f}.org-org-tag,.org-org-tag-group{font-weight:700}.org-org-target{text-decoration:underline}.org-org-time-grid{color:#b8860b}.org-org-todo{color:red;font-weight:700}.org-org-upcoming-deadline{color:#b22222}.org-org-verbatim,.org-org-verse{color:#7f7f7f}.org-org-warning{color:red;font-weight:700}.org-outline-1{color:#00f}.org-outline-2{color:sienna}.org-outline-3{color:#a020f0}.org-outline-4{color:#b22222}.org-outline-5{color:#228b22}.org-outline-6{color:#008b8b}.org-outline-7{color:#483d8b}.org-outline-8{color:#8b2252}.org-preprocessor{color:#483d8b}.org-regexp-grouping-backslash,.org-regexp-grouping-construct{font-weight:700}.org-region{background-color:gtk_selection_bg_color}.org-secondary-selection{background-color:#ff0}.org-shadow{color:#7f7f7f}.org-show-paren-match{background-color:#40e0d0}.org-show-paren-mismatch{color:#fff;background-color:#a020f0}.org-string{color:#8b2252}.org-success{color:#228b22;font-weight:700}.org-table-cell{color:#e5e5e5;background-color:#00f}.org-tooltip{color:#000;background-color:#ffffe0}.org-trailing-whitespace{background-color:red}.org-type{color:#228b22}.org-underline{text-decoration:underline}.org-variable-name{color:sienna}.org-warning{color:#ff8c00;font-weight:700}.org-warning-1{color:red;font-weight:700}.title{margin-bottom:.2em}.subtitle,.title{text-align:center}.subtitle{font-size:medium;font-weight:700;margin-top:0}.todo{color:red}.done,.todo{font-family:monospace}.done{color:green}.priority{color:orange}.priority,.tag{font-family:monospace}.tag{background-color:#eee;font-size:80%;font-weight:400;padding:2px}.timestamp{color:#bebebe}.timestamp-kwd{color:#5f9ea0}.org-right{margin-left:auto;margin-right:0;text-align:right}.org-left{margin-left:0;margin-right:auto;text-align:left}.org-center{margin-left:auto;margin-right:auto;text-align:center}.underline{text-decoration:underline}#postamble p,#preamble p{font-size:90%;margin:.2em}p.verse{margin-left:3%}pre{border:1px solid #ccc;box-shadow:3px 3px 3px #eee;font-family:monospace;margin:1.2em;overflow:auto;padding:8pt}pre.src{overflow:visible;padding-top:1.2em;position:relative}pre.src:before{background-color:#fff;border:1px solid #000;display:none;padding:3px;position:absolute;right:10px;top:-10px}pre.src:hover:before{display:inline}pre.src-bash:before,pre.src-sh:before{content:"sh"}pre.src-emacs-lisp:before{content:"Emacs Lisp"}pre.src-R:before{content:"R"}pre.src-perl:before{content:"Perl"}pre.src-java:before{content:"Java"}pre.src-sql:before{content:"SQL"}table{border-collapse:collapse}caption.t-above{caption-side:top}caption.t-bottom{caption-side:bottom}td,th{vertical-align:top}th.org-center,th.org-left,th.org-right{text-align:center}td.org-right{text-align:right}td.org-left{text-align:left}td.org-center{text-align:center}dt{font-weight:700}.footpara{display:inline}.footdef{margin-bottom:1em}.figure{padding:1em}.figure p{text-align:center}.inlinetask{background:#ffc;border:2px solid gray;margin:10px;padding:10px}#org-div-home-and-up{font-size:70%;text-align:right;white-space:nowrap}textarea{overflow-x:auto}.linenr{font-size:smaller}.code-highlighted{background-color:#ff0}.org-info-js_info-navigation{border-style:none}#org-info-js_console-label{font-size:10px;font-weight:700;white-space:nowrap}.org-info-js_search-highlight{background-color:#ff0;color:#000;font-weight:700}
2 | 


--------------------------------------------------------------------------------
/docs/eis_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 |   project_path: '/triage' # Project path defined in triage with matrices and models
 4 |   model_group_id: # List of model_id's [optional if a audition_output_path is given]
 5 |         - 40
 6 |         - 7
 7 |         - 156
 8 | 
 9 |   thresholds: # Thresholds for defining positive predictions
10 |         rank_abs: [50, 100, 250]
11 |         rank_pct: [5, 10, 25]
12 | 
13 |   baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
14 |       SELECT g.model_group_id,
15 |              m.model_id,
16 |              EXTRACT('YEAR' FROM m.evaluation_end_time) AS as_of_date_year,
17 |              m.metric,
18 |              m.parameter,
19 |              m.value,
20 |              m.num_labeled_examples,
21 |              m.num_labeled_above_threshold,
22 |              m.num_positive_labels
23 |        FROM test_results.evaluations m
24 |        LEFT JOIN model_metadata.models g
25 |        USING(model_id)
26 |        WHERE g.model_group_id = 1
27 |              AND metric = 'precision@'
28 |              AND parameter = '10_pct'
29 | 
30 |   max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
31 |   n_features_plots: 10 # Number of features for importances
32 |   figsize: [12, 12] # Default size for plots
33 |   fontsize: 20 # Default fontsize for plots
34 | 


--------------------------------------------------------------------------------
/docs/images/AWS_Batch_Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/AWS_Batch_Architecture.png


--------------------------------------------------------------------------------
/docs/images/data_road.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/data_road.png


--------------------------------------------------------------------------------
/docs/images/eis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis.png


--------------------------------------------------------------------------------
/docs/images/eis_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/docs/images/eis_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_mg_prec_over_time.png


--------------------------------------------------------------------------------
/docs/images/eis_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_mg_recall_over_time.png


--------------------------------------------------------------------------------
/docs/images/eis_model_group_64_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_model_group_64_feature_group_importances.png


--------------------------------------------------------------------------------
/docs/images/eis_model_group_64_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_model_group_64_feature_importances.png


--------------------------------------------------------------------------------
/docs/images/eis_model_group_64_rayid_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/eis_model_group_64_rayid_curve.png


--------------------------------------------------------------------------------
/docs/images/facilities_inspected_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/facilities_inspected_over_time.png


--------------------------------------------------------------------------------
/docs/images/facilities_with_failed_inspections_severe_violations_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/facilities_with_failed_inspections_severe_violations_over_time.png


--------------------------------------------------------------------------------
/docs/images/facilities_with_inspections_failed_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/facilities_with_inspections_failed_over_time.png


--------------------------------------------------------------------------------
/docs/images/failed_inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/failed_inspections_over_time.png


--------------------------------------------------------------------------------
/docs/images/failed_inspections_severe_violations_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/failed_inspections_severe_violations_over_time.png


--------------------------------------------------------------------------------
/docs/images/inspection_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/docs/images/inspection_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_mg_prec_over_time.png


--------------------------------------------------------------------------------
/docs/images/inspection_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_mg_recall_over_time.png


--------------------------------------------------------------------------------
/docs/images/inspection_model_group_11_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_model_group_11_feature_group_importances.png


--------------------------------------------------------------------------------
/docs/images/inspection_model_group_11_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_model_group_11_feature_importances.png


--------------------------------------------------------------------------------
/docs/images/inspection_model_group_11_rayid_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspection_model_group_11_rayid_curve.png


--------------------------------------------------------------------------------
/docs/images/inspections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspections.png


--------------------------------------------------------------------------------
/docs/images/inspections_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspections_dt.png


--------------------------------------------------------------------------------
/docs/images/inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/inspections_over_time.png


--------------------------------------------------------------------------------
/docs/images/model_7_tree_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/model_7_tree_0.png


--------------------------------------------------------------------------------
/docs/images/outcomes-eis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/outcomes-eis.png


--------------------------------------------------------------------------------
/docs/images/outcomes-inspections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/outcomes-inspections.png


--------------------------------------------------------------------------------
/docs/images/rolling-origin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/rolling-origin.png


--------------------------------------------------------------------------------
/docs/images/sanjose-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/sanjose-2.png


--------------------------------------------------------------------------------
/docs/images/simple_test_skeleton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/simple_test_skeleton.png


--------------------------------------------------------------------------------
/docs/images/timechop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop.png


--------------------------------------------------------------------------------
/docs/images/timechop_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_1.png


--------------------------------------------------------------------------------
/docs/images/timechop_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_10.png


--------------------------------------------------------------------------------
/docs/images/timechop_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_2.png


--------------------------------------------------------------------------------
/docs/images/timechop_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_3.png


--------------------------------------------------------------------------------
/docs/images/timechop_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_4.png


--------------------------------------------------------------------------------
/docs/images/timechop_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_5.png


--------------------------------------------------------------------------------
/docs/images/timechop_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_6.png


--------------------------------------------------------------------------------
/docs/images/timechop_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_7.png


--------------------------------------------------------------------------------
/docs/images/timechop_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_8.png


--------------------------------------------------------------------------------
/docs/images/timechop_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_9.png


--------------------------------------------------------------------------------
/docs/images/timechop_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_example.png


--------------------------------------------------------------------------------
/docs/images/timechop_inspections_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_inspections_test.png


--------------------------------------------------------------------------------
/docs/images/timechop_withoutblocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_withoutblocks.png


--------------------------------------------------------------------------------
/docs/images/timechop_withoutrows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/images/timechop_withoutrows.png


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <html>
 3 | <head>
 4 | <meta http-equiv="refresh" content="2;url=https://dssg.github.io/triage/dirtyduck/docs/" />
 5 | <title>Page Moved</title>
 6 | </head>
 7 | <body>
 8 | This page has moved. Click <a href="https://dssg.github.io/triage/dirtyduck/docs/">here</a> to go to the new page.
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/docs/js/jquery.stickytableheaders.min.js:
--------------------------------------------------------------------------------
1 | !function(a,b){"use strict";function c(c,g){var h=this;h.$el=a(c),h.el=c,h.id=e++,h.$window=a(b),h.$document=a(document),h.$el.bind("destroyed",a.proxy(h.teardown,h)),h.$clonedHeader=null,h.$originalHeader=null,h.isSticky=!1,h.hasBeenSticky=!1,h.leftOffset=null,h.topOffset=null,h.init=function(){h.$el.each(function(){var b=a(this);b.css("padding",0),h.$originalHeader=a("thead:first",this),h.$clonedHeader=h.$originalHeader.clone(),b.trigger("clonedHeader."+d,[h.$clonedHeader]),h.$clonedHeader.addClass("tableFloatingHeader"),h.$clonedHeader.css("display","none"),h.$originalHeader.addClass("tableFloatingHeaderOriginal"),h.$originalHeader.after(h.$clonedHeader),h.$printStyle=a('<style type="text/css" media="print">.tableFloatingHeader{display:none !important;}.tableFloatingHeaderOriginal{position:static !important;}</style>'),a("head").append(h.$printStyle)}),h.setOptions(g),h.updateWidth(),h.toggleHeaders(),h.bind()},h.destroy=function(){h.$el.unbind("destroyed",h.teardown),h.teardown()},h.teardown=function(){h.isSticky&&h.$originalHeader.css("position","static"),a.removeData(h.el,"plugin_"+d),h.unbind(),h.$clonedHeader.remove(),h.$originalHeader.removeClass("tableFloatingHeaderOriginal"),h.$originalHeader.css("visibility","visible"),h.$printStyle.remove(),h.el=null,h.$el=null},h.bind=function(){h.$scrollableArea.on("scroll."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.on("scroll."+d+h.id,h.setPositionValues),h.$window.on("resize."+d+h.id,h.toggleHeaders)),h.$scrollableArea.on("resize."+d,h.toggleHeaders),h.$scrollableArea.on("resize."+d,h.updateWidth)},h.unbind=function(){h.$scrollableArea.off("."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.off("."+d+h.id,h.setPositionValues),h.$window.off("."+d+h.id,h.toggleHeaders)),h.$scrollableArea.off("."+d,h.updateWidth)},h.toggleHeaders=function(){h.$el&&h.$el.each(function(){var b,c=a(this),d=h.isWindowScrolling?isNaN(h.options.fixedOffset)?h.options.fixedOffset.outerHeight():h.options.fixedOffset:h.$scrollableArea.offset().top+(isNaN(h.options.fixedOffset)?0:h.options.fixedOffset),e=c.offset(),f=h.$scrollableArea.scrollTop()+d,g=h.$scrollableArea.scrollLeft(),i=h.isWindowScrolling?f>e.top:d>e.top,j=(h.isWindowScrolling?f:0)<e.top+c.height()-h.$clonedHeader.height()-(h.isWindowScrolling?0:d);i&&j?(b=e.left-g+h.options.leftOffset,h.$originalHeader.css({position:"fixed","margin-top":h.options.marginTop,left:b,"z-index":3}),h.leftOffset=b,h.topOffset=d,h.$clonedHeader.css("display",""),h.isSticky||(h.isSticky=!0,h.updateWidth()),h.setPositionValues()):h.isSticky&&(h.$originalHeader.css("position","static"),h.$clonedHeader.css("display","none"),h.isSticky=!1,h.resetWidth(a("td,th",h.$clonedHeader),a("td,th",h.$originalHeader)))})},h.setPositionValues=function(){var a=h.$window.scrollTop(),b=h.$window.scrollLeft();!h.isSticky||0>a||a+h.$window.height()>h.$document.height()||0>b||b+h.$window.width()>h.$document.width()||h.$originalHeader.css({top:h.topOffset-(h.isWindowScrolling?0:a),left:h.leftOffset-(h.isWindowScrolling?0:b)})},h.updateWidth=function(){if(h.isSticky){h.$originalHeaderCells||(h.$originalHeaderCells=a("th,td",h.$originalHeader)),h.$clonedHeaderCells||(h.$clonedHeaderCells=a("th,td",h.$clonedHeader));var b=h.getWidth(h.$clonedHeaderCells);h.setWidth(b,h.$clonedHeaderCells,h.$originalHeaderCells),h.$originalHeader.css("width",h.$clonedHeader.width())}},h.getWidth=function(c){var d=[];return c.each(function(c){var e,f=a(this);if("border-box"===f.css("box-sizing"))e=f[0].getBoundingClientRect().width;else{var g=a("th",h.$originalHeader);if("collapse"===g.css("border-collapse"))if(b.getComputedStyle)e=parseFloat(b.getComputedStyle(this,null).width);else{var i=parseFloat(f.css("padding-left")),j=parseFloat(f.css("padding-right")),k=parseFloat(f.css("border-width"));e=f.outerWidth()-i-j-k}else e=f.width()}d[c]=e}),d},h.setWidth=function(a,b,c){b.each(function(b){var d=a[b];c.eq(b).css({"min-width":d,"max-width":d})})},h.resetWidth=function(b,c){b.each(function(b){var d=a(this);c.eq(b).css({"min-width":d.css("min-width"),"max-width":d.css("max-width")})})},h.setOptions=function(c){h.options=a.extend({},f,c),h.$scrollableArea=a(h.options.scrollableArea),h.isWindowScrolling=h.$scrollableArea[0]===b},h.updateOptions=function(a){h.setOptions(a),h.unbind(),h.bind(),h.updateWidth(),h.toggleHeaders()},h.init()}var d="stickyTableHeaders",e=0,f={fixedOffset:0,leftOffset:0,marginTop:0,scrollableArea:b};a.fn[d]=function(b){return this.each(function(){var e=a.data(this,"plugin_"+d);e?"string"==typeof b?e[b].apply(e):e.updateOptions(b):"destroy"!==b&&a.data(this,"plugin_"+d,new c(this,b))})}}(jQuery,window);


--------------------------------------------------------------------------------
/docs/js/readtheorg.js:
--------------------------------------------------------------------------------
 1 | 
 2 | $(function() {
 3 |     $('.note').before("<p class='admonition-title note'>Note</p>");
 4 |     $('.seealso').before("<p class='admonition-title seealso'>See also</p>");
 5 |     $('.warning').before("<p class='admonition-title warning'>Warning</p>");
 6 |     $('.caution').before("<p class='admonition-title caution'>Caution</p>");
 7 |     $('.attention').before("<p class='admonition-title attention'>Attention</p>");
 8 |     $('.tip').before("<p class='admonition-title tip'>Tip</p>");
 9 |     $('.important').before("<p class='admonition-title important'>Important</p>");
10 |     $('.hint').before("<p class='admonition-title hint'>Hint</p>");
11 |     $('.error').before("<p class='admonition-title error'>Error</p>");
12 |     $('.danger').before("<p class='admonition-title danger'>Danger</p>");
13 | });
14 | 
15 | $( document ).ready(function() {
16 | 
17 |     // Shift nav in mobile when clicking the menu.
18 |     $(document).on('click', "[data-toggle='wy-nav-top']", function() {
19 |       $("[data-toggle='wy-nav-shift']").toggleClass("shift");
20 |       $("[data-toggle='rst-versions']").toggleClass("shift");
21 |     });
22 |     // Close menu when you click a link.
23 |     $(document).on('click', ".wy-menu-vertical .current ul li a", function() {
24 |       $("[data-toggle='wy-nav-shift']").removeClass("shift");
25 |       $("[data-toggle='rst-versions']").toggleClass("shift");
26 |     });
27 |     $(document).on('click', "[data-toggle='rst-current-version']", function() {
28 |       $("[data-toggle='rst-versions']").toggleClass("shift-up");
29 |     });
30 |     // Make tables responsive
31 |     $("table.docutils:not(.field-list)").wrap("<div class='wy-table-responsive'></div>");
32 | });
33 | 
34 | $( document ).ready(function() {
35 |     $('#text-table-of-contents ul').first().addClass('nav');
36 |                                         // ScrollSpy also requires that we use
37 |                                         // a Bootstrap nav component.
38 |     $('body').scrollspy({target: '#text-table-of-contents'});
39 | 
40 |     // add sticky table headers
41 |     $('table').stickyTableHeaders();
42 | 
43 |     // set the height of tableOfContents
44 |     var $postamble = $('#postamble');
45 |     var $tableOfContents = $('#table-of-contents');
46 |     $tableOfContents.css({paddingBottom: $postamble.outerHeight()});
47 | 
48 |     // add TOC button
49 |     var toggleSidebar = $('<div id="toggle-sidebar"><a href="#table-of-contents"><h2>Table of Contents</h2></a></div>');
50 |     $('#content').prepend(toggleSidebar);
51 | 
52 |     // add close button when sidebar showed in mobile screen
53 |     var closeBtn = $('<a class="close-sidebar" href="#">Close</a>');
54 |     var tocTitle = $('#table-of-contents').find('h2');
55 |     tocTitle.append(closeBtn);
56 | });
57 | 
58 | window.SphinxRtdTheme = (function (jquery) {
59 |     var stickyNav = (function () {
60 |         var navBar,
61 |             win,
62 |             stickyNavCssClass = 'stickynav',
63 |             applyStickNav = function () {
64 |                 if (navBar.height() <= win.height()) {
65 |                     navBar.addClass(stickyNavCssClass);
66 |                 } else {
67 |                     navBar.removeClass(stickyNavCssClass);
68 |                 }
69 |             },
70 |             enable = function () {
71 |                 applyStickNav();
72 |                 win.on('resize', applyStickNav);
73 |             },
74 |             init = function () {
75 |                 navBar = jquery('nav.wy-nav-side:first');
76 |                 win    = jquery(window);
77 |             };
78 |         jquery(init);
79 |         return {
80 |             enable : enable
81 |         };
82 |     }());
83 |     return {
84 |         StickyNav : stickyNav
85 |     };
86 | }($));
87 | 


--------------------------------------------------------------------------------
/docs/sql/create_cleaned_inspections_table.sql:
--------------------------------------------------------------------------------
 1 |   create schema if not exists cleaned;
 2 | 
 3 | drop table if exists cleaned.inspections cascade;
 4 | 
 5 | create table cleaned.inspections as (
 6 |         with cleaned as (
 7 |         select
 8 |             inspection::integer,
 9 |             btrim(lower(results)) as result,
10 |             license_num::integer,
11 |             btrim(lower(dba_name)) as facility,
12 |             btrim(lower(aka_name)) as facility_aka,
13 |             case when
14 |             facility_type is null then 'unknown'
15 |             else btrim(lower(facility_type))
16 |             end as facility_type,
17 |             lower(substring(risk from '\((.+)\)')) as risk,
18 |             btrim(lower(address)) as address,
19 |             zip as zip_code,
20 |             substring(
21 |                 btrim(lower(regexp_replace(type, 'liquor', 'task force', 'gi')))
22 |             from 'canvass|task force|complaint|food poisoning|consultation|license|tag removal') as type,
23 |             date,
24 |             -- point(longitude, latitude) as location
25 |             ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as location  -- We use geography so the measurements are in meters
26 |         from raw.inspections
27 |         where zip is not null  -- removing NULL zip codes
28 |             )
29 | 
30 |     select * from cleaned where type is not null
31 |         );
32 | 


--------------------------------------------------------------------------------
/docs/sql/create_semantic_tables.sql:
--------------------------------------------------------------------------------
  1 | create schema if not exists semantic;
  2 | 
  3 | drop table if exists semantic.entities cascade;
  4 | 
  5 | create table semantic.entities as (
  6 |         with entities as (
  7 |         select
  8 |             distinct on (
  9 |                 license_num,
 10 |                 facility,
 11 |                 facility_aka,
 12 |                 facility_type,
 13 |                 address
 14 |                 )
 15 |             license_num,
 16 |             facility,
 17 |             facility_aka,
 18 |             facility_type,
 19 |             address,
 20 |             zip_code,
 21 |             location,
 22 |             min(date) over (partition by license_num, facility, facility_aka, facility_type, address) as start_time,
 23 |             max(case when result in ('out of business', 'business not located')
 24 |                 then date
 25 |                 else NULL
 26 |                 end)
 27 |             over (partition by license_num, facility, facility_aka, address) as end_time
 28 |         from cleaned.inspections
 29 |         order by
 30 |             license_num, facility, facility_aka, facility_type, address,
 31 |             date asc -- IMPORTANT!!
 32 |             )
 33 | 
 34 |     select
 35 |         row_number() over (order by start_time asc ) as entity_id,
 36 |         license_num,
 37 |         facility,
 38 |         facility_aka,
 39 |         facility_type,
 40 |         address,
 41 |         zip_code,
 42 |         location,
 43 |         start_time,
 44 |         end_time,
 45 |         daterange(start_time, end_time) as activity_period
 46 |     from entities
 47 |         );
 48 | 
 49 | create index entities_ix on semantic.entities (entity_id);
 50 | create index entities_license_num_ix on semantic.entities (license_num);
 51 | create index entities_facility_ix on semantic.entities (facility);
 52 | create index entities_facility_type_ix on semantic.entities (facility_type);
 53 | create index entities_zip_code_ix on semantic.entities (zip_code);
 54 | 
 55 | -- Spatial index
 56 | create index entities_location_gix on semantic.entities using gist (location);
 57 | 
 58 | create index entities_full_key_ix on semantic.entities (license_num, facility, facility_aka, facility_type, address);
 59 | 
 60 | drop table if exists semantic.events cascade;
 61 | 
 62 | create table semantic.events as (
 63 | 
 64 |         with entities as (
 65 |         select * from semantic.entities
 66 |             ),
 67 | 
 68 |         inspections as (
 69 |         select
 70 |             i.inspection, i.type, i.date, i.risk, i.result,
 71 |             i.license_num, i.facility, i.facility_aka,
 72 |             i.facility_type, i.address, i.zip_code, i.location,
 73 |             jsonb_agg(
 74 |                 jsonb_build_object(
 75 |                     'code', v.code,
 76 |                     'severity', v.severity,
 77 | 	                'description', v.description,
 78 | 	                'comment', v.comment
 79 | 	                )
 80 |             order  by code
 81 |                 ) as violations
 82 |         from
 83 |             cleaned.inspections as i
 84 |             inner join
 85 |             cleaned.violations as v
 86 |             on i.inspection = v.inspection
 87 |         group by
 88 |             i.inspection, i.type, i.license_num, i.facility,
 89 |             i.facility_aka, i.facility_type, i.address, i.zip_code, i.location,
 90 |             i.date, i.risk, i.result
 91 |             )
 92 | 
 93 |     select
 94 |         i.inspection as event_id,
 95 |         e.entity_id, i.type, i.date, i.risk, i.result,
 96 |         e.facility_type, e.zip_code, e.location,
 97 |         i.violations
 98 |     from
 99 |         entities as e
100 |         inner join
101 |         inspections as i
102 |         using (license_num, facility, facility_aka, facility_type, address, zip_code)
103 |         );
104 | 
105 | -- Add some indices
106 | create index events_entity_ix on semantic.events (entity_id asc nulls last);
107 | create index events_event_ix on semantic.events (event_id asc nulls last);
108 | create index events_type_ix on semantic.events (type);
109 | create index events_date_ix on semantic.events(date asc nulls last);
110 | create index events_facility_type_ix on semantic.events  (facility_type);
111 | create index events_zip_code_ix on semantic.events  (zip_code);
112 | 
113 | -- Spatial index
114 | create index events_location_gix on semantic.events using gist (location);
115 | 
116 | -- JSONB indices
117 | create index events_violations on semantic.events using gin(violations);
118 | create index events_violations_json_path on semantic.events using gin(violations jsonb_path_ops);
119 | 
120 | create index events_event_entity_zip_code_date on semantic.events (event_id asc nulls last, entity_id asc nulls last, zip_code, date desc nulls last);
121 | 


--------------------------------------------------------------------------------
/docs/sql/create_violations_table.sql:
--------------------------------------------------------------------------------
 1 |    drop table if exists cleaned.violations cascade;
 2 | 
 3 |    create table cleaned.violations as (
 4 |    select
 5 |        inspection::integer,
 6 |        license_num::integer,
 7 |        date::date,
 8 |        btrim(tuple[1]) as code,
 9 |        btrim(tuple[2]) as description,
10 |        btrim(tuple[3]) as comment,
11 |        (case
12 |            when btrim(tuple[1]) = '' then NULL
13 |            when btrim(tuple[1])::int between 1 and 14 then 'critical' -- From the documentation
14 |            when btrim(tuple[1])::int between 15 and 29  then 'serious'
15 |            else 'minor'
16 |            end
17 |            ) as severity from
18 |        (
19 |        select
20 |            inspection,
21 |            license_num,
22 |            date,
23 |            regexp_split_to_array(   -- Create an array we will split the code, description, comment
24 |                regexp_split_to_table( -- Create a row per each comment we split by |
25 |                    coalesce(            -- If there isn't a violation add '- Comments:'
26 |                        regexp_replace(violations, '[\n\r]+', '', 'g' )  -- Remove line breaks
27 |                        , '- Comments:')
28 |                    , '\|')  -- Split the violations
29 |                , '(?<=\d+)\.\s*|\s*-\s*Comments:')  -- Split each violation in three
30 |            as tuple
31 |        from raw.inspections
32 |        where results in ('Fail', 'Pass', 'Pass w/ Conditions') and license_num is not null
33 |            ) as t
34 |        );
35 | 


--------------------------------------------------------------------------------
/docs/triage/experiments/eis_01.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'eis: 01'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'inspected'
  7 |   experiment_type: 'eis'
  8 |   description: |
  9 |     EIS 01
 10 |   purpose: 'model creation'
 11 |   org: 'DSaPP'
 12 |   team: 'Tutorial'
 13 |   author: 'Your name here'
 14 |   etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | label_config:
 36 |   query: |
 37 |     select
 38 |     entity_id,
 39 |     True::integer as outcome
 40 |     from semantic.events
 41 |     where '{as_of_date}'::timestamp <= date
 42 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 43 |     group by entity_id
 44 |   include_missing_labels_in_train_as: False
 45 |   name: 'inspected'
 46 | 
 47 | cohort_config:
 48 |   query: |
 49 |     with buckets as (
 50 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 51 |     from (
 52 |     select entity_id, count(*) as number_of_inspections
 53 |     from semantic.events
 54 |     group by entity_id
 55 |     ) as t
 56 |     )
 57 |     select e.entity_id
 58 |     from semantic.entities as e
 59 |     inner join
 60 |     buckets as b
 61 |     using (entity_id)
 62 |     where
 63 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 64 |     and bucket in (5)
 65 |   name: 'active_facilities'
 66 | 
 67 | temporal_config:
 68 |     feature_start_time: '2010-01-04'
 69 |     feature_end_time: '2019-01-01'
 70 |     label_start_time: '2015-02-01'
 71 |     label_end_time: '2019-01-01'
 72 | 
 73 |     model_update_frequency: '1y'
 74 |     training_label_timespans: ['1month']
 75 |     training_as_of_date_frequencies: '1month'
 76 | 
 77 |     test_durations: '1y'
 78 |     test_label_timespans: ['1month']
 79 |     test_as_of_date_frequencies: '1month'
 80 | 
 81 |     max_training_histories: '5y'
 82 | 
 83 | feature_aggregations:
 84 |   -
 85 |     prefix: 'inspections'
 86 |     from_obj: 'semantic.events'
 87 |     knowledge_date_column: 'date'
 88 | 
 89 |     aggregates_imputation:
 90 |       count:
 91 |         type: 'zero_noflag'
 92 | 
 93 |     aggregates:
 94 |       -
 95 |         quantity:
 96 |           total: "*"
 97 |         metrics:
 98 |           - 'count'
 99 | 
100 |     intervals: ['1month', '3month', '6month', '1y', 'all']
101 | 
102 |     groups:
103 |       - 'entity_id'
104 | 
105 |   -
106 |     prefix: 'risks'
107 |     from_obj: 'semantic.events'
108 |     knowledge_date_column: 'date'
109 | 
110 |     categoricals_imputation:
111 |       sum:
112 |         type: 'zero'
113 |       avg:
114 |         type: 'zero'
115 | 
116 |     categoricals:
117 |       -
118 |         column: 'risk'
119 |         choices: ['low', 'medium', 'high']
120 |         metrics:
121 |           - 'sum'
122 |           - 'avg'
123 | 
124 |     intervals: ['1month', '3month', '6month', '1y', 'all']
125 | 
126 |     groups:
127 |       - 'entity_id'
128 |       - 'zip_code'
129 | 
130 |   -
131 |     prefix: 'results'
132 |     from_obj: 'semantic.events'
133 |     knowledge_date_column: 'date'
134 | 
135 |     categoricals_imputation:
136 |       all:
137 |         type: 'zero'
138 | 
139 |     categoricals:
140 |       -
141 |         column: 'result'
142 |         choice_query: 'select distinct result from semantic.events'
143 |         metrics:
144 |           - 'sum'
145 |           - 'avg'
146 | 
147 |     intervals: ['1month', '3month', '6month', '1y', 'all']
148 | 
149 |     groups:
150 |       - 'entity_id'
151 | 
152 |   -
153 |     prefix: 'inspection_types'
154 |     from_obj: 'semantic.events'
155 |     knowledge_date_column: 'date'
156 | 
157 |     categoricals_imputation:
158 |       sum:
159 |         type: 'zero_noflag'
160 | 
161 |     categoricals:
162 |       -
163 |         column: 'type'
164 |         choice_query: 'select distinct type from semantic.events where type is not null'
165 |         metrics:
166 |           - 'sum'
167 | 
168 |     intervals: ['1month', '3month', '6month', '1y', 'all']
169 | 
170 |     groups:
171 |       - 'entity_id'
172 |       - 'zip_code'
173 | 
174 | feature_group_definition:
175 |    prefix:
176 |      - 'inspections'
177 |      - 'results'
178 |      - 'risks'
179 |      - 'inspection_types'
180 | 
181 | feature_group_strategies: ['all', 'leave-one-out', 'leave-one-in']
182 | 
183 | grid_config:
184 |     'sklearn.tree.DecisionTreeClassifier':
185 |         max_depth: [2,null]
186 |     'sklearn.ensemble.RandomForestClassifier':
187 |         max_features: ['sqrt']
188 |         criterion: ['gini']
189 |         n_estimators: [500]
190 |         min_samples_leaf: [1]
191 |         min_samples_split: [50]
192 |     'sklearn.dummy.DummyClassifier':
193 |         strategy: [most_frequent]
194 | 
195 | scoring:
196 |     testing_metric_groups:
197 |         -
198 |           metrics: [precision@, recall@]
199 |           thresholds:
200 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
201 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
202 | 
203 | 
204 |     training_metric_groups:
205 |       -
206 |         metrics: [accuracy]
207 |       -
208 |         metrics: [precision@, recall@]
209 |         thresholds:
210 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
211 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
212 | 


--------------------------------------------------------------------------------
/docs/triage/experiments/inspections-training.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v3'
  2 | 
  3 | model_comment: 'test_triage_inspections'
  4 | 
  5 | temporal_config:
  6 |   feature_start_time=np.min(df.date)
  7 |   feature_end_time=np.max(df.date)
  8 |   label_start_time=np.min(df.date)
  9 |   label_end_time=np.max(df.date)
 10 | 
 11 |   model_update_frequency='3months'
 12 |   training_label_timespans='1day'
 13 |   training_as_of_date_frequencies='1day'
 14 |   max_training_histories='1year'
 15 | 
 16 |   test_durations='1day'
 17 |   test_label_timespans='3month'
 18 |   test_as_of_date_frequencies='1day'
 19 | 
 20 | events_table: 'inspections.events'
 21 | 
 22 | feature_aggregations:
 23 |   -
 24 |     # Number of violations of a specific code and proportion, grouped by entity
 25 |     prefix: 'violations'
 26 |     from_obj: 'cleaned.violations'
 27 |     knowledge_date_column: 'knowledge_date'
 28 | 
 29 |     categoricals:
 30 |       -
 31 |         column: 'violation_code'
 32 |         choice_query: 'select distinct violation_code from cleaned.violations'
 33 |         metrics:
 34 |           - 'sum'
 35 |           - 'avg'
 36 | 
 37 |     intervals:
 38 |       - '1 y'
 39 | 
 40 |     groups:
 41 |       - 'entity_id'
 42 | 
 43 |   -  # inspections in the last year associated with this entity
 44 |     prefix: 'inspections'
 45 |     from_obj: 'cleaned.inspections'
 46 |     knowledge_date_column: 'date'
 47 |     aggregates:
 48 |       -
 49 |           quantity: '*'
 50 |           metrics:
 51 |               - 'count'
 52 |     intervals:
 53 |       - '1 y'
 54 | 
 55 |     groups:
 56 |       - 'license_num'
 57 | 
 58 |   - # inspections that happened in the last year grouped  by type of facility
 59 |     prefix: 'inspections'
 60 |     from_obj: 'cleaned.inspections'
 61 |     knowledge_date_column: 'date'
 62 | 
 63 |     aggregates:
 64 |       -
 65 |           quantity: '*'
 66 |           metrics:
 67 |               - 'count'
 68 |     intervals:
 69 |       - '1 y'
 70 | 
 71 |     groups:
 72 |       - 'facility_type'
 73 | 
 74 |   - # inspections that happened in the last year grouped  by zip code
 75 |     prefix: 'inspections'
 76 |     from_obj: 'cleaned.inspections'
 77 |     knowledge_date_column: 'date'
 78 | 
 79 |     aggregates:
 80 |       -
 81 |           quantity: '*'
 82 |           metrics:
 83 |               - 'count'
 84 |     intervals:
 85 |       - '1 y'
 86 | 
 87 |     groups:
 88 |       - 'zip_code'
 89 | 
 90 | feature_group_strategies: ['all']
 91 | 
 92 | model_group_keys: []
 93 | 
 94 | grid_config:
 95 |   'sklearn.tree.DecisionTreeClassifier':
 96 |     criterion: ['gini']
 97 |     max_depth: [3]
 98 |     min_samples_split: [10]
 99 | 
100 | scoring:
101 |   metric_groups:
102 |     -
103 |       metrics: ['precision@', 'recall@', 'fpr@']
104 |       thresholds:
105 |         percentiles: [1.0, 2.0, 5.0, 10.0, 25.0]
106 |         top_n: [25, 75, 150, 300, 500, 1000, 1500]
107 | 


--------------------------------------------------------------------------------
/docs/triage/experiments/inspections_baseline.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'inspections: baseline'
  4 | 
  5 | user_metadata:
  6 |     label_definition: 'failed'
  7 |     experiment_type: 'inspections prioritization'
  8 |     description: |
  9 |       Baseline calculation
 10 |     purpose: 'baseline'
 11 |     org: 'DSaPP'
 12 |     team: 'Tutorial'
 13 |     author: 'Your name here'
 14 |     etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | temporal_config:
 36 |     feature_start_time: '2010-01-04'
 37 |     feature_end_time: '2019-01-01'
 38 |     label_start_time: '2015-02-01'
 39 |     label_end_time: '2019-01-01'
 40 | 
 41 |     model_update_frequency: '1y'
 42 |     training_label_timespans: ['1month']
 43 |     training_as_of_date_frequencies: '1month'
 44 | 
 45 |     test_durations: '1y'
 46 |     test_label_timespans: ['1month']
 47 |     test_as_of_date_frequencies: '1month'
 48 | 
 49 |     max_training_histories: '5y'
 50 | 
 51 | label_config:
 52 |   query: |
 53 |     select
 54 |     entity_id,
 55 |     bool_or(result = 'fail')::integer as outcome
 56 |     from semantic.events
 57 |     where '{as_of_date}'::timestamp <= date
 58 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 59 |     group by entity_id
 60 |   name: 'failed_inspections'
 61 | 
 62 | cohort_config:
 63 |   query: |
 64 |     with buckets as (
 65 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 66 |     from (
 67 |     select entity_id, count(*) as number_of_inspections
 68 |     from semantic.events
 69 |     group by entity_id
 70 |     ) as t
 71 |     )
 72 |     select e.entity_id
 73 |     from semantic.entities as e
 74 |     inner join
 75 |     buckets as b
 76 |     using (entity_id)
 77 |     where
 78 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 79 |     and bucket in (5)
 80 |   name: 'active_facilities'
 81 | 
 82 | feature_aggregations:
 83 |   -
 84 |     prefix: 'inspections'
 85 |     from_obj: 'semantic.events'
 86 |     knowledge_date_column: 'date'
 87 | 
 88 |     aggregates_imputation:
 89 |       count:
 90 |         type: 'zero_noflag'
 91 | 
 92 |     aggregates:
 93 |       -
 94 |         quantity:
 95 |           total: "*"
 96 |         metrics:
 97 |           - 'count'
 98 | 
 99 |     intervals: ['all']
100 | 
101 |     groups:
102 |       - 'entity_id'
103 | 
104 | feature_group_definition:
105 |    prefix:
106 |      - 'inspections'
107 | 
108 | feature_group_strategies: ['all']
109 | 
110 | grid_config:
111 |     'sklearn.dummy.DummyClassifier':
112 |         strategy: [prior,uniform, most_frequent]
113 | 
114 | scoring:
115 |     testing_metric_groups:
116 |         -
117 |           metrics: [precision@, recall@]
118 |           thresholds:
119 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
120 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
121 | 
122 |     training_metric_groups:
123 |       -
124 |         metrics: [accuracy]
125 |       -
126 |         metrics: [precision@, recall@]
127 |         thresholds:
128 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
129 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
130 | 


--------------------------------------------------------------------------------
/docs/triage/experiments/inspections_dt.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'inspections: DT'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'failed'
  7 |   experiment_type: 'inspections prioritization'
  8 |   description: |
  9 |     Decision Tree Classifier
 10 |   purpose: 'data mining'
 11 |   org: 'DSaPP'
 12 |   team: 'Tutorial'
 13 |   author: 'Your name here'
 14 |   etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | temporal_config:
 36 |     feature_start_time: '2010-01-04'
 37 |     feature_end_time: '2019-01-01'
 38 |     label_start_time: '2015-02-01'
 39 |     label_end_time: '2019-01-01'
 40 | 
 41 |     model_update_frequency: '1y'
 42 |     training_label_timespans: ['1month']
 43 |     training_as_of_date_frequencies: '1month'
 44 | 
 45 |     test_durations: '1y'
 46 |     test_label_timespans: ['1month']
 47 |     test_as_of_date_frequencies: '1month'
 48 | 
 49 |     max_training_histories: '5y'
 50 | 
 51 | label_config:
 52 |   query: |
 53 |     select
 54 |     entity_id,
 55 |     bool_or(result = 'fail')::integer as outcome
 56 |     from semantic.events
 57 |     where '{as_of_date}'::timestamp <= date
 58 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 59 |     group by entity_id
 60 |   name: 'failed_inspections'
 61 | 
 62 | cohort_config:
 63 |   query: |
 64 |     with buckets as (
 65 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 66 |     from (
 67 |     select entity_id, count(*) as number_of_inspections
 68 |     from semantic.events
 69 |     group by entity_id
 70 |     ) as t
 71 |     )
 72 |     select e.entity_id
 73 |     from semantic.entities as e
 74 |     inner join
 75 |     buckets as b
 76 |     using (entity_id)
 77 |     where
 78 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 79 |     and bucket in (5)
 80 |   name: 'active_facilities'
 81 | 
 82 | feature_aggregations:
 83 |   -
 84 |     prefix: 'inspections'
 85 |     from_obj: 'semantic.events'
 86 |     knowledge_date_column: 'date'
 87 | 
 88 |     aggregates_imputation:
 89 |       count:
 90 |         type: 'zero_noflag'
 91 | 
 92 |     aggregates:
 93 |       -
 94 |         quantity:
 95 |           total: "*"
 96 |         metrics:
 97 |           - 'count'
 98 | 
 99 |     intervals: ['1month', '3month', '6month', '1y', 'all']
100 | 
101 |     groups:
102 |       - 'entity_id'
103 | 
104 |   -
105 |     prefix: 'risks'
106 |     from_obj: 'semantic.events'
107 |     knowledge_date_column: 'date'
108 | 
109 |     categoricals_imputation:
110 |       sum:
111 |         type: 'zero'
112 |       avg:
113 |         type: 'zero'
114 | 
115 |     categoricals:
116 |       -
117 |         column: 'risk'
118 |         choices: ['low', 'medium', 'high']
119 |         metrics:
120 |           - 'sum'
121 |           - 'avg'
122 | 
123 |     intervals: ['1month', '3month', '6month', '1y', 'all']
124 | 
125 |     groups:
126 |       - 'entity_id'
127 |       - 'zip_code'
128 | 
129 |   -
130 |     prefix: 'results'
131 |     from_obj: 'semantic.events'
132 |     knowledge_date_column: 'date'
133 | 
134 |     categoricals_imputation:
135 |       all:
136 |         type: 'zero'
137 | 
138 |     categoricals:
139 |       -
140 |         column: 'result'
141 |         choice_query: 'select distinct result from semantic.events'
142 |         metrics:
143 |           - 'sum'
144 |           - 'avg'
145 | 
146 |     intervals: ['1month', '3month', '6month', '1y', 'all']
147 | 
148 |     groups:
149 |       - 'entity_id'
150 | 
151 |   -
152 |     prefix: 'inspection_types'
153 |     from_obj: 'semantic.events'
154 |     knowledge_date_column: 'date'
155 | 
156 |     categoricals_imputation:
157 |       sum:
158 |         type: 'zero_noflag'
159 | 
160 |     categoricals:
161 |       -
162 |         column: 'type'
163 |         choice_query: 'select distinct type from semantic.events where type is not null'
164 |         metrics:
165 |           - 'sum'
166 | 
167 |     intervals: ['1month', '3month', '6month', '1y', 'all']
168 | 
169 |     groups:
170 |       - 'entity_id'
171 |       - 'zip_code'
172 | 
173 | grid_config:
174 |     'sklearn.tree.DecisionTreeClassifier':
175 |         max_depth: [2,10,~]
176 |         min_samples_split: [2,5]
177 | 
178 | feature_group_definition:
179 |    prefix:
180 |      - 'inspections'
181 |      - 'results'
182 |      - 'risks'
183 |      - 'inspection_types'
184 | 
185 | feature_group_strategies: ['all']
186 | 
187 | scoring:
188 |     testing_metric_groups:
189 |         -
190 |           metrics: [precision@, recall@]
191 |           thresholds:
192 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
193 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
194 | 
195 | 
196 |     training_metric_groups:
197 |       -
198 |         metrics: [accuracy]
199 |       -
200 |         metrics: [precision@, recall@]
201 |         thresholds:
202 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
203 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
204 | 


--------------------------------------------------------------------------------
/docs/triage/experiments/inspections_label_failed_01.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'inspections: advanced'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'failed'
  7 |   experiment_type: 'inspections prioritization'
  8 |   description: |
  9 |     Using Ensamble methods
 10 |   purpose: 'trying ensamble algorithms'
 11 |   org: 'DSaPP'
 12 |   team: 'Tutorial'
 13 |   author: 'Your name here'
 14 |   etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | temporal_config:
 36 |     feature_start_time: '2010-01-04'
 37 |     feature_end_time: '2019-01-01'
 38 |     label_start_time: '2015-02-01'
 39 |     label_end_time: '2019-01-01'
 40 | 
 41 |     model_update_frequency: '1y'
 42 |     training_label_timespans: ['1month']
 43 |     training_as_of_date_frequencies: '1month'
 44 | 
 45 |     test_durations: '1y'
 46 |     test_label_timespans: ['1month']
 47 |     test_as_of_date_frequencies: '1month'
 48 | 
 49 |     max_training_histories: '5y'
 50 | 
 51 | label_config:
 52 |   query: |
 53 |     select
 54 |     entity_id,
 55 |     bool_or(result = 'fail')::integer as outcome
 56 |     from semantic.events
 57 |     where '{as_of_date}'::timestamp <= date
 58 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 59 |     group by entity_id
 60 |   name: 'failed_inspections'
 61 | 
 62 | 
 63 | cohort_config:
 64 |   query: |
 65 |     with buckets as (
 66 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 67 |     from (
 68 |     select entity_id, count(*) as number_of_inspections
 69 |     from semantic.events
 70 |     group by entity_id
 71 |     ) as t
 72 |     )
 73 |     select e.entity_id
 74 |     from semantic.entities as e
 75 |     inner join
 76 |     buckets as b
 77 |     using (entity_id)
 78 |     where
 79 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 80 |     and bucket in (5)
 81 |   name: 'active_facilities'
 82 | 
 83 | feature_aggregations:
 84 |   -
 85 |     prefix: 'inspections'
 86 |     from_obj: 'semantic.events'
 87 |     knowledge_date_column: 'date'
 88 | 
 89 |     aggregates_imputation:
 90 |       count:
 91 |         type: 'zero_noflag'
 92 | 
 93 |     aggregates:
 94 |       -
 95 |         quantity:
 96 |           total: "*"
 97 |         metrics:
 98 |           - 'count'
 99 | 
100 |     intervals: ['1month', '3month', '6month', '1y', 'all']
101 | 
102 |     groups:
103 |       - 'entity_id'
104 | 
105 |   -
106 |     prefix: 'risks'
107 |     from_obj: 'semantic.events'
108 |     knowledge_date_column: 'date'
109 | 
110 |     categoricals_imputation:
111 |       sum:
112 |         type: 'zero'
113 |       avg:
114 |         type: 'zero'
115 | 
116 |     categoricals:
117 |       -
118 |         column: 'risk'
119 |         choices: ['low', 'medium', 'high']
120 |         metrics:
121 |           - 'sum'
122 |           - 'avg'
123 | 
124 |     intervals: ['1month', '3month', '6month', '1y', 'all']
125 | 
126 |     groups:
127 |       - 'entity_id'
128 |       - 'zip_code'
129 | 
130 |   -
131 |     prefix: 'results'
132 |     from_obj: 'semantic.events'
133 |     knowledge_date_column: 'date'
134 | 
135 |     categoricals_imputation:
136 |       all:
137 |         type: 'zero'
138 | 
139 |     categoricals:
140 |       -
141 |         column: 'result'
142 |         choice_query: 'select distinct result from semantic.events'
143 |         metrics:
144 |           - 'sum'
145 |           - 'avg'
146 | 
147 |     intervals: ['1month', '3month', '6month', '1y', 'all']
148 | 
149 |     groups:
150 |       - 'entity_id'
151 | 
152 |   -
153 |     prefix: 'inspection_types'
154 |     from_obj: 'semantic.events'
155 |     knowledge_date_column: 'date'
156 | 
157 |     categoricals_imputation:
158 |       sum:
159 |         type: 'zero_noflag'
160 | 
161 |     categoricals:
162 |       -
163 |         column: 'type'
164 |         choice_query: 'select distinct type from semantic.events where type is not null'
165 |         metrics:
166 |           - 'sum'
167 | 
168 |     intervals: ['1month', '3month', '6month', '1y', 'all']
169 | 
170 |     groups:
171 |       - 'entity_id'
172 |       - 'zip_code'
173 | 
174 | feature_group_definition:
175 |    prefix:
176 |      - 'inspections'
177 |      - 'results'
178 |      - 'risks'
179 |      - 'inspection_types'
180 | 
181 | feature_group_strategies: ['all', 'leave-one-in', 'leave-one-out']
182 | 
183 | grid_config:
184 |     'sklearn.ensemble.RandomForestClassifier':
185 |         max_features: ['sqrt']
186 |         criterion: ['gini']
187 |         n_estimators: [100, 250]
188 |         min_samples_split: [2,10]
189 | 
190 | scoring:
191 |     testing_metric_groups:
192 |         -
193 |           metrics: [precision@, recall@]
194 |           thresholds:
195 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
196 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
197 | 
198 |     training_metric_groups:
199 |       -
200 |         metrics: [accuracy]
201 |       -
202 |         metrics: [precision@, recall@]
203 |         thresholds:
204 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
205 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
206 | 


--------------------------------------------------------------------------------
/docs/triage/experiments/simple_test_skeleton.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'simple_test_skeleton'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'failed_inspection'
  7 |   experiment_type: 'test'
  8 |   org: 'DSaPP'
  9 |   team: 'Tutorial'
 10 |   author: 'Adolfo De Unanue'
 11 |   etl_date: '2019-02-21'
 12 | 
 13 | temporal_config:
 14 |     feature_start_time: '2014-01-01'
 15 |     feature_end_time: '2018-01-01'
 16 |     label_start_time: '2014-01-02'
 17 |     label_end_time: '2018-01-01'
 18 | 
 19 |     model_update_frequency: '1y'
 20 | 
 21 |     max_training_histories: '1y'
 22 |     training_label_timespans: ['1y']
 23 |     training_as_of_date_frequencies: '1month'
 24 | 
 25 |     test_durations: '0d'
 26 |     test_label_timespans: ['1y']
 27 |     test_as_of_date_frequencies: '1month'
 28 | 
 29 | cohort_config:
 30 |     query: |
 31 |       select entity_id
 32 |       from semantic.entities
 33 |       where
 34 |       license_num in (1596210, 1874347, 1142451)
 35 |       and daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 36 |     name: 'test_facilities'
 37 | 
 38 | label_config:
 39 |   query: |
 40 |     select
 41 |     entity_id,
 42 |     bool_or(result = 'fail')::integer as outcome
 43 |     from semantic.events
 44 |     where '{as_of_date}'::timestamp <= date
 45 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 46 |     group by entity_id
 47 |   name: 'failed_inspections'
 48 | 
 49 | grid_config:
 50 |     'sklearn.dummy.DummyClassifier':
 51 |         strategy: [most_frequent]
 52 | 
 53 | feature_aggregations:
 54 |   -
 55 |     prefix: 'inspections'
 56 |     from_obj: 'semantic.events'
 57 |     knowledge_date_column: 'date'
 58 | 
 59 |     aggregates_imputation:
 60 |       count:
 61 |         type: 'zero_noflag'
 62 | 
 63 |     aggregates:
 64 |       -
 65 |         quantity:
 66 |           total: "*"
 67 |         metrics:
 68 |           - 'count'
 69 | 
 70 |     intervals: ['1month', '3month', '6month', '1y', 'all']
 71 | 
 72 |     groups:
 73 |       - 'entity_id'
 74 | 
 75 | 
 76 |   -
 77 |     prefix: 'risks'
 78 |     from_obj: 'semantic.events'
 79 |     knowledge_date_column: 'date'
 80 | 
 81 |     categoricals_imputation:
 82 |       sum:
 83 |         type: 'zero'
 84 |       avg:
 85 |         type: 'zero'
 86 | 
 87 |     categoricals:
 88 |       -
 89 |         column: 'risk'
 90 |         choices: ['low', 'medium', 'high']
 91 |         metrics:
 92 |           - 'sum'
 93 |           - 'avg'
 94 | 
 95 |     intervals: ['1month', '3month', '6month', '1y', 'all']
 96 | 
 97 |     groups:
 98 |       - 'entity_id'
 99 |       - 'zip_code'
100 | 
101 |   -
102 |     prefix: 'results'
103 |     from_obj: 'semantic.events'
104 |     knowledge_date_column: 'date'
105 | 
106 |     categoricals_imputation:
107 |       all:
108 |         type: 'zero'
109 | 
110 |     categoricals:
111 |       -
112 |         column: 'result'
113 |         choice_query: 'select distinct result from semantic.events'
114 |         metrics:
115 |           - 'sum'
116 |           - 'avg'
117 | 
118 |     intervals:
119 |       - '6month'
120 | 
121 |     groups:
122 |       - 'entity_id'
123 | 
124 | feature_group_definition:
125 |   prefix:
126 |     - 'results'
127 |     - 'risks'
128 |     - 'inspections'
129 | 
130 | feature_group_strategies: ['all']
131 | 
132 | model_group_keys:
133 |   - 'class_path'
134 |   - 'parameters'
135 |   - 'feature_names'
136 |   - 'feature_groups'
137 |   - 'cohort_name'
138 |   - 'state'
139 |   - 'label_name'
140 |   - 'label_timespan'
141 |   - 'training_as_of_date_frequency'
142 |   - 'max_training_history'
143 |   - 'label_definition'
144 |   - 'experiment_type'
145 |   - 'org'
146 |   - 'team'
147 |   - 'author'
148 |   - 'etl_date'
149 | 
150 | scoring:
151 |   testing_metric_groups:
152 |     -
153 |       metrics: ['precision@', 'recall@']
154 |       thresholds:
155 |         percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0]
156 |         top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500]
157 |   training_metric_groups:
158 |     -
159 |       metrics: ['accuracy']
160 |     -
161 |       metrics: ['precision@', 'recall@']
162 |       thresholds:
163 |         percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0]
164 |         top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500]
165 | 


--------------------------------------------------------------------------------
/docs/triage/images/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/triage/images/eis_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/eis_01.png


--------------------------------------------------------------------------------
/docs/triage/images/inspections_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/inspections_baseline.png


--------------------------------------------------------------------------------
/docs/triage/images/inspections_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/inspections_dt.png


--------------------------------------------------------------------------------
/docs/triage/images/inspections_label_failed_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/inspections_label_failed_01.png


--------------------------------------------------------------------------------
/docs/triage/images/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/triage/images/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/docs/triage/images/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/triage/images/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/triage/images/simple_test_skeleton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/docs/triage/images/simple_test_skeleton.png


--------------------------------------------------------------------------------
/infrastructure/aws_batch/credentials.filter.example:
--------------------------------------------------------------------------------
 1 | {
 2 |         "environment": [
 3 |                 {
 4 |                         "name": "AWS_ACCESS_KEY_ID",
 5 |                         "value": .Credentials.AccessKeyId
 6 |                 },
 7 |                 {
 8 |                         "name": "AWS_SECRET_ACCESS_KEY",
 9 |                         "value": .Credentials.SecretAccessKey
10 |                 },
11 |                 {
12 |                         "name": "AWS_SESSION_TOKEN",
13 |                         "value": .Credentials.SessionToken
14 |                 }
15 |         ]
16 | }
17 | 


--------------------------------------------------------------------------------
/infrastructure/aws_batch/triage-job-definition.json.example:
--------------------------------------------------------------------------------
 1 | {
 2 |   "containerProperties": {
 3 |     "command": [
 4 |       "--tb",
 5 |       "Ref::experiment_file",
 6 |       "--project-path",
 7 |       "Ref::output_path",
 8 |       "Ref::replace",
 9 |       "Ref::save_predictions",
10 |       "Ref::profile",
11 |       "Ref::validate"
12 |     ],
13 |     "image": "AWS_ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/YOUR_TRIAGE_IMAGE",
14 |     "jobRoleArn": "arn:aws:iam::AWS_ACCOUNT:role/dsappBatchJobRole",
15 |     "memory": 16000,
16 |     "vcpus": 1
17 |   },
18 |   "jobDefinitionName": "triage-cli-experiment",
19 |   "retryStrategy": {
20 |     "attempts": 1
21 |   },
22 |   "type": "container"
23 | }
24 | 


--------------------------------------------------------------------------------
/infrastructure/aws_batch/triage-overrides.json.example:
--------------------------------------------------------------------------------
 1 | {
 2 |     "environment": [
 3 |         {
 4 |             "name":"AWS_DEFAULT_REGION",
 5 |             "value":"us-west-2"
 6 |         },
 7 |         {
 8 |             "name":"AWS_JOB_QUEUE",
 9 |             "value":""
10 |         },
11 |         {
12 |             "name":"POSTGRES_PASSWORD",
13 |             "value":""
14 |         },
15 |         {
16 |             "name":"POSTGRES_USER",
17 |             "value":""
18 |         },
19 |         {
20 |             "name":"POSTGRES_DB",
21 |             "value":""
22 |         },
23 |         {
24 |             "name":"POSTGRES_PORT",
25 |             "value":""
26 |         },
27 |         {
28 |             "name":"POSTGRES_HOST",
29 |             "value":""
30 |         }
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/infrastructure/bastion/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6-stretch
 2 | 
 3 | ## Installing clients
 4 | RUN  sh -c "echo 'deb http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main' > /etc/apt/sources.list.d/pgdg.list" && \
 5 |      wget --quiet -O - http://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - && \
 6 |      apt-get -y update && \
 7 |      apt-get -y install less postgresql-9.6-postgis-2.2 \
 8 |      postgresql-contrib-9.6 \
 9 |      libpq-dev postgresql-9.6-pgrouting
10 | 
11 | COPY session.key .
12 | COPY requirements.txt .
13 | 
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | WORKDIR triage
17 | 


--------------------------------------------------------------------------------
/infrastructure/bastion/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython
2 | jupyter
3 | 
4 | ## DSaPP stuff
5 | git+https://github.com/dssg/triage.git
6 | 


--------------------------------------------------------------------------------
/infrastructure/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   food_db:
 5 |     build:
 6 |       context: ./food_db
 7 |     image: tutorial/db
 8 |     container_name: food_db
 9 |     env_file: ../.env
10 |     volumes:
11 |       - "../data:/tmp/raw-data"
12 |     ports:
13 |       - "5434:5432"
14 | 
15 |   bastion:
16 |     build:
17 |       context: ./bastion
18 |     image: tutorial/bastion
19 |     container_name: tutorial_bastion
20 |     command: bash
21 |     #user: ${UID}:${GID}
22 |     tty: true
23 |     env_file: ../.env
24 |     environment:
25 |       DATABASE_URL: 'postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@food_db/food'
26 |     volumes:
27 |       - "../data:/data"
28 |       - "../triage:/triage"
29 |       - "../src/sql:/sql"
30 |     ports:
31 |       - "56406-56410:56406-56410"
32 | 
33 |   triage:
34 |     build:
35 |       context: ./triage
36 |     image: tutorial/triage:v3.3.0
37 |     container_name: tutorial_triage
38 |     env_file: ../.env
39 |     environment:
40 |       DATABASE_URL: 'postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@food_db/food'
41 |       TRIAGE_OUTPUT_PATH: '/triage/output'
42 |     volumes:
43 |       - "../triage:/triage"
44 |       - "../src/sql:/sql"
45 | 


--------------------------------------------------------------------------------
/infrastructure/env_example:
--------------------------------------------------------------------------------
1 | POSTGRES_HOST=0.0.0.0
2 | POSTGRES_USER=food_user
3 | POSTGRES_DB=food
4 | POSTGRES_PORT=5434
5 | POSTGRES_PASSWORD=your_password
6 | 
7 | UID=1000
8 | GID=1000
9 | 


--------------------------------------------------------------------------------
/infrastructure/food_db/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM postgres:10
 2 | 
 3 | ## PostGIS activation
 4 | RUN apt-get -y update \
 5 |     && apt-get -y  install wget \
 6 |     && wget --quiet -O - http://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - \
 7 |     && apt-get -y update \
 8 |     && apt-get -y install postgresql-10-postgis-2.4 postgis postgresql-10-pgrouting
 9 | 
10 | 
11 | ## DB setup
12 | ADD activate_postgis.sql /docker-entrypoint-initdb.d/
13 | ADD create_inspections_table.sql /docker-entrypoint-initdb.d/
14 | ADD create_extensions.sql /docker-entrypoint-initdb.d/
15 | ADD nuke_triage.sql /docker-entrypoint-initdb.d/
16 | 
17 | RUN chown postgres:postgres /docker-entrypoint-initdb.d/*.sql
18 | 


--------------------------------------------------------------------------------
/infrastructure/food_db/activate_postgis.sql:
--------------------------------------------------------------------------------
1 | CREATE SCHEMA postgis;
2 | 
3 | ALTER DATABASE food SET search_path=public, postgis, contrib;
4 | 
5 | CREATE EXTENSION postgis SCHEMA postgis;
6 | CREATE EXTENSION pgrouting;
7 | 


--------------------------------------------------------------------------------
/infrastructure/food_db/create_extensions.sql:
--------------------------------------------------------------------------------
1 | create extension fuzzystrmatch;
2 | 


--------------------------------------------------------------------------------
/infrastructure/food_db/create_inspections_table.sql:
--------------------------------------------------------------------------------
 1 | create schema if not exists raw;
 2 | 
 3 | create table raw.inspections (
 4 |        inspection varchar not null,
 5 |        DBA_Name varchar,
 6 |        AKA_Name varchar,
 7 |        license_Num decimal,
 8 |        facility_type varchar,
 9 |        risk varchar,
10 |        address varchar,
11 |        city varchar,
12 |        state varchar,
13 |        zip varchar,
14 |        date date,
15 |        type varchar,
16 |        results varchar,
17 |        violations varchar,
18 |        latitude decimal,
19 |        longitude decimal,
20 |        location varchar
21 | );
22 | 


--------------------------------------------------------------------------------
/infrastructure/food_db/nuke_triage.sql:
--------------------------------------------------------------------------------
 1 | create or replace function nuke_triage()
 2 |     returns text as $result$
 3 | 
 4 |     declare
 5 |     result text;
 6 |     query text;
 7 | 
 8 |     begin
 9 | 
10 |     execute 'drop schema if exists model_metadata cascade';
11 |     raise notice 'model_metadata deleted';
12 |     execute 'drop schema if exists features cascade';
13 |     raise notice 'features deleted';
14 |     execute 'drop schema if exists train_results cascade';
15 |     raise notice 'train_results deleted';
16 |     execute 'drop schema if exists test_results cascade';
17 |     raise notice 'test_results deleted';
18 | 
19 |     execute 'drop table if exists results_schema_versions';
20 |     raise notice 'results_schema_versions deleted';
21 | 
22 | 
23 | select into query
24 |     string_agg(
25 |         format('drop table %I cascade;', tablename), E'\n'
26 |         )
27 | from   pg_tables
28 | where  tablename ~ 'cohort_|labels_';
29 | 
30 | 
31 | 
32 |     if query is not null then
33 |     raise notice '%', query;
34 |     execute query;
35 |     else
36 |     raise notice 'no  labels or states tables from triage found';
37 |     end if;
38 | 
39 |     return 'triage was send to the oblivion. Long live to triage!';
40 |     end;
41 |     $result$ language plpgsql;
42 | 


--------------------------------------------------------------------------------
/infrastructure/triage/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | 
 3 | LABEL triage.version="v3.3.0" \
 4 |       triage.from="cli" \
 5 |       creator="Center for Data Science and Public Policy (DSaPP)" \
 6 |       maintainer="Adolfo De Unánue <adolfo@uchicago.edu>"
 7 | 
 8 | RUN apt update
 9 | 
10 | COPY requirements.txt .
11 | 
12 | RUN pip install --no-cache-dir -r requirements.txt
13 | 
14 | RUN mkdir triage
15 | 
16 | WORKDIR triage
17 | 
18 | ENTRYPOINT [ "triage", "experiment" ]
19 | 


--------------------------------------------------------------------------------
/infrastructure/triage/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/infrastructure/triage/requirements.txt:
--------------------------------------------------------------------------------
1 | ## DSaPP stuff
2 | git+https://github.com/dssg/triage.git
3 | 


--------------------------------------------------------------------------------
/infrastructure/triage/setup.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(
 6 |     name='triage_experiment',
 7 |     version='0.1',
 8 |     py_modules=['triage_experiment'],
 9 |     entry_points='''
10 |         [console_scripts]
11 |         triage_experiment=triage_experiment:triage
12 |     ''',
13 | )
14 | 


--------------------------------------------------------------------------------
/infrastructure/triage/triage_experiment.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import os
  4 | import yaml
  5 | 
  6 | import datetime
  7 | 
  8 | import click
  9 | 
 10 | from triage.experiments import SingleThreadedExperiment
 11 | from triage.component.catwalk.utils import filename_friendly_hash
 12 | from triage import create_engine
 13 | 
 14 | from utils import show_timechop, show_features_queries, show_model, audit_experiment
 15 | 
 16 | import logging
 17 | 
 18 | logging_level = logging.WARNING
 19 | 
 20 | logging.basicConfig(
 21 |     format="%(name)-30s  %(asctime)s %(levelname)10s %(process)6d  %(filename)-24s  %(lineno)4d: %(message)s",
 22 |     datefmt = "%d/%m/%Y %I:%M:%S %p",
 23 |     level=logging_level,
 24 |     handlers=[logging.StreamHandler()]
 25 | )
 26 | 
 27 | @click.group()
 28 | @click.option('--config_file', type=click.Path(),
 29 |               help="""Triage's experiment congiguration file name
 30 |                       NOTE: It's assumed that the file is located inside
 31 |                       triage/experiments)""",
 32 |               required=True)
 33 | @click.option('--triage_db', envvar='TRIAGE_DB_URL', type=click.STRING,
 34 |                 help="""DB URL, in the form of 'postgresql://user:password@host_db:host_port/db',
 35 |                         by default it gets this from the environment (TRIAGE_DB_URL)""",
 36 |               required=True)
 37 | @click.option('--replace/--no-replace',
 38 |               help="Triage will (or won't) replace all the matrices and models",
 39 |               default=True)  ## Default True so it matches the default behaviour of Triage
 40 | @click.option('--debug', is_flag=True,
 41 |               help="Activate to get a lot of information in your screen")
 42 | @click.pass_context
 43 | def triage(ctx, config_file, triage_db, replace, debug):
 44 | 
 45 |     config_file = os.path.join(os.sep, "triage", "experiments", config_file)
 46 | 
 47 |     click.echo(f"Using the config file {config_file}")
 48 | 
 49 |     with open(config_file) as f:
 50 |         experiments = yaml.load(f)
 51 | 
 52 |     click.echo(f"The output (matrices and models) of this experiment will be stored in triage/output")
 53 |     click.echo(f"Using data stored in {triage_db}")
 54 |     click.echo(f"The experiment will utilize any preexisting matrix or model: {not replace}")
 55 |     click.echo(f"Creating experiment object")
 56 | 
 57 |     experiment = SingleThreadedExperiment(
 58 |         config=experiments,
 59 |         db_engine=create_engine(triage_db),
 60 |         project_path='/triage/output',
 61 |         cleanup=True,
 62 |         replace=replace
 63 |     )
 64 | 
 65 |     ctx.obj = experiment
 66 | 
 67 |     if debug:
 68 |         logging.basicConfig(level=logging.DEBUG)
 69 |         click.echo("Debug enabled (Expect A LOT of output at the screen!!!)")
 70 | 
 71 |     click.echo("Experiment loaded")
 72 | 
 73 | @triage.command()
 74 | @click.pass_obj
 75 | def validate(experiment):
 76 |     click.echo("Validating experiment's configuration")
 77 |     experiment.validate()
 78 | 
 79 |     click.echo("""
 80 |            The experiment configuration doesn't contain any obvious errors.
 81 |            Any error that occurs from now on, possibly will be related to hit the maximum
 82 |            number of columns allowed or collision in
 83 |            the column names, both due to PostgreSQL limitations.
 84 |     """)
 85 | 
 86 |     click.echo("The experiment looks in good shape. May the force be with you")
 87 | 
 88 | @triage.command()
 89 | @click.pass_obj
 90 | def run(experiment):
 91 |     start_time = datetime.datetime.now()
 92 | 
 93 |     click.echo("Executing experiment")
 94 |     experiment.run()
 95 |     click.echo("Done")
 96 | 
 97 |     end_time = datetime.datetime.now()
 98 |     click.echo(f"Experiment completed in {end_time - start_time} seconds")
 99 | 
100 | @triage.command()
101 | @click.pass_obj
102 | def show_feature_generators(experiment):
103 |     pass
104 | 
105 | @triage.command()
106 | @click.pass_obj
107 | def show_temporal_blocks(experiment):
108 |     click.echo("Generating temporal blocks image")
109 |     chopper = experiment.chopper
110 |     file_name = f"{experiment.config['model_comment'].replace(' ', '_')}.svg"
111 |     image_path=show_timechop(chopper, file_name=file_name)
112 |     click.echo("Image stored in:")
113 |     click.echo(image_path)
114 |     return image_path
115 | 
116 | @triage.command()
117 | @click.pass_obj
118 | @click.option('--model',
119 |               help="Model to plot",
120 |               required=True)
121 | def show_model_plot(experiment, model):
122 |     click.echo("Generating model image")
123 |     image_path = show_model(model)
124 |     click.echo("Image stored in: ")
125 |     click.echo(image_path)
126 | 
127 |     return image_path
128 | 
129 | 
130 | @triage.command()
131 | @click.pass_obj
132 | @click.option('--metric',
133 |               help="Model to plot",
134 |               required=True)
135 | @click.option('--rules',
136 |               help="Path to selection rules",
137 |               required=True)
138 | def audit_models(experiment, metric, rules):
139 |     click.echo("Auditing experiment")
140 |     experiment_hash = filename_friendly_hash(experiment.config)
141 | 
142 |     with open(f"/triage/selection_rules/{rules}") as f:
143 |         rules = yaml.load(f)
144 | 
145 |     metric, k = metric.split('@')
146 | 
147 |     audit_experiment(experiment_hash, f"{metric}@", k, rules)
148 | 


--------------------------------------------------------------------------------
/infrastructure/web/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx
2 | 
3 | COPY default.conf  /etc/nginx/conf.d/default.conf
4 | 
5 | RUN chown -R nginx:nginx /usr/share/nginx/html/
6 | 
7 | VOLUME /usr/share/nginx/html
8 | 


--------------------------------------------------------------------------------
/infrastructure/web/default.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 |     listen       80;
 3 |     server_name  localhost;
 4 | 
 5 |     #charset koi8-r;
 6 |     #access_log  /var/log/nginx/log/host.access.log  main;
 7 | 
 8 |     location / {
 9 |         root   /usr/share/nginx/html;
10 |         index  index.html index.htm;
11 |     }
12 | 
13 |     #error_page  404              /404.html;
14 | 
15 |     # redirect server error pages to the static page /50x.html
16 |     #
17 |     error_page   500 502 503 504  /50x.html;
18 |     location = /50x.html {
19 |         root   /usr/share/nginx/html;
20 |     }
21 | 
22 |     # proxy the PHP scripts to Apache listening on 127.0.0.1:80
23 |     #
24 |     #location ~ \.php$ {
25 |     #    proxy_pass   http://127.0.0.1;
26 |     #}
27 | 
28 |     # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
29 |     #
30 |     #location ~ \.php$ {
31 |     #    root           html;
32 |     #    fastcgi_pass   127.0.0.1:9000;
33 |     #    fastcgi_index  index.php;
34 |     #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
35 |     #    include        fastcgi_params;
36 |     #}
37 | 
38 |     # deny access to .htaccess files, if Apache's document root
39 |     # concurs with nginx's one
40 |     #
41 |     #location ~ /\.ht {
42 |     #    deny  all;
43 |     #}
44 | }


--------------------------------------------------------------------------------
/org/00_instructions.org:
--------------------------------------------------------------------------------
 1 | #+STARTUP: showeverything
 2 | #+STARTUP: nohideblocks
 3 | #+STARTUP: indent
 4 | #+STARTUP: align
 5 | #+STARTUP: inlineimages
 6 | #+STARTUP: latexpreview
 7 | #+PROPERTY: header-args:sql :engine postgresql
 8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0
 9 | #+PROPERTY: header-args:sql+ :dbport 5434
10 | #+PROPERTY: header-args:sql+ :dbuser food_user
11 | #+PROPERTY: header-args:sql+ :dbpassword some_password
12 | #+PROPERTY: header-args:sql+ :database food
13 | #+PROPERTY: header-args:sql+ :results table drawer
14 | #+PROPERTY: header-args:sql+ :cmdline -q
15 | #+PROPERTY: header-args:sh  :results verbatim org
16 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue :
17 | #+PROPERTY: header-args:ipython   :session Food_inspections
18 | #+PROPERTY: header-args:ipython+ :results raw drawer
19 | #+OPTIONS: broken-links:mark
20 | #+OPTIONS: tasks:todo
21 | #+OPTIONS: LaTeX:t
22 | 
23 | 
24 | * Welcome!
25 | 
26 | This tutorial will show you how to use =triage=, a data science
27 |  modeling tool developed at the [[http://dsapp.uchicago.edu][Center for Data Science and Public
28 |  Policy]] (DSaPP) at the University of Chicago.
29 | 
30 | =triage= helps build models for three [[https://dssg.uchicago.edu/data-science-for-social-good-conference-2017/training-workshop-data-science-for-social-good-problem-templates/][common applied problems]]: (a) Early
31 | warning systems (*EWS* or *EIS*), (b) /resource prioritization/ (a.k.a "an
32 | inspections problem") and (c) interaction level predictions (a.k.a
33 | "appointment level"). These problems
34 | are difficult to model because their conceptualization and
35 | and implementation are prone to error, thanks to their multi-dimensional,
36 | multi-entity, time-series structure.
37 | 
38 | The last version of this tutorial is published in [[https://dssg.github.io/dirtyduck/]]
39 | 
40 | *NOTE* This tutorial is in sync with the latest version of =triage=. At
41 | this moment [[https://github.com/dssg/triage/releases/tag/v3.3.0][v3.3.0 (Arepa)]].
42 | 
43 | * Before you start
44 | ** What you need for this tutorial
45 | 
46 | Install [[http://www.docker.com][Docker CE]] and [[https://docs.docker.com/compose/][Docker Compose]]. That's it.
47 | Follow the links for installation instructions.
48 | 
49 | Note that if you are using =GNU/Linux= you should add your user to the
50 | =docker= group following the instructions at this [[https://docs.docker.com/install/linux/linux-postinstall/][link]].
51 | 
52 | At the moment only operative systems with *nix-type command lines are
53 | supported, such as =GNU/Linux= and =MacOS=. Recent versions of
54 | =Windows= may also work.
55 | 
56 | ** How to use this tutorial
57 | 
58 | First, clone this repository on your laptop
59 | 
60 | #+BEGIN_EXAMPLE
61 |  git clone https://github.com/dssg/dirtyduck.git
62 | #+END_EXAMPLE
63 | 
64 | Second, run
65 | 
66 | #+BEGIN_EXAMPLE
67 | ./tutorial.sh start
68 | #+END_Example
69 | 
70 | This will take several minutes the first time you do it.
71 | 
72 | 
73 | ** How you can help to improve this tutorial
74 | 
75 | If you want to contribute, please follow the suggestions in the [[file:~/projects/dsapp/dirtyduck/README.org::*How%20you%20can%20help][README]]
76 | 


--------------------------------------------------------------------------------
/org/01_intro.org:
--------------------------------------------------------------------------------
  1 | #+STARTUP: showeverything
  2 | #+STARTUP: nohideblocks
  3 | #+STARTUP: indent
  4 | #+STARTUP: align
  5 | #+STARTUP: inlineimages
  6 | #+STARTUP: latexpreview
  7 | #+PROPERTY: header-args:sql :engine postgresql
  8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0
  9 | #+PROPERTY: header-args:sql+ :dbport 5434
 10 | #+PROPERTY: header-args:sql+ :dbuser food_user
 11 | #+PROPERTY: header-args:sql+ :dbpassword some_password
 12 | #+PROPERTY: header-args:sql+ :database food
 13 | #+PROPERTY: header-args:sql+ :results table drawer
 14 | #+PROPERTY: header-args:sql+ :cmdline -q
 15 | #+PROPERTY: header-args:sh  :results verbatim org
 16 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue :
 17 | #+PROPERTY: header-args:ipython   :session Food_inspections
 18 | #+PROPERTY: header-args:ipython+ :results raw drawer
 19 | #+OPTIONS: broken-links:mark
 20 | #+OPTIONS: tasks:todo
 21 | #+OPTIONS: LaTeX:t
 22 | 
 23 | 
 24 | * Description of the problem
 25 | 
 26 |   This tutorial aims to introduce the reader to [[https://github.com/dssg/triage][triage]], a machine learning modeling tool built by the [[https://dsapp.uchicago.edu][Center for Data Science and Public Policy]].
 27 |   We will use the well-known [[https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5][Chicago Food Inspections dataset]].[fn:1]
 28 | 
 29 |   We will present the two problems that =triage= was built to model[fn:5]:
 30 | 
 31 |   1. *Resource prioritization* (internally known as the /inspections
 32 |      problem/)[fn:2] and
 33 |   2. *Early warning*.[fn:3]
 34 | 
 35 | 
 36 | ** Inspection Prioritization
 37 | 
 38 |   In an ideal world, inspectors would frequently visit every food
 39 |   facility, every day[fn:4] to ensure it meets safety standards. But
 40 |   the real world doesn't have enough
 41 |   inspectors for that to happen, so the city needs to decide how to allocate
 42 |   its limited inspection workforce to find and remediate as many establishments
 43 |   with food hazards as possible. Assuming the city can inspect $n$ facilities
 44 |   in the next $X$ period of time, they can define the problem like this:
 45 | 
 46 |   #+CAPTION: How to define Chicago Food Inspections as an inspection-prioritization problem:
 47 |   #+begin_quote
 48 |     Which $n$ facilities will have a food violation in the
 49 |     following $X$ period of time?
 50 |   #+end_quote
 51 | 
 52 |   If our inspection workforce is really limited, we should probably just target
 53 |   the most serious violations. Then we'd define the problem like this:
 54 | 
 55 |   #+CAPTION: How to define Chicago Food Inspections as an inspection-prioritization problem that targets the most serious cases:
 56 |   #+begin_quote
 57 |     Which $n$ facilities will have a critical or serious violation in the
 58 |     following $X$ period of time?
 59 |   #+end_quote
 60 | 
 61 | 
 62 | ** Early Warning
 63 |   Using the same data set, facility owners or managers would pose the
 64 |   ML problem as an early warning problem.
 65 |   They'd like to know whether an inspector is going to visit their facility
 66 |   so they can prepare for it. They can define the problem like this:
 67 | 
 68 |   #+CAPTION: How to define Chicago Food Inspections as an early warning problem:
 69 |   #+begin_quote
 70 |     Will my facility be inspected in the next $X$ period of time?
 71 |   #+end_quote
 72 | 
 73 | Note that in both cases, we are defining a period of time in which the
 74 | event potentially will happen.
 75 | 
 76 | ** What do they have in common?
 77 |   For either problem, $X$ could be a day, a week, month, a quarter, a year, 56 days,
 78 |   or some other time period.
 79 | 
 80 |   Without going into detail, both problems use data where each
 81 |   row describes an *event* in which an *entity* was involved, and
 82 |   each event has a specific *outcome* or result.
 83 | 
 84 |   The *entity* for both inspection prioritizations and early warnings
 85 |   in this tutorial is a food /facility/, and the *event* is an inspection.
 86 |   But the *outcome* differs: for inspections the outcome is /inspection failed/
 87 |   or /major violation found/, while for early warning the outcome is
 88 |   /inspected/.
 89 | 
 90 | ** How do they differ?
 91 | 
 92 | Besides the obvious (i.e. the label), these ML's problem formulations
 93 | have very different internal structure:
 94 | 
 95 | The /EIS/ problem *all* the entities of interest in a given period of
 96 | time *have* a label. The /Inspections/ problem does not have that
 97 | luxury: from all the existing entities of interest only a bunch are
 98 | /inspected/ that means that only those inspected have a label
 99 | (=True/False=) but all the remaining ones doesn't have one. This will be
100 | reflected, for example in the /training/ matrices: you only train in the
101 | facilities that were inspected (so you will have less rows in
102 | them). Another impact will be in the metrics: you need to be very
103 | careful about interpreting the metrics in an inspections
104 | problem. Finally, when you are designing the field validation of your
105 | model, you need to take in account this selection bias, if not, you
106 | will be inspecting the same facilities over and over[fn:6]
107 | 
108 | 
109 | * Footnotes
110 | 
111 | [fn:6] This points is particularly acute: Imagine the scenario in
112 | which the /inspections/ problem is *crime prediction* in order to send
113 | cops (inspectors)to that "risky" area (facilities)...
114 | 
115 | [fn:5] It is also possible to do "visit-level prediction" type of ML problem.
116 | 
117 | [fn:4] Defined as "bakery, banquet
118 | hall, candy store, caterer, coffee shop, day care center (for ages less than 2), day care
119 | center (for ages 2 – 6), day care center (combo, for ages less than 2 and 2 – 6
120 | combined), gas station, Golden Diner, grocery store, hospital, long term care
121 | center(nursing home), liquor store, mobile food dispenser, restaurant, paleteria, school,
122 | shelter, tavern, social club, wholesaler, or Wrigley Field Rooftop"
123 | ([[https://data.cityofchicago.org/api/views/4ijn-s7e5/files/O9cwLJ4wvxQJ2MirxkNzAUCCMQiM31DMzRkckMsKlxc?download=true&filename=foodinspections_description.pdf][source]]).
124 | 
125 | [fn:3] Examples include [[http://dsapp.uchicago.edu/projects/education/][Increasing High School Graduation Rates: Early
126 | Warnings and Predictive Systems]], [[http://dsapp.uchicago.edu/projects/public-safety/police-eis/][Building Data-Driven Early
127 | Intervention Systems for Police Officers]], and [[http://dsapp.uchicago.edu/projects/criminal-justice/data-driven-justice-initiative/][Data-Driven Justice
128 | Initiative: Identifying Frequent Users of Multiple Public Systems for
129 | More Effective Early Assistance]].
130 | 
131 | [fn:2] Examples include [[http://dsapp.uchicago.edu/projects/environment/][Predictive Enforcement
132 | of Hazardous Waste Regulations]] and [[http://dsapp.uchicago.edu/projects/health/lead-prevention/][Targeting Proactive Inspections for Lead Hazards]].
133 | 
134 | [fn:1] Several examples use this dataset, such as [[https://chicago.github.io/food-inspections-evaluation/][City of Chicago Food
135 | Inspection Forecasting]],  [[https://youtu.be/lyDLAutA88s][PyCon 2016 keynote: Built in Super Heroes]],
136 | and [[https://youtu.be/1dKonIT-Yak][PyData 2016: Forecasting critical food violations at restaurants
137 | using open data]].
138 | 


--------------------------------------------------------------------------------
/org/07_quick_setup.org:
--------------------------------------------------------------------------------
  1 | #+STARTUP: showeverything
  2 | #+STARTUP: nohideblocks
  3 | #+STARTUP: indent
  4 | #+STARTUP: align
  5 | #+STARTUP: inlineimages
  6 | #+STARTUP: latexpreview
  7 | #+PROPERTY: header-args:sql :engine postgresql
  8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0
  9 | #+PROPERTY: header-args:sql+ :dbport 5434
 10 | #+PROPERTY: header-args:sql+ :dbuser food_user
 11 | #+PROPERTY: header-args:sql+ :dbpassword some_password
 12 | #+PROPERTY: header-args:sql+ :database food
 13 | #+PROPERTY: header-args:sql+ :results table drawer
 14 | #+PROPERTY: header-args:sql+ :exports both
 15 | #+PROPERTY: header-args:sql+ :eval no-export
 16 | #+PROPERTY: header-args:sql+ :cmdline -q
 17 | #+PROPERTY: header-args:sh  :results verbatim org
 18 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue :
 19 | #+PROPERTY: header-args:ipython   :session food_inspections
 20 | #+PROPERTY: header-args:ipython+ :results raw drawer
 21 | #+OPTIONS: broken-links:mark
 22 | #+OPTIONS: tasks:todo
 23 | #+OPTIONS: LaTeX:t
 24 | 
 25 | * Appendix: For the impatient
 26 | 
 27 | If you want to skip all the cleansing and transformation and deep
 28 | directly into =triage= you can
 29 | execute the following /inside bastion/:
 30 | 
 31 | #+BEGIN_SRC sh :dir /docker:root@tutorial_bastion:/
 32 |      curl "https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD" > data/inspections.csv
 33 | 
 34 |      psql ${DATABASE_URL} -c "\copy raw.inspections FROM '/data/inspections.csv' WITH HEADER CSV"
 35 | 
 36 |      psql ${DATABASE_URL} < /sql/create_cleaned_inspections_table.sql
 37 | 
 38 |      psql ${DATABASE_URL} < /sql/create_violations_table.sql
 39 | 
 40 |      psql ${DATABASE_URL} < /sql/create_semantic_tables.sql
 41 | #+END_SRC
 42 | 
 43 | #+RESULTS:
 44 | #+BEGIN_SRC org
 45 | COPY 168861
 46 | CREATE SCHEMA
 47 | NOTICE:  table "inspections" does not exist, skipping
 48 | DROP TABLE
 49 | SELECT 168046
 50 | NOTICE:  table "violations" does not exist, skipping
 51 | DROP TABLE
 52 | SELECT 632487
 53 | CREATE SCHEMA
 54 | NOTICE:  table "entities" does not exist, skipping
 55 | DROP TABLE
 56 | SELECT 35360
 57 | CREATE INDEX
 58 | CREATE INDEX
 59 | CREATE INDEX
 60 | CREATE INDEX
 61 | CREATE INDEX
 62 | CREATE INDEX
 63 | CREATE INDEX
 64 | NOTICE:  table "events" does not exist, skipping
 65 | DROP TABLE
 66 | SELECT 145123
 67 | CREATE INDEX
 68 | CREATE INDEX
 69 | CREATE INDEX
 70 | CREATE INDEX
 71 | CREATE INDEX
 72 | CREATE INDEX
 73 | CREATE INDEX
 74 | CREATE INDEX
 75 | CREATE INDEX
 76 | CREATE INDEX
 77 | #+END_SRC
 78 | 
 79 | 
 80 | If everything works, you should end with two new schemas: =cleaned= and =semantic=.
 81 | 
 82 | You could check that (from =psql=) With
 83 | #+BEGIN_SRC sql
 84 | \dn
 85 | #+END_SRC
 86 | 
 87 | #+RESULTS:
 88 | :RESULTS:
 89 | | List of schemas |          |
 90 | |-----------------+----------|
 91 | | Name            | Owner    |
 92 | | cleaned         | food_user |
 93 | | postgis         | food_user |
 94 | | public          | postgres |
 95 | | raw             | food_user |
 96 | | semantic        | food_user |
 97 | :END:
 98 | 
 99 | Now you can continue to the introduction to triage section.
100 | 


--------------------------------------------------------------------------------
/org/100_whats_next.org:
--------------------------------------------------------------------------------
 1 | #+STARTUP: showeverything
 2 | #+STARTUP: nohideblocks
 3 | #+STARTUP: indent
 4 | #+STARTUP: align
 5 | #+STARTUP: inlineimages
 6 | #+STARTUP: latexpreview
 7 | #+PROPERTY: header-args:sql :engine postgresql
 8 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0
 9 | #+PROPERTY: header-args:sql+ :dbport 5434
10 | #+PROPERTY: header-args:sql+ :dbuser food_user
11 | #+PROPERTY: header-args:sql+ :dbpassword some_password
12 | #+PROPERTY: header-args:sql+ :database food
13 | #+PROPERTY: header-args:sql+ :results table drawer
14 | #+PROPERTY: header-args:sql+ :cmdline -q
15 | #+PROPERTY: header-args:sh  :results verbatim org
16 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue :
17 | #+PROPERTY: header-args:ipython   :session Food_inspections
18 | #+PROPERTY: header-args:ipython+ :results raw drawer
19 | #+OPTIONS: broken-links:mark
20 | #+OPTIONS: tasks:todo
21 | 
22 | * What's next?
23 | 
24 |   - Add the shape file
25 |     https://data.cityofchicago.org/api/geospatial/gdcf-axmw?method=export&format=Shapefile
26 |     and generate geospatial variables using =location=
27 |   - Text analysis on the /violations/' =comments= column and generate
28 |     new /outcomes/ or /features/?
29 |   - Run =some deduplication and had a better =semantic.entities=?
30 |   - Routing based on the inspection list?
31 |   - Add more data sources (Census, Schools, bus stops, ACS data, Yelp!):
32 |     - [[https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses/r5kz-chrr][Business Licenses]]
33 |     - Food Inspections
34 |     - [[https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2][Crime]]
35 |     - Garbage Cart Complaints
36 |     - [[https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints/me59-5fac][Sanitation Complaints]]
37 |     - Weather
38 |     - Sanitarian Information
39 | 


--------------------------------------------------------------------------------
/org/audition:
--------------------------------------------------------------------------------
1 | ../triage/audition


--------------------------------------------------------------------------------
/org/css/org-default.css:
--------------------------------------------------------------------------------
1 | .org-bold{font-weight:700}.org-bold-italic{font-weight:700;font-style:italic}.org-buffer-menu-buffer{font-weight:700}.org-builtin{color:#483d8b}.org-button{color:#3a5fcd;text-decoration:underline}.org-calendar-month-header{color:#00f}.org-calendar-today{text-decoration:underline}.org-calendar-weekday-header{color:#008b8b}.org-calendar-weekend-header{color:#b22222}.org-comint-highlight-input{font-weight:700}.org-comint-highlight-prompt{color:#0000cd}.org-comment,.org-comment-delimiter{color:#b22222}.org-constant{color:#008b8b}.org-diary{color:red}.org-doc{color:#8b2252}.org-error{color:red;font-weight:700}.org-escape-glyph{color:brown}.org-file-name-shadow{color:#7f7f7f}.org-fringe{background-color:#f2f2f2}.org-function-name{color:#00f}.org-glyphless-char{font-size:60%}.org-header-line{color:#333;background-color:#e5e5e5}.org-help-argument-name{font-style:italic}.org-highlight{background-color:#b4eeb4}.org-holiday{background-color:pink}.org-info-header-node{color:brown;font-weight:700;font-style:italic}.org-info-header-xref{color:#3a5fcd;text-decoration:underline}.org-info-index-match{background-color:#ff0}.org-info-menu-header{font-weight:700}.org-info-menu-star{color:red}.org-info-node{color:brown;font-weight:700;font-style:italic}.org-info-title-1{font-size:172%;font-weight:700}.org-info-title-2{font-size:144%;font-weight:700}.org-info-title-3{font-size:120%;font-weight:700}.org-info-title-4{font-weight:700}.org-info-xref{color:#3a5fcd;text-decoration:underline}.org-italic{font-style:italic}.org-keyword{color:#a020f0}.org-lazy-highlight{background-color:#afeeee}.org-link{color:#3a5fcd;text-decoration:underline}.org-link-visited{color:#8b008b;text-decoration:underline}.org-makefile-makepp-perl{background-color:#bfefff}.org-makefile-space{background-color:#ff69b4}.org-makefile-targets{color:#00f}.org-match{background-color:#ff0}.org-next-error{background-color:gtk_selection_bg_color}.org-nobreak-space{color:brown;text-decoration:underline}.org-org-agenda-calendar-event,.org-org-agenda-calendar-sexp{color:#000;background-color:#fff}.org-org-agenda-clocking{background-color:#ff0}.org-org-agenda-column-dateline{background-color:#e5e5e5}.org-org-agenda-current-time{color:#b8860b}.org-org-agenda-date{color:#00f}.org-org-agenda-date-today{color:#00f;font-weight:700;font-style:italic}.org-org-agenda-date-weekend{color:#00f;font-weight:700}.org-org-agenda-diary{color:#000;background-color:#fff}.org-org-agenda-dimmed-todo{color:#7f7f7f}.org-org-agenda-done{color:#228b22}.org-org-agenda-filter-category,.org-org-agenda-filter-effort,.org-org-agenda-filter-regexp,.org-org-agenda-filter-tags{color:#000;background-color:#bfbfbf}.org-org-agenda-restriction-lock{background-color:#eee}.org-org-agenda-structure{color:#00f}.org-org-archived,.org-org-block{color:#7f7f7f}.org-org-block-begin-line,.org-org-block-end-line{color:#b22222}.org-org-checkbox{font-weight:700}.org-org-checkbox-statistics-done{color:#228b22;font-weight:700}.org-org-checkbox-statistics-todo{color:red;font-weight:700}.org-org-clock-overlay{color:#000;background-color:#d3d3d3}.org-org-code{color:#7f7f7f}.org-org-column,.org-org-column-title{background-color:#e5e5e5}.org-org-column-title{font-weight:700;text-decoration:underline}.org-org-date{color:#a020f0;text-decoration:underline}.org-org-date-selected{color:red}.org-org-default{color:#000;background-color:#fff}.org-org-document-info{color:#191970}.org-org-document-info-keyword{color:#7f7f7f}.org-org-document-title{color:#191970;font-weight:700}.org-org-done{color:#228b22;font-weight:700}.org-org-drawer{color:#00f}.org-org-ellipsis{color:#b8860b;text-decoration:underline}.org-org-footnote{color:#a020f0;text-decoration:underline}.org-org-formula{color:#b22222}.org-org-headline-done{color:#bc8f8f}.org-org-hide{color:#fff}.org-org-latex-and-related{color:#8b4513}.org-org-level-1{color:#00f}.org-org-level-2{color:sienna}.org-org-level-3{color:#a020f0}.org-org-level-4{color:#b22222}.org-org-level-5{color:#228b22}.org-org-level-6{color:#008b8b}.org-org-level-7{color:#483d8b}.org-org-level-8{color:#8b2252}.org-org-link{color:#3a5fcd;text-decoration:underline}.org-org-list-dt{font-weight:700}.org-org-macro{color:#8b4513}.org-org-meta-line{color:#b22222}.org-org-mode-line-clock{color:#000;background-color:#bfbfbf}.org-org-mode-line-clock-overrun{color:#000;background-color:red}.org-org-priority{color:#a020f0}.org-org-quote{color:#7f7f7f}.org-org-scheduled{color:#006400}.org-org-scheduled-previously{color:#b22222}.org-org-scheduled-today{color:#006400}.org-org-sexp-date,.org-org-special-keyword{color:#a020f0}.org-org-table{color:#00f}.org-org-tag,.org-org-tag-group{font-weight:700}.org-org-target{text-decoration:underline}.org-org-time-grid{color:#b8860b}.org-org-todo{color:red;font-weight:700}.org-org-upcoming-deadline{color:#b22222}.org-org-verbatim,.org-org-verse{color:#7f7f7f}.org-org-warning{color:red;font-weight:700}.org-outline-1{color:#00f}.org-outline-2{color:sienna}.org-outline-3{color:#a020f0}.org-outline-4{color:#b22222}.org-outline-5{color:#228b22}.org-outline-6{color:#008b8b}.org-outline-7{color:#483d8b}.org-outline-8{color:#8b2252}.org-preprocessor{color:#483d8b}.org-regexp-grouping-backslash,.org-regexp-grouping-construct{font-weight:700}.org-region{background-color:gtk_selection_bg_color}.org-secondary-selection{background-color:#ff0}.org-shadow{color:#7f7f7f}.org-show-paren-match{background-color:#40e0d0}.org-show-paren-mismatch{color:#fff;background-color:#a020f0}.org-string{color:#8b2252}.org-success{color:#228b22;font-weight:700}.org-table-cell{color:#e5e5e5;background-color:#00f}.org-tooltip{color:#000;background-color:#ffffe0}.org-trailing-whitespace{background-color:red}.org-type{color:#228b22}.org-underline{text-decoration:underline}.org-variable-name{color:sienna}.org-warning{color:#ff8c00;font-weight:700}.org-warning-1{color:red;font-weight:700}.title{margin-bottom:.2em}.subtitle,.title{text-align:center}.subtitle{font-size:medium;font-weight:700;margin-top:0}.todo{color:red}.done,.todo{font-family:monospace}.done{color:green}.priority{color:orange}.priority,.tag{font-family:monospace}.tag{background-color:#eee;font-size:80%;font-weight:400;padding:2px}.timestamp{color:#bebebe}.timestamp-kwd{color:#5f9ea0}.org-right{margin-left:auto;margin-right:0;text-align:right}.org-left{margin-left:0;margin-right:auto;text-align:left}.org-center{margin-left:auto;margin-right:auto;text-align:center}.underline{text-decoration:underline}#postamble p,#preamble p{font-size:90%;margin:.2em}p.verse{margin-left:3%}pre{border:1px solid #ccc;box-shadow:3px 3px 3px #eee;font-family:monospace;margin:1.2em;overflow:auto;padding:8pt}pre.src{overflow:visible;padding-top:1.2em;position:relative}pre.src:before{background-color:#fff;border:1px solid #000;display:none;padding:3px;position:absolute;right:10px;top:-10px}pre.src:hover:before{display:inline}pre.src-bash:before,pre.src-sh:before{content:"sh"}pre.src-emacs-lisp:before{content:"Emacs Lisp"}pre.src-R:before{content:"R"}pre.src-perl:before{content:"Perl"}pre.src-java:before{content:"Java"}pre.src-sql:before{content:"SQL"}table{border-collapse:collapse}caption.t-above{caption-side:top}caption.t-bottom{caption-side:bottom}td,th{vertical-align:top}th.org-center,th.org-left,th.org-right{text-align:center}td.org-right{text-align:right}td.org-left{text-align:left}td.org-center{text-align:center}dt{font-weight:700}.footpara{display:inline}.footdef{margin-bottom:1em}.figure{padding:1em}.figure p{text-align:center}.inlinetask{background:#ffc;border:2px solid gray;margin:10px;padding:10px}#org-div-home-and-up{font-size:70%;text-align:right;white-space:nowrap}textarea{overflow-x:auto}.linenr{font-size:smaller}.code-highlighted{background-color:#ff0}.org-info-js_info-navigation{border-style:none}#org-info-js_console-label{font-size:10px;font-weight:700;white-space:nowrap}.org-info-js_search-highlight{background-color:#ff0;color:#000;font-weight:700}
2 | 


--------------------------------------------------------------------------------
/org/docker-kernel-connection.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "shell_port": 56409,
 3 |   "iopub_port": 56408,
 4 |   "stdin_port": 56410,
 5 |   "control_port": 56406,
 6 |   "hb_port": 56407,
 7 |   "ip": "0.0.0.0",
 8 |   "key": "c2e3bb2a-f80c7b34d4fe02d7e5be87d9",
 9 |   "transport": "tcp",
10 |   "signature_scheme": "hmac-sha256",
11 |   "kernel_name": ""
12 | }


--------------------------------------------------------------------------------
/org/images/AWS_Batch_Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/AWS_Batch_Architecture.png


--------------------------------------------------------------------------------
/org/images/data_road.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/data_road.png


--------------------------------------------------------------------------------
/org/images/eis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis.png


--------------------------------------------------------------------------------
/org/images/eis_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/org/images/eis_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_mg_prec_over_time.png


--------------------------------------------------------------------------------
/org/images/eis_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_mg_recall_over_time.png


--------------------------------------------------------------------------------
/org/images/eis_model_group_64_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_model_group_64_feature_group_importances.png


--------------------------------------------------------------------------------
/org/images/eis_model_group_64_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_model_group_64_feature_importances.png


--------------------------------------------------------------------------------
/org/images/eis_model_group_64_rayid_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/eis_model_group_64_rayid_curve.png


--------------------------------------------------------------------------------
/org/images/facilities_inspected_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/facilities_inspected_over_time.png


--------------------------------------------------------------------------------
/org/images/facilities_with_failed_inspections_severe_violations_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/facilities_with_failed_inspections_severe_violations_over_time.png


--------------------------------------------------------------------------------
/org/images/facilities_with_inspections_failed_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/facilities_with_inspections_failed_over_time.png


--------------------------------------------------------------------------------
/org/images/failed_inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/failed_inspections_over_time.png


--------------------------------------------------------------------------------
/org/images/failed_inspections_severe_violations_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/failed_inspections_severe_violations_over_time.png


--------------------------------------------------------------------------------
/org/images/inspection_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/org/images/inspection_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_mg_prec_over_time.png


--------------------------------------------------------------------------------
/org/images/inspection_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_mg_recall_over_time.png


--------------------------------------------------------------------------------
/org/images/inspection_model_group_11_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_model_group_11_feature_group_importances.png


--------------------------------------------------------------------------------
/org/images/inspection_model_group_11_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_model_group_11_feature_importances.png


--------------------------------------------------------------------------------
/org/images/inspection_model_group_11_rayid_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspection_model_group_11_rayid_curve.png


--------------------------------------------------------------------------------
/org/images/inspections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspections.png


--------------------------------------------------------------------------------
/org/images/inspections_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspections_dt.png


--------------------------------------------------------------------------------
/org/images/inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/inspections_over_time.png


--------------------------------------------------------------------------------
/org/images/model_7_tree_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/model_7_tree_0.png


--------------------------------------------------------------------------------
/org/images/outcomes-eis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/outcomes-eis.png


--------------------------------------------------------------------------------
/org/images/outcomes-inspections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/outcomes-inspections.png


--------------------------------------------------------------------------------
/org/images/rolling-origin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/rolling-origin.png


--------------------------------------------------------------------------------
/org/images/sanjose-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/sanjose-2.png


--------------------------------------------------------------------------------
/org/images/simple_test_skeleton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/simple_test_skeleton.png


--------------------------------------------------------------------------------
/org/images/timechop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop.png


--------------------------------------------------------------------------------
/org/images/timechop_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_1.png


--------------------------------------------------------------------------------
/org/images/timechop_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_10.png


--------------------------------------------------------------------------------
/org/images/timechop_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_2.png


--------------------------------------------------------------------------------
/org/images/timechop_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_3.png


--------------------------------------------------------------------------------
/org/images/timechop_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_4.png


--------------------------------------------------------------------------------
/org/images/timechop_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_5.png


--------------------------------------------------------------------------------
/org/images/timechop_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_6.png


--------------------------------------------------------------------------------
/org/images/timechop_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_7.png


--------------------------------------------------------------------------------
/org/images/timechop_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_8.png


--------------------------------------------------------------------------------
/org/images/timechop_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_9.png


--------------------------------------------------------------------------------
/org/images/timechop_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_example.png


--------------------------------------------------------------------------------
/org/images/timechop_inspections_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_inspections_test.png


--------------------------------------------------------------------------------
/org/images/timechop_withoutblocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_withoutblocks.png


--------------------------------------------------------------------------------
/org/images/timechop_withoutrows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/images/timechop_withoutrows.png


--------------------------------------------------------------------------------
/org/index.org:
--------------------------------------------------------------------------------
 1 | # -*- mode: org; -*-
 2 | 
 3 | #+TITLE: Dirty Duck: A Guided Tour of Triage
 4 | #+DESCRIPTION:
 5 | #+AUTHOR: Center of Data Science for Public Policy
 6 | #+EMAIL: adolfo@uchicago.edu
 7 | #+STARTUP: showeverything
 8 | #+STARTUP: nohideblocks
 9 | #+STARTUP: indent
10 | #+STARTUP: align
11 | #+STARTUP: inlineimages
12 | #+STARTUP: latexpreview
13 | #+PROPERTY: header-args :cache yes
14 | #+PROPERTY: header-args :eval never-export
15 | #+PROPERTY: header-args:sql :engine postgresql
16 | #+PROPERTY: header-args:sql+ :dbhost 0.0.0.0
17 | #+PROPERTY: header-args:sql+ :dbport 5434
18 | #+PROPERTY: header-args:sql+ :dbuser food_user
19 | #+PROPERTY: header-args:sql+ :dbpassword some_password
20 | #+PROPERTY: header-args:sql+ :database food
21 | #+PROPERTY: header-args:sql+ :results table drawer
22 | #+PROPERTY: header-args:sql+ :exports both
23 | #+PROPERTY: header-args:sql+ :eval no-export
24 | #+PROPERTY: header-args:sql+ :cmdline -q
25 | #+PROPERTY: header-args:sh  :results verbatim org
26 | #+PROPERTY: header-args:sh+ :prologue exec 2>&1 :epilogue :
27 | #+PROPERTY: header-args:ipython  :session food_inspections
28 | #+PROPERTY: header-args:ipython+ :results raw drawer
29 | #+PROPERTY: header-args:ipython+ :eval no-export
30 | #+OPTIONS: broken-links:mark
31 | #+OPTIONS: tasks:todo
32 | #+OPTIONS: LaTeX:t
33 | #+SETUPFILE: tutorial.setup
34 | 
35 | #+INCLUDE: 00_instructions.org :minlevel 1
36 | #+INCLUDE: 01_intro.org :minlevel 1
37 | #+INCLUDE: 02_infrastructure.org :minlevel 1
38 | #+INCLUDE: 03_data_preparation.org :minlevel 1
39 | #+INCLUDE: 04_triage_intro.org :minlevel 1
40 | #+INCLUDE: 05_inspections.org :minlevel 1
41 | #+INCLUDE: 06_eis.org :minlevel 1
42 | #+INCLUDE: 09_aws_batch.org :minlevel 1
43 | #+INCLUDE: 100_whats_next.org :minlevel 1
44 | #+INCLUDE: 07_quick_setup.org :minlevel 1
45 | 
46 | #+EXPORT_EXCLUDE_TAGS: noexport
47 | 


--------------------------------------------------------------------------------
/org/js/jquery.stickytableheaders.min.js:
--------------------------------------------------------------------------------
1 | !function(a,b){"use strict";function c(c,g){var h=this;h.$el=a(c),h.el=c,h.id=e++,h.$window=a(b),h.$document=a(document),h.$el.bind("destroyed",a.proxy(h.teardown,h)),h.$clonedHeader=null,h.$originalHeader=null,h.isSticky=!1,h.hasBeenSticky=!1,h.leftOffset=null,h.topOffset=null,h.init=function(){h.$el.each(function(){var b=a(this);b.css("padding",0),h.$originalHeader=a("thead:first",this),h.$clonedHeader=h.$originalHeader.clone(),b.trigger("clonedHeader."+d,[h.$clonedHeader]),h.$clonedHeader.addClass("tableFloatingHeader"),h.$clonedHeader.css("display","none"),h.$originalHeader.addClass("tableFloatingHeaderOriginal"),h.$originalHeader.after(h.$clonedHeader),h.$printStyle=a('<style type="text/css" media="print">.tableFloatingHeader{display:none !important;}.tableFloatingHeaderOriginal{position:static !important;}</style>'),a("head").append(h.$printStyle)}),h.setOptions(g),h.updateWidth(),h.toggleHeaders(),h.bind()},h.destroy=function(){h.$el.unbind("destroyed",h.teardown),h.teardown()},h.teardown=function(){h.isSticky&&h.$originalHeader.css("position","static"),a.removeData(h.el,"plugin_"+d),h.unbind(),h.$clonedHeader.remove(),h.$originalHeader.removeClass("tableFloatingHeaderOriginal"),h.$originalHeader.css("visibility","visible"),h.$printStyle.remove(),h.el=null,h.$el=null},h.bind=function(){h.$scrollableArea.on("scroll."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.on("scroll."+d+h.id,h.setPositionValues),h.$window.on("resize."+d+h.id,h.toggleHeaders)),h.$scrollableArea.on("resize."+d,h.toggleHeaders),h.$scrollableArea.on("resize."+d,h.updateWidth)},h.unbind=function(){h.$scrollableArea.off("."+d,h.toggleHeaders),h.isWindowScrolling||(h.$window.off("."+d+h.id,h.setPositionValues),h.$window.off("."+d+h.id,h.toggleHeaders)),h.$scrollableArea.off("."+d,h.updateWidth)},h.toggleHeaders=function(){h.$el&&h.$el.each(function(){var b,c=a(this),d=h.isWindowScrolling?isNaN(h.options.fixedOffset)?h.options.fixedOffset.outerHeight():h.options.fixedOffset:h.$scrollableArea.offset().top+(isNaN(h.options.fixedOffset)?0:h.options.fixedOffset),e=c.offset(),f=h.$scrollableArea.scrollTop()+d,g=h.$scrollableArea.scrollLeft(),i=h.isWindowScrolling?f>e.top:d>e.top,j=(h.isWindowScrolling?f:0)<e.top+c.height()-h.$clonedHeader.height()-(h.isWindowScrolling?0:d);i&&j?(b=e.left-g+h.options.leftOffset,h.$originalHeader.css({position:"fixed","margin-top":h.options.marginTop,left:b,"z-index":3}),h.leftOffset=b,h.topOffset=d,h.$clonedHeader.css("display",""),h.isSticky||(h.isSticky=!0,h.updateWidth()),h.setPositionValues()):h.isSticky&&(h.$originalHeader.css("position","static"),h.$clonedHeader.css("display","none"),h.isSticky=!1,h.resetWidth(a("td,th",h.$clonedHeader),a("td,th",h.$originalHeader)))})},h.setPositionValues=function(){var a=h.$window.scrollTop(),b=h.$window.scrollLeft();!h.isSticky||0>a||a+h.$window.height()>h.$document.height()||0>b||b+h.$window.width()>h.$document.width()||h.$originalHeader.css({top:h.topOffset-(h.isWindowScrolling?0:a),left:h.leftOffset-(h.isWindowScrolling?0:b)})},h.updateWidth=function(){if(h.isSticky){h.$originalHeaderCells||(h.$originalHeaderCells=a("th,td",h.$originalHeader)),h.$clonedHeaderCells||(h.$clonedHeaderCells=a("th,td",h.$clonedHeader));var b=h.getWidth(h.$clonedHeaderCells);h.setWidth(b,h.$clonedHeaderCells,h.$originalHeaderCells),h.$originalHeader.css("width",h.$clonedHeader.width())}},h.getWidth=function(c){var d=[];return c.each(function(c){var e,f=a(this);if("border-box"===f.css("box-sizing"))e=f[0].getBoundingClientRect().width;else{var g=a("th",h.$originalHeader);if("collapse"===g.css("border-collapse"))if(b.getComputedStyle)e=parseFloat(b.getComputedStyle(this,null).width);else{var i=parseFloat(f.css("padding-left")),j=parseFloat(f.css("padding-right")),k=parseFloat(f.css("border-width"));e=f.outerWidth()-i-j-k}else e=f.width()}d[c]=e}),d},h.setWidth=function(a,b,c){b.each(function(b){var d=a[b];c.eq(b).css({"min-width":d,"max-width":d})})},h.resetWidth=function(b,c){b.each(function(b){var d=a(this);c.eq(b).css({"min-width":d.css("min-width"),"max-width":d.css("max-width")})})},h.setOptions=function(c){h.options=a.extend({},f,c),h.$scrollableArea=a(h.options.scrollableArea),h.isWindowScrolling=h.$scrollableArea[0]===b},h.updateOptions=function(a){h.setOptions(a),h.unbind(),h.bind(),h.updateWidth(),h.toggleHeaders()},h.init()}var d="stickyTableHeaders",e=0,f={fixedOffset:0,leftOffset:0,marginTop:0,scrollableArea:b};a.fn[d]=function(b){return this.each(function(){var e=a.data(this,"plugin_"+d);e?"string"==typeof b?e[b].apply(e):e.updateOptions(b):"destroy"!==b&&a.data(this,"plugin_"+d,new c(this,b))})}}(jQuery,window);


--------------------------------------------------------------------------------
/org/js/readtheorg.js:
--------------------------------------------------------------------------------
 1 | 
 2 | $(function() {
 3 |     $('.note').before("<p class='admonition-title note'>Note</p>");
 4 |     $('.seealso').before("<p class='admonition-title seealso'>See also</p>");
 5 |     $('.warning').before("<p class='admonition-title warning'>Warning</p>");
 6 |     $('.caution').before("<p class='admonition-title caution'>Caution</p>");
 7 |     $('.attention').before("<p class='admonition-title attention'>Attention</p>");
 8 |     $('.tip').before("<p class='admonition-title tip'>Tip</p>");
 9 |     $('.important').before("<p class='admonition-title important'>Important</p>");
10 |     $('.hint').before("<p class='admonition-title hint'>Hint</p>");
11 |     $('.error').before("<p class='admonition-title error'>Error</p>");
12 |     $('.danger').before("<p class='admonition-title danger'>Danger</p>");
13 | });
14 | 
15 | $( document ).ready(function() {
16 | 
17 |     // Shift nav in mobile when clicking the menu.
18 |     $(document).on('click', "[data-toggle='wy-nav-top']", function() {
19 |       $("[data-toggle='wy-nav-shift']").toggleClass("shift");
20 |       $("[data-toggle='rst-versions']").toggleClass("shift");
21 |     });
22 |     // Close menu when you click a link.
23 |     $(document).on('click', ".wy-menu-vertical .current ul li a", function() {
24 |       $("[data-toggle='wy-nav-shift']").removeClass("shift");
25 |       $("[data-toggle='rst-versions']").toggleClass("shift");
26 |     });
27 |     $(document).on('click', "[data-toggle='rst-current-version']", function() {
28 |       $("[data-toggle='rst-versions']").toggleClass("shift-up");
29 |     });
30 |     // Make tables responsive
31 |     $("table.docutils:not(.field-list)").wrap("<div class='wy-table-responsive'></div>");
32 | });
33 | 
34 | $( document ).ready(function() {
35 |     $('#text-table-of-contents ul').first().addClass('nav');
36 |                                         // ScrollSpy also requires that we use
37 |                                         // a Bootstrap nav component.
38 |     $('body').scrollspy({target: '#text-table-of-contents'});
39 | 
40 |     // add sticky table headers
41 |     $('table').stickyTableHeaders();
42 | 
43 |     // set the height of tableOfContents
44 |     var $postamble = $('#postamble');
45 |     var $tableOfContents = $('#table-of-contents');
46 |     $tableOfContents.css({paddingBottom: $postamble.outerHeight()});
47 | 
48 |     // add TOC button
49 |     var toggleSidebar = $('<div id="toggle-sidebar"><a href="#table-of-contents"><h2>Table of Contents</h2></a></div>');
50 |     $('#content').prepend(toggleSidebar);
51 | 
52 |     // add close button when sidebar showed in mobile screen
53 |     var closeBtn = $('<a class="close-sidebar" href="#">Close</a>');
54 |     var tocTitle = $('#table-of-contents').find('h2');
55 |     tocTitle.append(closeBtn);
56 | });
57 | 
58 | window.SphinxRtdTheme = (function (jquery) {
59 |     var stickyNav = (function () {
60 |         var navBar,
61 |             win,
62 |             stickyNavCssClass = 'stickynav',
63 |             applyStickNav = function () {
64 |                 if (navBar.height() <= win.height()) {
65 |                     navBar.addClass(stickyNavCssClass);
66 |                 } else {
67 |                     navBar.removeClass(stickyNavCssClass);
68 |                 }
69 |             },
70 |             enable = function () {
71 |                 applyStickNav();
72 |                 win.on('resize', applyStickNav);
73 |             },
74 |             init = function () {
75 |                 navBar = jquery('nav.wy-nav-side:first');
76 |                 win    = jquery(window);
77 |             };
78 |         jquery(init);
79 |         return {
80 |             enable : enable
81 |         };
82 |     }());
83 |     return {
84 |         StickyNav : stickyNav
85 |     };
86 | }($));
87 | 


--------------------------------------------------------------------------------
/org/js/stickytableheaders-license.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 Jonas Mosbech
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/org/publish.el:
--------------------------------------------------------------------------------
 1 | (require 'package)
 2 | 
 3 | (when (<= emacs-major-version 27)
 4 |   (package-initialize) ;; Not needed in Emacs 27
 5 |                                         ; Disable loading package again after init.el
 6 |   )
 7 | 
 8 | 
 9 | (unless (package-installed-p 'use-package)
10 |   (package-refresh-contents)
11 |   (package-install 'use-package))
12 | 
13 | (eval-when-compile
14 |   (require 'use-package))
15 | 
16 | (use-package htmlize
17 |   :defer t
18 |   )
19 | 
20 | (require 'ox-publish)
21 | (setq org-publish-project-alist
22 |       '(
23 | 
24 |         ("dirtyduck-notes"
25 |          :base-directory "~/projects/dsapp/dirtyduck/org/"
26 |          :base-extension "org"
27 |          :exclude "[[:digit:]][[:digit:]]_.*\.org"
28 |          :publishing-directory "~/projects/dsapp/dirtyduck/docs/"
29 |          :recursive t
30 |          :publishing-function org-html-publish-to-html
31 |          :headline-levels 4       ; Just the default for this project.
32 |          :auto-preamble t
33 |          :sitemap-title "Dirtyduck"
34 |          )
35 | 
36 |         ("dirtyduck-notes-md"
37 |          :base-directory "~/projects/dsapp/dirtyduck/org/"
38 |          :base-extension "org"
39 |          :exclude "[[:digit:]][[:digit:]]_.*\.org"
40 |          :publishing-directory "~/projects/dsapp/dirtyduck/docs/"
41 |          :recursive t
42 |          :publishing-function org-gfm-export-to-markdown
43 |          :headline-levels 4       ; Just the default for this project.
44 |          :auto-preamble t
45 |          :sitemap-title "Dirtyduck"
46 |          )
47 | 
48 |         ("dirtyduck-static"
49 |          :base-directory "~/projects/dsapp/dirtyduck/org/"
50 |          :base-extension "css\\|js\\|png\\|jpg\\|gif\\|pdf\\|mp3\\|ogg\\|swf\\|sql\\|svg\\|yaml"
51 |          :publishing-directory "~/projects/dsapp/dirtyduck/docs/"
52 |          :recursive t
53 |          :publishing-function org-publish-attachment
54 |          )
55 | 
56 | 
57 |         ("dirtyduck" :components ("dirtyduck-static" "dirtyduck-notes"  "dirtyduck-notes-md"))
58 | 
59 |         ))
60 | 


--------------------------------------------------------------------------------
/org/ref.bib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/org/ref.bib


--------------------------------------------------------------------------------
/org/sql/create_cleaned_inspections_table.sql:
--------------------------------------------------------------------------------
 1 |   create schema if not exists cleaned;
 2 | 
 3 | drop table if exists cleaned.inspections cascade;
 4 | 
 5 | create table cleaned.inspections as (
 6 |         with cleaned as (
 7 |         select
 8 |             inspection::integer,
 9 |             btrim(lower(results)) as result,
10 |             license_num::integer,
11 |             btrim(lower(dba_name)) as facility,
12 |             btrim(lower(aka_name)) as facility_aka,
13 |             case when
14 |             facility_type is null then 'unknown'
15 |             else btrim(lower(facility_type))
16 |             end as facility_type,
17 |             lower(substring(risk from '\((.+)\)')) as risk,
18 |             btrim(lower(address)) as address,
19 |             zip as zip_code,
20 |             substring(
21 |                 btrim(lower(regexp_replace(type, 'liquor', 'task force', 'gi')))
22 |             from 'canvass|task force|complaint|food poisoning|consultation|license|tag removal') as type,
23 |             date,
24 |             -- point(longitude, latitude) as location
25 |             ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as location  -- We use geography so the measurements are in meters
26 |         from raw.inspections
27 |         where zip is not null  -- removing NULL zip codes
28 |             )
29 | 
30 |     select * from cleaned where type is not null
31 |         );
32 | 


--------------------------------------------------------------------------------
/org/sql/create_semantic_tables.sql:
--------------------------------------------------------------------------------
  1 | create schema if not exists semantic;
  2 | 
  3 | drop table if exists semantic.entities cascade;
  4 | 
  5 | create table semantic.entities as (
  6 |         with entities as (
  7 |         select
  8 |             distinct on (
  9 |                 license_num,
 10 |                 facility,
 11 |                 facility_aka,
 12 |                 facility_type,
 13 |                 address
 14 |                 )
 15 |             license_num,
 16 |             facility,
 17 |             facility_aka,
 18 |             facility_type,
 19 |             address,
 20 |             zip_code,
 21 |             location,
 22 |             min(date) over (partition by license_num, facility, facility_aka, facility_type, address) as start_time,
 23 |             max(case when result in ('out of business', 'business not located')
 24 |                 then date
 25 |                 else NULL
 26 |                 end)
 27 |             over (partition by license_num, facility, facility_aka, address) as end_time
 28 |         from cleaned.inspections
 29 |         order by
 30 |             license_num, facility, facility_aka, facility_type, address,
 31 |             date asc -- IMPORTANT!!
 32 |             )
 33 | 
 34 |     select
 35 |         row_number() over (order by start_time asc ) as entity_id,
 36 |         license_num,
 37 |         facility,
 38 |         facility_aka,
 39 |         facility_type,
 40 |         address,
 41 |         zip_code,
 42 |         location,
 43 |         start_time,
 44 |         end_time,
 45 |         daterange(start_time, end_time) as activity_period
 46 |     from entities
 47 |         );
 48 | 
 49 | create index entities_ix on semantic.entities (entity_id);
 50 | create index entities_license_num_ix on semantic.entities (license_num);
 51 | create index entities_facility_ix on semantic.entities (facility);
 52 | create index entities_facility_type_ix on semantic.entities (facility_type);
 53 | create index entities_zip_code_ix on semantic.entities (zip_code);
 54 | 
 55 | -- Spatial index
 56 | create index entities_location_gix on semantic.entities using gist (location);
 57 | 
 58 | create index entities_full_key_ix on semantic.entities (license_num, facility, facility_aka, facility_type, address);
 59 | 
 60 | drop table if exists semantic.events cascade;
 61 | 
 62 | create table semantic.events as (
 63 | 
 64 |         with entities as (
 65 |         select * from semantic.entities
 66 |             ),
 67 | 
 68 |         inspections as (
 69 |         select
 70 |             i.inspection, i.type, i.date, i.risk, i.result,
 71 |             i.license_num, i.facility, i.facility_aka,
 72 |             i.facility_type, i.address, i.zip_code, i.location,
 73 |             jsonb_agg(
 74 |                 jsonb_build_object(
 75 |                     'code', v.code,
 76 |                     'severity', v.severity,
 77 | 	                'description', v.description,
 78 | 	                'comment', v.comment
 79 | 	                )
 80 |             order  by code
 81 |                 ) as violations
 82 |         from
 83 |             cleaned.inspections as i
 84 |             inner join
 85 |             cleaned.violations as v
 86 |             on i.inspection = v.inspection
 87 |         group by
 88 |             i.inspection, i.type, i.license_num, i.facility,
 89 |             i.facility_aka, i.facility_type, i.address, i.zip_code, i.location,
 90 |             i.date, i.risk, i.result
 91 |             )
 92 | 
 93 |     select
 94 |         i.inspection as event_id,
 95 |         e.entity_id, i.type, i.date, i.risk, i.result,
 96 |         e.facility_type, e.zip_code, e.location,
 97 |         i.violations
 98 |     from
 99 |         entities as e
100 |         inner join
101 |         inspections as i
102 |         using (license_num, facility, facility_aka, facility_type, address, zip_code)
103 |         );
104 | 
105 | -- Add some indices
106 | create index events_entity_ix on semantic.events (entity_id asc nulls last);
107 | create index events_event_ix on semantic.events (event_id asc nulls last);
108 | create index events_type_ix on semantic.events (type);
109 | create index events_date_ix on semantic.events(date asc nulls last);
110 | create index events_facility_type_ix on semantic.events  (facility_type);
111 | create index events_zip_code_ix on semantic.events  (zip_code);
112 | 
113 | -- Spatial index
114 | create index events_location_gix on semantic.events using gist (location);
115 | 
116 | -- JSONB indices
117 | create index events_violations on semantic.events using gin(violations);
118 | create index events_violations_json_path on semantic.events using gin(violations jsonb_path_ops);
119 | 
120 | create index events_event_entity_zip_code_date on semantic.events (event_id asc nulls last, entity_id asc nulls last, zip_code, date desc nulls last);
121 | 


--------------------------------------------------------------------------------
/org/sql/create_violations_table.sql:
--------------------------------------------------------------------------------
 1 |    drop table if exists cleaned.violations cascade;
 2 | 
 3 |    create table cleaned.violations as (
 4 |    select
 5 |        inspection::integer,
 6 |        license_num::integer,
 7 |        date::date,
 8 |        btrim(tuple[1]) as code,
 9 |        btrim(tuple[2]) as description,
10 |        btrim(tuple[3]) as comment,
11 |        (case
12 |            when btrim(tuple[1]) = '' then NULL
13 |            when btrim(tuple[1])::int between 1 and 14 then 'critical' -- From the documentation
14 |            when btrim(tuple[1])::int between 15 and 29  then 'serious'
15 |            else 'minor'
16 |            end
17 |            ) as severity from
18 |        (
19 |        select
20 |            inspection,
21 |            license_num,
22 |            date,
23 |            regexp_split_to_array(   -- Create an array we will split the code, description, comment
24 |                regexp_split_to_table( -- Create a row per each comment we split by |
25 |                    coalesce(            -- If there isn't a violation add '- Comments:'
26 |                        regexp_replace(violations, '[\n\r]+', '', 'g' )  -- Remove line breaks
27 |                        , '- Comments:')
28 |                    , '\|')  -- Split the violations
29 |                , '(?<=\d+)\.\s*|\s*-\s*Comments:')  -- Split each violation in three
30 |            as tuple
31 |        from raw.inspections
32 |        where results in ('Fail', 'Pass', 'Pass w/ Conditions') and license_num is not null
33 |            ) as t
34 |        );
35 | 


--------------------------------------------------------------------------------
/org/triage/experiments:
--------------------------------------------------------------------------------
1 | ../../triage/experiments


--------------------------------------------------------------------------------
/org/triage/images:
--------------------------------------------------------------------------------
1 | ../../triage/images


--------------------------------------------------------------------------------
/org/tutorial.setup:
--------------------------------------------------------------------------------
 1 | # -*- mode: org; -*-
 2 | 
 3 | #+LANGUAGE:  en
 4 | # +OPTIONS: toc:nil h:4 html-postamble:nil html-preamble:t tex:t f:t
 5 | # +INFOJS_OPT: view:info toc:t ltoc:f mouse:underline buttons:0 path:http://thomasf.github.io/solarized-css/org-info.min.js
 6 | # +HTML_HEAD: <link rel="stylesheet" type="text/css" href="css/org.css"/>
 7 | 
 8 | #+HTML_HEAD: <link rel="stylesheet" type="text/css" href="css/htmlize.css"/>
 9 | #+HTML_HEAD: <link rel="stylesheet" type="text/css" href="css/readtheorg.css"/>
10 | 
11 | #+HTML_HEAD: <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
12 | #+HTML_HEAD: <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
13 | #+HTML_HEAD: <script type="text/javascript" src="js/jquery.stickytableheaders.min.js"></script>
14 | #+HTML_HEAD: <script type="text/javascript" src="js/readtheorg.js"></script>
15 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | Sphinx
 2 | sphinx_rtd_theme
 3 | coverage
 4 | flake8
 5 | mkdocs
 6 | tox
 7 | tox-pyenv
 8 | nose
 9 | mock
10 | colorama
11 | ipython
12 | jupyter
13 | httpie
14 | psycopg2-binary
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ## DSaPP stuff
2 | git+https://github.com/dssg/triage.git
3 | 


--------------------------------------------------------------------------------
/scratch.org:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | * Temp stuff
 4 | 
 5 | 
 6 |   Before doing that, let's check how many different =dba_name= we have.
 7 | 
 8 |   #+BEGIN_SRC sql :results table drawer
 9 |     select
10 |     count(distinct dba_name) as different_names
11 |     from inspections;
12 |   #+END_SRC
13 | 
14 |   #+RESULTS:
15 |   :RESULTS:
16 |   | different_names |
17 |   |----------------|
18 |   |          25107 |
19 |   :END:
20 | 
21 |   #+BEGIN_SRC sql :results table drawer
22 |     select
23 |     dba_name,
24 |     btrim(upper(regexp_replace(replace(dba_name, '''', ''), '[^a-zA-Z0-9 ]', '', 'g'))) as cleaned_name
25 |     from inspections
26 |     limit 30
27 |   #+END_SRC
28 | 
29 |   #+RESULTS:
30 |   :RESULTS:
31 |   | dba_name                                      | cleaned_name                                 |
32 |   |----------------------------------------------+---------------------------------------------|
33 |   | D AND Y GROCERY                              | D AND Y GROCERY                             |
34 |   | ONE STOP FOOD MARKET                         | ONE STOP FOOD MARKET                        |
35 |   | CITGO                                        | CITGO                                       |
36 |   | KHAN DOLLAR STATION                          | KHAN DOLLAR STATION                         |
37 |   | FOSTER & BROADWAY BP/AUTOTECH                | FOSTER  BROADWAY BPAUTOTECH                 |
38 |   | Rizzo's Bar & Inn                            | RIZZOS BAR  INN                             |
39 |   | Rizzo's Bar & Inn                            | RIZZOS BAR  INN                             |
40 |   | SAVE-A-LOT #882                              | SAVEALOT 882                                |
41 |   | MEDITERRANEAN EXPRESS                        | MEDITERRANEAN EXPRESS                       |
42 |   | SWEET FREAKS                                 | SWEET FREAKS                                |
43 |   | MINGHIN CUISINE KITCHEN                      | MINGHIN CUISINE KITCHEN                     |
44 |   | HAPPY GROCERY & DOLLAR                       | HAPPY GROCERY  DOLLAR                       |
45 |   | ARDEN RESTAURANT                             | ARDEN RESTAURANT                            |
46 |   | TBD                                          | TBD                                         |
47 |   | MAGGIE GYROS & CHICKEN                       | MAGGIE GYROS  CHICKEN                       |
48 |   | WOLCOTT TAP                                  | WOLCOTT TAP                                 |
49 |   | WOLCOTT TAP                                  | WOLCOTT TAP                                 |
50 |   | 3JJJ'S BETTER TASTE JAMAICAN JERK RESTAURANT | 3JJJS BETTER TASTE JAMAICAN JERK RESTAURANT |
51 |   | THE HARDING TAVERN                           | THE HARDING TAVERN                          |
52 |   | ZACATACOS, II. INC                           | ZACATACOS II INC                            |
53 |   | ONESTI PIZZERIA INC                          | ONESTI PIZZERIA INC                         |
54 |   | 3JJJ'S BETTER TASTE JAMAICAN JERK RESTAURANT | 3JJJS BETTER TASTE JAMAICAN JERK RESTAURANT |
55 |   | NORMAN'S                                     | NORMANS                                     |
56 |   | MCCB                                         | MCCB                                        |
57 |   | CHECKERS DRIVE-IN RESTAURANTS, INC           | CHECKERS DRIVEIN RESTAURANTS INC            |
58 |   | Rizzo's Bar & Inn                            | RIZZOS BAR  INN                             |
59 |   | GRILL 87                                     | GRILL 87                                    |
60 |   | KFC                                          | KFC                                         |
61 |   | PACO'S TACOS 2                               | PACOS TACOS 2                               |
62 |   | MARTINI CLUB                                 | MARTINI CLUB                                |
63 |   :END:
64 | 


--------------------------------------------------------------------------------
/triage/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/.gitkeep


--------------------------------------------------------------------------------
/triage/audition/eis/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/eis/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/eis/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/triage/audition/eis/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/eis/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/eis/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/eis/results_model_group_ids.json:
--------------------------------------------------------------------------------
1 | {"best_current_value_precision@_10_pct": [70, 66, 64], "best_average_value_precision@_10_pct": [66, 64, 72], "lowest_metric_variance_precision@_10_pct": [65, 67, 69], "most_frequent_best_dist_precision@_10_pct_0.05": [64, 66, 70]}


--------------------------------------------------------------------------------
/triage/audition/inspections/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/inspections/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/inspections/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/triage/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/inspections/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/audition/inspections/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/triage/audition/inspections/results_model_group_ids.json:
--------------------------------------------------------------------------------
1 | {"best_current_value_precision@_10_pct": [41, 45, 32], "best_average_value_precision@_10_pct": [41, 11, 45], "lowest_metric_variance_precision@_10_pct": [6, 20, 27], "most_frequent_best_dist_precision@_10_pct_0.05": [10, 11, 12]}


--------------------------------------------------------------------------------
/triage/eis_audition_config.yaml:
--------------------------------------------------------------------------------
 1 | # CHOOSE MODEL GROUPS
 2 | model_groups:
 3 |     query: |
 4 |         select distinct(model_group_id)
 5 |         from model_metadata.model_groups
 6 |         where model_config ->> 'experiment_type' ~ 'eis'
 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES
 8 | time_stamps:
 9 |     query: |
10 |         select distinct train_end_time
11 |         from model_metadata.models
12 |         where model_group_id in ({})
13 |         and extract(day from train_end_time) in (1)
14 |         and train_end_time >= '2015-01-01'
15 | # FILTER
16 | filter:
17 |     metric: 'precision@' # metric of interest
18 |     parameter: '10_pct' # parameter of interest
19 |     max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time.
20 |     threshold_value: 0.0 # The worst absolute value that the given metric should be.
21 |     distance_table: 'eis_distance_table' # name of the distance table
22 |     models_table: 'models' # name of the models table
23 | 
24 | # RULES
25 | rules:
26 |     -
27 |         shared_parameters:
28 |             -
29 |                 metric: 'precision@'
30 |                 parameter: '10_pct'
31 | 
32 |         selection_rules:
33 |             -
34 |                 name: 'best_current_value' # Pick the model group with the best current metric value
35 |                 n: 3
36 |             -
37 |                 name: 'best_average_value' # Pick the model with the highest average metric value
38 |                 n: 3
39 |             -
40 |                 name: 'lowest_metric_variance' # Pick the model with the lowest metric variance
41 |                 n: 3
42 |             -
43 |                 name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case`
44 |                 dist_from_best_case: [0.05]
45 |                 n: 3
46 | 


--------------------------------------------------------------------------------
/triage/eis_crosstabs_config.yaml:
--------------------------------------------------------------------------------
 1 | output:
 2 |   schema: 'test_results'
 3 |   table: 'eis_crosstabs'
 4 | 
 5 | thresholds:
 6 |     rank_abs: [50]
 7 |     rank_pct: [5]
 8 | 
 9 | #(optional): a list of entity_ids to subset on the crosstabs analysis
10 | entity_id_list: []
11 | 
12 | models_list_query: "select unnest(ARRAY[226]) :: int as model_id"
13 | 
14 | as_of_dates_query: "select generate_series('2017-12-01'::date, '2018-09-01'::date, interval '1month')  as as_of_date"
15 | 
16 | #don't change this query unless strictly necessary. It is just validating pairs of (model_id,as_of_date)
17 | #it is just a join with distinct (model_id, as_of_date) in a predictions table
18 | models_dates_join_query: |
19 |   select model_id,
20 |   as_of_date
21 |   from models_list_query as m
22 |   cross join as_of_dates_query a join (select distinct model_id, as_of_date from test_results.predictions) as p
23 |   using (model_id, as_of_date)
24 | 
25 | #features_query must join models_dates_join_query with 1 or more features table using as_of_date
26 | features_query: |
27 |   select m.model_id, m.as_of_date, f4.entity_id, f4.results_entity_id_1month_result_fail_avg, f4.results_entity_id_3month_result_fail_avg, f4.results_entity_id_6month_result_fail_avg,
28 |   f2.inspection_types_zip_code_1month_type_canvass_sum, f3.risks_zip_code_1month_risk_high_sum, f4.results_entity_id_6month_result_pass_avg,
29 |   f3.risks_entity_id_all_risk_high_sum, f2.inspection_types_zip_code_3month_type_canvass_sum, f4.results_entity_id_6month_result_pass_sum,
30 |   f2.inspection_types_entity_id_all_type_canvass_sum
31 |   from features.inspection_types_aggregation_imputed as f2
32 |   inner join features.risks_aggregation_imputed as f3 using (entity_id, as_of_date)
33 |   inner join features.results_aggregation_imputed as f4 using (entity_id, as_of_date)
34 |   inner join models_dates_join_query as m using (as_of_date)
35 | 
36 | #the predictions query must return model_id, as_of_date, entity_id, score, label_value, rank_abs and rank_pct
37 | #it must join models_dates_join_query using both model_id and as_of_date
38 | predictions_query: |
39 |   select model_id,
40 |       as_of_date,
41 |       entity_id,
42 |       score,
43 |       label_value,
44 |       coalesce(rank_abs, row_number() over (partition by (model_id, as_of_date) order by score desc)) as rank_abs,
45 |       coalesce(rank_pct*100, ntile(100) over (partition by (model_id, as_of_date) order by score desc)) as rank_pct
46 |       from test_results.predictions
47 |       join models_dates_join_query using(model_id, as_of_date)
48 |       where model_id in (select model_id from models_list_query)
49 |       and as_of_date in (select as_of_date from as_of_dates_query)
50 | 


--------------------------------------------------------------------------------
/triage/eis_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 |   project_path: '/triage' # Project path defined in triage with matrices and models
 4 |   audition_output_path: '/triage/audition/eis/results_model_group_ids.json'
 5 | 
 6 |   thresholds: # Thresholds for2 defining positive predictions
 7 |         rank_abs: [50, 100, 250]
 8 |         rank_pct: [5, 10, 25]
 9 | 
10 |   baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
11 |       select g.model_group_id,
12 |              m.model_id,
13 |              extract('year' from m.evaluation_end_time) as as_of_date_year,
14 |              m.metric,
15 |              m.parameter,
16 |              m.value,
17 |              m.num_labeled_examples,
18 |              m.num_labeled_above_threshold,
19 |              m.num_positive_labels
20 |        from test_results.evaluations m
21 |        left join model_metadata.models g
22 |        using(model_id)
23 |        where g.model_group_id = 81
24 |              and metric = 'precision@'
25 |              and parameter = '10_pct'
26 | 
27 |   max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
28 |   n_features_plots: 10 # Number of features for importances
29 |   figsize: [12, 12] # Default size for plots
30 |   fontsize: 20 # Default fontsize for plots
31 | 


--------------------------------------------------------------------------------
/triage/experiments/eis_01.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'eis: 01'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'inspected'
  7 |   experiment_type: 'eis'
  8 |   description: |
  9 |     EIS 01
 10 |   purpose: 'model creation'
 11 |   org: 'DSaPP'
 12 |   team: 'Tutorial'
 13 |   author: 'Your name here'
 14 |   etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | label_config:
 36 |   query: |
 37 |     select
 38 |     entity_id,
 39 |     True::integer as outcome
 40 |     from semantic.events
 41 |     where '{as_of_date}'::timestamp <= date
 42 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 43 |     group by entity_id
 44 |   include_missing_labels_in_train_as: False
 45 |   name: 'inspected'
 46 | 
 47 | cohort_config:
 48 |   query: |
 49 |     with buckets as (
 50 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 51 |     from (
 52 |     select entity_id, count(*) as number_of_inspections
 53 |     from semantic.events
 54 |     group by entity_id
 55 |     ) as t
 56 |     )
 57 |     select e.entity_id
 58 |     from semantic.entities as e
 59 |     inner join
 60 |     buckets as b
 61 |     using (entity_id)
 62 |     where
 63 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 64 |     and bucket in (5)
 65 |   name: 'active_facilities'
 66 | 
 67 | temporal_config:
 68 |     feature_start_time: '2010-01-04'
 69 |     feature_end_time: '2019-01-01'
 70 |     label_start_time: '2015-02-01'
 71 |     label_end_time: '2019-01-01'
 72 | 
 73 |     model_update_frequency: '1y'
 74 |     training_label_timespans: ['1month']
 75 |     training_as_of_date_frequencies: '1month'
 76 | 
 77 |     test_durations: '1y'
 78 |     test_label_timespans: ['1month']
 79 |     test_as_of_date_frequencies: '1month'
 80 | 
 81 |     max_training_histories: '5y'
 82 | 
 83 | feature_aggregations:
 84 |   -
 85 |     prefix: 'inspections'
 86 |     from_obj: 'semantic.events'
 87 |     knowledge_date_column: 'date'
 88 | 
 89 |     aggregates_imputation:
 90 |       count:
 91 |         type: 'zero_noflag'
 92 | 
 93 |     aggregates:
 94 |       -
 95 |         quantity:
 96 |           total: "*"
 97 |         metrics:
 98 |           - 'count'
 99 | 
100 |     intervals: ['1month', '3month', '6month', '1y', 'all']
101 | 
102 |     groups:
103 |       - 'entity_id'
104 | 
105 |   -
106 |     prefix: 'risks'
107 |     from_obj: 'semantic.events'
108 |     knowledge_date_column: 'date'
109 | 
110 |     categoricals_imputation:
111 |       sum:
112 |         type: 'zero'
113 |       avg:
114 |         type: 'zero'
115 | 
116 |     categoricals:
117 |       -
118 |         column: 'risk'
119 |         choices: ['low', 'medium', 'high']
120 |         metrics:
121 |           - 'sum'
122 |           - 'avg'
123 | 
124 |     intervals: ['1month', '3month', '6month', '1y', 'all']
125 | 
126 |     groups:
127 |       - 'entity_id'
128 |       - 'zip_code'
129 | 
130 |   -
131 |     prefix: 'results'
132 |     from_obj: 'semantic.events'
133 |     knowledge_date_column: 'date'
134 | 
135 |     categoricals_imputation:
136 |       all:
137 |         type: 'zero'
138 | 
139 |     categoricals:
140 |       -
141 |         column: 'result'
142 |         choice_query: 'select distinct result from semantic.events'
143 |         metrics:
144 |           - 'sum'
145 |           - 'avg'
146 | 
147 |     intervals: ['1month', '3month', '6month', '1y', 'all']
148 | 
149 |     groups:
150 |       - 'entity_id'
151 | 
152 |   -
153 |     prefix: 'inspection_types'
154 |     from_obj: 'semantic.events'
155 |     knowledge_date_column: 'date'
156 | 
157 |     categoricals_imputation:
158 |       sum:
159 |         type: 'zero_noflag'
160 | 
161 |     categoricals:
162 |       -
163 |         column: 'type'
164 |         choice_query: 'select distinct type from semantic.events where type is not null'
165 |         metrics:
166 |           - 'sum'
167 | 
168 |     intervals: ['1month', '3month', '6month', '1y', 'all']
169 | 
170 |     groups:
171 |       - 'entity_id'
172 |       - 'zip_code'
173 | 
174 | feature_group_definition:
175 |    prefix:
176 |      - 'inspections'
177 |      - 'results'
178 |      - 'risks'
179 |      - 'inspection_types'
180 | 
181 | feature_group_strategies: ['all', 'leave-one-out', 'leave-one-in']
182 | 
183 | grid_config:
184 |     'sklearn.tree.DecisionTreeClassifier':
185 |         max_depth: [2,null]
186 |     'sklearn.ensemble.RandomForestClassifier':
187 |         max_features: ['sqrt']
188 |         criterion: ['gini']
189 |         n_estimators: [500]
190 |         min_samples_leaf: [1]
191 |         min_samples_split: [50]
192 |     'sklearn.dummy.DummyClassifier':
193 |         strategy: [most_frequent]
194 | 
195 | scoring:
196 |     testing_metric_groups:
197 |         -
198 |           metrics: [precision@, recall@]
199 |           thresholds:
200 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
201 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
202 | 
203 | 
204 |     training_metric_groups:
205 |       -
206 |         metrics: [accuracy]
207 |       -
208 |         metrics: [precision@, recall@]
209 |         thresholds:
210 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
211 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
212 | 


--------------------------------------------------------------------------------
/triage/experiments/inspections-training.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v3'
  2 | 
  3 | model_comment: 'test_triage_inspections'
  4 | 
  5 | temporal_config:
  6 |   feature_start_time=np.min(df.date)
  7 |   feature_end_time=np.max(df.date)
  8 |   label_start_time=np.min(df.date)
  9 |   label_end_time=np.max(df.date)
 10 | 
 11 |   model_update_frequency='3months'
 12 |   training_label_timespans='1day'
 13 |   training_as_of_date_frequencies='1day'
 14 |   max_training_histories='1year'
 15 | 
 16 |   test_durations='1day'
 17 |   test_label_timespans='3month'
 18 |   test_as_of_date_frequencies='1day'
 19 | 
 20 | events_table: 'inspections.events'
 21 | 
 22 | feature_aggregations:
 23 |   -
 24 |     # Number of violations of a specific code and proportion, grouped by entity
 25 |     prefix: 'violations'
 26 |     from_obj: 'cleaned.violations'
 27 |     knowledge_date_column: 'knowledge_date'
 28 | 
 29 |     categoricals:
 30 |       -
 31 |         column: 'violation_code'
 32 |         choice_query: 'select distinct violation_code from cleaned.violations'
 33 |         metrics:
 34 |           - 'sum'
 35 |           - 'avg'
 36 | 
 37 |     intervals:
 38 |       - '1 y'
 39 | 
 40 |     groups:
 41 |       - 'entity_id'
 42 | 
 43 |   -  # inspections in the last year associated with this entity
 44 |     prefix: 'inspections'
 45 |     from_obj: 'cleaned.inspections'
 46 |     knowledge_date_column: 'date'
 47 |     aggregates:
 48 |       -
 49 |           quantity: '*'
 50 |           metrics:
 51 |               - 'count'
 52 |     intervals:
 53 |       - '1 y'
 54 | 
 55 |     groups:
 56 |       - 'license_num'
 57 | 
 58 |   - # inspections that happened in the last year grouped  by type of facility
 59 |     prefix: 'inspections'
 60 |     from_obj: 'cleaned.inspections'
 61 |     knowledge_date_column: 'date'
 62 | 
 63 |     aggregates:
 64 |       -
 65 |           quantity: '*'
 66 |           metrics:
 67 |               - 'count'
 68 |     intervals:
 69 |       - '1 y'
 70 | 
 71 |     groups:
 72 |       - 'facility_type'
 73 | 
 74 |   - # inspections that happened in the last year grouped  by zip code
 75 |     prefix: 'inspections'
 76 |     from_obj: 'cleaned.inspections'
 77 |     knowledge_date_column: 'date'
 78 | 
 79 |     aggregates:
 80 |       -
 81 |           quantity: '*'
 82 |           metrics:
 83 |               - 'count'
 84 |     intervals:
 85 |       - '1 y'
 86 | 
 87 |     groups:
 88 |       - 'zip_code'
 89 | 
 90 | feature_group_strategies: ['all']
 91 | 
 92 | model_group_keys: []
 93 | 
 94 | grid_config:
 95 |   'sklearn.tree.DecisionTreeClassifier':
 96 |     criterion: ['gini']
 97 |     max_depth: [3]
 98 |     min_samples_split: [10]
 99 | 
100 | scoring:
101 |   metric_groups:
102 |     -
103 |       metrics: ['precision@', 'recall@', 'fpr@']
104 |       thresholds:
105 |         percentiles: [1.0, 2.0, 5.0, 10.0, 25.0]
106 |         top_n: [25, 75, 150, 300, 500, 1000, 1500]
107 | 


--------------------------------------------------------------------------------
/triage/experiments/inspections_baseline.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'inspections: baseline'
  4 | 
  5 | user_metadata:
  6 |     label_definition: 'failed'
  7 |     experiment_type: 'inspections prioritization'
  8 |     description: |
  9 |       Baseline calculation
 10 |     purpose: 'baseline'
 11 |     org: 'DSaPP'
 12 |     team: 'Tutorial'
 13 |     author: 'Your name here'
 14 |     etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | temporal_config:
 36 |     feature_start_time: '2010-01-04'
 37 |     feature_end_time: '2019-01-01'
 38 |     label_start_time: '2015-02-01'
 39 |     label_end_time: '2019-01-01'
 40 | 
 41 |     model_update_frequency: '1y'
 42 |     training_label_timespans: ['1month']
 43 |     training_as_of_date_frequencies: '1month'
 44 | 
 45 |     test_durations: '1y'
 46 |     test_label_timespans: ['1month']
 47 |     test_as_of_date_frequencies: '1month'
 48 | 
 49 |     max_training_histories: '5y'
 50 | 
 51 | label_config:
 52 |   query: |
 53 |     select
 54 |     entity_id,
 55 |     bool_or(result = 'fail')::integer as outcome
 56 |     from semantic.events
 57 |     where '{as_of_date}'::timestamp <= date
 58 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 59 |     group by entity_id
 60 |   name: 'failed_inspections'
 61 | 
 62 | cohort_config:
 63 |   query: |
 64 |     with buckets as (
 65 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 66 |     from (
 67 |     select entity_id, count(*) as number_of_inspections
 68 |     from semantic.events
 69 |     group by entity_id
 70 |     ) as t
 71 |     )
 72 |     select e.entity_id
 73 |     from semantic.entities as e
 74 |     inner join
 75 |     buckets as b
 76 |     using (entity_id)
 77 |     where
 78 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 79 |     and bucket in (5)
 80 |   name: 'active_facilities'
 81 | 
 82 | feature_aggregations:
 83 |   -
 84 |     prefix: 'inspections'
 85 |     from_obj: 'semantic.events'
 86 |     knowledge_date_column: 'date'
 87 | 
 88 |     aggregates_imputation:
 89 |       count:
 90 |         type: 'zero_noflag'
 91 | 
 92 |     aggregates:
 93 |       -
 94 |         quantity:
 95 |           total: "*"
 96 |         metrics:
 97 |           - 'count'
 98 | 
 99 |     intervals: ['all']
100 | 
101 |     groups:
102 |       - 'entity_id'
103 | 
104 | feature_group_definition:
105 |    prefix:
106 |      - 'inspections'
107 | 
108 | feature_group_strategies: ['all']
109 | 
110 | grid_config:
111 |     'sklearn.dummy.DummyClassifier':
112 |         strategy: [prior,uniform, most_frequent]
113 | 
114 | scoring:
115 |     testing_metric_groups:
116 |         -
117 |           metrics: [precision@, recall@]
118 |           thresholds:
119 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
120 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
121 | 
122 |     training_metric_groups:
123 |       -
124 |         metrics: [accuracy]
125 |       -
126 |         metrics: [precision@, recall@]
127 |         thresholds:
128 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
129 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
130 | 


--------------------------------------------------------------------------------
/triage/experiments/inspections_dt.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'inspections: DT'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'failed'
  7 |   experiment_type: 'inspections prioritization'
  8 |   description: |
  9 |     Decision Tree Classifier
 10 |   purpose: 'data mining'
 11 |   org: 'DSaPP'
 12 |   team: 'Tutorial'
 13 |   author: 'Your name here'
 14 |   etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | temporal_config:
 36 |     feature_start_time: '2010-01-04'
 37 |     feature_end_time: '2019-01-01'
 38 |     label_start_time: '2015-02-01'
 39 |     label_end_time: '2019-01-01'
 40 | 
 41 |     model_update_frequency: '1y'
 42 |     training_label_timespans: ['1month']
 43 |     training_as_of_date_frequencies: '1month'
 44 | 
 45 |     test_durations: '1y'
 46 |     test_label_timespans: ['1month']
 47 |     test_as_of_date_frequencies: '1month'
 48 | 
 49 |     max_training_histories: '5y'
 50 | 
 51 | label_config:
 52 |   query: |
 53 |     select
 54 |     entity_id,
 55 |     bool_or(result = 'fail')::integer as outcome
 56 |     from semantic.events
 57 |     where '{as_of_date}'::timestamp <= date
 58 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 59 |     group by entity_id
 60 |   name: 'failed_inspections'
 61 | 
 62 | cohort_config:
 63 |   query: |
 64 |     with buckets as (
 65 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 66 |     from (
 67 |     select entity_id, count(*) as number_of_inspections
 68 |     from semantic.events
 69 |     group by entity_id
 70 |     ) as t
 71 |     )
 72 |     select e.entity_id
 73 |     from semantic.entities as e
 74 |     inner join
 75 |     buckets as b
 76 |     using (entity_id)
 77 |     where
 78 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 79 |     and bucket in (5)
 80 |   name: 'active_facilities'
 81 | 
 82 | feature_aggregations:
 83 |   -
 84 |     prefix: 'inspections'
 85 |     from_obj: 'semantic.events'
 86 |     knowledge_date_column: 'date'
 87 | 
 88 |     aggregates_imputation:
 89 |       count:
 90 |         type: 'zero_noflag'
 91 | 
 92 |     aggregates:
 93 |       -
 94 |         quantity:
 95 |           total: "*"
 96 |         metrics:
 97 |           - 'count'
 98 | 
 99 |     intervals: ['1month', '3month', '6month', '1y', 'all']
100 | 
101 |     groups:
102 |       - 'entity_id'
103 | 
104 |   -
105 |     prefix: 'risks'
106 |     from_obj: 'semantic.events'
107 |     knowledge_date_column: 'date'
108 | 
109 |     categoricals_imputation:
110 |       sum:
111 |         type: 'zero'
112 |       avg:
113 |         type: 'zero'
114 | 
115 |     categoricals:
116 |       -
117 |         column: 'risk'
118 |         choices: ['low', 'medium', 'high']
119 |         metrics:
120 |           - 'sum'
121 |           - 'avg'
122 | 
123 |     intervals: ['1month', '3month', '6month', '1y', 'all']
124 | 
125 |     groups:
126 |       - 'entity_id'
127 |       - 'zip_code'
128 | 
129 |   -
130 |     prefix: 'results'
131 |     from_obj: 'semantic.events'
132 |     knowledge_date_column: 'date'
133 | 
134 |     categoricals_imputation:
135 |       all:
136 |         type: 'zero'
137 | 
138 |     categoricals:
139 |       -
140 |         column: 'result'
141 |         choice_query: 'select distinct result from semantic.events'
142 |         metrics:
143 |           - 'sum'
144 |           - 'avg'
145 | 
146 |     intervals: ['1month', '3month', '6month', '1y', 'all']
147 | 
148 |     groups:
149 |       - 'entity_id'
150 | 
151 |   -
152 |     prefix: 'inspection_types'
153 |     from_obj: 'semantic.events'
154 |     knowledge_date_column: 'date'
155 | 
156 |     categoricals_imputation:
157 |       sum:
158 |         type: 'zero_noflag'
159 | 
160 |     categoricals:
161 |       -
162 |         column: 'type'
163 |         choice_query: 'select distinct type from semantic.events where type is not null'
164 |         metrics:
165 |           - 'sum'
166 | 
167 |     intervals: ['1month', '3month', '6month', '1y', 'all']
168 | 
169 |     groups:
170 |       - 'entity_id'
171 |       - 'zip_code'
172 | 
173 | grid_config:
174 |     'sklearn.tree.DecisionTreeClassifier':
175 |         max_depth: [2,10,~]
176 |         min_samples_split: [2,5]
177 | 
178 | feature_group_definition:
179 |    prefix:
180 |      - 'inspections'
181 |      - 'results'
182 |      - 'risks'
183 |      - 'inspection_types'
184 | 
185 | feature_group_strategies: ['all']
186 | 
187 | scoring:
188 |     testing_metric_groups:
189 |         -
190 |           metrics: [precision@, recall@]
191 |           thresholds:
192 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
193 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
194 | 
195 | 
196 |     training_metric_groups:
197 |       -
198 |         metrics: [accuracy]
199 |       -
200 |         metrics: [precision@, recall@]
201 |         thresholds:
202 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
203 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
204 | 


--------------------------------------------------------------------------------
/triage/experiments/inspections_label_failed_01.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'inspections: advanced'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'failed'
  7 |   experiment_type: 'inspections prioritization'
  8 |   description: |
  9 |     Using Ensamble methods
 10 |   purpose: 'trying ensamble algorithms'
 11 |   org: 'DSaPP'
 12 |   team: 'Tutorial'
 13 |   author: 'Your name here'
 14 |   etl_date: '2019-02-21'
 15 | 
 16 | model_group_keys:
 17 |   - 'class_path'
 18 |   - 'parameters'
 19 |   - 'feature_names'
 20 |   - 'feature_groups'
 21 |   - 'cohort_name'
 22 |   - 'state'
 23 |   - 'label_name'
 24 |   - 'label_timespan'
 25 |   - 'training_as_of_date_frequency'
 26 |   - 'max_training_history'
 27 |   - 'label_definition'
 28 |   - 'experiment_type'
 29 |   - 'org'
 30 |   - 'team'
 31 |   - 'author'
 32 |   - 'purpose'
 33 |   - 'etl_date'
 34 | 
 35 | temporal_config:
 36 |     feature_start_time: '2010-01-04'
 37 |     feature_end_time: '2019-01-01'
 38 |     label_start_time: '2015-02-01'
 39 |     label_end_time: '2019-01-01'
 40 | 
 41 |     model_update_frequency: '1y'
 42 |     training_label_timespans: ['1month']
 43 |     training_as_of_date_frequencies: '1month'
 44 | 
 45 |     test_durations: '1y'
 46 |     test_label_timespans: ['1month']
 47 |     test_as_of_date_frequencies: '1month'
 48 | 
 49 |     max_training_histories: '5y'
 50 | 
 51 | label_config:
 52 |   query: |
 53 |     select
 54 |     entity_id,
 55 |     bool_or(result = 'fail')::integer as outcome
 56 |     from semantic.events
 57 |     where '{as_of_date}'::timestamp <= date
 58 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 59 |     group by entity_id
 60 |   name: 'failed_inspections'
 61 | 
 62 | 
 63 | cohort_config:
 64 |   query: |
 65 |     with buckets as (
 66 |     select *, ntile(5) over (order by number_of_inspections asc) as bucket
 67 |     from (
 68 |     select entity_id, count(*) as number_of_inspections
 69 |     from semantic.events
 70 |     group by entity_id
 71 |     ) as t
 72 |     )
 73 |     select e.entity_id
 74 |     from semantic.entities as e
 75 |     inner join
 76 |     buckets as b
 77 |     using (entity_id)
 78 |     where
 79 |     daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 80 |     and bucket in (5)
 81 |   name: 'active_facilities'
 82 | 
 83 | feature_aggregations:
 84 |   -
 85 |     prefix: 'inspections'
 86 |     from_obj: 'semantic.events'
 87 |     knowledge_date_column: 'date'
 88 | 
 89 |     aggregates_imputation:
 90 |       count:
 91 |         type: 'zero_noflag'
 92 | 
 93 |     aggregates:
 94 |       -
 95 |         quantity:
 96 |           total: "*"
 97 |         metrics:
 98 |           - 'count'
 99 | 
100 |     intervals: ['1month', '3month', '6month', '1y', 'all']
101 | 
102 |     groups:
103 |       - 'entity_id'
104 | 
105 |   -
106 |     prefix: 'risks'
107 |     from_obj: 'semantic.events'
108 |     knowledge_date_column: 'date'
109 | 
110 |     categoricals_imputation:
111 |       sum:
112 |         type: 'zero'
113 |       avg:
114 |         type: 'zero'
115 | 
116 |     categoricals:
117 |       -
118 |         column: 'risk'
119 |         choices: ['low', 'medium', 'high']
120 |         metrics:
121 |           - 'sum'
122 |           - 'avg'
123 | 
124 |     intervals: ['1month', '3month', '6month', '1y', 'all']
125 | 
126 |     groups:
127 |       - 'entity_id'
128 |       - 'zip_code'
129 | 
130 |   -
131 |     prefix: 'results'
132 |     from_obj: 'semantic.events'
133 |     knowledge_date_column: 'date'
134 | 
135 |     categoricals_imputation:
136 |       all:
137 |         type: 'zero'
138 | 
139 |     categoricals:
140 |       -
141 |         column: 'result'
142 |         choice_query: 'select distinct result from semantic.events'
143 |         metrics:
144 |           - 'sum'
145 |           - 'avg'
146 | 
147 |     intervals: ['1month', '3month', '6month', '1y', 'all']
148 | 
149 |     groups:
150 |       - 'entity_id'
151 | 
152 |   -
153 |     prefix: 'inspection_types'
154 |     from_obj: 'semantic.events'
155 |     knowledge_date_column: 'date'
156 | 
157 |     categoricals_imputation:
158 |       sum:
159 |         type: 'zero_noflag'
160 | 
161 |     categoricals:
162 |       -
163 |         column: 'type'
164 |         choice_query: 'select distinct type from semantic.events where type is not null'
165 |         metrics:
166 |           - 'sum'
167 | 
168 |     intervals: ['1month', '3month', '6month', '1y', 'all']
169 | 
170 |     groups:
171 |       - 'entity_id'
172 |       - 'zip_code'
173 | 
174 | feature_group_definition:
175 |    prefix:
176 |      - 'inspections'
177 |      - 'results'
178 |      - 'risks'
179 |      - 'inspection_types'
180 | 
181 | feature_group_strategies: ['all', 'leave-one-in', 'leave-one-out']
182 | 
183 | grid_config:
184 |     'sklearn.ensemble.RandomForestClassifier':
185 |         max_features: ['sqrt']
186 |         criterion: ['gini']
187 |         n_estimators: [100, 250]
188 |         min_samples_split: [2,10]
189 | 
190 | scoring:
191 |     testing_metric_groups:
192 |         -
193 |           metrics: [precision@, recall@]
194 |           thresholds:
195 |             percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
196 |             top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
197 | 
198 |     training_metric_groups:
199 |       -
200 |         metrics: [accuracy]
201 |       -
202 |         metrics: [precision@, recall@]
203 |         thresholds:
204 |           percentiles: [1.0, 2.0, 3.0, 4.0, 5.0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
205 |           top_n: [1, 5, 10, 25, 50, 100, 250, 500, 1000]
206 | 


--------------------------------------------------------------------------------
/triage/experiments/simple_test_skeleton.yaml:
--------------------------------------------------------------------------------
  1 | config_version: 'v6'
  2 | 
  3 | model_comment: 'simple_test_skeleton'
  4 | 
  5 | user_metadata:
  6 |   label_definition: 'failed_inspection'
  7 |   experiment_type: 'test'
  8 |   org: 'DSaPP'
  9 |   team: 'Tutorial'
 10 |   author: 'Adolfo De Unanue'
 11 |   etl_date: '2019-02-21'
 12 | 
 13 | temporal_config:
 14 |     feature_start_time: '2014-01-01'
 15 |     feature_end_time: '2018-01-01'
 16 |     label_start_time: '2014-01-02'
 17 |     label_end_time: '2018-01-01'
 18 | 
 19 |     model_update_frequency: '1y'
 20 | 
 21 |     max_training_histories: '1y'
 22 |     training_label_timespans: ['1y']
 23 |     training_as_of_date_frequencies: '1month'
 24 | 
 25 |     test_durations: '0d'
 26 |     test_label_timespans: ['1y']
 27 |     test_as_of_date_frequencies: '1month'
 28 | 
 29 | cohort_config:
 30 |     query: |
 31 |       select entity_id
 32 |       from semantic.entities
 33 |       where
 34 |       license_num in (1596210, 1874347, 1142451)
 35 |       and daterange(start_time, end_time, '[]') @> '{as_of_date}'::date
 36 |     name: 'test_facilities'
 37 | 
 38 | label_config:
 39 |   query: |
 40 |     select
 41 |     entity_id,
 42 |     bool_or(result = 'fail')::integer as outcome
 43 |     from semantic.events
 44 |     where '{as_of_date}'::timestamp <= date
 45 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
 46 |     group by entity_id
 47 |   name: 'failed_inspections'
 48 | 
 49 | grid_config:
 50 |     'sklearn.dummy.DummyClassifier':
 51 |         strategy: [most_frequent]
 52 | 
 53 | feature_aggregations:
 54 |   -
 55 |     prefix: 'inspections'
 56 |     from_obj: 'semantic.events'
 57 |     knowledge_date_column: 'date'
 58 | 
 59 |     aggregates_imputation:
 60 |       count:
 61 |         type: 'zero_noflag'
 62 | 
 63 |     aggregates:
 64 |       -
 65 |         quantity:
 66 |           total: "*"
 67 |         metrics:
 68 |           - 'count'
 69 | 
 70 |     intervals: ['1month', '3month', '6month', '1y', 'all']
 71 | 
 72 |     groups:
 73 |       - 'entity_id'
 74 | 
 75 | 
 76 |   -
 77 |     prefix: 'risks'
 78 |     from_obj: 'semantic.events'
 79 |     knowledge_date_column: 'date'
 80 | 
 81 |     categoricals_imputation:
 82 |       sum:
 83 |         type: 'zero'
 84 |       avg:
 85 |         type: 'zero'
 86 | 
 87 |     categoricals:
 88 |       -
 89 |         column: 'risk'
 90 |         choices: ['low', 'medium', 'high']
 91 |         metrics:
 92 |           - 'sum'
 93 |           - 'avg'
 94 | 
 95 |     intervals: ['1month', '3month', '6month', '1y', 'all']
 96 | 
 97 |     groups:
 98 |       - 'entity_id'
 99 |       - 'zip_code'
100 | 
101 |   -
102 |     prefix: 'results'
103 |     from_obj: 'semantic.events'
104 |     knowledge_date_column: 'date'
105 | 
106 |     categoricals_imputation:
107 |       all:
108 |         type: 'zero'
109 | 
110 |     categoricals:
111 |       -
112 |         column: 'result'
113 |         choice_query: 'select distinct result from semantic.events'
114 |         metrics:
115 |           - 'sum'
116 |           - 'avg'
117 | 
118 |     intervals:
119 |       - '6month'
120 | 
121 |     groups:
122 |       - 'entity_id'
123 | 
124 | feature_group_definition:
125 |   prefix:
126 |     - 'results'
127 |     - 'risks'
128 |     - 'inspections'
129 | 
130 | feature_group_strategies: ['all']
131 | 
132 | model_group_keys:
133 |   - 'class_path'
134 |   - 'parameters'
135 |   - 'feature_names'
136 |   - 'feature_groups'
137 |   - 'cohort_name'
138 |   - 'state'
139 |   - 'label_name'
140 |   - 'label_timespan'
141 |   - 'training_as_of_date_frequency'
142 |   - 'max_training_history'
143 |   - 'label_definition'
144 |   - 'experiment_type'
145 |   - 'org'
146 |   - 'team'
147 |   - 'author'
148 |   - 'etl_date'
149 | 
150 | scoring:
151 |   testing_metric_groups:
152 |     -
153 |       metrics: ['precision@', 'recall@']
154 |       thresholds:
155 |         percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0]
156 |         top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500]
157 |   training_metric_groups:
158 |     -
159 |       metrics: ['accuracy']
160 |     -
161 |       metrics: ['precision@', 'recall@']
162 |       thresholds:
163 |         percentiles: [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0]
164 |         top_n: [1, 5, 10, 25, 50, 100, 150, 300, 500, 1000, 1500]
165 | 


--------------------------------------------------------------------------------
/triage/inspection_audition_config.yaml:
--------------------------------------------------------------------------------
 1 | # CHOOSE MODEL GROUPS
 2 | model_groups:
 3 |     query: |
 4 |         select distinct(model_group_id)
 5 |         from model_metadata.model_groups
 6 |         where model_config ->> 'experiment_type' ~ 'inspection'
 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES
 8 | time_stamps:
 9 |     query: |
10 |         select distinct train_end_time
11 |         from model_metadata.models
12 |         where model_group_id in ({})
13 |         and extract(day from train_end_time) in (1)
14 |         and train_end_time >= '2015-01-01'
15 | # FILTER
16 | filter:
17 |     metric: 'precision@' # metric of interest
18 |     parameter: '10_pct' # parameter of interest
19 |     max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time.
20 |     threshold_value: 0.0 # The worst absolute value that the given metric should be.
21 |     distance_table: 'inspections_distance_table' # name of the distance table
22 |     models_table: 'models' # name of the models table
23 | 
24 | # RULES
25 | rules:
26 |     -
27 |         shared_parameters:
28 |             -
29 |                 metric: 'precision@'
30 |                 parameter: '10_pct'
31 | 
32 |         selection_rules:
33 |             -
34 |                 name: 'best_current_value' # Pick the model group with the best current metric value
35 |                 n: 3
36 |             -
37 |                 name: 'best_average_value' # Pick the model with the highest average metric value
38 |                 n: 3
39 |             -
40 |                 name: 'lowest_metric_variance' # Pick the model with the lowest metric variance
41 |                 n: 3
42 |             -
43 |                 name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case`
44 |                 dist_from_best_case: [0.05]
45 |                 n: 3
46 | 


--------------------------------------------------------------------------------
/triage/inspection_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 | project_path: '/triage' # Project path defined in triage with matrices and models
 4 | model_group_id:
 5 |   - 41
 6 |   - 32
 7 |   - 45
 8 |   - 11
 9 | 
10 | thresholds: # Thresholds for defining positive predictions
11 |   rank_abs: [50, 100, 250]
12 |   rank_pct: [5, 10, 25]
13 | 
14 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
15 |       select g.model_group_id,
16 |              m.model_id,
17 |              extract('year' from m.evaluation_end_time) as as_of_date_year,
18 |              m.metric,
19 |              m.parameter,
20 |              m.value,
21 |              m.num_labeled_examples,
22 |              m.num_labeled_above_threshold,
23 |              m.num_positive_labels
24 |        from test_results.evaluations m
25 |        left join model_metadata.models g
26 |        using(model_id)
27 |        where g.model_group_id = 1
28 |              and metric = 'precision@'
29 |              and parameter = '10_pct'
30 | 
31 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
32 | n_features_plots: 10 # Number of features for importances
33 | figsize: [12, 12] # Default size for plots
34 | fontsize: 20 # Default fontsize for plots
35 | 


--------------------------------------------------------------------------------
/triage/output/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/output/.gitkeep


--------------------------------------------------------------------------------
/triage/output/images/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/output/images/.gitkeep


--------------------------------------------------------------------------------
/triage/output/images/model_7_tree_0.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <!-- Generated by graphviz version 2.38.0 (20140413.2041)
 5 |  -->
 6 | <!-- Title: Tree Pages: 1 -->
 7 | <svg width="421pt" height="186pt"
 8 |  viewBox="0.00 0.00 421.00 186.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 182)">
10 | <title>Tree</title>
11 | <polygon fill="white" stroke="none" points="-4,4 -4,-182 417,-182 417,4 -4,4"/>
12 | <!-- 0 -->
13 | <g id="node1" class="node"><title>0</title>
14 | <path fill="#e58139" fill-opacity="0.701961" stroke="black" d="M401,-178C401,-178 12,-178 12,-178 6,-178 0,-172 0,-166 0,-166 0,-112 0,-112 0,-106 6,-100 12,-100 12,-100 401,-100 401,-100 407,-100 413,-106 413,-112 413,-112 413,-166 413,-166 413,-172 407,-178 401,-178"/>
15 | <text text-anchor="start" x="8" y="-163.8" font-family="Helvetica,sans-Serif" font-size="14.00">inspections_zip_code_3month_type_complaint_sum ≤ 0.5</text>
16 | <text text-anchor="start" x="163" y="-149.8" font-family="Helvetica,sans-Serif" font-size="14.00">gini = 0.353</text>
17 | <text text-anchor="start" x="146" y="-135.8" font-family="Helvetica,sans-Serif" font-size="14.00">samples = 26018</text>
18 | <text text-anchor="start" x="126.5" y="-121.8" font-family="Helvetica,sans-Serif" font-size="14.00">value = [20060, 5958]</text>
19 | <text text-anchor="start" x="171.5" y="-107.8" font-family="Helvetica,sans-Serif" font-size="14.00">class = y</text>
20 | <text text-anchor="start" x="233.5" y="-107.8" font-family="Helvetica,sans-Serif" baseline-shift="sub" font-size="14.00">0</text>
21 | </g>
22 | <!-- 1 -->
23 | <g id="node2" class="node"><title>1</title>
24 | <path fill="#e58139" fill-opacity="0.666667" stroke="black" d="M190.5,-64C190.5,-64 38.5,-64 38.5,-64 32.5,-64 26.5,-58 26.5,-52 26.5,-52 26.5,-12 26.5,-12 26.5,-6 32.5,-0 38.5,-0 38.5,-0 190.5,-0 190.5,-0 196.5,-0 202.5,-6 202.5,-12 202.5,-12 202.5,-52 202.5,-52 202.5,-58 196.5,-64 190.5,-64"/>
25 | <text text-anchor="start" x="71" y="-49.8" font-family="Helvetica,sans-Serif" font-size="14.00">gini = 0.375</text>
26 | <text text-anchor="start" x="54" y="-35.8" font-family="Helvetica,sans-Serif" font-size="14.00">samples = 21708</text>
27 | <text text-anchor="start" x="34.5" y="-21.8" font-family="Helvetica,sans-Serif" font-size="14.00">value = [16281, 5427]</text>
28 | <text text-anchor="start" x="79.5" y="-7.8" font-family="Helvetica,sans-Serif" font-size="14.00">class = y</text>
29 | <text text-anchor="start" x="141.5" y="-7.8" font-family="Helvetica,sans-Serif" baseline-shift="sub" font-size="14.00">0</text>
30 | </g>
31 | <!-- 0&#45;&gt;1 -->
32 | <g id="edge1" class="edge"><title>0&#45;&gt;1</title>
33 | <path fill="none" stroke="black" d="M173.026,-99.7956C164.998,-90.6335 156.419,-80.8428 148.374,-71.6603"/>
34 | <polygon fill="black" stroke="black" points="150.987,-69.332 141.765,-64.1172 145.722,-73.9451 150.987,-69.332"/>
35 | <text text-anchor="middle" x="140.118" y="-85.3629" font-family="Helvetica,sans-Serif" font-size="14.00">True</text>
36 | </g>
37 | <!-- 2 -->
38 | <g id="node3" class="node"><title>2</title>
39 | <path fill="#e58139" fill-opacity="0.858824" stroke="black" d="M366.5,-64C366.5,-64 232.5,-64 232.5,-64 226.5,-64 220.5,-58 220.5,-52 220.5,-52 220.5,-12 220.5,-12 220.5,-6 226.5,-0 232.5,-0 232.5,-0 366.5,-0 366.5,-0 372.5,-0 378.5,-6 378.5,-12 378.5,-12 378.5,-52 378.5,-52 378.5,-58 372.5,-64 366.5,-64"/>
40 | <text text-anchor="start" x="256" y="-49.8" font-family="Helvetica,sans-Serif" font-size="14.00">gini = 0.216</text>
41 | <text text-anchor="start" x="243.5" y="-35.8" font-family="Helvetica,sans-Serif" font-size="14.00">samples = 4310</text>
42 | <text text-anchor="start" x="228.5" y="-21.8" font-family="Helvetica,sans-Serif" font-size="14.00">value = [3779, 531]</text>
43 | <text text-anchor="start" x="264.5" y="-7.8" font-family="Helvetica,sans-Serif" font-size="14.00">class = y</text>
44 | <text text-anchor="start" x="326.5" y="-7.8" font-family="Helvetica,sans-Serif" baseline-shift="sub" font-size="14.00">0</text>
45 | </g>
46 | <!-- 0&#45;&gt;2 -->
47 | <g id="edge2" class="edge"><title>0&#45;&gt;2</title>
48 | <path fill="none" stroke="black" d="M240.338,-99.7956C248.453,-90.6335 257.125,-80.8428 265.258,-71.6603"/>
49 | <polygon fill="black" stroke="black" points="267.929,-73.9237 271.939,-64.1172 262.689,-69.2824 267.929,-73.9237"/>
50 | <text text-anchor="middle" x="273.451" y="-85.3714" font-family="Helvetica,sans-Serif" font-size="14.00">False</text>
51 | </g>
52 | </g>
53 | </svg>
54 | 


--------------------------------------------------------------------------------
/triage/selection_rules/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/dirtyduck/db78911a6c35dd000145f75993650413c435c721/triage/selection_rules/.gitkeep


--------------------------------------------------------------------------------
/triage/selection_rules/rules.yaml:
--------------------------------------------------------------------------------
 1 | -
 2 |   shared_parameters:
 3 |     -
 4 |       metric: 'precision@'
 5 |       parameter: '50_abs'
 6 |   selection_rules:
 7 |     -
 8 |       name: best_current_value
 9 |       n: 1
10 |     -
11 |       name: best_average_value
12 |       n: 1
13 |     -
14 |       name: lowest_metric_variance
15 |       n: 1
16 |     -
17 |       name: most_frequent_best_dist
18 |       dist_from_best_case: [0.05]
19 |       n: 1
20 | 


--------------------------------------------------------------------------------
/triage/session.key:
--------------------------------------------------------------------------------
1 | c2e3bb2a-f80c7b34d4fe02d7e5be87d9
2 | 


--------------------------------------------------------------------------------
/tutorial.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e -u
  4 | 
  5 | PROJECT="triage-dirtyduck"
  6 | PROJECT_HOME="$( cd "$( dirname "$0" )" && pwd )"
  7 | INFRASTRUCTURE_HOME="${PROJECT_HOME}/infrastructure"
  8 | 
  9 | cd "$INFRASTRUCTURE_HOME"
 10 | 
 11 | function help_menu () {
 12 | cat << EOF
 13 | Usage: ${0} {start|stop|build|rebuild|run|logs|status|destroy|all|}
 14 | 
 15 | OPTIONS:
 16 |    -h|help             Show this message
 17 |    start
 18 |    stop
 19 |    rebuild
 20 |    status
 21 |    destroy
 22 |    -t|triage
 23 |    -a|all
 24 | 
 25 | INFRASTRUCTURE:
 26 |    Build the infrastructure:
 27 |         $ ./tutorial.sh start
 28 | 
 29 |    Check the status of the containers:
 30 |         $ ./tutorial.sh status
 31 | 
 32 |    Stop the tutorial's infrastructure:
 33 |         $ ./tutorial.sh stop
 34 | 
 35 |    Destroy all the resources related to the tutorial:
 36 |         $ ./tutorial.sh destroy
 37 | 
 38 |    View the infrastructure logs:
 39 |         $ ./tutorial.sh -l
 40 | 
 41 | EXPERIMENTS:
 42 |    NOTE:
 43 |       The following commands assume that "sample_experiments.yaml"
 44 |       is located inside the triage/experiments directory
 45 | 
 46 |    Run one experiment:
 47 |         $ ./tutorial.sh -t --config_file sample_experiment_config.yaml run
 48 | 
 49 |    Run one experiment, do not replace existing matrices or models, and enable debug:
 50 |         $ ./tutorial.sh -t --config_file sample_experiment_config.yaml --no-replace --debug run
 51 | 
 52 |    Validate experiment configuration file:
 53 |         $ ./tutorial.sh triage --config_file sample_experiment_config.yaml validate
 54 | 
 55 |    Show the experiment's temporal cross-validation blocks:
 56 |         $ ./tutorial.sh -t --config_file sample_experiment_config.yaml show-temporal-blocks
 57 | 
 58 |    Plot model number 4 (for Decision Trees and Random Forests):
 59 |         $ ./tutorial.sh -t --config_file sample_experiment_config.yaml show_model_plot --model 4
 60 | 
 61 |    Triage help:
 62 |         $ ./tutorial.sh triage --help
 63 | 
 64 | EOF
 65 | }
 66 | 
 67 | function start_infrastructure () {
 68 |     docker-compose --project-name ${PROJECT} up -d food_db
 69 | 	#tyra reverseproxy api
 70 | }
 71 | 
 72 | function stop_infrastructure () {
 73 | 	docker-compose  --project-name ${PROJECT} stop
 74 | }
 75 | 
 76 | function build_images () {
 77 | 	docker-compose  --project-name ${PROJECT} build "${@}"
 78 | }
 79 | 
 80 | function destroy () {
 81 | 	docker-compose  --project-name ${PROJECT} down --rmi all --remove-orphans --volumes
 82 | }
 83 | 
 84 | function infrastructure_logs () {
 85 |     docker-compose --project-name ${PROJECT} logs -f -t
 86 | }
 87 | 
 88 | function status () {
 89 | 	docker-compose --project-name ${PROJECT} ps
 90 | }
 91 | 
 92 | function bastion () {
 93 |     docker-compose --project-name ${PROJECT} run --service-ports  --rm --name tutorial_bastion bastion
 94 | }
 95 | 
 96 | function triage () {
 97 | 	docker-compose  --project-name ${PROJECT} run --rm --name triage_experiment triage "${@}"
 98 | }
 99 | 
100 | function all () {
101 | 	build_images
102 | 	start_infrastructure
103 | 	status
104 | }
105 | 
106 | 
107 | if [[ $# -eq 0 ]] ; then
108 | 	help_menu
109 | 	exit 0
110 | fi
111 | 
112 | case "$1" in
113 |     start)
114 |         start_infrastructure
115 | 		shift
116 |         ;;
117 |     stop)
118 |         stop_infrastructure
119 | 		shift
120 |         ;;
121 |     build)
122 |         build_images
123 | 		shift
124 |         ;;
125 |     rebuild)
126 |         build_images --no-cache
127 | 		shift
128 |         ;;
129 |     -d|destroy)
130 |         destroy
131 | 		shift
132 |         ;;
133 |     -l|logs)
134 |         infrastructure_logs
135 | 		shift
136 |         ;;
137 |     status)
138 |         status
139 | 		shift
140 |         ;;
141 |     -t|triage)
142 | 	triage ${@:2}
143 | 		shift
144 | 	;;
145 |     bastion)
146 |         bastion
147 | 	        shift
148 | 	;;
149 |    -a|--all)
150 |        all
151 |                 shift
152 |         ;;
153 |     -h|--help)
154 |         help_menu
155 |                 shift
156 |         ;;
157 |    *)
158 |        echo "${1} is not a valid flag, try running: ${0} --help"
159 | 	   shift
160 |        ;;
161 | esac
162 | shift
163 | 
164 | cd - > /dev/null
165 | 


--------------------------------------------------------------------------------