├── .bandit.yml
├── .codeclimate.yml
├── .dockerignore
├── .editorconfig
├── .github
    └── workflows
    │   ├── build-mkdocs.yaml
    │   ├── publish-to-pypi.yml
    │   └── test.yaml
├── .gitignore
├── .python-version.current
├── .pyup.yml
├── .travis.yml
├── AUTHORS.rst
├── CONTRIBUTING.md
├── Dockerfile
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── README.md
├── develop
├── dirtyduck
    ├── .dockerignore
    ├── docker-compose.yml
    └── food_db
    │   ├── 00_create_extensions.sql
    │   ├── 01_create_inspections_table.sql
    │   ├── 02_create_cleaned_inspections_table.sql
    │   ├── 03_create_violations_table.sql
    │   ├── 04_create_semantic_tables.sql
    │   ├── Dockerfile
    │   └── inspections_2014_2017.csv.bz2
├── docs
    ├── __init__.py
    ├── md_autogen.py
    ├── mkdocs.yml
    ├── sources
    │   ├── api
    │   │   ├── audition
    │   │   │   ├── audition-config.md
    │   │   │   ├── auditioner.md
    │   │   │   ├── database-dependencies.md
    │   │   │   ├── index.md
    │   │   │   └── selection_rules.md
    │   │   └── timechop
    │   │   │   ├── index.md
    │   │   │   ├── plotting.md
    │   │   │   └── timechop.md
    │   ├── audition
    │   │   ├── audition_intro.md
    │   │   ├── images
    │   │   │   └── sanjose-2.png
    │   │   └── model_selection.md
    │   ├── db.md
    │   ├── dirtyduck
    │   │   ├── .swp
    │   │   ├── AUTHORS
    │   │   ├── aws_batch.md
    │   │   ├── choose_your_own_adventure.md
    │   │   ├── data_preparation.md
    │   │   ├── dirty_duckling.md
    │   │   ├── eis.md
    │   │   ├── for_the_impatient.md
    │   │   ├── images
    │   │   │   ├── AWS_Batch_Architecture.png
    │   │   │   ├── AWS_Batch_Architecture.svg
    │   │   │   ├── EDA
    │   │   │   │   ├── facilities_inspected_over_time.png
    │   │   │   │   ├── facilities_inspections_over_time.png
    │   │   │   │   ├── facilities_with_failed_inspections_severe_violations_over_time.png
    │   │   │   │   ├── facilities_with_inspections_failed_over_time.png
    │   │   │   │   ├── failed_inspections_over_time.png
    │   │   │   │   ├── failed_inspections_severe_violations_over_time.png
    │   │   │   │   └── inspections_over_time.png
    │   │   │   ├── audition
    │   │   │   │   ├── eis
    │   │   │   │   │   ├── distance_from_best_precision@10_pct.png
    │   │   │   │   │   ├── metric_over_time_precision@10_pct.png
    │   │   │   │   │   ├── precision@10_pct_next_time.png
    │   │   │   │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   │   │   │   └── regret_over_time_precision@10_pct.png
    │   │   │   │   └── inspections
    │   │   │   │   │   ├── distance_from_best_precision@10_pct.png
    │   │   │   │   │   ├── metric_over_time_precision@10_pct.png
    │   │   │   │   │   ├── precision@10_pct_next_time.png
    │   │   │   │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   │   │   │   └── regret_over_time_precision@10_pct.png
    │   │   │   ├── data_road.png
    │   │   │   ├── outcomes-eis.png
    │   │   │   ├── outcomes-inspections.png
    │   │   │   ├── postmodeling
    │   │   │   │   ├── eis_jaccard_on_lists_over_time.png
    │   │   │   │   ├── eis_mg_prec_over_time.png
    │   │   │   │   ├── eis_mg_recall_over_time.png
    │   │   │   │   ├── eis_model_group_64_feature_group_importances.png
    │   │   │   │   ├── eis_model_group_64_feature_importances.png
    │   │   │   │   ├── eis_postmodeling_config.yaml
    │   │   │   │   ├── inspection_jaccard_on_lists_over_time.png
    │   │   │   │   ├── inspection_mg_prec_over_time.png
    │   │   │   │   ├── inspection_mg_recall_over_time.png
    │   │   │   │   ├── inspection_model_group_39_model_125_feature_group_importances.png
    │   │   │   │   ├── inspection_model_group_39_model_125_feature_importances.png
    │   │   │   │   ├── inspection_model_group_39_model_125_rayid_curve.png
    │   │   │   │   └── inspection_postmodeling_config.yaml
    │   │   │   ├── quickstart.png
    │   │   │   ├── rolling-origin.png
    │   │   │   ├── sanjose-2.png
    │   │   │   ├── timechop
    │   │   │   │   ├── timechop_1.png
    │   │   │   │   ├── timechop_10.png
    │   │   │   │   ├── timechop_2.png
    │   │   │   │   ├── timechop_3.png
    │   │   │   │   ├── timechop_4.png
    │   │   │   │   ├── timechop_5.png
    │   │   │   │   ├── timechop_6.png
    │   │   │   │   ├── timechop_7.png
    │   │   │   │   ├── timechop_8.png
    │   │   │   │   └── timechop_9.png
    │   │   │   └── triage
    │   │   │   │   ├── distance_from_best_precision@10_pct.png
    │   │   │   │   ├── eis_01.png
    │   │   │   │   ├── inspections_baseline.png
    │   │   │   │   ├── inspections_dt.png
    │   │   │   │   ├── inspections_label_failed_01.png
    │   │   │   │   ├── metric_over_time_precision@10_pct.png
    │   │   │   │   ├── precision@10_pct_next_time.png
    │   │   │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   │   │   ├── regret_over_time_precision@10_pct.png
    │   │   │   │   ├── results_model_group_ids.json
    │   │   │   │   └── simple_test_skeleton.png
    │   │   ├── index.md
    │   │   ├── infrastructure.md
    │   │   ├── inspections.md
    │   │   ├── ml_governance.md
    │   │   ├── problem_description.md
    │   │   ├── triage_intro.md
    │   │   └── who_is_this_tutorial_for.md
    │   ├── experiments
    │   │   ├── algorithm.md
    │   │   ├── architecture.md
    │   │   ├── cohort-labels.md
    │   │   ├── experiment-config.md
    │   │   ├── feature-testing.md
    │   │   ├── features.md
    │   │   ├── featuretest-cli.png
    │   │   ├── featuretest-result.png
    │   │   ├── prediction-ranking.md
    │   │   ├── running.md
    │   │   ├── temporal-validation.md
    │   │   ├── temporal_config_graph.png
    │   │   ├── timechops.png
    │   │   ├── upgrade-to-v5.md
    │   │   ├── upgrade-to-v6.md
    │   │   ├── upgrade-to-v7.md
    │   │   └── upgrade-to-v8.md
    │   ├── index.md
    │   ├── js
    │   │   └── mermaid.min.js
    │   ├── postmodeling
    │   │   ├── index.md
    │   │   ├── postmodeling-config.md
    │   │   └── postmodeling_general_flow.png
    │   ├── predictlist
    │   │   └── index.md
    │   ├── quickstart.md
    │   ├── triage.experiments.base.md
    │   ├── triage.experiments.multicore.md
    │   ├── triage.experiments.singlethreaded.md
    │   ├── triage_docs.css
    │   └── triage_project_workflow.md
    └── update_docs.py
├── example
    ├── aws_batch
    │   ├── aws_env.example
    │   ├── credentials.filter.example
    │   ├── deploy.sh
    │   ├── triage-job-definition.json.example
    │   └── triage-overrides.json.example
    ├── cohort
    │   └── past_events.sql
    ├── colab
    │   └── colab_triage.ipynb
    ├── config
    │   ├── README.md
    │   ├── audition.yaml
    │   ├── database.yaml
    │   ├── dirty-duckling.yaml
    │   ├── experiment.yaml
    │   ├── postmodeling_config.yaml
    │   └── postmodeling_crosstabs.yaml
    ├── dirtyduck
    │   ├── audition
    │   │   ├── eis_audition_config.yaml
    │   │   ├── inspection_audition_config.yaml
    │   │   └── inspections
    │   │   │   ├── distance_from_best_precision@10_pct.png
    │   │   │   ├── distance_from_best_precision@15_pct.png
    │   │   │   ├── metric_over_time_precision@10_pct.png
    │   │   │   ├── metric_over_time_precision@15_pct.png
    │   │   │   ├── precision@10_pct_next_time.png
    │   │   │   ├── precision@15_pct_next_time.png
    │   │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   │   ├── regret_distance_from_best_rules_precision@15_pct.png
    │   │   │   ├── regret_over_time_precision@10_pct.png
    │   │   │   ├── regret_over_time_precision@15_pct.png
    │   │   │   └── results_model_group_ids.json
    │   ├── crosstabs
    │   │   └── eis_crosstabs_config.yaml
    │   ├── experiments
    │   │   ├── dirty-duckling.yaml
    │   │   ├── eis_01.yaml
    │   │   ├── eis_02.yaml
    │   │   ├── inspections_baseline.yaml
    │   │   ├── inspections_dt.yaml
    │   │   ├── inspections_label_failed_01.yaml
    │   │   ├── inspections_label_failed_02.yaml
    │   │   └── simple_test_skeleton.yaml
    │   ├── images
    │   │   ├── distance_from_best_precision@10_pct.png
    │   │   ├── eis_01.png
    │   │   ├── inspections_baseline.png
    │   │   ├── inspections_dt.png
    │   │   ├── inspections_label_failed_01.png
    │   │   ├── metric_over_time_precision@10_pct.png
    │   │   ├── precision@10_pct_next_time.png
    │   │   ├── regret_distance_from_best_rules_precision@10_pct.png
    │   │   ├── regret_over_time_precision@10_pct.png
    │   │   ├── results_model_group_ids.json
    │   │   └── simple_test_skeleton.png
    │   ├── output
    │   │   ├── .gitkeep
    │   │   └── images
    │   │   │   ├── .gitkeep
    │   │   │   ├── eis.svg
    │   │   │   ├── inspections.svg
    │   │   │   ├── inspections_dt.svg
    │   │   │   ├── inspections_test.svg
    │   │   │   ├── model_7_tree_0.svg
    │   │   │   └── simple_test_skeleton.svg
    │   └── postmodeling
    │   │   ├── database.yaml
    │   │   ├── eis_postmodeling_config.yaml
    │   │   ├── inspection_jaccard_on_lists_over_time.png
    │   │   ├── inspection_mg_prec_over_time.png
    │   │   ├── inspection_mg_recall_over_time.png
    │   │   ├── inspection_model_group_39_model_125_feature_group_importances.png
    │   │   ├── inspection_model_group_39_model_125_feature_importances.png
    │   │   ├── inspection_model_group_39_model_125_rayid_curve.png
    │   │   └── inspection_postmodeling_config.yaml
    └── label
    │   └── events.sql
├── manage.py
├── pytest.ini
├── requirement
    ├── dev.txt
    ├── extras-rq.txt
    ├── include
    │   ├── build.txt
    │   ├── lint.txt
    │   └── test-management.txt
    ├── main.txt
    └── test.txt
├── setup.cfg
├── setup.py
├── src
    ├── tests
    │   ├── __init__.py
    │   ├── architect_tests
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── test_builders.py
    │   │   ├── test_entity_date_table_generators.py
    │   │   ├── test_feature_dictionary_creator.py
    │   │   ├── test_feature_generators.py
    │   │   ├── test_feature_group_creator.py
    │   │   ├── test_feature_group_mixer.py
    │   │   ├── test_integration.py
    │   │   ├── test_label_generators.py
    │   │   ├── test_planner.py
    │   │   └── utils.py
    │   ├── audition_tests
    │   │   ├── __init__.py
    │   │   ├── test_audition.py
    │   │   ├── test_distance_from_best.py
    │   │   ├── test_model_group_performance.py
    │   │   ├── test_plotting.py
    │   │   ├── test_preaudition.py
    │   │   ├── test_regrets.py
    │   │   ├── test_rules_maker.py
    │   │   ├── test_selection_rule_grid.py
    │   │   ├── test_selection_rule_performance.py
    │   │   ├── test_selection_rules.py
    │   │   ├── test_thresholding.py
    │   │   └── utils.py
    │   ├── catwalk_tests
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── test_baselines.py
    │   │   ├── test_estimators.py
    │   │   ├── test_evaluation.py
    │   │   ├── test_feature_importances.py
    │   │   ├── test_individual_importance.py
    │   │   ├── test_individual_importance_uniform.py
    │   │   ├── test_integration.py
    │   │   ├── test_metrics.py
    │   │   ├── test_model_grouping.py
    │   │   ├── test_model_trainers.py
    │   │   ├── test_predictors.py
    │   │   ├── test_protected_groups_generators.py
    │   │   ├── test_storage.py
    │   │   ├── test_utils.py
    │   │   └── utils.py
    │   ├── collate_tests
    │   │   ├── __init__.py
    │   │   ├── create_inspections_subset.py
    │   │   ├── food_inspections_subset.csv
    │   │   ├── initialize_db.py
    │   │   ├── test_collate.py
    │   │   ├── test_from_obj.py
    │   │   ├── test_helpers.py
    │   │   ├── test_imputation_output.py
    │   │   ├── test_imputations.py
    │   │   ├── test_integration.py
    │   │   └── test_spacetime.py
    │   ├── conftest.py
    │   ├── example_schema.yaml
    │   ├── postmodeling_tests
    │   │   ├── test_add_predictions.py
    │   │   ├── test_crosstabs.py
    │   │   ├── test_model_evaluator.py
    │   │   ├── test_model_group_evaluator.py
    │   │   └── test_without_predictions.py
    │   ├── results_tests
    │   │   ├── __init__.py
    │   │   ├── factories.py
    │   │   ├── test_factories.py
    │   │   ├── test_upgrade_if_clean.py
    │   │   └── test_valid_schema.py
    │   ├── test_cli.py
    │   ├── test_database_reflection.py
    │   ├── test_defaults.py
    │   ├── test_experiments.py
    │   ├── test_partial_experiments.py
    │   ├── test_predictlist.py
    │   ├── test_tracking_experiments.py
    │   ├── test_utils.py
    │   ├── test_utils_pandas.py
    │   ├── test_validation.py
    │   ├── test_validation_primitives.py
    │   ├── timechop_tests
    │   │   ├── __init__.py
    │   │   ├── test_plotting.py
    │   │   ├── test_timechop.py
    │   │   └── test_utils.py
    │   └── utils.py
    └── triage
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── component
    │       ├── __init__.py
    │       ├── architect
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── builders.py
    │       │   ├── database_reflection.py
    │       │   ├── entity_date_table_generators.py
    │       │   ├── feature_dictionary_creator.py
    │       │   ├── feature_generators.py
    │       │   ├── feature_group_creator.py
    │       │   ├── feature_group_mixer.py
    │       │   ├── features.py
    │       │   ├── label_generators.py
    │       │   ├── planner.py
    │       │   ├── utils.py
    │       │   └── validations.py
    │       ├── audition
    │       │   ├── Audition_Tutorial.ipynb
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── distance_from_best.py
    │       │   ├── metric_directionality.py
    │       │   ├── model_group_performance.py
    │       │   ├── plotting.py
    │       │   ├── pre_audition.py
    │       │   ├── regrets.py
    │       │   ├── rules_maker.py
    │       │   ├── selection_rule_grid.py
    │       │   ├── selection_rule_performance.py
    │       │   ├── selection_rules.py
    │       │   ├── thresholding.py
    │       │   └── utils.py
    │       ├── catwalk
    │       │   ├── README.rst
    │       │   ├── __init__.py
    │       │   ├── baselines
    │       │   │   ├── __init__.py
    │       │   │   ├── rankers.py
    │       │   │   └── thresholders.py
    │       │   ├── db.py
    │       │   ├── estimators
    │       │   │   ├── __init__.py
    │       │   │   ├── classifiers.py
    │       │   │   ├── dsapp_scaler.org
    │       │   │   └── transformers.py
    │       │   ├── evaluation.py
    │       │   ├── exceptions.py
    │       │   ├── feature_importances.py
    │       │   ├── individual_importance
    │       │   │   ├── __init__.py
    │       │   │   └── uniform.py
    │       │   ├── metrics.py
    │       │   ├── model_grouping.py
    │       │   ├── model_trainers.py
    │       │   ├── predictors.py
    │       │   ├── protected_groups_generators.py
    │       │   ├── storage.py
    │       │   ├── subsetters.py
    │       │   └── utils.py
    │       ├── collate
    │       │   ├── README.rst
    │       │   ├── __init__.py
    │       │   ├── collate.py
    │       │   ├── from_obj.py
    │       │   ├── imputations.py
    │       │   ├── spacetime.py
    │       │   └── sql.py
    │       ├── postmodeling
    │       │   ├── __init__.py
    │       │   ├── add_predictions.py
    │       │   ├── add_predictions_example_config.yaml
    │       │   ├── base.py
    │       │   ├── crosstabs.py
    │       │   ├── deprecated
    │       │   │   ├── README.md
    │       │   │   ├── __init__.py
    │       │   │   ├── db_credentials_example.yaml
    │       │   │   ├── model_analyzer.py
    │       │   │   ├── model_evaluator.py
    │       │   │   ├── model_group_evaluator.py
    │       │   │   ├── parameters.py
    │       │   │   ├── postmodeling_analyzer.py
    │       │   │   ├── postmodeling_tutorial.ipynb
    │       │   │   └── utils
    │       │   │   │   ├── __init__.py
    │       │   │   │   └── aux_funcs.py
    │       │   ├── error_analysis.py
    │       │   ├── error_analysis_example.ipynb
    │       │   ├── example_experiment_summary_given_experiment_hashes.ipynb
    │       │   ├── example_experiment_summary_report_specific_experiment_wSubsets.ipynb
    │       │   ├── example_generate_experiment_summary_report_automatically_after_experiment_run.ipynb
    │       │   ├── example_triage_report.html
    │       │   ├── experiment_summarizer.py
    │       │   ├── experiment_summary_report_template.ipynb
    │       │   ├── fairness
    │       │   │   ├── __init__.py
    │       │   │   ├── aequitas_example.ipynb
    │       │   │   └── aequitas_utils.py
    │       │   ├── list_analysis.py
    │       │   ├── postmodeling_config.yaml
    │       │   ├── postmodeling_report_example_acdhs_housing.ipynb
    │       │   ├── readme.md
    │       │   └── utils
    │       │   │   └── __init__.py
    │       ├── results_schema
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── alembic.ini
    │       │   ├── alembic
    │       │   │   ├── README
    │       │   │   ├── __init__.py
    │       │   │   ├── env.py
    │       │   │   ├── script.py.mako
    │       │   │   └── versions
    │       │   │   │   ├── 079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py
    │       │   │   │   ├── 0bca1ba9706e_add_matrix_uuid_to_eval.py
    │       │   │   │   ├── 0d44655e35fd_.py
    │       │   │   │   ├── 1b990cbc04e4_production_schema.py
    │       │   │   │   ├── 2446a931de7a_changing_column_names_and_removing_.py
    │       │   │   │   ├── 264245ddfce2_.py
    │       │   │   │   ├── 264786a9fe85_add_label_value_to_prodcution_table.py
    │       │   │   │   ├── 38f37d013686_associate_experiments_with_models_and_.py
    │       │   │   │   ├── 3ce027594a5c_add_hashes_to_runs.py
    │       │   │   │   ├── 45219f25072b_hash_partitioning_predictions_tables.py
    │       │   │   │   ├── 4ae804cc0977_.py
    │       │   │   │   ├── 50e1f1bc2cac_add_subsets.py
    │       │   │   │   ├── 5dd2ba8222b1_add_run_type.py
    │       │   │   │   ├── 609c7cc51794_rankify_predictions.py
    │       │   │   │   ├── 670289044eb2_add_production_prediction_metadata.py
    │       │   │   │   ├── 72ac5cbdca05_change_importance_to_float.py
    │       │   │   │   ├── 7d57d1cf3429_.py
    │       │   │   │   ├── 89a8ce240bae_.py
    │       │   │   │   ├── 8b3f167d0418_.py
    │       │   │   │   ├── 8cef808549dd_.py
    │       │   │   │   ├── 97cf99b7348f_evaluation_randomness.py
    │       │   │   │   ├── 9bbfdcf8bab0_.py
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── a20104116533_.py
    │       │   │   │   ├── a98acf92fd48_add_nuke_triage_function.py
    │       │   │   │   ├── b097e47ba829_remove_random_seed_from_experiments.py
    │       │   │   │   ├── b4d7569d31cb_aequitas.py
    │       │   │   │   ├── cdd0dc9d9870_rename_production_schema_and_prediction_table.py
    │       │   │   │   ├── ce5b50ffa8e2_break_ties_in_list_predictions.py
    │       │   │   │   ├── cfd5c3386014_add_experiment_runs.py
    │       │   │   │   ├── d0ac573eaf1a_model_group_stored_procedure.py
    │       │   │   │   └── fa1760d35710_.py
    │       │   ├── example_db_config.yaml
    │       │   ├── schema.py
    │       │   └── sql
    │       │   │   ├── model_group_stored_procedure.sql
    │       │   │   ├── nuke_triage.sql
    │       │   │   ├── predictions_hash_partitioning.sql
    │       │   │   └── undo_predictions_hash_partitioning.sql
    │       └── timechop
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── plotting.py
    │       │   ├── timechop.py
    │       │   └── utils.py
    │   ├── config
    │       └── logging.yaml
    │   ├── database_reflection.py
    │   ├── experiments
    │       ├── __init__.py
    │       ├── base.py
    │       ├── defaults.py
    │       ├── model_grid_presets.yaml
    │       ├── multicore.py
    │       ├── rq.py
    │       ├── singlethreaded.py
    │       └── validate.py
    │   ├── predictlist
    │       ├── __init__.py
    │       └── utils.py
    │   ├── tracking.py
    │   ├── util
    │       ├── __init__.py
    │       ├── conf.py
    │       ├── db.py
    │       ├── defaults.py
    │       ├── introspection.py
    │       ├── pandas.py
    │       ├── random.py
    │       └── structs.py
    │   └── validation_primitives.py
├── tox.ini
└── tutorial.sh


/.bandit.yml:
--------------------------------------------------------------------------------
1 | skips: ['B101']
2 | 


--------------------------------------------------------------------------------
/.codeclimate.yml:
--------------------------------------------------------------------------------
 1 | version: "2"         # required to adjust maintainability checks
 2 | plugins:
 3 |     pep8:
 4 |         enabled: true
 5 | checks:
 6 |   argument-count:
 7 |     config:
 8 |       threshold: 5
 9 |   complex-logic:
10 |     config:
11 |       threshold: 4
12 |   file-lines:
13 |     config:
14 |       threshold: 400
15 |   method-complexity:
16 |     config:
17 |       threshold: 5
18 |   method-count:
19 |     config:
20 |       threshold: 20
21 |   method-lines:
22 |     config:
23 |       threshold: 25
24 |   nested-control-flow:
25 |     config:
26 |       threshold: 4
27 |   return-statements:
28 |     config:
29 |       threshold: 4
30 |   similar-code:
31 |     enabled: false
32 |   identical-code:
33 |     enabled: false
34 | exclude_patterns:
35 |   - "src/triage/component/results_schema/alembic/versions"
36 |   - "src/triage/component/results_schema/alembic/env.py"
37 |   - "src/triage/component/results_schema/schema.py"
38 |   - "docs/"
39 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .cache
 2 | .config
 3 | docker
 4 | docs
 5 | example
 6 | develop
 7 | .git
 8 | .hypothesis
 9 | .local
10 | .pytest_cache
11 | tox.ini
12 | .travis.yml
13 | tutorial.sh
14 | .pyup.yml
15 | .python-version
16 | .python-version.current
17 | *.rst
18 | *.yml
19 | *.inc
20 | CONTRIBUTING.md
21 | database.yaml
22 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.github/workflows/build-mkdocs.yaml:
--------------------------------------------------------------------------------
 1 | name: Build Docs and Serve to Github Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     paths:
 8 |       - 'docs/**'
 9 |   
10 | jobs:
11 |   docs:
12 |     name: Build Docs and Serve to Github Pages
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@v4
17 |         with:
18 |           fetch-depth: 0
19 |       
20 |       - name: Set up Python 3.9
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: 3.9
24 |       
25 |       - name: Install dependencies
26 |         run: |
27 |           pip install --upgrade pip && pip install -r requirement/dev.txt
28 |           pip install git+https://github.com/dssg/triage.git@master
29 |           git config user.name 'github-actions[bot]' && git config user.email 'github-actions[bot]@users.noreply.github.com'
30 |       
31 |       - name: Publish docs
32 |         run: mkdocs gh-deploy -f "$(pwd)/docs/mkdocs.yml"
33 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish python distributions to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: Build and publish python distributions to PyPI
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v4
16 |       with:
17 |         python-version: 3.9
18 |     - name: Install pypa/build
19 |       run: >-
20 |         python -m
21 |         pip install
22 |         build
23 |         --user
24 |     - name: Build a binary wheel and a source tarball
25 |       run: python -m build
26 |     - name: Store the distribution packages
27 |       uses: actions/upload-artifact@v4
28 |       with:
29 |         name: python-package-distributions
30 |         path: dist/
31 |   publish-to-pypi:
32 |     name: Publish python distribution to PyPI
33 |     if: startsWith(github.ref, 'refs/tags/')
34 |     needs: 
35 |     - build
36 |     runs-on: ubuntu-latest
37 |     environment:
38 |       name: pypi
39 |       url: https://pypi.org/p/triage
40 |     steps:
41 |       - name: Download all the dists
42 |         uses: actions/download-artifact@v4
43 |         with:
44 |           name: python-package-distributions
45 |           path: dist/
46 |       - name: Publish distribution to PyPI 
47 |         uses: pypa/gh-action-pypi-publish@release/v1
48 |         with:
49 |           password: ${{ secrets.PYPI_API_TOKEN }}
50 | 
51 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |      - '*'
 7 | 
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.9', '3.10']
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v3
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v4
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install system dependencies
23 |       run: |
24 |         sudo apt-get update
25 |         sudo apt-get install libblas-dev liblapack-dev libatlas-base-dev gfortran
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest
30 |         pip install -r requirement/include/build.txt
31 |         pip install -r requirement/include/test-management.txt
32 |     - name: Test with tox
33 |       run: |
34 |         tox
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.egg-info/
 3 | .eggs/
 4 | build/
 5 | dist/
 6 | .cache/
 7 | .coverage*
 8 | .tox/
 9 | docs/site/
10 | **/.hypothesis/
11 | 
12 | /.install.*.bash.inc
13 | /.python-version
14 | .DS_Store
15 | 
16 | .ipynb_checkpoints/
17 | venv/
18 | my_db_config.yaml
19 | database.yaml
20 | database*.yaml
21 | dirtyduck/triage/**
22 | 
23 | *~
24 | **/trained_models/**
25 | **/matrices
26 | 
27 | *.swp
28 | src/triage/component/postmodeling/postmodeling_tmp.ipynb
29 | src/triage/component/postmodeling/model_class_testing.py
30 | src/triage/component/postmodeling/model_class_testing.ipynb
31 | src/triage/component/postmodeling/model_groups_testing.ipynb


--------------------------------------------------------------------------------
/.python-version.current:
--------------------------------------------------------------------------------
1 | triage-3.9.10
2 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
 1 | # autogenerated pyup.io config file 
 2 | # see https://pyup.io/docs/configuration/ for all available options
 3 | 
 4 | schedule: every month
 5 | 
 6 | requirements:
 7 |     - requirement/dev.txt
 8 |     - requirement/main.txt
 9 |     - requirement/test.txt
10 |     - requirement/include/build.txt
11 |     - requirement/include/lint.txt
12 |     - requirement/include/test-management.txt
13 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | Center for Data Science and Public Policy <datascifellows@gmail.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | - Adolfo De Unánue
14 | - Andrea Navarrete
15 | - Avishek Kumar
16 | - Benedict Kuester
17 | - Eddie Lin
18 | - Eric Potash
19 | - Erika Salomon
20 | - Hannes Koenig
21 | - Jesse London
22 | - Joe Walsh
23 | - Kit Rodolfa
24 | - Klaus Ackermann
25 | - Matt Bauman
26 | - Rayid Ghani
27 | - Tristan Crockett
28 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
1 | =======
2 | History
3 | =======
4 | 
5 | 0.1.0 (2016-10-19)
6 | ------------------
7 | 
8 | * First release on PyPI.
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Created by Data Science and Public Policy, University of Chicago
 2 | 
 3 | MIT License
 4 | 
 5 | Copyright (c) 2019 Data Science and Public Policy, University of Chicago
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include requirement *.txt
 8 | recursive-include src alembic.ini
 9 | recursive-include src *.sql
10 | recursive-include src/tests *
11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 | recursive-include src *.yaml
13 | 
14 | recursive-exclude * __pycache__
15 | recursive-exclude * *.py[co]
16 | 


--------------------------------------------------------------------------------
/dirtyduck/.dockerignore:
--------------------------------------------------------------------------------
1 | food_db/*
2 | docker_compose.yml
3 | 


--------------------------------------------------------------------------------
/dirtyduck/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | services:
 4 |   food_db:
 5 |     build:
 6 |       context: ./food_db
 7 |     image: dirtyduck/db
 8 |     container_name: dirtyduck_db
 9 |     environment:
10 |       - POSTGRES_HOST=0.0.0.0
11 |       - POSTGRES_USER=food_user
12 |       - POSTGRES_PORT=5432
13 |       - POSTGRES_PASSWORD=some_password
14 |       - POSTGRES_DB=food
15 |     volumes:
16 |       - db-data:/var/lib/postgresql/data
17 |     ports:
18 |       - "5434:5432"
19 | 
20 |   bastion:
21 |     build:
22 |       context: ..
23 |       dockerfile: Dockerfile
24 |       target: development
25 |     image: dsapp/triage:development
26 |     container_name: dirtyduck_bastion
27 |     tty: true
28 |     environment:
29 |       - PS1=\[$$(tput setaf 4)$$(tput bold)[\]\u@$$(tput setaf 2)$$(tput smul)dirtyduck$$(tput rmul)$$(tput setaf 4)$$:\\w]#\[$$(tput sgr0) ]\
30 |       - TRIAGE_IMAGE=dirtyduck
31 |       - DATABASE_URL=postgresql://food_user:some_password@dirtyduck_db/food
32 |       - TRIAGE_OUTPUT=/triage-output
33 |     volumes:
34 |       - "../example/dirtyduck:/dirtyduck"
35 |       - "../dirtyduck-output:/triage-output"
36 |     working_dir: /dirtyduck
37 | 
38 | volumes:
39 |   db-data:
40 | 


--------------------------------------------------------------------------------
/dirtyduck/food_db/00_create_extensions.sql:
--------------------------------------------------------------------------------
 1 | create extension postgis;
 2 | 
 3 | create extension postgis_raster;
 4 | create extension postgis_topology;
 5 | create extension postgis_sfcgal;
 6 | 
 7 | 
 8 | 
 9 | create extension if not exists fuzzystrmatch;
10 | create extension if not exists unaccent;
11 | create extension if not exists pg_trgm;
12 | create extension if not exists bloom;
13 | 
14 | create extension if not exists citext;
15 | 
16 | create extension if not exists cube;
17 | 
18 | create extension if not exists file_fdw;
19 | create extension if not exists postgres_fdw;
20 | 
21 | create extension if not exists earthdistance;
22 | 


--------------------------------------------------------------------------------
/dirtyduck/food_db/01_create_inspections_table.sql:
--------------------------------------------------------------------------------
 1 | create schema if not exists raw;
 2 | 
 3 | drop table if exists raw.inspections;
 4 | create table if not exists raw.inspections (
 5 |        inspection text not null,
 6 |        DBA_Name text,
 7 |        AKA_Name text,
 8 |        license_Num decimal,
 9 |        facility_type text,
10 |        risk text,
11 |        address text,
12 |        city text,
13 |        state text,
14 |        zip text,
15 |        date date,
16 |        type text,
17 |        results text,
18 |        violations text,
19 |        latitude decimal,
20 |        longitude decimal,
21 |        location text
22 | );
23 | 
24 | copy raw.inspections from program 'bzcat /tmp/inspections_2014_2017.csv.bz2' HEADER CSV QUOTE '"';
25 | 


--------------------------------------------------------------------------------
/dirtyduck/food_db/02_create_cleaned_inspections_table.sql:
--------------------------------------------------------------------------------
 1 | create schema if not exists cleaned;
 2 | 
 3 | drop table if exists cleaned.inspections cascade;
 4 | 
 5 | create table cleaned.inspections as (
 6 |   with cleaned as (
 7 |     select
 8 |       inspection::integer,
 9 |       btrim(lower(results)) as result,
10 |       license_num::integer,
11 |       btrim(lower(dba_name)) as facility,
12 |       btrim(lower(aka_name)) as facility_aka,
13 |       case when
14 |            facility_type is null then 'unknown'
15 |       else btrim(lower(facility_type))
16 |       end as facility_type,
17 |       lower(substring(risk from '\((.+)\)')) as risk,
18 |       btrim(lower(address)) as address,
19 |       zip as zip_code,
20 |       substring(
21 |         btrim(lower(regexp_replace(type, 'liquor', 'task force', 'gi')))
22 |         from 'canvass|task force|complaint|food poisoning|consultation|license|tag removal') as type,
23 |       date,
24 |         -- point(longitude, latitude) as location
25 |       ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as location  -- We use geography so the measurements are in meters
26 |       from raw.inspections
27 |      where zip is not null  -- removing NULL zip codes
28 |   )
29 | 
30 |   select * from cleaned where type is not null
31 | );
32 | 


--------------------------------------------------------------------------------
/dirtyduck/food_db/03_create_violations_table.sql:
--------------------------------------------------------------------------------
 1 | drop table if exists cleaned.violations cascade;
 2 | 
 3 | create table cleaned.violations as (
 4 |   select
 5 |     inspection::integer,
 6 |     license_num::integer,
 7 |     date::date,
 8 |     btrim(tuple[1]) as code,
 9 |     lower(btrim(tuple[2])) as description,
10 |     lower(btrim(tuple[3])) as comment,
11 |     (case
12 |      when btrim(tuple[1]) = '' then NULL
13 |      when btrim(tuple[1])::int between 1 and 14 then 'critical' -- From the documentation
14 |      when btrim(tuple[1])::int between 15 and 29  then 'serious'
15 |      else 'minor'
16 |      end
17 |     ) as severity from
18 |                       (
19 |                         select
20 |                           inspection,
21 |                           license_num,
22 |                           date,
23 |                           regexp_split_to_array(   -- Create an array we will split the code, description, comment
24 |                                                 regexp_split_to_table( -- Create a row per each comment we split by |
25 |                                                                       coalesce(            -- If there isn't a violation add '- Comments:'
26 |                                                                                regexp_replace(violations, '[\n\r]+', '', 'g' )  -- Remove line breaks
27 |                                                                                , '- Comments:')
28 |                                                                                , '\|')  -- Split the violations
29 |                                                                                , '(?<=\d+)\.\s*|\s*-\s*Comments:')  -- Split each violation in three
30 |                                                                                                                     -- , '\.\s*|\s*-\s*Comments:')  -- Split each violation in three (Use this if your postgresql is kind off old
31 |                             as tuple
32 |                           from raw.inspections
33 |                          where results in ('Fail', 'Pass', 'Pass w/ Conditions') and license_num is not null
34 |                       ) as t
35 | );
36 | 


--------------------------------------------------------------------------------
/dirtyduck/food_db/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM postgres:12
 2 | 
 3 | ## Installing PostGIS
 4 | RUN apt-get -y update \
 5 |     && apt-get -y  install wget \
 6 |     && wget --quiet -O - http://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - \
 7 |     && apt-get -y update \
 8 |     && apt-get -y install postgresql-12-postgis-3 postgis postgresql-12-pgrouting bzip2
 9 | 
10 | ## Chigago Food Inspections
11 | ## From https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD
12 | ## Only the rows between the years 2014 and 2017 (inclusive) are included
13 | COPY inspections_2014_2017.csv.bz2 /tmp
14 | 
15 | ## DB setup
16 | ADD 00_create_extensions.sql /docker-entrypoint-initdb.d/
17 | ADD 01_create_inspections_table.sql /docker-entrypoint-initdb.d/
18 | ADD 02_create_cleaned_inspections_table.sql /docker-entrypoint-initdb.d/
19 | ADD 03_create_violations_table.sql /docker-entrypoint-initdb.d/
20 | ADD 04_create_semantic_tables.sql /docker-entrypoint-initdb.d/
21 | 
22 | RUN chown postgres:postgres /docker-entrypoint-initdb.d/*.sql
23 | 


--------------------------------------------------------------------------------
/dirtyduck/food_db/inspections_2014_2017.csv.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/dirtyduck/food_db/inspections_2014_2017.csv.bz2


--------------------------------------------------------------------------------
/docs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/__init__.py


--------------------------------------------------------------------------------
/docs/sources/api/audition/auditioner.md:
--------------------------------------------------------------------------------
 1 | The `Auditioner` class is the main entry point for the Audition module. Users pass its constructor a database connection, information about the model groups to be evaluated, and a specification for a filter to prune the worst-performing models.
 2 | 
 3 | Other methods allow users to define more complex selection rules, list selected models, or plot results from the selection process.
 4 | 
 5 | ::: triage.component.audition
 6 |     options:
 7 |         show_root_toc_entry: false
 8 |         group_by_category: true
 9 |         show_category_heading: true
10 |         show_if_no_docstring: true
11 |         


--------------------------------------------------------------------------------
/docs/sources/api/audition/index.md:
--------------------------------------------------------------------------------
 1 | ## Audition Reference
 2 | 
 3 | Audition is the Triage model selection module. It simplifies the process of comparing multiple model_groups trained across time.
 4 | 
 5 | Find user-focused documentation for Audition [here](../../audition/audition_intro.md)
 6 | 
 7 | |Page||
 8 | |-|-|
 9 | |[Auditioner](auditioner.md)|The Auditioner class is the main entry point for Audition. 
10 | |[Selection Rules](selection_rules.md)|The Audition selection rules implement a range of criteria for identifying best-performing models.|
11 | |[Audition Config](audition-config.md)|Users of the Triage CLI can specify settings for Audition in an Audition config file.|
12 | |[Database Dependencies](database-dependencies.md)|The database schema from which Audition reads model training results.|
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/sources/api/audition/selection_rules.md:
--------------------------------------------------------------------------------
 1 | ## Selection Rules
 2 | 
 3 | The Triage uses *selection rules* to compare the performance of trained model groups over time, and select a model group for future predictions. A selection rule tries to predict the best-performing model group in some train/test period, based on the historical performance of each model group on some metric.
 4 | 
 5 | For example, a simple selection rule might predict that the best-performing model group during one train/test period will perform best in the following period.
 6 | 
 7 | A selection rule can be evaluated by calculating its *regret*, or the difference between the performance of its selected model group and the best-performing model group in some period.
 8 | 
 9 | Triage supports 8 model selection rules. Each is represented internally by one of the following functions:
10 | 
11 | ::: triage.component.audition.selection_rules
12 |     options:
13 |         heading_level: 3
14 |         show_root_toc_entry: false
15 |     selection:
16 |         filters: 
17 |             - "!^BoundSelectionRule"
18 |             - "!^_"
19 | 
20 | ## RuleMakers
21 | 
22 | Triage uses `RuleMaker` classes to conveniently format the parameter grids accepted by `make_selection_rule_grid`. Each type of `RuleMaker` class holds methods that build parameter grids for a subset of the available selection rules.
23 | 
24 | The arguments of each `add_rule_` method map to the arguments of the corresponding model selection function.
25 | 
26 | 
27 | ::: triage.component.audition.rules_maker
28 |     options:
29 |         show_if_no_docstring: true
30 |         show_category_heading: false
31 |         show_root_heading: false
32 |         show_root_toc_entry: false
33 |         heading_level: 3
34 |         selection:
35 |             members:
36 |                 - SimpleRuleMaker
37 |                 - TwoMetricsRuleMaker
38 |                 - RandomGroupRuleMaker
39 |   
40 | ## Selection Grid
41 | 
42 | ::: triage.component.audition.selection_rule_grid
43 |     options:
44 |         heading_level: 3
45 |         show_root_toc_entry: false


--------------------------------------------------------------------------------
/docs/sources/api/timechop/index.md:
--------------------------------------------------------------------------------
1 | ## Timechop Reference
2 | 
3 | Timechop handles temporal logic in the Triage Experiment pipeline.
4 | 
5 | 
6 | |Page||
7 | |-|-|
8 | |[Timechop](timechop.md)|The Timechop class is the main entry point for Timechop.|
9 | |[Plotting](plotting.md)|Tools for visualizing Timechop|


--------------------------------------------------------------------------------
/docs/sources/api/timechop/plotting.md:
--------------------------------------------------------------------------------
1 | ::: triage.component.timechop.plotting.visualize_chops
2 |     options:
3 |         show_root_toc_entry: False
4 |         group_by_category: True
5 |         show_category_heading: True
6 |         show_if_no_docstring: True
7 | 


--------------------------------------------------------------------------------
/docs/sources/api/timechop/timechop.md:
--------------------------------------------------------------------------------
1 | ::: triage.component.timechop.timechop
2 |     options:
3 |         show_root_toc_entry: False
4 |         group_by_category: True
5 |         show_category_heading: True
6 |         show_if_no_docstring: True        


--------------------------------------------------------------------------------
/docs/sources/audition/images/sanjose-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/audition/images/sanjose-2.png


--------------------------------------------------------------------------------
/docs/sources/db.md:
--------------------------------------------------------------------------------
 1 | # Triage database provisioner
 2 | 
 3 | This document explains the purpose and behavior of the Triage database provisioner, accessed from the Triage CLI. It is optional and only intended for use if you don't have an existing Postgres database to use for Triage.
 4 | 
 5 | The Triage database provisioner is just a single command:
 6 | 
 7 | `triage db up`
 8 | 
 9 | This command attempts to use docker to spawn a new Postgres 12 database. If successful, it will prompt you for a password to use for a user, and populate the connection information in `database.yaml` *in the directory where you ran it from*. The next time you run `triage db up`, it will look for the existing container and reuse it. 
10 | 
11 | At this point, you can use the database either from Triage or anything else that can connect to Postgres (eg. [psql](https://www.postgresql.org/docs/13/app-psql.html) or [dbeaver](https://dbeaver.io/), using the credentials in the autogenerated `database.yaml`.
12 | 
13 | ## Troubleshooting
14 | 
15 | ### No docker
16 | The command does require some version of Docker. We recommend getting it from the [official Docker downloads page](https://docs.docker.com/get-docker/).
17 | 
18 | ### Can't log in
19 | Because of the way Docker volumes work, if you manually remove the Docker container created by `triage db up`, the volume will still be around. This is usually fine, but the superuser credential information will persist as well, which means the next time you spawn the database, *the Postgres server will not take the new credential information into account*. Under normal usage (simply calling `triage db up` and never removing the container), you will never run into this situation. But if you do, and you would like to use a new username/password, you will have to remove the volume before recreating. This can be done with `docker volume rm triage-db-data`. This will also remove all of the stored data in Postgres, so beware!
20 | 


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/.swp


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/AUTHORS:
--------------------------------------------------------------------------------
1 | Adolfo De Unanue <adolfo@uchicago.edu>
2 | Joseph Walsh     <jtwalsh@uchicago.edu>
3 | Hans Koening
4 | Arthi Ramachandran <aramachandran1@medicine.bsd.uchicago.edu>
5 | Iván Higuera
6 | Kit Rodolfa
7 | 


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/choose_your_own_adventure.md:
--------------------------------------------------------------------------------
 1 | # How to use this tutorial?
 2 | 
 3 | 
 4 | -   You are interested in the *learn* how to use `triage` and have a *lot* of time:
 5 |     -   [Problem description](problem_description.md)
 6 |     -   [Infrastructure](infrastructure.md)
 7 |     -   [Data preparation](data_preparation.md)
 8 |     -   [Resource prioritization](inspections.md)
 9 |     -   [Early warning systems](eis.md)
10 |     -   [A deeper look into triage](triage_intro.md)
11 |     -   [Scaling up](aws_batch.md)
12 | -   You want to know about `triage`
13 |     -   [A deeper look into triage](triage_intro.md)
14 |     -   [Model governance](ml_governance.md)
15 |     -   [Model selection](../audition/audition_intro.md)
16 | -   You want to learn about case studies
17 |     -   [Quick setup](for_the_impatient.md)
18 |     -   [Resource prioritization](inspections.md) and/or [Early warning systems](eis.md)
19 | -   You *already* know `triage` but want to use it on the cloud
20 |     -   [Scaling up](aws_batch.md)
21 | - You *just* want to use the database for your own purposes
22 |     -  [Quick setup](for_the_impatient.md)
23 | 


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/for_the_impatient.md:
--------------------------------------------------------------------------------
 1 | # For the impatient
 2 | 
 3 | If you want to skip all the cleaning and transformations and dive  directly into `triage` you can execute the following *inside bastion*:
 4 | 
 5 | ```sh
 6 |      psql ${DATABASE_URL} -c "\copy raw.inspections from program 'curl "https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD"' HEADER CSV"
 7 | 
 8 |      psql ${DATABASE_URL} < /sql/create_cleaned_inspections_table.sql
 9 | 
10 |      psql ${DATABASE_URL} < /sql/create_violations_table.sql
11 | 
12 |      psql ${DATABASE_URL} < /sql/create_semantic_tables.sql
13 | ```
14 | 
15 | If everything works, you should end with two new schemas: `cleaned` and `semantic`.
16 | 
17 | You could check that (from `psql`) With
18 | 
19 | ```sql
20 | \dn
21 | ```
22 | 
23 | | List of schemas |                     |
24 | |--------------- |------------------- |
25 | | Name            | Owner               |
26 | | cleaned         | food\_user |
27 | | postgis         | food\_user |
28 | | public          | postgres            |
29 | | raw             | food\_user |
30 | | semantic        | food\_user |
31 | 


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/AWS_Batch_Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/AWS_Batch_Architecture.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/facilities_inspected_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_inspected_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/facilities_inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_inspections_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/facilities_with_failed_inspections_severe_violations_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_with_failed_inspections_severe_violations_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/facilities_with_inspections_failed_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_with_inspections_failed_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/failed_inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/failed_inspections_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/failed_inspections_severe_violations_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/failed_inspections_severe_violations_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/EDA/inspections_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/inspections_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/eis/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/eis/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/eis/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/eis/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/eis/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/inspections/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/inspections/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/inspections/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/audition/inspections/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/data_road.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/data_road.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/outcomes-eis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/outcomes-eis.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/outcomes-inspections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/outcomes-inspections.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/eis_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/eis_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_mg_prec_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/eis_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_mg_recall_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_group_importances.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_importances.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/eis_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 |   project_path: '/triage' # Project path defined in triage with matrices and models
 4 |   audition_output_path: '/triage/audition/eis/results_model_group_ids.json'
 5 | 
 6 |   thresholds: # Thresholds for defining positive predictions
 7 |         rank_abs: [50, 100, 250]
 8 |         rank_pct: [5, 10, 25]
 9 | 
10 |   baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
11 |       select g.model_group_id,
12 |              m.model_id,
13 |              extract('year' from m.evaluation_end_time) as as_of_date_year,
14 |              m.metric,
15 |              m.parameter,
16 |              m.stochastic_value,
17 |              m.num_labeled_examples,
18 |              m.num_labeled_above_threshold,
19 |              m.num_positive_labels
20 |        from test_results.evaluations m
21 |        left join model_metadata.models g
22 |        using(model_id)
23 |        where g.model_group_id = 20
24 |              and metric = 'precision@'
25 |              and parameter = '10_pct'
26 | 
27 |   max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
28 |   n_features_plots: 10 # Number of features for importances
29 |   figsize: [12, 12] # Default size for plots
30 |   fontsize: 20 # Default fontsize for plots
31 | 


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_mg_prec_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_mg_recall_over_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_importances.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_rayid_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_rayid_curve.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/postmodeling/inspection_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 | project_path: '/triage' # Project path defined in triage with matrices and models
 4 | 
 5 | model_group_id:
 6 |     - 39
 7 |     - 9
 8 |     - 29
 9 |     - 30
10 | 
11 | thresholds: # Thresholds for defining positive predictions
12 |   rank_abs: [50, 100, 250]
13 |   rank_pct: [5, 10, 25]
14 | 
15 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
16 |       select g.model_group_id,
17 |              m.model_id,
18 |              extract('year' from m.evaluation_end_time) as as_of_date_year,
19 |              m.metric,
20 |              m.parameter,
21 |              m.stochastic_value,
22 |              m.num_labeled_examples,
23 |              m.num_labeled_above_threshold,
24 |              m.num_positive_labels
25 |        from test_results.evaluations as m
26 |        left join model_metadata.models as g
27 |        using(model_id)
28 |        where g.model_group_id = 1
29 |              and metric = 'precision@'
30 |              and parameter = '15_pct'
31 | 
32 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
33 | n_features_plots: 10 # Number of features for importances
34 | figsize: [12, 12] # Default size for plots
35 | fontsize: 20 # Default fontsize for plots
36 | 


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/quickstart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/quickstart.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/rolling-origin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/rolling-origin.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/sanjose-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/sanjose-2.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_1.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_10.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_2.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_3.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_4.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_5.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_6.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_7.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_8.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/timechop/timechop_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_9.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/eis_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/eis_01.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/inspections_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/inspections_baseline.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/inspections_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/inspections_dt.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/inspections_label_failed_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/inspections_label_failed_01.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/results_model_group_ids.json:
--------------------------------------------------------------------------------
1 | {"best_current_value_precision@_10_pct": [7, 6, 5], "best_average_value_precision@_10_pct": [6, 7, 4], "lowest_metric_variance_precision@_10_pct": [1, 2, 3], "most_frequent_best_dist_precision@_10_pct_0.05": [6, 4, 5]}


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/images/triage/simple_test_skeleton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/simple_test_skeleton.png


--------------------------------------------------------------------------------
/docs/sources/dirtyduck/who_is_this_tutorial_for.md:
--------------------------------------------------------------------------------
 1 | # Who is this tutorial for?
 2 | 
 3 | We created this tutorial with two roles in mind:
 4 | 
 5 | 
 6 | - A data scientist/ML practitioner who wants to focus
 7 | in the problem at his/her hands, not in the nitty-gritty detail about
 8 | how to configure and setup a Machine learning pipeline, Model
 9 | governance, Model selection, etc.
10 | 
11 | - A policy maker with a little of technical background that wants to
12 |   learn how to pose his/her policy problem as a Machine Learning
13 |   problem.
14 | 


--------------------------------------------------------------------------------
/docs/sources/experiments/feature-testing.md:
--------------------------------------------------------------------------------
 1 | # Testing a Feature Aggregation
 2 | 
 3 | Developing features for Triage experiments can be a daunting task. There are a lot of things to configure, a small amount of configuration can result in a ton of SQL, and it can take a long time to validate your feature configuration in the context of an Experiment being run on real data.
 4 | 
 5 | To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `FeatureGenerator` component.
 6 | 
 7 | ## Using Triage CLI
 8 | ![triage featuretest cli help screen](featuretest-cli.png)
 9 | 
10 | The command-line interface for testing features takes in two arguments:
11 | 	- An experiment config file. Refer to the [example_experiment_config.yaml](https://github.com/dssg/triage/blob/master/example/config/experiment.yaml)'s `feature_aggregations` section. It consists of a YAML list, with one or more feature_aggregation rows present.
12 | 	- An as-of-date. This should be in the format `2016-01-01`.
13 | 
14 | Example: `triage experiment featuretest example/config/experiment.yaml 2016-01-01`
15 | 
16 | All given feature aggregations will be processed for the given date. You will see a bunch of queries pass by in your terminal, populating tables in the `features_test` schema which you can inspect afterwards.
17 | 
18 | ![triage feature test result](featuretest-result.png)
19 | 
20 | ## Using Python Code
21 | If you'd like to call this from a notebook or from any other Python code, the arguments look similar but are a bit different. You have to supply your own sqlalchemy database engine to create a 'FeatureGenerator' object, and then call the `create_features_before_imputation` method with your feature config as a list of dictionaries, along with an as-of-date as a string. Make sure your logging level is set to INFO if you want to see all of the queries.
22 | 
23 | ```
24 | from triage.component.architect.feature_generators import FeatureGenerator
25 | from triage.util.db import create_engine
26 | import logging
27 | import yaml
28 | 
29 | logging.basicConfig(level=logging.INFO)
30 | 
31 | # create a db_engine 
32 | db_url = 'your db url here'
33 | db_engine = create_engine(db_url)
34 | 
35 | feature_config = [{
36 | 	'prefix': 'aprefix',
37 | 	'aggregates': [
38 | 		{
39 | 		'quantity': 'quantity_one',
40 | 		'metrics': ['sum', 'count'],
41 | 	],
42 | 	'categoricals': [
43 | 		{
44 | 			'column': 'cat_one',
45 | 			'choices': ['good', 'bad'],
46 | 			'metrics': ['sum']
47 | 		},
48 | 	],
49 | 	'intervals': ['all'],
50 | 	'knowledge_date_column': 'knowledge_date',
51 | 	'from_obj': 'data'
52 | }]
53 | 
54 | FeatureGenerator(db_engine, 'features_test').create_features_before_imputation(
55 | 	feature_aggregation_config=feature_config,
56 | 	feature_dates=['2016-01-01']
57 | )
58 | ```
59 | 


--------------------------------------------------------------------------------
/docs/sources/experiments/features.md:
--------------------------------------------------------------------------------
 1 | # Feature Generation Recipe Book
 2 | 
 3 | This document is a collection of 'collate' aggregate features that we have found useful to create in Triage that may not be apparent at first.
 4 | 
 5 | For an introduction to feature generation in Triage, refer to [Dirty Duck Feature Generation](https://dssg.github.io/dirtyduck/#orgaae2e66)
 6 | 
 7 | ## Age
 8 | 
 9 | You can calculate age from a date of birth column using the `collate_date` special variable. This variable is marked as a placeholder in the feature quantity input, but is replaced with each as-of-date when features are being calculated. Combined with the Postgres `age` function, this calculates a person's age at each as-of-date as a feature.
10 | 
11 | For this example, let's assume you have a column called 'dob' that is a timestamp (or anything that can be cast to a date) in your source table. The `feature_aggregation`'s quantity would be: 
12 | 
13 | ```EXTRACT(YEAR FROM AGE('{collate_date}'::DATE, dob::DATE))```
14 | 
15 | If Triage is calculating this for the as-of-date '2016-01-01', it will internally expand the `collate_date` out to:
16 | ```EXTRACT(YEAR FROM AGE('2016-01-01'::DATE, dob::DATE))```
17 | 
18 | In context, a feature aggregate that uses age may look more like:
19 | 
20 | ```
21 |     aggregates:
22 |       - # age in years 
23 |         quantity:
24 |           age: "EXTRACT(YEAR FROM AGE('{collate_date}'::DATE, dob::DATE))"
25 |         metrics: ['max']
26 | ```
27 | 
28 | 
29 | Here, we call the feature 'age' and since everything in collate is defined as an aggregate, we pick 'max'; Any records for the same person and as-of-date should have the same 'dob', so there are many aggregates you can use that will arrive at the same answer (e.g. 'min', 'avg'). In these cases 'max' is the standard aggregate metric of choice in Triage.
30 | 


--------------------------------------------------------------------------------
/docs/sources/experiments/featuretest-cli.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/featuretest-cli.png


--------------------------------------------------------------------------------
/docs/sources/experiments/featuretest-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/featuretest-result.png


--------------------------------------------------------------------------------
/docs/sources/experiments/prediction-ranking.md:
--------------------------------------------------------------------------------
 1 | # Prediction Ranking
 2 | 
 3 | The predictions tables in the `train_results` and `test_results`
 4 | schemas contain several different flavors of rankings, covering
 5 | absolute vs percentile ranking and whether or not ties exist.
 6 | 
 7 | ## Ranking columns
 8 | 
 9 | | Column name | Behavior |
10 | | ----------- | ------- |
11 | | rank_abs_with_ties | Absolute ranking, with ties. Ranks will skip after a set of ties, so if two entities are tied at rank 3, the next entity after them will have rank 5. |
12 | | rank_pct_with_ties | Percentile ranking, with ties. Percentiles will skip after a set of ties, so if two entities out of ten are tied at 0.1 (tenth percentile), the next entity after them will have 0.3 (thirtieth percentile). At most five decimal places. |
13 | | rank_abs_no_ties | Absolute ranking, with no ties. Ties are broken according to a configured choice: 'best', 'worst', or 'random', which is recorded in the `prediction_metadata` table |
14 | | rank_pct_no_ties | Percentile ranking, with no ties. Ties are broken according to a configured choice: 'best', 'worst', or 'random', which is recorded in the `prediction_metadata` table. At most five decimal places. |
15 | 
16 | 
17 | ## Viewing prediction metadata
18 | 
19 | The `prediction_metadata` table contains information about how ties
20 | were broken. There is one row per model/matrix combination. For each
21 | model and matrix, it records:
22 | 
23 | - `tiebreaker_ordering` - The tiebreaker ordering rule (e.g. 'random',
24 |   'best', 'worst') used for the corresponding predictions.
25 | - `random_seed` - The random seed, if 'random' was the ordering
26 |   used. Otherwise None
27 | - `predictions_saved` - Whether or not predictions were saved. If it's
28 |   false, you won't expect to find any predictions, but the row is
29 |   inserted as a record that the prediction was performed.
30 | 
31 | There is one `prediction_metadata` table in each of the
32 | `train_results`, `test_results` schemas (in other words, wherever
33 | there is a companion `predictions` table).
34 | 
35 | 
36 | 
37 | ## Subsequent runs
38 | 
39 | If you run Triage Experiments with `replace=False`, and you change
40 | nothing except for the `rank_tiebreaker` in experiment config, ranking
41 | will be redone and the row in `prediction_metadata` updated. You don't
42 | have to run a full experiment if that's all you want to do; you could
43 | follow the directions for backfilling ranks above, which will redo the
44 | ranking for an individual model/matrix pair. However, changing the
45 | `rank_tiebreaker` in experiment config and re-running the experiment
46 | is a handy way of redoing all of them if that's what is useful.
47 | 


--------------------------------------------------------------------------------
/docs/sources/experiments/temporal_config_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/temporal_config_graph.png


--------------------------------------------------------------------------------
/docs/sources/experiments/timechops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/timechops.png


--------------------------------------------------------------------------------
/docs/sources/experiments/upgrade-to-v6.md:
--------------------------------------------------------------------------------
 1 | # Upgrading your experiment configuration to v6
 2 | 
 3 | 
 4 | This document details the steps needed to update a triage v5 configuration to
 5 | v6, mimicking the old behavior.
 6 | 
 7 | Experiment configuration v6 includes only one change from v5: When specifying
 8 | the `cohort_config`, if a `query` is given , the `{af_of_date}` is no longer
 9 | quoted or casted by Triage. Instead, the user must perform the quoting and
10 | casting, as is done already for the `label_config`.
11 | 
12 | Old:
13 | 
14 | ```
15 | cohort_config:
16 |     query: |
17 |         SELECT DISTINCT entity_id
18 |           FROM semantic.events
19 |          WHERE event = 'booking'
20 |            AND startdt <@ daterange(({as_of_date} - '3 years'::interval)::date, {as_of_date})
21 |            AND enddt < {as_of_date}
22 |          LIMIT 100
23 |     name: 'booking_last_3_years_limit_100'
24 | ```
25 | 
26 | New:
27 | 
28 | ```
29 | cohort_config:
30 |     query: |
31 |         SELECT DISTINCT entity_id
32 |           FROM semantic.events
33 |          WHERE event = 'booking'
34 |            AND startdt <@ daterange(('{as_of_date}'::date - '3 years'::interval)::date, '{as_of_date}'::date)
35 |            AND enddt < '{as_of_date}'
36 |          LIMIT 100
37 |     name: 'booking_last_3_years_limit_100'
38 | ```
39 | 
40 | ## Upgrading the experiment config version
41 | 
42 | At this point, you should be able to bump the top-level experiment config version to v6:
43 | 
44 | Old:
45 | 
46 | ```
47 | config_version: 'v5'
48 | ```
49 | 
50 | New:
51 | 
52 | ```
53 | config_version: 'v6'
54 | ```
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/sources/experiments/upgrade-to-v7.md:
--------------------------------------------------------------------------------
 1 | # Upgrading your experiment configuration to v7
 2 | 
 3 | 
 4 | This document details the steps needed to update a triage v6 configuration to
 5 | v7, mimicking the old behavior.
 6 | 
 7 | Experiment configuration v7 includes only one change from v6: the addition of a mandatory random_seed, that is set at the beginning of the experiment and affects all subsequent random numbers. It is expected to be an integer.
 8 | 
 9 | Old:
10 | ```yaml
11 | 
12 | config_version: 'v6'
13 | 
14 | # EXPERIMENT METADATA
15 | ```
16 | 
17 | New:
18 | ```yaml
19 | 
20 | config_version: 'v7'
21 | 
22 | # EXPERIMENT METADATA
23 | # random_seed will be set in Python at the beginning of the experiment and 
24 | # affect the generation of all model seeds
25 | random_seed: 23895478
26 | ```
27 | 


--------------------------------------------------------------------------------
/docs/sources/experiments/upgrade-to-v8.md:
--------------------------------------------------------------------------------
 1 | # Upgrading your experiment configuration to v8
 2 | 
 3 | 
 4 | This document details the steps needed to update a triage v6 configuration to
 5 | v8, mimicking the old behavior.
 6 | 
 7 | Experiment configuration v8 includes only one change from v7: the `groups` key is no longer supported in the feature configuration (all features must be grouped only at the `entity_id` level).
 8 | 
 9 | Old:
10 | ```yaml
11 | 
12 | config_version: 'v7'
13 | 
14 | # FEATURE GENERATION
15 | feature_aggregations:
16 |   -
17 |     prefix: 'inspections'
18 |     from_obj: 'semantic.events'
19 |     knowledge_date_column: 'date'
20 | 
21 |     aggregates_imputation:
22 |       count:
23 |         type: 'zero_noflag'
24 | 
25 |     aggregates:
26 |       -
27 |         quantity:
28 |           total: "*"
29 |         metrics:
30 |           - 'count'
31 | 
32 |     intervals: ['all']
33 | 
34 |     groups:
35 |       - 'entity_id'
36 | ```
37 | 
38 | New:
39 | ```yaml
40 | 
41 | config_version: 'v8'
42 | 
43 | # FEATURE GENERATION
44 | feature_aggregations:
45 |   -
46 |     prefix: 'inspections'
47 |     from_obj: 'semantic.events'
48 |     knowledge_date_column: 'date'
49 | 
50 |     aggregates_imputation:
51 |       count:
52 |         type: 'zero_noflag'
53 | 
54 |     aggregates:
55 |       -
56 |         quantity:
57 |           total: "*"
58 |         metrics:
59 |           - 'count'
60 | 
61 |     intervals: ['all']
62 | ```
63 | 


--------------------------------------------------------------------------------
/docs/sources/index.md:
--------------------------------------------------------------------------------
 1 | # Triage
 2 | 
 3 | [![Build Status](https://travis-ci.org/dssg/triage.svg?branch=master)](https://travis-ci.org/dssg/triage)
 4 | [![codecov](https://codecov.io/gh/dssg/triage/branch/master/graph/badge.svg)](https://codecov.io/gh/dssg/triage)
 5 | [![codeclimate](https://codeclimate.com/github/dssg/triage.png)](https://codeclimate.com/github/dssg/triage)
 6 | 
 7 | 
 8 | ## What is Triage?
 9 | 
10 | Triage is an open source machine learning toolkit to help data scientists, machine learning developers, and analysts quickly prototype, build and evaluate end-to-end predictive risk modeling systems for public policy and social good problems.
11 | 
12 | While many tools (sklearn, keras, pytorch, etc.) exist to build ML models, an end-to-end project requires a lot more than just building models. Developing AI/ML/data science systems requires making many design decisions that need to match how the system is going to be deployed and used. These choices then get turned into modeling choices and code. Triage lets you focus on the problem you’re solving and guides you through design choices you need to make at each step of the machine learning pipeline.
13 | 
14 | ## How to get started with Triage?
15 | 
16 | ### [Go through a quick online tutorial with sample data (no setup required)](https://colab.research.google.com/github/dssg/triage/blob/master/example/colab/colab_triage.ipynb)
17 | 
18 | ### [Go through a more in-depth tutorial with sample data](dirtyduck/index.md)
19 | 
20 | ### [Get started with your own project and data](quickstart.md)
21 | 
22 | 
23 | ## Background
24 | 
25 | Triage was initially developed at the University of Chicago's [Center For Data Science and Public Policy](http://dsapp.uchicago.edu) and is now being maintained and extended at Carnegie Mellon University.
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/sources/postmodeling/postmodeling-config.md:
--------------------------------------------------------------------------------
 1 | ## Postmodeling Configuration
 2 | 
 3 | The Triage Postmodeling module is controlled by two config files: `postmodeling_config.yaml` and `postmodeling_crosstabs.yaml`.
 4 | 
 5 | ### Postmodeling Configuration File
 6 | Configuration for the Triage Postmodeling module. An example `postmodeling_config.yaml` file can be found [here](https://github.com/dssg/triage/blob/master/example/config/postmodeling_config.yaml).
 7 | 
 8 | - `project_path`: Project path defined in triage with matrices and models
 9 | - `audition_output_path`: Audition output path
10 | - `model_group_id`: List of model_id's [optional if a audition_output_path is given]
11 | - `thresholds`: Thresholds for defining positive predictions
12 | - `baseline_query`: SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
13 | - `max_depth_error_tree`: For error trees, how depth the decision trees should go?
14 | - `n_features_plots`: Number of features for importances
15 | - `figsize`: Default size for plots
16 | - `fontsize`: Default fontsize for plots
17 | 
18 | 
19 | ### Postmodeling Crosstabs Configuration File
20 | Configuration for crosstabs in Triage's Postmodeling module. An example `postmodeling_crosstabs.yaml` file can be found [here](https://github.com/dssg/triage/blob/master/example/config/postmodeling_crosstabs.yaml).
21 | 
22 | - `output`: Define the schema and table for crosstabs
23 | - `thresholds`: Thresholds for defining positive predictions
24 | - `entity_id_list`: (optional) a list of `entity_ids` to subset on the crosstabs analysis
25 | - `models_list_query`: SQL query for getting `model_id`s
26 | - `as_of_dates_query`: SQL query for getting `as_of_date`s
27 | - `models_dates_join_query`: don't change the default query unless strictly necessary. It is just validating pairs of (`model_id`, `as_of_date`) in a predictions table
28 | - `features_query`: features_query must join `models_dates_join_query` with 1 or more features table using `as_of_date`
29 | - `predictions_query`: the predictions query must return `model_id`, `as_of_date`, `entity_id`, `score`, `label_value`, `rank_abs` and `rank_pct`. It must join `models_dates_join_query` using both `model_id` and `as_of_date`. 
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/sources/postmodeling/postmodeling_general_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/postmodeling/postmodeling_general_flow.png


--------------------------------------------------------------------------------
/docs/sources/triage_docs.css:
--------------------------------------------------------------------------------
 1 | /* Indents and adds a border to lower headings. */
 2 | div.doc-contents:not(.first) {
 3 |   padding-left: 25px;
 4 |   border-left: 4px solid rgba(230, 230, 230);
 5 |   margin-bottom: 80px;
 6 | }
 7 | 
 8 | /* Don't capitalize names. */
 9 | h5.doc-heading {
10 |   text-transform: none !important;
11 | }
12 | 
13 | /* Don't use vertical space on hidden ToC entries. */
14 | h6.hidden-toc {
15 |   margin: 0 !important;
16 |   position: relative;
17 |   top: -70px;
18 | }
19 | h6.hidden-toc::before {
20 |   margin-top: 0 !important;
21 |   padding-top: 0 !important;
22 | }
23 | 
24 | /* Don't show permalink of hidden ToC entries.
25 | h6.hidden-toc a.headerlink {
26 |   display: none;
27 | } */
28 | 
29 | /* Avoid breaking parameters name, etc. in table cells. */
30 | td code {
31 |   word-break: normal !important;
32 | }
33 | 
34 | /* For pieces of Markdown rendered in table cells. */
35 | td p {
36 |   margin-top: 0 !important;
37 |   margin-bottom: 0 !important;
38 | }
39 | 


--------------------------------------------------------------------------------
/docs/update_docs.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | from md_autogen import MarkdownAPIGenerator
 4 | from md_autogen import to_md_file
 5 | 
 6 | from triage import experiments
 7 | 
 8 | 
 9 | def generate_api_docs():
10 |     modules = [
11 |         experiments.base,
12 |         experiments.singlethreaded,
13 |         experiments.multicore
14 |     ]
15 | 
16 |     md_gen = MarkdownAPIGenerator("triage", "https://github.com/dssg/triage/tree/master")
17 |     for m in modules:
18 |         md_string = md_gen.module2md(m)
19 |         to_md_file(md_string, m.__name__, "docs/sources")
20 | 
21 | 
22 | def update_index_md():
23 |     shutil.copyfile('README.md', 'docs/sources/index.md')
24 | 
25 | 
26 | def copy_templates():
27 |     shutil.rmtree('docs/sources', ignore_errors=True)
28 |     shutil.copytree('docs/templates', 'docs/sources')
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     #copy_templates()
33 |     #update_index_md()
34 |     #generate_api_docs()
35 | 


--------------------------------------------------------------------------------
/example/aws_batch/aws_env.example:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PROJECT_NAME=dirtyduck
 4 | TRIAGE_VERSION=3.3.0
 5 | ENV=development
 6 | AWS_REGISTRY={your-ecr-registry}
 7 | AWS_JOB_QUEUE={your-job-queue}
 8 | POSTGRES_DB={postgresql://user:password@db_server/dbname}
 9 | S3_BUCKET={your-bucket}
10 | 


--------------------------------------------------------------------------------
/example/aws_batch/credentials.filter.example:
--------------------------------------------------------------------------------
 1 | {
 2 |         "environment": [
 3 |                 {
 4 |                         "name": "AWS_ACCESS_KEY_ID",
 5 |                         "value": .Credentials.AccessKeyId
 6 |                 },
 7 |                 {
 8 |                         "name": "AWS_SECRET_ACCESS_KEY",
 9 |                         "value": .Credentials.SecretAccessKey
10 |                 },
11 |                 {
12 |                         "name": "AWS_SESSION_TOKEN",
13 |                         "value": .Credentials.SessionToken
14 |                 }
15 |         ]
16 | }
17 | 


--------------------------------------------------------------------------------
/example/aws_batch/triage-job-definition.json.example:
--------------------------------------------------------------------------------
 1 | {
 2 |   "containerProperties": {
 3 |     "command": [
 4 |       "--tb",
 5 |       "Ref::experiment_file",
 6 |       "--project-path",
 7 |       "Ref::output_path",
 8 |       "Ref::replace",
 9 |       "Ref::save_predictions",
10 |       "Ref::profile",
11 |       "Ref::validate"
12 |     ],
13 |     "image": "AWS_ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/YOUR_TRIAGE_IMAGE",
14 |     "jobRoleArn": "arn:aws:iam::AWS_ACCOUNT:role/dsappBatchJobRole",
15 |     "memory": 16000,
16 |     "vcpus": 1
17 |   },
18 |   "jobDefinitionName": "triage-cli-experiment",
19 |   "retryStrategy": {
20 |     "attempts": 1
21 |   },
22 |   "type": "container"
23 | }
24 | 


--------------------------------------------------------------------------------
/example/aws_batch/triage-overrides.json.example:
--------------------------------------------------------------------------------
 1 | {
 2 |     "environment": [
 3 |         {
 4 |             "name":"AWS_DEFAULT_REGION",
 5 |             "value":"us-west-2"
 6 |         },
 7 |         {
 8 |             "name":"AWS_JOB_QUEUE",
 9 |             "value":""
10 |         },
11 |         {
12 |             "name":"POSTGRES_PASSWORD",
13 |             "value":""
14 |         },
15 |         {
16 |             "name":"POSTGRES_USER",
17 |             "value":""
18 |         },
19 |         {
20 |             "name":"POSTGRES_DB",
21 |             "value":""
22 |         },
23 |         {
24 |             "name":"POSTGRES_PORT",
25 |             "value":""
26 |         },
27 |         {
28 |             "name":"POSTGRES_HOST",
29 |             "value":""
30 |         }
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/example/cohort/past_events.sql:
--------------------------------------------------------------------------------
1 | select entity_id
2 | from events
3 | where outcome_date < '{as_of_date}'
4 | 


--------------------------------------------------------------------------------
/example/config/README.md:
--------------------------------------------------------------------------------
 1 | ### Triage Example Config Files
 2 | 
 3 | This folder contains examples of the config files that control Triage. These config files exist to demonstrate the format and syntax of Triage's config files, and provide templates for implementing new projects in triage.
 4 | 
 5 | #### audition.yaml
 6 | 
 7 | An example of the config file that controls Audition, the Triage model selection module. Find additional documentation for the Audition config file [here](https://dssg.github.io/triage/dirtyduck/audition/audition-config/).
 8 | 
 9 | #### database.yaml
10 | 
11 | Triage requires a database connection for source data and [model governance](https://dssg.github.io/triage/dirtyduck/ml_governance/). Use a file of this format to specify your connection.
12 | 
13 | #### dirty-duckling.yaml
14 | 
15 | A Triage experiment config file used in [Dirty Duckling](https://dssg.github.io/triage/dirtyduck/), the Triage tutorial.
16 | 
17 | #### experiment.yaml
18 | 
19 | An example of an experiment config file. Experiment configs control behavior of the Triage experiment pipeline, which handles feature and label generation, model training, and model evaluation. Find more documentation for the Triage experiment config file [here](https://dssg.github.io/triage/experiments/experiment-config/).
20 | 
21 | #### postmodeling_config.yaml & postmodeling_crosstabs.yaml
22 | 
23 | Controls the Triage Postmodeling module. Postmodeling is currently under development. It provides a set of tools for evaluating and investigating trained models. More documentation is available [here](https://dssg.github.io/triage/postmodeling/postmodeling-config).


--------------------------------------------------------------------------------
/example/config/audition.yaml:
--------------------------------------------------------------------------------
 1 | # CHOOSE MODEL GROUPS
 2 | # Audition needs a bunch of model_group_ids to help you select the models.
 3 | # The query is to choose what the model groups you want to include in the first round.
 4 | model_groups:
 5 |     query: |
 6 |         SELECT DISTINCT(model_group_id)
 7 |         FROM triage_metadata.model_groups
 8 | 
 9 | # CHOOSE TIMESTAMPS/TRAIN END TIMES
10 | # The timestamps when audition happens for each model group.
11 | # There's a hard rule in Audition that all of the chosen model groups for audition should
12 | # have the same train end times as the timestamps or the subset of the timestamps from this
13 | # query, otherwise those model groups with unmatched train end times will be pruned in the
14 | # first round.
15 | time_stamps:
16 |     query: |
17 |         SELECT DISTINCT train_end_time
18 |         FROM triage_metadata.models
19 |         WHERE model_group_id IN ({})
20 |         AND EXTRACT(DAY FROM train_end_time) IN (1)
21 |         AND train_end_time >= '2012-01-01'
22 | 
23 | # FILTER
24 | # Configuration for the Auditioner
25 | filter:
26 |     metric: 'precision@' # metric of interest
27 |     parameter: '50_abs' # parameter of interest
28 |     max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time. 
29 |     threshold_value: 0.0 # The worst absolute value that the given metric should be. 
30 |     distance_table: 'distance_table' # name of the distance table
31 |     models_table: 'models' # name of the models table
32 |     agg_type: 'worst' # Optional: how to aggregate multiple metric values if multiple models exist for a model group/train end time.
33 | 
34 | # RULES
35 | # The selection rules for Audition to simulate the model selection process for each timestamps. 
36 | # More rules can be found in the README.
37 | # The metric and parameter in shared_parameters should be the same in the filter section as well. 
38 | rules:
39 |     -
40 |         shared_parameters:
41 |             -
42 |                 metric: 'precision@'
43 |                 parameter: '50_abs'
44 |         selection_rules:
45 |             -
46 |                 name: 'best_current_value' # Pick the model group with the best current metric value
47 |                 n: 3
48 |             -
49 |                 name: 'best_average_value' # Pick the model with the highest average metric value
50 |                 n: 3
51 |             -
52 |                 name: 'lowest_metric_variance' # Pick the model with the lowest metric variance
53 |                 n: 3
54 |             -
55 |                 name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case`
56 |                 dist_from_best_case: [0.05]
57 |                 n: 3
58 | 
59 | 


--------------------------------------------------------------------------------
/example/config/database.yaml:
--------------------------------------------------------------------------------
 1 | # Connecting to the database requires a configuration file like this one
 2 | 
 3 | # address.of.database.server
 4 | host: 0.0.0.0
 5 | user: food_user
 6 | db: food
 7 | # user password
 8 | pass: some_password
 9 | # connection port
10 | port: 5434
11 | 


--------------------------------------------------------------------------------
/example/config/dirty-duckling.yaml:
--------------------------------------------------------------------------------
 1 | config_version: 'v8'
 2 | 
 3 | model_comment: 'dirtyduck-quickstart'
 4 | 
 5 | random_seed: 1234
 6 | 
 7 | temporal_config:
 8 |     label_timespans: ['3months']
 9 | 
10 | label_config:
11 |   query: |
12 |     select
13 |     entity_id,
14 |     bool_or(result = 'fail')::integer as outcome
15 |     from semantic.events
16 |     where '{as_of_date}'::timestamp <= date
17 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
18 |     group by entity_id
19 |   name: 'failed_inspections'
20 | 
21 | feature_aggregations:
22 |   -
23 |     prefix: 'inspections'
24 |     from_obj: 'semantic.events'
25 |     knowledge_date_column: 'date'
26 | 
27 |     aggregates_imputation:
28 |       count:
29 |         type: 'zero_noflag'
30 | 
31 |     aggregates:
32 |       -
33 |         quantity:
34 |           total: "*"
35 |         metrics:
36 |           - 'count'
37 | 
38 |     intervals: ['all']
39 | 
40 | model_grid_preset:  'quickstart'
41 | 
42 | scoring:
43 |     testing_metric_groups:
44 |         -
45 |           metrics: [precision@]
46 |           thresholds:
47 |             percentiles: [1]
48 | 
49 | 
50 |     training_metric_groups:
51 |       -
52 |           metrics: [precision@]
53 |           thresholds:
54 |             percentiles: [1]
55 | 


--------------------------------------------------------------------------------
/example/config/postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 |   
 3 |   project_path: 'triage_output/output/' # Project path defined in triage with matrices and models
 4 |   audition_output_path: 'results_model_group_ids.json' # Audition output path 
 5 |   model_group_id: # List of model_id's [optional if a audition_output_path is given]
 6 |         - 19
 7 |         - 43
 8 |         - 55
 9 | 
10 |   thresholds: # Thresholds for defining positive predictions 
11 |         rank_abs: [10, 20]
12 |         rank_pct: [10, 25, 50]
13 | 
14 |   baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
15 |       SELECT g.model_group_id,
16 |              m.model_id,
17 |              EXTRACT('YEAR' FROM m.evaluation_end_time) AS as_of_date_year,
18 |              m.metric,
19 |              m.parameter,
20 |              m.value,
21 |              m.num_labeled_examples,
22 |              m.num_labeled_above_threshold,
23 |              m.num_positive_labels
24 |        FROM test_results.evaluations m
25 |        LEFT JOIN triage_metadata.models g
26 |        USING(model_id)
27 |        WHERE g.model_group_id IN (1, 2, 3)
28 |              AND metric = 'precision@'
29 |              AND parameter = '10.0_pct'
30 | 
31 |   max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
32 |   n_features_plots: 10 # Number of features for importances
33 |   figsize: [12, 12] # Default size for plots
34 |   fontsize: 20 # Default fontsize for plots
35 | 


--------------------------------------------------------------------------------
/example/config/postmodeling_crosstabs.yaml:
--------------------------------------------------------------------------------
 1 | output:
 2 |   schema: 'test_results'
 3 |   table: 'crosstabs'
 4 | 
 5 | thresholds:
 6 |     rank_abs: [50]
 7 |     rank_pct: []
 8 | 
 9 | #(optional): a list of entity_ids to subset on the crosstabs analysis
10 | entity_id_list: []
11 | 
12 | models_list_query: "select unnest(ARRAY[44, 86]) :: int as model_id"
13 | 
14 | as_of_dates_query: "select unnest(ARRAY['2016-01-13','2017-01-13']) :: date as as_of_date"
15 | 
16 | #don't change this query unless strictly necessary. It is just validating pairs of (model_id,as_of_date)
17 | #it is just a join with distinct (model_id, as_of_date) in a predictions table
18 | models_dates_join_query: "
19 | select model_id,
20 |       as_of_date
21 |       from models_list_query m
22 |       cross join as_of_dates_query a join (select distinct model_id, as_of_date from test_results.predictions) p
23 |       using (model_id, as_of_date)"
24 | 
25 | #features_query must join models_dates_join_query with 1 or more features table using as_of_date
26 | features_query: "
27 | select m.model_id, f1.*
28 |  from features.inspections_aggregation_imputed f1 join
29 |  models_dates_join_query m using (as_of_date)"
30 | 
31 | #the predictions query must return model_id, as_of_date, entity_id, score, label_value, rank_abs and rank_pct
32 | #it must join models_dates_join_query using both model_id and as_of_date
33 | predictions_query: "
34 | select model_id,
35 |       as_of_date,
36 |       entity_id,
37 |       score,
38 |       label_value,
39 |       coalesce(rank_abs_no_ties, row_number() over (partition by (model_id, as_of_date) order by score desc)) as rank_abs,
40 |       coalesce(rank_pct_no_ties*100, ntile(100) over (partition by (model_id, as_of_date) order by score desc)) as rank_pct
41 |   from test_results.predictions
42 |   JOIN models_dates_join_query USING(model_id, as_of_date)
43 |   where model_id IN (select model_id from models_list_query)
44 |   AND as_of_date in (select as_of_date from as_of_dates_query)"
45 | 


--------------------------------------------------------------------------------
/example/dirtyduck/audition/eis_audition_config.yaml:
--------------------------------------------------------------------------------
 1 | # CHOOSE MODEL GROUPS
 2 | model_groups:
 3 |     query: |
 4 |         select distinct(model_group_id)
 5 |         from triage_metadata.model_groups
 6 |         where model_config ->> 'experiment_type' ~ 'eis'
 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES
 8 | time_stamps:
 9 |     query: |
10 |         select distinct train_end_time
11 |         from triage_metadata.models
12 |         where model_group_id in ({})
13 |         and extract(day from train_end_time) in (1)
14 |         and train_end_time >= '2014-01-01'
15 | # FILTER
16 | filter:
17 |     metric: 'precision@' # metric of interest
18 |     parameter: '10_pct' # parameter of interest
19 |     max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time.
20 |     threshold_value: 0.0 # The worst absolute value that the given metric should be.
21 |     distance_table: 'eis_distance_table' # name of the distance table
22 |     models_table: 'models' # name of the models table
23 | 
24 | # RULES
25 | rules:
26 |     -
27 |         shared_parameters:
28 |             -
29 |                 metric: 'precision@'
30 |                 parameter: '10_pct'
31 | 
32 |         selection_rules:
33 |             -
34 |                 name: 'best_current_value' # Pick the model group with the best current metric value
35 |                 n: 5
36 |             -
37 |                 name: 'best_average_value' # Pick the model with the highest average metric value
38 |                 n: 5
39 |             -
40 |                 name: 'lowest_metric_variance' # Pick the model with the lowest metric variance
41 |                 n: 5
42 |             -
43 |                 name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case`
44 |                 dist_from_best_case: [0.05]
45 |                 n: 5
46 | 


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspection_audition_config.yaml:
--------------------------------------------------------------------------------
 1 | # CHOOSE MODEL GROUPS
 2 | model_groups:
 3 |     query: |
 4 |         select distinct(model_group_id)
 5 |         from triage_metadata.model_groups
 6 |         where model_config ->> 'experiment_type' ~ 'inspection'
 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES
 8 | time_stamps:
 9 |     query: |
10 |         select distinct train_end_time
11 |         from triage_metadata.models
12 |         where model_group_id in ({})
13 |         and extract(day from train_end_time) in (1)
14 |         and train_end_time >= '2014-01-01'
15 | # FILTER
16 | filter:
17 |     metric: 'precision@' # metric of interest
18 |     parameter: '10_pct' # parameter of interest
19 |     max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time.
20 |     threshold_value: 0.0 # The worst absolute value that the given metric should be.
21 |     distance_table: 'inspections_distance_table' # name of the distance table
22 |     models_table: 'models' # name of the models table
23 | 
24 | # RULES
25 | rules:
26 |     -
27 |         shared_parameters:
28 |             -
29 |                 metric: 'precision@'
30 |                 parameter: '10_pct'
31 | 
32 |         selection_rules:
33 |             -
34 |                 name: 'best_current_value' # Pick the model group with the best current metric value
35 |                 n: 3
36 |             -
37 |                 name: 'best_average_value' # Pick the model with the highest average metric value
38 |                 n: 3
39 |             -
40 |                 name: 'lowest_metric_variance' # Pick the model with the lowest metric variance
41 |                 n: 3
42 |             -
43 |                 name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case`
44 |                 dist_from_best_case: [0.05]
45 |                 n: 3
46 | 


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/distance_from_best_precision@15_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/distance_from_best_precision@15_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/metric_over_time_precision@15_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/metric_over_time_precision@15_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/precision@15_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/precision@15_pct_next_time.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@15_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@15_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/regret_over_time_precision@15_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_over_time_precision@15_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/audition/inspections/results_model_group_ids.json:
--------------------------------------------------------------------------------
1 | {"best_current_value_precision@_10_pct": [39, 30, 9], "best_average_value_precision@_10_pct": [39, 9, 29], "lowest_metric_variance_precision@_10_pct": [1, 5, 19], "most_frequent_best_dist_precision@_10_pct_0.05": [8, 9, 10]}


--------------------------------------------------------------------------------
/example/dirtyduck/crosstabs/eis_crosstabs_config.yaml:
--------------------------------------------------------------------------------
 1 | output:
 2 |   schema: 'test_results'
 3 |   table: 'eis_crosstabs'
 4 | 
 5 | thresholds:
 6 |     rank_abs: [50]
 7 |     rank_pct: [5]
 8 | 
 9 | #(optional): a list of entity_ids to subset on the crosstabs analysis
10 | entity_id_list: []
11 | 
12 | models_list_query: "select unnest(ARRAY[226]) :: int as model_id"
13 | 
14 | as_of_dates_query: "select generate_series('2017-12-01'::date, '2018-09-01'::date, interval '1month')  as as_of_date"
15 | 
16 | #don't change this query unless strictly necessary. It is just validating pairs of (model_id,as_of_date)
17 | #it is just a join with distinct (model_id, as_of_date) in a predictions table
18 | models_dates_join_query: |
19 |   select model_id,
20 |   as_of_date
21 |   from models_list_query as m
22 |   cross join as_of_dates_query a join (select distinct model_id, as_of_date from test_results.predictions) as p
23 |   using (model_id, as_of_date)
24 | 
25 | #features_query must join models_dates_join_query with 1 or more features table using as_of_date
26 | features_query: |
27 |   select m.model_id, m.as_of_date, f4.entity_id, f4.results_entity_id_1month_result_fail_avg, f4.results_entity_id_3month_result_fail_avg, f4.results_entity_id_6month_result_fail_avg,
28 |   f2.inspection_types_entity_id_1month_type_canvass_sum, f3.risks_entity_id_1month_risk_high_sum, f4.results_entity_id_6month_result_pass_avg,
29 |   f3.risks_entity_id_all_risk_high_sum, f2.inspection_types_entity_id_3month_type_canvass_sum, f4.results_entity_id_6month_result_pass_sum,
30 |   f2.inspection_types_entity_id_all_type_canvass_sum
31 |   from features.inspection_types_aggregation_imputed as f2
32 |   inner join features.risks_aggregation_imputed as f3 using (entity_id, as_of_date)
33 |   inner join features.results_aggregation_imputed as f4 using (entity_id, as_of_date)
34 |   inner join models_dates_join_query as m using (as_of_date)
35 | 
36 | #the predictions query must return model_id, as_of_date, entity_id, score, label_value, rank_abs and rank_pct
37 | #it must join models_dates_join_query using both model_id and as_of_date
38 | predictions_query: |
39 |   select model_id,
40 |       as_of_date,
41 |       entity_id,
42 |       score,
43 |       label_value,
44 |       coalesce(rank_abs_no_ties, row_number() over (partition by (model_id, as_of_date) order by score desc)) as rank_abs,
45 |       coalesce(rank_pct_no_ties*100, ntile(100) over (partition by (model_id, as_of_date) order by score desc)) as rank_pct
46 |       from test_results.predictions
47 |       join models_dates_join_query using(model_id, as_of_date)
48 |       where model_id in (select model_id from models_list_query)
49 |       and as_of_date in (select as_of_date from as_of_dates_query)
50 | 


--------------------------------------------------------------------------------
/example/dirtyduck/experiments/dirty-duckling.yaml:
--------------------------------------------------------------------------------
 1 | config_version: 'v8'
 2 | 
 3 | model_comment: 'dirtyduck-quickstart'
 4 | 
 5 | random_seed: 1234
 6 | 
 7 | temporal_config:
 8 |     label_timespans: ['3months']
 9 | 
10 | label_config:
11 |   query: |
12 |     select
13 |     entity_id,
14 |     bool_or(result = 'fail')::integer as outcome
15 |     from semantic.events
16 |     where '{as_of_date}'::timestamp <= date
17 |     and date < '{as_of_date}'::timestamp + interval '{label_timespan}'
18 |     group by entity_id
19 |   name: 'failed_inspections'
20 | 
21 | feature_aggregations:
22 |   -
23 |     prefix: 'inspections'
24 |     from_obj: 'semantic.events'
25 |     knowledge_date_column: 'date'
26 | 
27 |     aggregates_imputation:
28 |       count:
29 |         type: 'zero_noflag'
30 | 
31 |     aggregates:
32 |       -
33 |         quantity:
34 |           total: "*"
35 |         metrics:
36 |           - 'count'
37 | 
38 |     intervals: ['all']
39 | 
40 | model_grid_preset:  'quickstart'
41 | 
42 | scoring:
43 |     testing_metric_groups:
44 |         -
45 |           metrics: [precision@]
46 |           thresholds:
47 |             percentiles: [10]
48 | 
49 | 
50 |     training_metric_groups:
51 |       -
52 |           metrics: [precision@]
53 |           thresholds:
54 |             percentiles: [10]
55 | 


--------------------------------------------------------------------------------
/example/dirtyduck/images/distance_from_best_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/distance_from_best_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/eis_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/eis_01.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/inspections_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/inspections_baseline.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/inspections_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/inspections_dt.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/inspections_label_failed_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/inspections_label_failed_01.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/metric_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/metric_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/precision@10_pct_next_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/precision@10_pct_next_time.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/regret_distance_from_best_rules_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/regret_distance_from_best_rules_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/regret_over_time_precision@10_pct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/regret_over_time_precision@10_pct.png


--------------------------------------------------------------------------------
/example/dirtyduck/images/results_model_group_ids.json:
--------------------------------------------------------------------------------
1 | {"best_current_value_precision@_10_pct": [7, 6, 5], "best_average_value_precision@_10_pct": [6, 7, 4], "lowest_metric_variance_precision@_10_pct": [1, 2, 3], "most_frequent_best_dist_precision@_10_pct_0.05": [6, 4, 5]}


--------------------------------------------------------------------------------
/example/dirtyduck/images/simple_test_skeleton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/simple_test_skeleton.png


--------------------------------------------------------------------------------
/example/dirtyduck/output/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/output/.gitkeep


--------------------------------------------------------------------------------
/example/dirtyduck/output/images/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/output/images/.gitkeep


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/database.yaml:
--------------------------------------------------------------------------------
1 | host: food_db
2 | user: food_user
3 | password: some_password
4 | port: 5432
5 | dbname: food
6 | 


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/eis_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 |   project_path: '/triage' # Project path defined in triage with matrices and models
 4 |   audition_output_path: '/triage/audition/eis/results_model_group_ids.json'
 5 | 
 6 |   thresholds: # Thresholds for defining positive predictions
 7 |         rank_abs: [50, 100, 250]
 8 |         rank_pct: [5, 10, 25]
 9 | 
10 |   baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
11 |       select g.model_group_id,
12 |              m.model_id,
13 |              extract('year' from m.evaluation_end_time) as as_of_date_year,
14 |              m.metric,
15 |              m.parameter,
16 |              m.stochastic_value,
17 |              m.num_labeled_examples,
18 |              m.num_labeled_above_threshold,
19 |              m.num_positive_labels
20 |        from test_results.evaluations m
21 |        left join triage_metadata.models g
22 |        using(model_id)
23 |        where g.model_group_id = 20
24 |              and metric = 'precision@'
25 |              and parameter = '10_pct'
26 | 
27 |   max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
28 |   n_features_plots: 10 # Number of features for importances
29 |   figsize: [12, 12] # Default size for plots
30 |   fontsize: 20 # Default fontsize for plots
31 | 


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_jaccard_on_lists_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_jaccard_on_lists_over_time.png


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_mg_prec_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_mg_prec_over_time.png


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_mg_recall_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_mg_recall_over_time.png


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_importances.png


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_rayid_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_rayid_curve.png


--------------------------------------------------------------------------------
/example/dirtyduck/postmodeling/inspection_postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | # Postmodeling Configuration File
 2 | 
 3 | project_path: '/triage' # Project path defined in triage with matrices and models
 4 | 
 5 | model_group_id:
 6 |     - 39
 7 |     - 9
 8 |     - 29
 9 |     - 30
10 | 
11 | thresholds: # Thresholds for defining positive predictions
12 |   rank_abs: [50, 100, 250]
13 |   rank_pct: [5, 10, 25]
14 | 
15 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter
16 |       select g.model_group_id,
17 |              m.model_id,
18 |              extract('year' from m.evaluation_end_time) as as_of_date_year,
19 |              m.metric,
20 |              m.parameter,
21 |              m.stochastic_value,
22 |              m.num_labeled_examples,
23 |              m.num_labeled_above_threshold,
24 |              m.num_positive_labels
25 |        from test_results.evaluations as m
26 |        left join triage_metadata.models as g
27 |        using(model_id)
28 |        where g.model_group_id = 1
29 |              and metric = 'precision@'
30 |              and parameter = '15_pct'
31 | 
32 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go?
33 | n_features_plots: 10 # Number of features for importances
34 | figsize: [12, 12] # Default size for plots
35 | fontsize: 20 # Default fontsize for plots
36 | 


--------------------------------------------------------------------------------
/example/label/events.sql:
--------------------------------------------------------------------------------
1 | select
2 |     entity_id,
3 |     bool_or(outcome::bool)::integer as outcome
4 | from events
5 | where outcome_date >= '{as_of_date}'
6 |     and outcome_date < '{as_of_date}'::timestamp + interval '{label_timespan}'
7 | group by entity_id
8 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | from argcmdr import local, LocalRoot, Local
 5 | from plumbum import local as plumlocal
 6 | 
 7 | 
 8 | ROOT_PATH = Path(__file__).parent.resolve()
 9 | 
10 | 
11 | class Development(LocalRoot):
12 |     """Commands to aid in Triage library development"""
13 |     pass
14 | 
15 | 
16 | @Development.register
17 | @local('remainder', metavar='alembic arguments', nargs=argparse.REMAINDER)
18 | def alembic(context, args):
19 |     """Configuration wrapper to use the Alembic schema migrations library for Triage development.
20 |     Try `alembic -h` or `manage alembic -- -h` to see a description of all
21 |     the available subcommands"""
22 |     return context.local['env'][
23 |         'PYTHONPATH=' + str(ROOT_PATH / 'src'),
24 |         'alembic',
25 |         '-c', ROOT_PATH / 'src' / 'triage' / 'component' / 'results_schema' / 'alembic.ini',
26 |         '-x', 'db_config_file=database.yaml',
27 |         args.remainder,
28 |     ]
29 | 
30 | 
31 | @Development.register
32 | class Docs(Local):
33 |     """View Triage documentation through local server"""
34 |     def prepare(self, args):
35 |         yield plumlocal['python']['docs/update_docs.py']
36 |         with plumlocal.cwd(ROOT_PATH / 'docs'):
37 |             yield plumlocal['mkdocs']['serve']
38 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = src/tests/
3 | 


--------------------------------------------------------------------------------
/requirement/dev.txt:
--------------------------------------------------------------------------------
1 | -r include/build.txt
2 | bumpversion==0.6.0 
3 | mkdocs==1.6.1
4 | pymdown-extensions==10.8
5 | mkdocs-material==9.4.6
6 | mkdocstrings==0.29.1
7 | mkdocstrings-python==1.16.10
8 | black==22.3.0
9 | 


--------------------------------------------------------------------------------
/requirement/extras-rq.txt:
--------------------------------------------------------------------------------
1 | rq==1.4.3 # pyup: ignore
2 | redis
3 | 


--------------------------------------------------------------------------------
/requirement/include/build.txt:
--------------------------------------------------------------------------------
1 | wheel==0.38.2
2 | 


--------------------------------------------------------------------------------
/requirement/include/lint.txt:
--------------------------------------------------------------------------------
1 | flake8==4.0.1
2 | 


--------------------------------------------------------------------------------
/requirement/include/test-management.txt:
--------------------------------------------------------------------------------
1 | codecov==2.1.13
2 | coverage>=4.4
3 | tox==3.25.0
4 | 


--------------------------------------------------------------------------------
/requirement/main.txt:
--------------------------------------------------------------------------------
 1 | polars==0.18.2
 2 | pyarrow>=12.0.1
 3 | numpy==1.26.0
 4 | pandas==1.5.0
 5 | alembic==1.7.7
 6 | SQLAlchemy==1.3.18 # pyup: ignore
 7 | PyYAML==6.0.2
 8 | psycopg2-binary==2.9.3
 9 | boto3==1.22.4
10 | click==8.1.3
11 | inflection==0.5.1
12 | sqlalchemy-postgres-copy==0.5.0
13 | retrying==1.3.3
14 | Dickens==1.0.1
15 | signalled-timeout==1.0.0
16 | wrapt==1.14.0
17 | argcmdr==0.7.0
18 | sqlparse==0.4.4
19 | pebble==4.6.3
20 | adjustText==0.7.3
21 | graphviz==0.20
22 | requests==2.31.0
23 | coloredlogs==15.0.1
24 | verboselogs==1.7
25 | s3fs==0.4.2 # pyup: ignore
26 | scikit-learn==1.6.1
27 | matplotlib==3.5.1
28 | seaborn==0.11.2
29 | ohio==0.5.0
30 | aequitas==0.42.0
31 | plotly==6.0.1
32 | jupyter==1.0.0


--------------------------------------------------------------------------------
/requirement/test.txt:
--------------------------------------------------------------------------------
 1 | -r include/lint.txt
 2 | -r include/test-management.txt
 3 | parsedatetime==2.6
 4 | csvkit==1.0.7
 5 | factory_boy==3.2.1
 6 | testing.postgresql==1.3.0
 7 | pytest==6.2.5 #<4.0.0 # pyup: ignore
 8 | pytest-cov==3.0.0
 9 | moto==3.1.7
10 | fakeredis==1.7.1
11 | hypothesis==6.46.1
12 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 5.5.0
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:src/triage/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [pycodestyle]
21 | max-line-length = 88
22 | statistics = True
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import re
 4 | from pathlib import Path
 5 | from setuptools import find_packages, setup
 6 | 
 7 | 
 8 | ROOT_PATH = Path(__file__).parent
 9 | 
10 | LICENSE_PATH = ROOT_PATH / "LICENSE"
11 | 
12 | README_PATH = ROOT_PATH / "README.md"
13 | 
14 | REQUIREMENTS_PATH = ROOT_PATH / "requirement" / "main.txt"
15 | 
16 | REQUIREMENTS_TEST_PATH = ROOT_PATH / "requirement" / "test.txt"
17 | 
18 | REQUIREMENTS_RQ_PATH = ROOT_PATH / "requirement" / "extras-rq.txt"
19 | 
20 | 
21 | def stream_requirements(fd):
22 |     """For a given requirements file descriptor, generate lines of
23 |     distribution requirements, ignoring comments and chained requirement
24 |     files.
25 | 
26 |     """
27 |     for line in fd:
28 |         cleaned = re.sub(r"#.*$", "", line).strip()
29 |         if cleaned and not cleaned.startswith("-r"):
30 |             yield cleaned
31 | 
32 | 
33 | with REQUIREMENTS_PATH.open() as requirements_file:
34 |     REQUIREMENTS = list(stream_requirements(requirements_file))
35 | 
36 | 
37 | with REQUIREMENTS_TEST_PATH.open() as test_requirements_file:
38 |     REQUIREMENTS_TEST = REQUIREMENTS[:]
39 |     REQUIREMENTS_TEST.extend(stream_requirements(test_requirements_file))
40 | 
41 | with REQUIREMENTS_RQ_PATH.open() as rq_requirements_file:
42 |     RQ_REQUIREMENTS = list(stream_requirements(rq_requirements_file))
43 | 
44 | 
45 | setup(
46 |     name='triage',
47 |     version='5.5.1',
48 |     description="Risk modeling and prediction",
49 |     long_description=README_PATH.read_text(),
50 |     long_description_content_type="text/markdown",
51 |     author="Center for Data Science and Public Policy",
52 |     author_email="datascifellows@gmail.com",
53 |     url="https://dssg.github.io/triage/",
54 |     project_urls={
55 |         "Documentation": "https://dssg.github.io/triage/",
56 |         "Source Code": "https://github.com/dssg/triage",
57 |         "Tutorial": "https://dssg.github.io/triage/dirtyduck/",
58 |     },
59 |     packages=find_packages("src", exclude=["tests", "tests.*"]),
60 |     package_dir={"": "src"},
61 |     include_package_data=True,
62 |     install_requires=REQUIREMENTS,
63 |     entry_points={
64 |         "console_scripts": ["triage = triage.cli:execute"],
65 |     },
66 |     extras_require={"rq": RQ_REQUIREMENTS},
67 |     license="MIT License",
68 |     zip_safe=False,
69 |     keywords="triage",
70 |     classifiers=[
71 |         "Development Status :: 2 - Pre-Alpha",
72 |         "Intended Audience :: Developers",
73 |         "License :: OSI Approved :: MIT License",
74 |         "Natural Language :: English",
75 |         "Programming Language :: Python :: 3",
76 |         "Programming Language :: Python :: 3.8",
77 |         "Programming Language :: Python :: 3.9",
78 |         "Programming Language :: Python :: 3.10",
79 |     ],
80 |     python_requires=">=3.8",
81 |     test_suite="tests",
82 |     tests_require=REQUIREMENTS_TEST,
83 | )
84 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/src/tests/architect_tests/README.md:
--------------------------------------------------------------------------------
1 | Write some tests!
2 | 


--------------------------------------------------------------------------------
/src/tests/architect_tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for the app."""
2 | 


--------------------------------------------------------------------------------
/src/tests/audition_tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for the app."""
2 | 


--------------------------------------------------------------------------------
/src/tests/audition_tests/test_model_group_performance.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import numpy as np
 4 | import testing.postgresql
 5 | from sqlalchemy import create_engine
 6 | 
 7 | from triage.component.audition.model_group_performance import (
 8 |     ModelGroupPerformancePlotter,
 9 | )
10 | 
11 | from .utils import create_sample_distance_table
12 | 
13 | 
14 | def test_ModelGroupPerformancePlotter_generate_plot_data():
15 |     with testing.postgresql.Postgresql() as postgresql:
16 |         engine = create_engine(postgresql.url())
17 |         distance_table, model_groups = create_sample_distance_table(engine)
18 |         plotter = ModelGroupPerformancePlotter(distance_table)
19 |         df = plotter.generate_plot_data(
20 |             metric="precision@",
21 |             parameter="100_abs",
22 |             model_group_ids=[1, 2],
23 |             train_end_times=["2014-01-01", "2015-01-01"],
24 |         )
25 |         assert sorted(df["model_type"].unique()) == [
26 |             "best case",
27 |             "mySpikeClassifier",
28 |             "myStableClassifier",
29 |         ]
30 |         for value in df[df["model_group_id"] == 1]["raw_value"].values:
31 |             assert np.isclose(value, 0.5)
32 | 
33 | 
34 | def test_ModelGroupPerformancePlotter_plot_all():
35 |     with patch(
36 |         "triage.component.audition.model_group_performance.plot_cats"
37 |     ) as plot_patch:
38 |         with testing.postgresql.Postgresql() as postgresql:
39 |             engine = create_engine(postgresql.url())
40 |             distance_table, model_groups = create_sample_distance_table(engine)
41 |             plotter = ModelGroupPerformancePlotter(distance_table)
42 |             plotter.plot_all(
43 |                 [{"metric": "precision@", "parameter": "100_abs"}],
44 |                 model_group_ids=[1, 2],
45 |                 train_end_times=["2014-01-01", "2015-01-01"],
46 |             )
47 |         assert plot_patch.called
48 |         args, kwargs = plot_patch.call_args
49 |         assert "raw_value" in kwargs["frame"]
50 |         assert "train_end_time" in kwargs["frame"]
51 |         assert kwargs["x_col"] == "train_end_time"
52 |         assert kwargs["y_col"] == "raw_value"
53 | 


--------------------------------------------------------------------------------
/src/tests/audition_tests/test_plotting.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pandas as pd
 4 | from matplotlib import lines as mlines
 5 | 
 6 | from triage.component.audition.plotting import (
 7 |     generate_plot_lines,
 8 |     category_colordict,
 9 |     category_styledict,
10 |     plot_cats,
11 | )
12 | 
13 | 
14 | def test_generate_plot_lines():
15 |     colordict = {"cat1": "#001122", "cat2": "#112233", "cat3": "#223344"}
16 |     styledict = {"cat1": "-", "cat2": "--", "cat3": "-"}
17 |     plot_lines = generate_plot_lines(colordict, lambda x: "Cat {}".format(x), styledict)
18 |     assert len(plot_lines) == 3
19 |     for line in plot_lines:
20 |         assert type(line) == mlines.Line2D
21 |         assert "Cat " in line._label
22 |         assert "-" in line._linestyle
23 |         if line._label == "Cat 2":
24 |             assert line._linestyle == "--"
25 | 
26 | 
27 | def test_category_colordict():
28 |     cmap_name = "tab10"
29 |     categories = ["Cat1", "Cat2", "Cat3", "Cat4"]
30 |     colordict = category_colordict(cmap_name, categories)
31 |     assert len(colordict.keys()) == 4
32 | 
33 | 
34 | def test_category_colordict_with_highlight():
35 |     cmap_name = "tab10"
36 |     colordict_with_highlight = category_colordict(
37 |         cmap_name, ["Cat1", "Cat2", "Cat3", "Cat4"], "Cat2"
38 |     )
39 |     colordict_without_highlight = category_colordict(
40 |         cmap_name, ["Cat1", "Cat3", "Cat4"]
41 |     )
42 |     for cat in ["Cat1", "Cat3", "Cat4"]:
43 |         assert colordict_with_highlight[cat] == colordict_without_highlight[cat]
44 |     assert colordict_with_highlight["Cat2"] == "#000000"
45 | 
46 | 
47 | def test_category_styledict():
48 |     colordict = {"cat1": "#001122", "cat2": "#112233", "cat3": "#223344"}
49 |     assert category_styledict(colordict, "cat3") == {
50 |         "cat1": "-",
51 |         "cat2": "-",
52 |         "cat3": "--",
53 |     }
54 | 
55 | 
56 | def test_plot_cats():
57 |     test_df = pd.DataFrame.from_dict(
58 |         {
59 |             "cats": ["tuxedo", "maine coon", "lion!"],
60 |             "groups": ["i", "dont", "know"],
61 |             "col1": [1, 2, 3],
62 |             "col2": [4, 5, 6],
63 |             "col3": [7, 8, 9],
64 |         }
65 |     )
66 |     # hard to make many assertions, but we can make sure it gets to the end
67 |     # and shows the contents
68 |     with patch("triage.component.audition.plotting.plt.show") as show_patch:
69 |         plot_cats(test_df, "col1", "col2", cat_col="cats", grp_col="groups")
70 |         assert show_patch.called
71 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/README.md:
--------------------------------------------------------------------------------
1 | Write some tests!
2 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for the app."""
2 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/test_estimators.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import pytest
 4 | 
 5 | from triage.component.catwalk.estimators.transformers import CutOff
 6 | from triage.component.catwalk.estimators.classifiers import ScaledLogisticRegression
 7 | 
 8 | from sklearn import linear_model
 9 | 
10 | from sklearn import datasets
11 | from sklearn import preprocessing
12 | from sklearn.pipeline import Pipeline
13 | from sklearn.model_selection import train_test_split
14 | 
15 | 
16 | @pytest.fixture
17 | def data():
18 |     dataset = datasets.load_breast_cancer()
19 |     X = dataset.data
20 |     y = dataset.target
21 | 
22 |     X_train, X_test, y_train, y_test = train_test_split(
23 |         X, y, test_size=0.3, random_state=12345
24 |     )
25 | 
26 |     return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
27 | 
28 | 
29 | def test_cutoff_warning():
30 |     X_data = [1, 2, 0.5, 0.7, 100, -1, -23, 0]
31 | 
32 |     cutoff = CutOff()
33 | 
34 |     with pytest.raises(ValueError):
35 |         cutoff.fit_transform(X_data)
36 | 
37 | 
38 | def test_cutoff_transformer():
39 |     cutoff = CutOff()
40 | 
41 |     X_data = np.array([1, 2, 0.5, 0.7, 100, -1, -23, 0]).reshape(-1,1)
42 | 
43 |     assert np.all(cutoff.fit_transform(X_data) == np.array([1, 1, 0.5, 0.7, 1, 0, 0, 0]).reshape(-1,1))
44 | 
45 | 
46 | def test_cutoff_inside_a_pipeline(data):
47 |     minmax_scaler = preprocessing.MinMaxScaler()
48 |     dsapp_cutoff = CutOff()
49 | 
50 |     pipeline = Pipeline(
51 |         [("minmax_scaler", minmax_scaler), ("dsapp_cutoff", dsapp_cutoff)]
52 |     )
53 | 
54 |     pipeline.fit(data["X_train"], data["y_train"])
55 | 
56 |     X_fake_new_data = data["X_test"][-1, :].reshape(1, -1) + 0.5
57 | 
58 |     mms = preprocessing.MinMaxScaler().fit(data["X_train"])
59 | 
60 |     assert np.all(
61 |         (mms.transform(X_fake_new_data) > 1)
62 |         == (pipeline.transform(X_fake_new_data) == 1)
63 |     )
64 | 
65 | 
66 | def test_dsapp_lr(data):
67 |     dsapp_lr = ScaledLogisticRegression()
68 |     dsapp_lr.fit(data["X_train"], data["y_train"])
69 | 
70 |     minmax_scaler = preprocessing.MinMaxScaler()
71 |     dsapp_cutoff = CutOff()
72 |     lr = linear_model.LogisticRegression(solver='lbfgs')
73 | 
74 |     pipeline = Pipeline(
75 |         [("minmax_scaler", minmax_scaler), ("dsapp_cutoff", dsapp_cutoff), ("lr", lr)]
76 |     )
77 | 
78 |     pipeline.fit(data["X_train"], data["y_train"])
79 | 
80 |     assert np.all(dsapp_lr.predict(data["X_test"]) == pipeline.predict(data["X_test"]))
81 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/test_feature_importances.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from triage.component.catwalk.feature_importances import (
 4 |     get_feature_importances,
 5 | )
 6 | 
 7 | from sklearn import datasets
 8 | from sklearn.svm import SVC
 9 | from sklearn.dummy import DummyClassifier
10 | from sklearn.ensemble import RandomForestClassifier
11 | from sklearn.linear_model import LogisticRegression
12 | 
13 | from sklearn.model_selection import train_test_split
14 | 
15 | 
16 | @pytest.fixture
17 | def trained_models():
18 |     dataset = datasets.load_breast_cancer()
19 |     X = dataset.data
20 |     y = dataset.target
21 | 
22 |     X_train, X_test, y_train, y_test = train_test_split(
23 |         X, y, test_size=0.3, random_state=12345
24 |     )
25 | 
26 |     rf = RandomForestClassifier(n_estimators=100)
27 |     rf.fit(X_train, y_train)
28 | 
29 |     lr = LogisticRegression(solver='liblinear')
30 |     lr.fit(X_train, y_train)
31 | 
32 |     svc_w_linear_kernel = SVC(kernel="linear", gamma='auto')
33 |     svc_w_linear_kernel.fit(X_train, y_train)
34 | 
35 |     svc_wo_linear_kernel = SVC(gamma='auto')
36 |     svc_wo_linear_kernel.fit(X_train, y_train)
37 | 
38 |     dummy = DummyClassifier(strategy='stratified')
39 |     dummy.fit(X_train, y_train)
40 | 
41 |     return {
42 |         "RF": rf,
43 |         "LR": lr,
44 |         "SVC_w_linear_kernel": svc_w_linear_kernel,
45 |         "Dummy": dummy,
46 |         "SVC_wo_linear_kernel": svc_wo_linear_kernel,
47 |     }
48 | 
49 | def test_correct_feature_importances_for_lr(trained_models):
50 |     feature_importances = get_feature_importances(trained_models["LR"])
51 | 
52 |     # It returns the intercept, too
53 |     assert feature_importances.shape == (30,)
54 | 
55 | 
56 | def test_correct_feature_importances_for_rf(trained_models):
57 |     feature_importances = get_feature_importances(trained_models["RF"])
58 |     assert feature_importances.shape == (30,)
59 | 
60 | 
61 | def test_correct_feature_importances_for_svc_w_linear_kernel(trained_models):
62 |     feature_importances = get_feature_importances(
63 |         trained_models["SVC_w_linear_kernel"])
64 |     assert feature_importances.shape == (30,)
65 | 
66 | 
67 | def test_correct_feature_importances_for_svc_wo_linear_kernel(trained_models):
68 |     feature_importances = get_feature_importances(
69 |         trained_models["SVC_wo_linear_kernel"]
70 |     )
71 |     assert feature_importances is None
72 | 
73 | 
74 | def test_correct_feature_importances_for_dummy(trained_models):
75 |     feature_importances = get_feature_importances(trained_models["Dummy"])
76 |     assert feature_importances is None
77 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/test_individual_importance_uniform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from triage.component.catwalk.individual_importance.uniform import uniform_distribution
 4 | from tests.utils import rig_engines, get_matrix_store, matrix_metadata_creator
 5 | import datetime
 6 | 
 7 | from tests.results_tests.factories import (
 8 |     ModelFactory,
 9 |     FeatureImportanceFactory,
10 | )
11 | 
12 | 
13 | def test_uniform_distribution():
14 |     with rig_engines() as (db_engine, project_storage):
15 |         model = ModelFactory()
16 |         feature_importances = [
17 |             FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i))
18 |             for i in range(0, 10)
19 |         ]
20 |         data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]}
21 |         for imp in feature_importances:
22 |             data_dict[imp.feature] = [0.5, 0.5]
23 |         metadata = matrix_metadata_creator()
24 |         test_store = get_matrix_store(
25 |             project_storage,
26 |             pd.DataFrame.from_dict(data_dict),
27 |             metadata,
28 |         )
29 |         results = uniform_distribution(
30 |             db_engine,
31 |             model_id=model.model_id,
32 |             as_of_date=datetime.date(2016, 1, 1),
33 |             test_matrix_store=test_store,
34 |             n_ranks=5,
35 |         )
36 | 
37 |         assert len(results) == 5  # 5 features x 1 entity for this as_of_date
38 |         for result in results:
39 |             assert "entity_id" in result
40 |             assert "feature_name" in result
41 |             assert "score" in result
42 |             assert "feature_value" in result
43 |             assert result["feature_value"] == 0.5
44 |             assert result["score"] >= 0
45 |             assert result["score"] <= 1
46 |             assert isinstance(result["feature_name"], str)
47 |             assert result["entity_id"] in [1, 2]
48 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | from triage.component.catwalk.metrics import fpr
 2 | from triage.component.catwalk.evaluation import ModelEvaluator
 3 | 
 4 | 
 5 | def test_metric_directionality():
 6 |     """All metrics must be wrapped using the @Metric decorator available
 7 |     in catwalk.metrics to provide an `greater_is_better` attribute which must
 8 |     be one of True or False.
 9 |     """
10 |     for met in ModelEvaluator.available_metrics.values():
11 |         assert hasattr(met, "greater_is_better")
12 |         assert met.greater_is_better in (True, False)
13 | 
14 | 
15 | def test_fpr():
16 |     predictions_binary = [1, 1, 1, 0, 0, 0, 0, 0]
17 |     labels = [1, 1, 0, 1, 0, 0, 0, 1]
18 | 
19 |     result = fpr([], predictions_binary, labels, [])
20 |     # false positives = 1
21 |     # total negatives = 4
22 |     assert result == 0.25
23 | 


--------------------------------------------------------------------------------
/src/tests/catwalk_tests/utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import random
 3 | import tempfile
 4 | from contextlib import contextmanager
 5 | import pytest
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | import yaml
10 | 
11 | from triage.component.catwalk.storage import (
12 |     ProjectStorage,
13 | )
14 | from triage.util.structs import FeatureNameList
15 | 
16 | 
17 | def fake_labels(length):
18 |     return np.array([random.choice([True, False]) for i in range(0, length)])
19 | 
20 | 
21 | @pytest.fixture
22 | def sample_metadata():
23 |     return {
24 |         "feature_start_time": datetime.date(2012, 12, 20),
25 |         "end_time": datetime.date(2016, 12, 20),
26 |         "label_name": "label",
27 |         "as_of_date_frequency": "1w",
28 |         "max_training_history": "5y",
29 |         "state": "default",
30 |         "cohort_name": "default",
31 |         "label_timespan": "1y",
32 |         "metta-uuid": "1234",
33 |         "feature_names": FeatureNameList(["ft1", "ft2"]),
34 |         "feature_groups": ["all: True"],
35 |         "indices": ["entity_id"],
36 |     }
37 | 
38 | 
39 | @pytest.fixture
40 | def sample_df():
41 |     return pd.DataFrame.from_dict(
42 |         {
43 |             "entity_id": [1, 2],
44 |             "feature_one": [3, 4],
45 |             "feature_two": [5, 6],
46 |             "label": ["good", "bad"],
47 |         }
48 |     ).set_index("entity_id")
49 | 
50 | 
51 | @pytest.fixture
52 | def sample_matrix_store(sample_df, sample_metadata):
53 |     with tempfile.TemporaryDirectory() as tempdir:
54 |         project_storage = ProjectStorage(tempdir)
55 |         store = project_storage.matrix_storage_engine().get_store("1234")
56 |         store.matrix = sample_df
57 |         store.metadata = sample_metadata
58 |         return store
59 | 


--------------------------------------------------------------------------------
/src/tests/collate_tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/src/tests/collate_tests/create_inspections_subset.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import random
 3 | from contextlib import contextmanager
 4 | 
 5 | import pandas as pd
 6 | import requests
 7 | from tqdm import tqdm
 8 | import re
 9 | 
10 | 
11 | @contextmanager
12 | def download(url):
13 |     "download `url` to a file, returning the file name"
14 |     with tempfile.NamedTemporaryFile(mode="wb") as f:
15 |         response = requests.get(url, stream=True)
16 |         for data in tqdm(response.iter_content()):
17 |             f.write(data)
18 |         f.flush()
19 |         yield f.name
20 | 
21 | 
22 | def create_subset(src, dest, n=250):
23 |     "Given a csv file `src`, create a subset `dest` with `n` unique entities"
24 |     df = pd.read_csv(src)
25 |     lics = pd.unique(df["License #"])
26 |     sublics = lics[random.sample(range(0, len(lics)), n)]
27 |     subset = df[df["License #"].isin(sublics)]
28 |     # Make the column names a little more readable
29 |     subset.columns = map(clean_column_name, subset.columns)
30 |     subset.to_csv(dest, index=False)
31 | 
32 | 
33 | def clean_column_name(col):
34 |     col = col.lower()
35 |     col = col.replace(" ", "_")
36 |     col = col.replace("#", "no")
37 |     return re.sub("[\W]+", "", col)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     # download the entire Chicago restaurant inspections CSV file
42 |     with download(
43 |         "https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD"
44 |     ) as f:
45 |         create_subset(f, "food_inspections_subset.csv")
46 | 


--------------------------------------------------------------------------------
/src/tests/collate_tests/initialize_db.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import subprocess
 3 | from sqlalchemy import create_engine
 4 | 
 5 | 
 6 | DATA_NAME = "food_inspections_subset.csv"
 7 | DATA_PATH = pathlib.Path(__file__).with_name(DATA_NAME)
 8 | 
 9 | 
10 | def handler(database):
11 |     engine = create_engine(database.url())
12 |     connection = engine.connect()
13 |     try:
14 |         load_data(connection)
15 |     finally:
16 |         connection.close()
17 | 
18 | 
19 | def load_data(connection):
20 |     connection.execute("DROP TABLE IF EXISTS food_inspections")
21 |     subprocess.run(
22 |         [
23 |             "csvsql",
24 |             "-v",
25 |             "--no-constraints",
26 |             "--tables",
27 |             "food_inspections",
28 |             "--insert",
29 |             "--db",
30 |             str(connection.engine.url),
31 |             str(DATA_PATH),
32 |         ],
33 |         check=True,
34 |     )
35 |     connection.execute("CREATE INDEX ON food_inspections(license_no, inspection_date)")
36 | 
37 |     # create a state table for license/date
38 |     connection.execute("DROP TABLE IF EXISTS inspection_states")
39 |     connection.execute(
40 |         """\
41 |         CREATE TABLE inspection_states AS (
42 |             SELECT license_no, date
43 |             FROM (SELECT DISTINCT license_no FROM food_inspections) a
44 |             CROSS JOIN (SELECT DISTINCT inspection_date as date FROM food_inspections) b
45 |         )"""
46 |     )
47 |     connection.execute("CREATE INDEX ON inspection_states(license_no, date)")
48 | 
49 |     # create an alternate state table with a different date column
50 |     connection.execute("DROP TABLE IF EXISTS inspection_states_diff_colname")
51 |     connection.execute(
52 |         """\
53 |         CREATE TABLE inspection_states_diff_colname
54 |         AS select license_no, date as aggregation_date
55 |         FROM inspection_states
56 |         """
57 |     )
58 |     connection.execute(
59 |         """\
60 |         CREATE INDEX ON
61 |         inspection_states_diff_colname(license_no, aggregation_date)
62 |         """
63 |     )
64 | 
65 |     # create a state table for licenseo only
66 |     connection.execute("DROP TABLE IF EXISTS all_licenses")
67 |     connection.execute(
68 |         """\
69 |         CREATE TABLE all_licenses AS (
70 |             SELECT DISTINCT license_no FROM food_inspections
71 |         )"""
72 |     )
73 |     connection.execute("CREATE INDEX ON all_licenses(license_no)")
74 | 


--------------------------------------------------------------------------------
/src/tests/example_schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | entities:
 3 |     -
 4 |         name: house
 5 |         attributes:
 6 |             siding: str
 7 |             construction_year: int
 8 |     -
 9 |         name: kid
10 |         attributes:
11 |             age: int
12 |     -
13 |         name: insurance_policy
14 |         attributes:
15 |             deductible: bool
16 |     -
17 |         name: address
18 |         spatial: True
19 |         attributes:
20 |             street_address: str
21 |     -
22 |         name: inspection
23 |         event: True
24 |         attributes:
25 |             result: bool
26 | 
27 | relationships:
28 |     -
29 |         name: residency
30 |         entity_one: house
31 |         entity_two: kid
32 |         type: m2m
33 |         temporal: True
34 |     -
35 |         entity_one: house
36 |         entity_two: insurance_policy
37 |         type: o2m
38 |         temporal: True
39 |     -
40 |         entity_one: house
41 |         entity_two: address
42 |         type: o2m
43 |     -
44 |         entity_one: house
45 |         entity_two: inspection
46 |         type: o2m
47 | primary_entity: kid
48 | outcome_variable: lead_level
49 | 


--------------------------------------------------------------------------------
/src/tests/postmodeling_tests/test_crosstabs.py:
--------------------------------------------------------------------------------
 1 | from triage.component.postmodeling.crosstabs import run_crosstabs
 2 | from triage.database_reflection import table_has_data
 3 | 
 4 | 
 5 | def test_run_crosstabs(finished_experiment, crosstabs_config):
 6 |     run_crosstabs(finished_experiment.db_engine, crosstabs_config)
 7 |     expected_table_name = (
 8 |         crosstabs_config.output["schema"] + "." + crosstabs_config.output["table"]
 9 |     )
10 |     table_has_data(expected_table_name, finished_experiment.db_engine)
11 | 


--------------------------------------------------------------------------------
/src/tests/postmodeling_tests/test_without_predictions.py:
--------------------------------------------------------------------------------
 1 | from triage.component.postmodeling.deprecated.model_group_evaluator import ModelGroupEvaluator
 2 | from triage.component.postmodeling.deprecated.model_evaluator import ModelEvaluator
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="module")
 7 | def model_group_evaluator(finished_experiment_without_predictions):
 8 |     return ModelGroupEvaluator((1, 1), finished_experiment_without_predictions.db_engine)
 9 | 
10 | 
11 | @pytest.fixture(scope="module")
12 | def model_evaluator(finished_experiment_without_predictions):
13 |     return ModelEvaluator(1, 1, finished_experiment_without_predictions.db_engine)
14 | 
15 | 
16 | def test_ModelGroupEvaluator_metadata(model_group_evaluator):
17 |     assert all(value for metadata_row in model_group_evaluator.metadata for key, value in metadata_row.items() )
18 | 
19 | 
20 | def test_ModelGroupEvaluator_predictions(model_group_evaluator):
21 |     with pytest.raises(RuntimeError):
22 |         model_group_evaluator.predictions
23 | 
24 | 
25 | def test_ModelEvaluator_metadata(model_evaluator):
26 |     assert all(value for key, value in model_evaluator.metadata.items())
27 | 
28 | 
29 | def test_ModelEvaluator_predictions(model_evaluator):
30 |     with pytest.raises(RuntimeError):
31 |         model_evaluator.predictions
32 | 


--------------------------------------------------------------------------------
/src/tests/results_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/tests/results_tests/__init__.py


--------------------------------------------------------------------------------
/src/tests/results_tests/test_upgrade_if_clean.py:
--------------------------------------------------------------------------------
 1 | from triage.component import results_schema
 2 | from alembic import command, script
 3 | import pytest
 4 | 
 5 | 
 6 | def test_upgrade_if_clean_upgrades_if_clean(db_engine):
 7 |     results_schema.upgrade_if_clean(db_engine.url)
 8 |     db_version = db_engine.execute("select version_num from results_schema_versions").scalar()
 9 |     alembic_cfg = results_schema.alembic_config(db_engine.url)
10 |     assert db_version == script.ScriptDirectory.from_config(alembic_cfg).get_current_head()
11 | 
12 | 
13 | def test_upgrade_if_clean_does_not_upgrade_if_not_clean(db_engine):
14 |     command.upgrade(results_schema.alembic_config(dburl=db_engine.url), "head")
15 |     command.downgrade(results_schema.alembic_config(dburl=db_engine.url), "-1")
16 |     with pytest.raises(ValueError):
17 |         results_schema.upgrade_if_clean(db_engine.url)
18 | 


--------------------------------------------------------------------------------
/src/tests/results_tests/test_valid_schema.py:
--------------------------------------------------------------------------------
 1 | import testing.postgresql
 2 | from sqlalchemy import create_engine
 3 | 
 4 | from triage.component.results_schema import Base
 5 | 
 6 | 
 7 | def test_full_schema():
 8 |     with testing.postgresql.Postgresql() as postgres:
 9 |         engine = create_engine(postgres.url())
10 |         Base.metadata.create_all(bind=engine)
11 | 


--------------------------------------------------------------------------------
/src/tests/test_utils_pandas.py:
--------------------------------------------------------------------------------
 1 | from triage.util.pandas import downcast_matrix
 2 | from triage.component.catwalk.storage import MatrixStore
 3 | from .utils import matrix_creator
 4 | 
 5 | 
 6 | def test_downcast_matrix():
 7 |     df = matrix_creator().set_index(MatrixStore.indices)
 8 |     downcasted_df = downcast_matrix(df)
 9 | 
10 |     # make sure the contents are equivalent
11 |     assert((downcasted_df == df).all().all())
12 | 
13 |     # make sure the memory usage is lower because there would be no point of this otherwise
14 |     assert downcasted_df.memory_usage().sum() < df.memory_usage().sum()
15 | 


--------------------------------------------------------------------------------
/src/tests/test_validation.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | import testing.postgresql
 3 | from unittest import mock
 4 | 
 5 | from triage.component.catwalk.db import ensure_db
 6 | 
 7 | from tests.utils import sample_config, populate_source_data, open_side_effect
 8 | from triage.experiments.validate import ExperimentValidator
 9 | 
10 | 
11 | def test_experiment_validator():
12 |     with testing.postgresql.Postgresql() as postgresql:
13 |         db_engine = create_engine(postgresql.url())
14 |         ensure_db(db_engine)
15 |         populate_source_data(db_engine)
16 |         with mock.patch(
17 |             "triage.util.conf.open", side_effect=open_side_effect
18 |         ) as mock_file:
19 |             ExperimentValidator(db_engine).run(sample_config("query"))
20 |             ExperimentValidator(db_engine).run(sample_config("filepath"))
21 | 


--------------------------------------------------------------------------------
/src/tests/test_validation_primitives.py:
--------------------------------------------------------------------------------
 1 | from triage.validation_primitives import string_is_tablesafe
 2 | from hypothesis import given, example
 3 | from hypothesis.strategies import text, characters
 4 | 
 5 | 
 6 | # test with a variety of strings based on letters and numbers auto-generated by hypothesis
 7 | # and also add a hardcoded example that includes underscores because those are fine
 8 | @given(text(alphabet=characters(whitelist_categories=('Ll', 'Nd')), min_size=1))
 9 | @example('a_valid_name')
10 | def test_string_is_tablesafe(s):
11 |     assert string_is_tablesafe(s)
12 | 
13 | 
14 | # test with a variety of strings based on unsafe characters auto-generated by hypothesis
15 | # and also add a hardcoded example that should be bad because it has spaces
16 | @given(text(alphabet='/ "A'))
17 | @example('spaces are not valid')
18 | @example('Neither_are_CAPITAL_letters')
19 | def test_string_is_not_tablesafe(s):
20 |     assert not string_is_tablesafe(s)
21 | 


--------------------------------------------------------------------------------
/src/tests/timechop_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/tests/timechop_tests/__init__.py


--------------------------------------------------------------------------------
/src/tests/timechop_tests/test_plotting.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | from unittest import TestCase
 3 | import yaml
 4 | import matplotlib
 5 | 
 6 | matplotlib.use("Agg")
 7 | from triage.component.timechop import Timechop # noqa
 8 | from triage.component.timechop.plotting import visualize_chops # noqa
 9 | 
10 | 
11 | class VisualizeChopTest(TestCase):
12 |     @property
13 |     def chopper(self):
14 |         # create a valid Timechop chopper
15 |         # least brittle current way of doing this is by loading the
16 |         # example_experiment_config.yaml file, because that is a
17 |         # diligently updated file. If Timechop config changes, the
18 |         # example config should change too
19 |         with open("example/config/experiment.yaml") as fd:
20 |             experiment_config = yaml.full_load(fd)
21 |         return Timechop(**(experiment_config["temporal_config"]))
22 | 
23 |     # hard to make many assertions, but we can make sure it gets to the end
24 |     # and shows the contents.
25 | 
26 |     # we do one such test case to work out each combination of boolean arguments
27 |     def test_default_args(self):
28 |         with patch("triage.component.timechop.plotting.plt.show") as show_patch:
29 |             visualize_chops(self.chopper)
30 |             assert show_patch.called
31 | 
32 |     def test_no_as_of_times(self):
33 |         with patch("triage.component.timechop.plotting.plt.show") as show_patch:
34 |             visualize_chops(self.chopper, show_as_of_times=False)
35 |             assert show_patch.called
36 | 
37 |     def test_no_boundaries(self):
38 |         with patch("triage.component.timechop.plotting.plt.show") as show_patch:
39 |             visualize_chops(self.chopper, show_boundaries=False)
40 |             assert show_patch.called
41 | 
42 |     def test_no_boundaries_or_as_of_times(self):
43 |         with patch("triage.component.timechop.plotting.plt.show") as show_patch:
44 |             visualize_chops(self.chopper, show_as_of_times=False, show_boundaries=False)
45 |             assert show_patch.called
46 | 


--------------------------------------------------------------------------------
/src/tests/timechop_tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from triage.component.timechop.utils import convert_to_list
 2 | 
 3 | 
 4 | def test_convert_to_list():
 5 |     tests = [
 6 |         {"val": "1 day", "expected_result": ["1 day"]},
 7 |         {"val": ["1 day"], "expected_result": ["1 day"]},
 8 |         {"val": 1, "expected_result": [1]},
 9 |     ]
10 |     for test in tests:
11 |         assert convert_to_list(test["val"]) == test["expected_result"]
12 | 


--------------------------------------------------------------------------------
/src/triage/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | __author__ = """Center for Data Science and Public Policy"""
 4 | __email__ = "datascifellows@gmail.com"
 5 | __version__ = '5.5.1' # do not change to double-quotes, it will screw up bumpversion
 6 | 
 7 | import logging
 8 | import logging.config
 9 | import yaml
10 | import pathlib
11 | 
12 | 
13 | logging_config = pathlib.Path(__file__).parent / 'config' / 'logging.yaml'
14 | 
15 | with open(logging_config, 'r') as f:
16 |     config = yaml.safe_load(f.read())
17 |     logging.config.dictConfig(config)
18 | 
19 | 
20 | from .util.db import create_engine
21 | 
22 | 
23 | 
24 | __all__ = ('create_engine',)
25 | 


--------------------------------------------------------------------------------
/src/triage/component/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/architect/README.md:
--------------------------------------------------------------------------------
 1 | # The Architect 
 2 | 
 3 | Plan, design, and build train and test matrices
 4 | 
 5 | [![Build Status](https://travis-ci.org/dssg/architect.svg?branch=master)](https://travis-ci.org/dssg/architect)
 6 | [![codecov](https://codecov.io/gh/dssg/architect/branch/master/graph/badge.svg)](https://codecov.io/gh/dssg/architect)
 7 | [![codeclimate](https://codeclimate.com/github/dssg/architect.png)](https://codeclimate.com/github/dssg/architect)
 8 | 
 9 | In order to run classification algorithms on source data, this data must be properly organized into design matrices. Converting cleaned data into these matrices is not a trivial task; the process of creating the needed features and labels for an experiment from source data can be complicated, creating the matrices themselves out of features and labels can be inefficient, and there is opportunity at each step to leak data backwards in time to give model trained on a matrix an unfair advantage.
10 | 
11 | The Architect addresses these issues with functionality aimed at all tasks between cleaned source data (in a PostgreSQL database) and design matrices.
12 | 
13 | ## Components
14 | 
15 | - [LabelGenerator](architect/label_generators.py): Create binary labels suitable for a design matrix by querying a database table containing outcome events.
16 | - [FeatureGenerator](architect/feature_generators.py): Create aggregate features suitable for a design matrix from a set of database tables containing events. Uses [collate](https://github.com/dssg/collate/) to build aggregation SQL queries.
17 | - [FeatureGroupCreator](architect/feature_group_creator.py), [FeatureGroupMixer](architect/feature_group_mixer.py): Create groupings of features, and mix them using different strategies (like 'leave one out') to test their effectiveness.
18 | - [Planner](architect/planner.py), [Builder](architect/builders.py): Build all design matrices needed for an experiment, taking into account different labels, state configurations, and feature groups.
19 | 
20 | In addition to being usable individually to assist in different aspects of building matrices in your project, the Architect components are integrated in [triage](https://github.com/dssg/triage) as a part of an entire modeling experiment that incorporates later tasks like model training and testing.
21 | 
22 | ## Distributing, Building &amp; Testing
23 | 
24 | The Architect is a Python package distributable via `setuptools`. It may be installed directly using `easy_install` or `pip`, or listed as a dependency of another package (namely `triage`), under the package name `matrix-architect`.
25 | 
26 | To build this package for development, its dependencies may be installed using `pip`:
27 | 
28 |     pip install -r requirements_dev.txt
29 | 
30 | (or, without test and development dependencies, using **requirements.txt**).
31 | 
32 | And, having built for development, to run tests:
33 | 
34 |     pytest
35 | 


--------------------------------------------------------------------------------
/src/triage/component/architect/__init__.py:
--------------------------------------------------------------------------------
1 | """Main application"""
2 | from .planner import Planner
3 | from . import builders
4 | 
5 | __all__ = ("Planner", "builders")
6 | 


--------------------------------------------------------------------------------
/src/triage/component/architect/feature_dictionary_creator.py:
--------------------------------------------------------------------------------
 1 | import verboselogs, logging
 2 | logger = verboselogs.VerboseLogger(__name__)
 3 | 
 4 | from triage.component.architect.utils import str_in_sql
 5 | from triage.util.structs import FeatureNameList
 6 | 
 7 | 
 8 | class FeatureDictionaryCreator:
 9 |     def __init__(self, features_schema_name, db_engine):
10 |         self.features_schema_name = features_schema_name
11 |         self.db_engine = db_engine
12 | 
13 |     def _tables_to_include(self, feature_table_names):
14 |         return [
15 |             feature_table
16 |             for feature_table in feature_table_names
17 |             if "aggregation_imputed" in feature_table
18 |         ]
19 | 
20 |     def feature_dictionary(self, feature_table_names, index_column_lookup):
21 |         """ Create a dictionary of feature names, where keys are feature tables
22 |         and values are lists of feature names.
23 | 
24 |         :return: feature_dictionary
25 |         :rtype: dict
26 |         """
27 |         feature_dictionary = {}
28 | 
29 |         # iterate! store each table name + features names as key-value pair
30 |         for feature_table_name in self._tables_to_include(feature_table_names):
31 |             feature_names = [
32 |                 row[0]
33 |                 for row in self.db_engine.execute(
34 |                     self._build_feature_names_query(
35 |                         feature_table_name, index_column_lookup[feature_table_name]
36 |                     )
37 |                 )
38 |             ]
39 |             feature_dictionary[feature_table_name] = FeatureNameList(feature_names)
40 |         logger.spam(f"Feature dictionary built: {feature_dictionary}")
41 |         return feature_dictionary
42 | 
43 |     def _build_feature_names_query(self, table_name, index_columns):
44 |         """ For a given feature table, get the names of the feature columns.
45 | 
46 |         :param table_name: name of the feature table
47 |         :type table_name: str
48 | 
49 |         :return: names of the feature columns in given table
50 |         :rtype: list
51 |         """
52 |         # format the query that gets column names,
53 |         # excluding indices from result
54 |         feature_names_query = f"""
55 |             SELECT column_name
56 |             FROM information_schema.columns
57 |             WHERE table_name = '{table_name}' AND
58 |                   table_schema = '{self.features_schema_name}' AND
59 |                   column_name NOT IN ({str_in_sql(index_columns)})
60 |         """
61 |         logger.spam(
62 |             f"Extracting all possible feature names for table {table_name} with query {feature_names_query}"
63 |         )
64 | 
65 |         return feature_names_query
66 | 


--------------------------------------------------------------------------------
/src/triage/component/architect/features.py:
--------------------------------------------------------------------------------
 1 | from triage.component.architect.feature_generators import FeatureGenerator
 2 | from triage.component.architect.feature_dictionary_creator import (
 3 |     FeatureDictionaryCreator,
 4 | )
 5 | from triage.component.architect.feature_group_creator import FeatureGroupCreator
 6 | from triage.component.architect.feature_group_mixer import FeatureGroupMixer
 7 | 
 8 | __all__ = (
 9 |     "FeatureGenerator",
10 |     "FeatureDictionaryCreator",
11 |     "FeatureGroupCreator",
12 |     "FeatureGroupMixer",
13 | )
14 | 


--------------------------------------------------------------------------------
/src/triage/component/audition/utils.py:
--------------------------------------------------------------------------------
1 | def make_list(a):
2 |     return [a] if not isinstance(a, list) else a
3 | 
4 | 
5 | def str_in_sql(values):
6 |     return ",".join(map(lambda x: "'{}'".format(x), values))
7 | 


--------------------------------------------------------------------------------
/src/triage/component/catwalk/README.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Catwalk
 3 | =======
 4 | 
 5 | Training, testing, and evaluating machine learning classifier models
 6 | 
 7 | At the core of many predictive analytics applications is the need to train classifiers on large set of design matrices, test and temporally cross-validate them, and generate evaluation metrics about them.
 8 | 
 9 | Python's scikit-learn package provides much of this functionality, but it is not trivial to design large experiments with it in a persistable way. Catwalk builds upon the functionality offered by scikit-learn by implementing:
10 | 
11 | - Saving of modeling results and metadata in a `Postgres database <https://github.com/dssg/results-schema>`_ for later analysis
12 | - Exposure of computationally-intensive tasks as discrete workloads that can be used with different parallelization solutions (e.g. multiprocessing, Celery)
13 | - Different model persistence strategies such as on-filesystem or Amazon S3, that can be easily switched between
14 | - Hashing classifier model configuration to only retrain a model if necessary.
15 | - Various best practices in areas like input scaling for different classifier types and feature importance
16 | - Common scikit-learn model evaluation metrics as well as the ability to bundle custom evaluation metrics
17 | - Custom model wrappers for classifiers
18 | - 'Baseline' classes that generate classifications or predictions based on pre-determined rules, to be used for evaluating predictive models against simple hueristics
19 | 


--------------------------------------------------------------------------------
/src/triage/component/catwalk/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/catwalk/baselines/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/catwalk/db.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from sqlalchemy import create_engine
 3 | from sqlalchemy.engine.url import URL
 4 | from sqlalchemy.pool import QueuePool
 5 | 
 6 | from triage.component.results_schema import Base
 7 | 
 8 | 
 9 | def ensure_db(engine):
10 |     Base.metadata.create_all(engine)
11 | 
12 | 
13 | def connect(poolclass=QueuePool):
14 |     with open("database.yaml") as fd:
15 |         config = yaml.full_load(fd)
16 |         dburl = URL(
17 |             "postgres",
18 |             host=config["host"],
19 |             username=config["user"],
20 |             database=config["db"],
21 |             password=config["pass"],
22 |             port=config["port"],
23 |         )
24 |         return create_engine(dburl, poolclass=poolclass)
25 | 


--------------------------------------------------------------------------------
/src/triage/component/catwalk/estimators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/catwalk/estimators/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/catwalk/estimators/transformers.py:
--------------------------------------------------------------------------------
 1 | import verboselogs, logging
 2 | logger = verboselogs.VerboseLogger(__name__)
 3 | 
 4 | import numpy as np
 5 | from sklearn.base import BaseEstimator, TransformerMixin
 6 | from sklearn.utils import check_array
 7 | 
 8 | class CutOff(BaseEstimator, TransformerMixin):
 9 |     """Transform features cutting values out of established range
10 | 
11 |     Args:
12 |        feature_range: Range of allowed values, default=`(0,1)`
13 | 
14 |     Usage:
15 |        The recommended way of using this is::
16 | 
17 |            from sklearn.pipeline import Pipeline
18 | 
19 |            minmax_scaler = preprocessing.MinMaxScaler()
20 |            dsapp_cutoff = CutOff()
21 |            lr  = linear_model.LogisticRegression()
22 | 
23 |            pipeline =Pipeline([
24 |                  ('minmax_scaler',minmax_scaler),
25 |                  ('dsapp_cutoff', dsapp_cutoff),
26 |                  ('lr', lr)
27 |            ])
28 | 
29 |            pipeline.fit(X_train, y_train)
30 |            pipeline.predict(X_test)
31 | 
32 |     """
33 | 
34 |     def __init__(self, feature_range=(0, 1), copy=True):
35 |         self.feature_range = feature_range
36 |         self.copy = copy
37 | 
38 | 
39 |     def fit(self, X, y=None):
40 |         return self
41 | 
42 | 
43 |     def transform(self, X):
44 |         feature_range = self.feature_range
45 | 
46 |         X = check_array(X, copy=self.copy, ensure_2d=True)
47 | 
48 |         if np.any(X > feature_range[1]) or np.any(X < feature_range[0]):
49 |             logger.notice(
50 |                 f"You got feature values that are out of the range: {feature_range}. "
51 |                 f"The feature values will cutoff to fit in the range {feature_range}."
52 |             )
53 | 
54 |         X[X > feature_range[1]] = feature_range[1]
55 |         X[X < feature_range[0]] = feature_range[0]
56 | 
57 |         return X
58 | 


--------------------------------------------------------------------------------
/src/triage/component/catwalk/exceptions.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["BaselineFeatureNotInMatrix"]
 2 | 
 3 | 
 4 | class BaselineFeatureNotInMatrix(KeyError):
 5 |     """ This error is used to allow feature mixing and baseline classes to be
 6 |     included in the same experiment.
 7 | 
 8 |     Without error handling, the baseline classes would cause the experiment to
 9 |     end prematurely when they received a matrix without the required feature
10 |     (if, for example, leave-one-out feature mixing is enabled). Raising this
11 |     error will cause the model to be skipped elegantly.
12 |     """
13 | 


--------------------------------------------------------------------------------
/src/triage/component/catwalk/feature_importances.py:
--------------------------------------------------------------------------------
 1 | import verboselogs, logging
 2 | logger = verboselogs.VerboseLogger(__name__)
 3 | 
 4 | 
 5 | import numpy as np
 6 | import sklearn.linear_model
 7 | from sklearn.svm import SVC
 8 | from triage.component.catwalk.estimators.classifiers import ScaledLogisticRegression
 9 | 
10 | 
11 | def _ad_hoc_feature_importances(model):
12 |     """
13 |     Get the "ad-hoc feature importances" for scikit-learn's models
14 |     lacking the `feature_importances_` attribute
15 | 
16 |     Args:
17 |         model: A trained model that has not a `feature_importances_` attribute
18 | 
19 |     Returns:
20 |         At this moment, this method only returns the odds ratio of both the
21 |         intercept and the coefficients given by sklearn's implementation of
22 |         the LogisticRegression.
23 |         The order of the odds ratio list is the standard
24 |         of the statistical packages (like R, SAS, etc) i.e. (intercept, coefficients)
25 |     """
26 |     feature_importances = None
27 | 
28 |     if (isinstance(model, (sklearn.linear_model.LogisticRegression)) or
29 |         isinstance(model, (ScaledLogisticRegression))):
30 |         coef_odds_ratio = np.exp(model.coef_)
31 |         # intercept_odds_ratio = np.exp(model.intercept_[:,np.newaxis])
32 |         # We are ignoring the intercept
33 | 
34 |         # NOTE: We need to squeeze this array so it has the correct dimensions
35 |         feature_importances = coef_odds_ratio.squeeze()
36 | 
37 |     elif isinstance(model, (SVC)) and (model.get_params()["kernel"] == "linear"):
38 |         feature_importances = model.coef_.squeeze()
39 | 
40 |     return feature_importances
41 | 
42 | 
43 | def get_feature_importances(model):
44 |     """
45 |     Get feature importances (from scikit-learn) of a trained model.
46 | 
47 |     Args:
48 |         model: Trained model
49 | 
50 |     Returns:
51 |         Feature importances, or failing that, None
52 |     """
53 |     feature_importances = None
54 | 
55 |     if hasattr(model, "feature_importances_"):
56 |         feature_importances = model.feature_importances_
57 | 
58 |     else:
59 |         logger.warning(
60 |             "The selected algorithm, doesn't support a standard way "
61 |             "of calculate the importance of each feature used. "
62 |             "Falling back to ad-hoc methods "
63 |             "(e.g. in LogisticRegression we will return Odd Ratios instead coefficients)"
64 |         )
65 | 
66 |         feature_importances = _ad_hoc_feature_importances(model)
67 | 
68 |     # if we just ended up with a scalar (e.g., single feature logit), ensure we return an array
69 |     if isinstance(feature_importances, np.ndarray) and feature_importances.shape == ():
70 |         feature_importances = feature_importances.reshape((1,))
71 | 
72 |     return feature_importances
73 | 


--------------------------------------------------------------------------------
/src/triage/component/collate/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .collate import available_imputations, Aggregation, Aggregate, Compare, Categorical
 3 | from .from_obj import FromObj
 4 | from .spacetime import SpacetimeAggregation
 5 | 
 6 | __all__ = [
 7 |     "available_imputations",
 8 |     "Aggregation",
 9 |     "Aggregate",
10 |     "FromObj",
11 |     "Compare",
12 |     "Categorical",
13 |     "SpacetimeAggregation",
14 | ]
15 | __author__ = """DSaPP Researchers"""
16 | __email__ = "datascifellows@gmail.com"
17 | 


--------------------------------------------------------------------------------
/src/triage/component/collate/sql.py:
--------------------------------------------------------------------------------
 1 | import sqlalchemy.sql.expression as ex
 2 | from sqlalchemy.ext.compiler import compiles
 3 | 
 4 | 
 5 | def make_sql_clause(s, constructor):
 6 |     if not isinstance(s, ex.ClauseElement):
 7 |         return constructor(s)
 8 |     else:
 9 |         return s
10 | 
11 | 
12 | class CreateTableAs(ex.Executable, ex.ClauseElement):
13 |     def __init__(self, name, query):
14 |         self.name = name
15 |         self.query = query
16 | 
17 | 
18 | @compiles(CreateTableAs)
19 | def _create_table_as(element, compiler, **kw):
20 |     return "CREATE TABLE %s AS %s" % (element.name, compiler.process(element.query))
21 | 
22 | 
23 | class InsertFromSelect(ex.Executable, ex.ClauseElement):
24 |     def __init__(self, name, query):
25 |         self.name = name
26 |         self.query = query
27 | 
28 | 
29 | @compiles(InsertFromSelect)
30 | def _insert_from_select(element, compiler, **kw):
31 |     return "INSERT INTO %s (%s)" % (element.name, compiler.process(element.query))
32 | 
33 | 
34 | def to_sql_name(name):
35 |     return name.replace('"', "")
36 | 


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/add_predictions_example_config.yaml:
--------------------------------------------------------------------------------
 1 | # Path where the models and matrices are stored
 2 | project_path: 'path/to/models/and/matrices'
 3 | 
 4 | # Model group ids we need predictions for
 5 | # List of integers
 6 | model_group_ids:
 7 |   - 1
 8 |   - 2
 9 | 
10 | # Following parameters are optional
11 | # These will help narrow down the model_ids in the above model groups in case you are not interested in all the models in a group
12 | # If these are not specificied, all the models in the group will be scored
13 | # Either (or both) can be specified independent of the other
14 | 
15 | # Narrowing down by the experiment hash(es)
16 | # If this is provided, only the model ids relevant to these experiment hashes will be scored
17 | experiments: 
18 |   - 'experiment_hash1'
19 |   - 'experiment_hash2'
20 | 
21 | # Narrowing down by the train_end_time
22 | # Here you can score models that are trained with data from a certain time period
23 | # The first element should be the start date, and the second element should be the end date
24 | # All models (in the above model groups) that has a train_end_time that falls within this range will be scored 
25 | # range end points are inclusive
26 | # If you only specify one limit, an open ended interval is used. 
27 | train_end_times: 
28 |   range_start_date: '1970-01-01' # If only this is specified, all train_end_times on and after this date will be incuded
29 |   range_end_date: '1980-01-01' # If only this is specified, all train_end_times on and before this date will be included
30 | 


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/deprecated/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/deprecated/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/deprecated/db_credentials_example.yaml:
--------------------------------------------------------------------------------
1 | host:
2 | dbname: 
3 | user: 
4 | password:
5 | port: 
6 | role: '' 
7 | 


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/deprecated/parameters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Postmodeling parameters
 3 | 
 4 | This script contain the parameters Class that will be used across all the
 5 | postmodeling functions within the ModelEvaluator and ModelGroupEvaluator
 6 | classes. This class will be initialized using the 'postmodeling_parameters.yaml'
 7 | file
 8 | 
 9 | """
10 | 
11 | import yaml
12 | import json
13 | 
14 | import verboselogs, logging
15 | logger = verboselogs.VerboseLogger(__name__)
16 | 
17 | class PostmodelParameters:
18 |     '''
19 |     PostmodelParameters reads all parameters from a 'yaml' file and store them
20 |     in this object to be used in other functions. Different metrics can be
21 |     passed to this object, by default it will reads from a
22 |     'postmodeling_parameters.yaml', but an Audition config file can be passed
23 |     and will parse from it the needed parameters
24 |     '''
25 |     def __init__(self, path_params):
26 | 
27 |         with open(path_params) as f:
28 |             params = yaml.full_load(f)
29 | 
30 |         # Assign dict elements to Parameters object and flatten
31 |         # thresholds
32 |         self.__dict__.update(params)
33 |         self.figsize = tuple(self.figsize)
34 | 
35 |         try:
36 |             if self.audition_output_path is not None:
37 |                 with open(self.audition_output_path) as f:
38 |                     json_models = json.load(f)
39 | 
40 |                 list_models = [model for model_list in json_models.values()
41 |                                for model in model_list]
42 |                 self.model_group_id = list_models
43 | 
44 |         except AttributeError:
45 |             logger.exception(
46 |                 f'''No audition output file was defined. I will use the models
47 |                 defined in the {path_params} configuration file.'''
48 |             )
49 | 


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/deprecated/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/deprecated/utils/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/deprecated/utils/aux_funcs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A4uxiliary functions and helpers:
 3 | 
 4 | This set of functions are helper functions to format data
 5 | (i.e., prediction matrices, etc.) for plotting. This functions
 6 | are called in both Model class and ModelGroup class in
 7 | evaluation.py.
 8 | '''
 9 | 
10 | from sqlalchemy import create_engine
11 | from sqlalchemy.sql import text
12 | from collections import namedtuple
13 | import yaml
14 | 
15 | import verboselogs, logging
16 | logger = verboselogs.VerboseLogger(__name__)
17 | 
18 | 
19 | ModelEvaluator = namedtuple('ModelEvaluator',
20 |                            ('model_group_id', 'model_id'))
21 | 
22 | 
23 | def create_pgconn(credentials_yaml):
24 |     '''
25 |     Create SQL connection object using a psycopg2 cursor and abiding to new
26 |     dssg/dsapp db user configuration.
27 | 
28 |     Arguments:
29 |         - credentials_yaml: .yaml file with db credentials
30 |     '''
31 |     with open(credentials_yaml) as f:
32 |         configs = yaml.full_load(f)
33 |     try:
34 |         conn = create_engine("postgresql://{user}:{password}@{host}:{port}/{dbname}".format(**configs))
35 |     except:
36 |         logger.error("Error connecting to db.")
37 | 
38 |     return conn
39 | 
40 | 
41 | def get_models_ids(audited_model_group_ids, conn):
42 |     '''
43 |     This helper functions will retrieve the model_id's from a set
44 |     of model_group_ids and will instantiate each model into the
45 |     ModelEvaluator class.
46 | 
47 |     Aguments:
48 |         - audited_model_group_ids: List of model_group_ids
49 |           (ideally from Audition's output)
50 |         - conn: sql engine
51 | 
52 |     This function will return a list of ModelEvaluator objects
53 |     '''
54 | 
55 |     query = conn.execute(text("""
56 |     SELECT model_group_id,
57 |            model_id
58 |     FROM triage_metadata.models
59 |     WHERE model_group_id = ANY(:ids);
60 |     """), ids=audited_model_group_ids)
61 | 
62 |     return [ModelEvaluator._make(row) for row in query]
63 | 


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/fairness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/fairness/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/fairness/aequitas_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def get_aequitas_results(engine, parameter, schema="test_results", table="aequitas", model_id=None,  subset_hash="", tie_breaker="worst"):
 4 |     ''' This function returns the current contents of the aequitas table.
 5 | 
 6 |         Args:
 7 |             - engine: SQLAlchemy engine conected to database
 8 |             - parameter: A string that indicates any parameters for the metric (ex. `100_abs` indicates top-100 entities)
 9 |             - schema: Databse schema to find table within
10 |             - table: Databse table to select data from
11 |             - model_id: A model_id, to query only for results of that model
12 |             - subset_hash: Identifies the subset for the evaluation
13 |             - tie_breaker: Indicates how ties are broken
14 | 
15 |         Returns: A DataFrame, corresponding to schema.table
16 |     '''
17 | 
18 |     query = f"""SELECT * FROM {schema}.{table} 
19 |                  WHERE parameter = '{parameter}' 
20 |                    AND subset_hash = '{subset_hash}'
21 |                    AND tie_breaker = '{tie_breaker}'
22 |                    """
23 |     if model_id:
24 |         query += f" AND model_id = {model_id}"
25 |     return pd.read_sql(query, con=engine)


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/postmodeling_config.yaml:
--------------------------------------------------------------------------------
 1 | error_analysis:
 2 |   # Size of the list.
 3 |   k: [100]
 4 |   
 5 |   # Parameter grid to try on the DecisionTreeClassifier trained for the error analysis.
 6 |   model_params:
 7 |     max_depth: [5]
 8 | 
 9 |   # Flag to define if plots are going to be displayed (True) or saved (False)
10 |   view_plots: False


--------------------------------------------------------------------------------
/src/triage/component/postmodeling/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/utils/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic.ini:
--------------------------------------------------------------------------------
1 | [alembic]
2 | script_location = %(here)s/alembic
3 | 
4 | [exclude]
5 | tables = predictions_\d+
6 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/results_schema/alembic/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py:
--------------------------------------------------------------------------------
 1 | """merge b097e47ba829 with cdd0dc9d9870
 2 | 
 3 | Revision ID: 079a74c15e8b
 4 | Revises: b097e47ba829, cdd0dc9d9870
 5 | Create Date: 2021-05-30 20:49:19.039280
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '079a74c15e8b'
14 | down_revision = ('b097e47ba829', 'cdd0dc9d9870')
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     pass
21 | 
22 | 
23 | def downgrade():
24 |     pass
25 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/0bca1ba9706e_add_matrix_uuid_to_eval.py:
--------------------------------------------------------------------------------
 1 | """add_matrix_uuid_to_eval
 2 | 
 3 | Revision ID: 0bca1ba9706e
 4 | Revises: 38f37d013686
 5 | Create Date: 2019-02-05 13:19:50.172109
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '0bca1ba9706e'
14 | down_revision = '38f37d013686'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('evaluations', sa.Column('matrix_uuid', sa.Text(), nullable=True), schema='test_results')
22 |     op.create_foreign_key(None, 'evaluations', 'matrices', ['matrix_uuid'], ['matrix_uuid'], source_schema='test_results', referent_schema='model_metadata')
23 |     op.add_column('evaluations', sa.Column('matrix_uuid', sa.Text(), nullable=True), schema='train_results')
24 |     op.create_foreign_key(None, 'evaluations', 'matrices', ['matrix_uuid'], ['matrix_uuid'], source_schema='train_results', referent_schema='model_metadata')
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade():
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.drop_constraint(None, 'evaluations', schema='train_results', type_='foreignkey')
31 |     op.drop_column('evaluations', 'matrix_uuid', schema='train_results')
32 |     op.drop_constraint(None, 'evaluations', schema='test_results', type_='foreignkey')
33 |     op.drop_column('evaluations', 'matrix_uuid', schema='test_results')
34 |     # ### end Alembic commands ###
35 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/0d44655e35fd_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 0d44655e35fd
 4 | Revises: 8b3f167d0418
 5 | Create Date: 2017-08-31 14:14:54.187073
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "0d44655e35fd"
14 | down_revision = "8b3f167d0418"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table(
22 |         "individual_importances",
23 |         sa.Column("model_id", sa.Integer(), nullable=False),
24 |         sa.Column("entity_id", sa.BigInteger(), nullable=False),
25 |         sa.Column("as_of_date", sa.DateTime(), nullable=False),
26 |         sa.Column("feature", sa.String(), nullable=False),
27 |         sa.Column("method", sa.String(), nullable=False),
28 |         sa.Column("importance_score", sa.Text(), nullable=True),
29 |         sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]),
30 |         sa.PrimaryKeyConstraint(
31 |             "model_id", "entity_id", "as_of_date", "feature", "method"
32 |         ),
33 |         schema="results",
34 |     )
35 |     op.create_table(
36 |         "list_predictions",
37 |         sa.Column("model_id", sa.Integer(), nullable=False),
38 |         sa.Column("entity_id", sa.BigInteger(), nullable=False),
39 |         sa.Column("as_of_date", sa.DateTime(), nullable=False),
40 |         sa.Column("score", sa.Numeric(), nullable=True),
41 |         sa.Column("rank_abs", sa.Integer(), nullable=True),
42 |         sa.Column("rank_pct", sa.Float(), nullable=True),
43 |         sa.Column("matrix_uuid", sa.Text(), nullable=True),
44 |         sa.Column("test_label_window", sa.Interval(), nullable=True),
45 |         sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]),
46 |         sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"),
47 |         schema="results",
48 |     )
49 |     # ### end Alembic commands ###
50 | 
51 | 
52 | def downgrade():
53 |     # ### commands auto generated by Alembic - please adjust! ###
54 |     op.drop_table("list_predictions", schema="results")
55 |     op.drop_table("individual_importances", schema="results")
56 |     # ### end Alembic commands ###
57 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 1b990cbc04e4
 4 | Revises: 0bca1ba9706e
 5 | Create Date: 2019-02-20 16:41:22.810452
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '1b990cbc04e4'
14 | down_revision = '45219f25072b'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.execute("CREATE SCHEMA IF NOT EXISTS production")
21 |     op.execute("ALTER TABLE triage_metadata.list_predictions SET SCHEMA production;")
22 | 
23 | 
24 | def downgrade():
25 |     op.execute("ALTER TABLE production.list_predictions SET SCHEMA triage_metadata;")
26 |     op.execute("DROP SCHEMA IF EXISTS production")
27 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/2446a931de7a_changing_column_names_and_removing_.py:
--------------------------------------------------------------------------------
 1 | """Changing column names and removing redundancies in table names
 2 | 
 3 | Revision ID: 2446a931de7a
 4 | Revises: 89a8ce240bae
 5 | Create Date: 2018-05-24 17:07:20.567789
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = "2446a931de7a"
12 | down_revision = "89a8ce240bae"
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     # ### commands auto generated by Alembic - please adjust! ###
19 |     op.execute(
20 |         "ALTER TABLE test_results.test_evaluations RENAME TO evaluations;"
21 |         + "ALTER TABLE test_results.test_predictions RENAME TO predictions;"
22 |         + "ALTER TABLE train_results.train_evaluations RENAME TO evaluations;"
23 |         + "ALTER TABLE train_results.train_predictions RENAME TO predictions;"
24 |     )
25 | 
26 |     op.alter_column(
27 |         "matrices",
28 |         "n_examples",
29 |         new_column_name="num_observations",
30 |         schema="model_metadata",
31 |     )
32 |     op.alter_column(
33 |         "model_groups",
34 |         "model_parameters",
35 |         new_column_name="hyperparameters",
36 |         schema="model_metadata",
37 |     )
38 |     op.alter_column(
39 |         "models",
40 |         "model_parameters",
41 |         new_column_name="hyperparameters",
42 |         schema="model_metadata",
43 |     )
44 |     # ### end Alembic commands ###
45 | 
46 | 
47 | def downgrade():
48 |     # ### commands auto generated by Alembic - please adjust! ###
49 |     op.execute(
50 |         "ALTER TABLE test_results.evaluations RENAME TO test_evaluations;"
51 |         + "ALTER TABLE test_results.predictions RENAME TO test_predictions;"
52 |         + "ALTER TABLE train_results.evaluations RENAME TO train_evaluations;"
53 |         + "ALTER TABLE train_results.predictions RENAME TO train_predictions;"
54 |     )
55 | 
56 |     op.alter_column(
57 |         "matrices",
58 |         "num_observations",
59 |         new_column_name="n_examples",
60 |         schema="model_metadata",
61 |     )
62 |     op.alter_column(
63 |         "model_groups",
64 |         "hyperparameters",
65 |         new_column_name="model_parameters",
66 |         schema="model_metadata",
67 |     )
68 |     op.alter_column(
69 |         "models",
70 |         "hyperparameters",
71 |         new_column_name="model_parameters",
72 |         schema="model_metadata",
73 |     )
74 | 
75 |     # ### end Alembic commands ###
76 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/264245ddfce2_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 264245ddfce2
 4 | Revises: 0d44655e35fd
 5 | Create Date: 2017-09-01 14:26:01.107455
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "264245ddfce2"
14 | down_revision = "0d44655e35fd"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column(
22 |         "individual_importances",
23 |         sa.Column("feature_value", sa.Float(), nullable=True),
24 |         schema="results",
25 |     )
26 |     # ### end Alembic commands ###
27 | 
28 | 
29 | def downgrade():
30 |     # ### commands auto generated by Alembic - please adjust! ###
31 |     op.drop_column("individual_importances", "feature_value", schema="results")
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py:
--------------------------------------------------------------------------------
 1 | """add label_value to prodcution table
 2 | 
 3 | Revision ID: 264786a9fe85
 4 | Revises: 1b990cbc04e4
 5 | Create Date: 2019-02-26 13:17:05.365654
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '264786a9fe85'
14 | down_revision = '1b990cbc04e4'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.drop_table("list_predictions", schema="production")
21 |     op.create_table(
22 |         "list_predictions",
23 |         sa.Column("model_id", sa.Integer(), nullable=False),
24 |         sa.Column("entity_id", sa.BigInteger(), nullable=False),
25 |         sa.Column("as_of_date", sa.DateTime(), nullable=False),
26 |         sa.Column("score", sa.Numeric(), nullable=True),
27 |         sa.Column('label_value', sa.Integer, nullable=True),
28 |         sa.Column("rank_abs", sa.Integer(), nullable=True),
29 |         sa.Column("rank_pct", sa.Float(), nullable=True),
30 |         sa.Column("matrix_uuid", sa.Text(), nullable=True),
31 |         sa.Column("test_label_timespan", sa.Interval(), nullable=True),
32 |         sa.ForeignKeyConstraint(["model_id"], ["triage_metadata.models.model_id"]),
33 |         sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"),
34 |         schema="production",
35 |     )
36 | 
37 | 
38 | def downgrade():
39 |     op.drop_table("list_predictions", schema="production")
40 |     op.create_table(
41 |         "list_predictions",
42 |         sa.Column("model_id", sa.Integer(), nullable=False),
43 |         sa.Column("entity_id", sa.BigInteger(), nullable=False),
44 |         sa.Column("as_of_date", sa.DateTime(), nullable=False),
45 |         sa.Column("score", sa.Numeric(), nullable=True),
46 |         sa.Column("rank_abs", sa.Integer(), nullable=True),
47 |         sa.Column("rank_pct", sa.Float(), nullable=True),
48 |         sa.Column("matrix_uuid", sa.Text(), nullable=True),
49 |         sa.Column("test_label_timespan", sa.Interval(), nullable=True),
50 |         sa.ForeignKeyConstraint(["model_id"], ["triage_metadata.models.model_id"]),
51 |         sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"),
52 |         schema="results",
53 |     )
54 | 
55 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/38f37d013686_associate_experiments_with_models_and_.py:
--------------------------------------------------------------------------------
 1 | """Associate experiments with models and matrices
 2 | 
 3 | Revision ID: 38f37d013686
 4 | Revises: d0ac573eaf1a
 5 | Create Date: 2018-10-18 16:31:43.181779
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '38f37d013686'
14 | down_revision = 'd0ac573eaf1a'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('experiment_matrices',
22 |     sa.Column('experiment_hash', sa.String(), nullable=False),
23 |     sa.Column('matrix_uuid', sa.String(), nullable=False),
24 |     sa.ForeignKeyConstraint(['experiment_hash'], ['model_metadata.experiments.experiment_hash'], ),
25 |     sa.PrimaryKeyConstraint('experiment_hash', 'matrix_uuid'),
26 |     schema='model_metadata'
27 |     )
28 |     op.create_table('experiment_models',
29 |     sa.Column('experiment_hash', sa.String(), nullable=False),
30 |     sa.Column('model_hash', sa.String(), nullable=False),
31 |     sa.ForeignKeyConstraint(['experiment_hash'], ['model_metadata.experiments.experiment_hash'], ),
32 |     sa.PrimaryKeyConstraint('experiment_hash', 'model_hash'),
33 |     schema='model_metadata'
34 |     )
35 |     op.add_column('matrices', sa.Column('built_by_experiment', sa.String(), nullable=True), schema='model_metadata')
36 |     op.create_foreign_key(None, 'matrices', 'experiments', ['built_by_experiment'], ['experiment_hash'], source_schema='model_metadata', referent_schema='model_metadata')
37 | 
38 |     op.alter_column('models', 'experiment_hash', new_column_name='built_by_experiment', schema='model_metadata')
39 |     # ### end Alembic commands ###
40 | 
41 | 
42 | def downgrade():
43 |     # ### commands auto generated by Alembic - please adjust! ###
44 |     op.alter_column('models', 'built_by_experiment', new_column_name='experiment_hash', schema='model_metadata')
45 |     op.drop_constraint(None, 'matrices', schema='model_metadata', type_='foreignkey')
46 |     op.drop_column('matrices', 'built_by_experiment', schema='model_metadata')
47 |     op.drop_table('experiment_models', schema='model_metadata')
48 |     op.drop_table('experiment_matrices', schema='model_metadata')
49 |     # ### end Alembic commands ###
50 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/3ce027594a5c_add_hashes_to_runs.py:
--------------------------------------------------------------------------------
 1 | """add hashes to runs
 2 | 
 3 | Revision ID: 3ce027594a5c
 4 | Revises: 5dd2ba8222b1
 5 | Create Date: 2022-03-25 12:58:38.370271
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '3ce027594a5c'
14 | down_revision = '5dd2ba8222b1'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.add_column('triage_runs', sa.Column('cohort_table_name', sa.String(), nullable=True), schema='triage_metadata')
21 |     op.add_column('triage_runs', sa.Column('labels_table_name', sa.String(), nullable=True), schema='triage_metadata')
22 |     op.add_column('triage_runs', sa.Column('bias_hash', sa.String(), nullable=True), schema='triage_metadata')
23 | 
24 | 
25 | def downgrade():
26 |     op.drop_column('triage_runs', 'bias_hash', schema='triage_metadata')
27 |     op.drop_column('triage_runs', 'labels_table_name', schema='triage_metadata')
28 |     op.drop_column('triage_runs', 'cohort_table_name', schema='triage_metadata')
29 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/45219f25072b_hash_partitioning_predictions_tables.py:
--------------------------------------------------------------------------------
 1 | """hash-partitioning predictions tables
 2 | 
 3 | Revision ID: 45219f25072b
 4 | Revises: a98acf92fd48
 5 | Create Date: 2020-08-21 09:29:04.751933
 6 | 
 7 | """
 8 | from alembic import op
 9 | import os
10 | 
11 | import verboselogs, logging
12 | logger = verboselogs.VerboseLogger(__name__)
13 | 
14 | 
15 | # revision identifiers, used by Alembic.
16 | revision = '45219f25072b'
17 | down_revision = 'a98acf92fd48'
18 | branch_labels = None
19 | depends_on = None
20 | 
21 | 
22 | def get_pg_major_version(op):
23 |     conn = op.get_bind()
24 |     pg_major_version = conn.execute('show server_version').fetchone()[0].split('.')[0]
25 |     logger.debug(f'PostgreSQL major version {pg_major_version}')
26 |     return int(pg_major_version)
27 | 
28 | 
29 | def upgrade():
30 | 
31 |     pg_major_version = get_pg_major_version(op)
32 | 
33 |     if pg_major_version >= 11:
34 |         logger.info(f'PostgreSQL 11 or greater found (PostgreSQL {pg_major_version}): Using hash partitioning')
35 |         hash_partitioning_filename = os.path.join(
36 |             os.path.dirname(__file__), "../../sql/predictions_hash_partitioning.sql"
37 |         )
38 |         with open(hash_partitioning_filename) as fd:
39 |             stmt = fd.read()
40 |             op.execute(stmt)
41 |     else:
42 |         logger.info(f'No hash partitioning implemented because PostgreSQL 11 or greater not found (using: PostgreSQL {pg_major_version})')
43 | 
44 | 
45 | def downgrade():
46 | 
47 |     pg_major_version = get_pg_major_version(op)
48 | 
49 |     if pg_major_version >= 11:
50 |         logger.info(f'PostgreSQL 11 or greater found  (PostgreSQL {pg_major_version}): Removing hash partitioning')
51 |         undo_hash_partitioning_filename = os.path.join(
52 |             os.path.dirname(__file__), "../../sql/undo_predictions_hash_partitioning.sql"
53 |         )
54 |         with open(undo_hash_partitioning_filename) as fd:
55 |             stmt = fd.read()
56 |             op.execute(stmt)
57 |     else:
58 |         logger.info(f'No hash partitioning implemented because PostgreSQL 11 or greater not found (using: PostgreSQL {pg_major_version})')
59 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/4ae804cc0977_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 4ae804cc0977
 4 | Revises: 9bbfdcf8bab0
 5 | Create Date: 2020-07-19 01:35:54.419099
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '4ae804cc0977'
14 | down_revision = '9bbfdcf8bab0'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('models', sa.Column('built_in_experiment_run', sa.Integer(), nullable=True), schema='triage_metadata')
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade():
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('models', 'built_in_experiment_run', schema='triage_metadata')
28 |  # ### end Alembic commands ###
29 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/670289044eb2_add_production_prediction_metadata.py:
--------------------------------------------------------------------------------
 1 | """Add production prediction metadata
 2 | 
 3 | Revision ID: 670289044eb2
 4 | Revises: ce5b50ffa8e2
 5 | Create Date: 2021-01-08 22:27:23.433813
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '670289044eb2'
14 | down_revision = 'ce5b50ffa8e2'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('prediction_metadata',
22 |     sa.Column('model_id', sa.Integer(), nullable=False),
23 |     sa.Column('matrix_uuid', sa.Text(), nullable=False),
24 |     sa.Column('tiebreaker_ordering', sa.Text(), nullable=True),
25 |     sa.Column('random_seed', sa.Integer(), nullable=True),
26 |     sa.Column('predictions_saved', sa.Boolean(), nullable=True),
27 |     sa.ForeignKeyConstraint(['matrix_uuid'], ['triage_metadata.matrices.matrix_uuid'], ),
28 |     sa.ForeignKeyConstraint(['model_id'], ['triage_metadata.models.model_id'], ),
29 |     sa.PrimaryKeyConstraint('model_id', 'matrix_uuid'),
30 |     schema='production'
31 |     )
32 |     # ### end Alembic commands ###
33 | 
34 | 
35 | def downgrade():
36 |     # ### commands auto generated by Alembic - please adjust! ###
37 |     op.drop_table('prediction_metadata', schema='production')
38 |     # ### end Alembic commands ###
39 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/72ac5cbdca05_change_importance_to_float.py:
--------------------------------------------------------------------------------
 1 | """Change importance to float
 2 | 
 3 | Revision ID: 72ac5cbdca05
 4 | Revises: 264245ddfce2
 5 | Create Date: 2017-09-01 14:31:09.302828
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "72ac5cbdca05"
14 | down_revision = "264245ddfce2"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.alter_column(
21 |         table_name="individual_importances",
22 |         column_name="importance_score",
23 |         type_=sa.Float(),
24 |         schema="results",
25 |         postgresql_using="importance_score::double precision",
26 |     )
27 | 
28 | 
29 | def downgrade():
30 |     op.alter_column(
31 |         table_name="individual_importances",
32 |         column_name="importance_score",
33 |         type_=sa.Text(),
34 |         schema="results",
35 |     )
36 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/7d57d1cf3429_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 7d57d1cf3429
 4 | Revises: 72ac5cbdca05
 5 | Create Date: 2017-11-06 11:34:23.046005
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = "7d57d1cf3429"
12 | down_revision = "72ac5cbdca05"
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     op.alter_column(
19 |         "evaluations", "example_frequency", new_column_name="as_of_date_frequency"
20 |     )
21 |     op.alter_column(
22 |         "models", "train_label_window", new_column_name="training_label_timespan"
23 |     )
24 |     op.alter_column(
25 |         "predictions", "test_label_window", new_column_name="test_label_timespan"
26 |     )
27 |     op.alter_column(
28 |         "list_predictions", "test_label_window", new_column_name="test_label_timespan"
29 |     )
30 | 
31 | 
32 | def downgrade():
33 |     op.alter_column(
34 |         "evaluations", "as_of_date_frequency", new_column_name="example_frequency"
35 |     )
36 |     op.alter_column(
37 |         "models", "training_label_timespan", new_column_name="train_label_window"
38 |     )
39 |     op.alter_column(
40 |         "predictions", "test_label_timespan", new_column_name="test_label_window"
41 |     )
42 |     op.alter_column(
43 |         "list_predictions", "test_label_timespan", new_column_name="test_label_window"
44 |     )
45 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/8cef808549dd_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 8cef808549dd
 4 | Revises: b4d7569d31cb
 5 | Create Date: 2020-06-02 21:26:32.528991
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '8cef808549dd'
14 | down_revision = 'b4d7569d31cb'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('experiment_runs', sa.Column('python_version', sa.String(), nullable=True), schema='model_metadata')
22 |     op.create_index(op.f('ix_model_metadata_models_model_hash'), 'models', ['model_hash'], unique=True, schema='model_metadata')
23 |     op.drop_index('ix_results_models_model_hash', table_name='models', schema='model_metadata')
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade():
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     op.create_index('ix_results_models_model_hash', 'models', ['model_hash'], unique=True, schema='model_metadata')
30 |     op.drop_index(op.f('ix_model_metadata_models_model_hash'), table_name='models', schema='model_metadata')
31 |     op.drop_column('experiment_runs', 'python_version', schema='model_metadata')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/9bbfdcf8bab0_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: 9bbfdcf8bab0
 4 | Revises: fa1760d35710
 5 | Create Date: 2020-07-19 01:04:23.442598
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '9bbfdcf8bab0'
14 | down_revision = 'fa1760d35710'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('experiment_runs', sa.Column('random_seed', sa.Integer(), nullable=True), schema='triage_metadata')
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade():
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('experiment_runs', 'random_seed', schema='triage_metadata')
28 |     # ### end Alembic commands ###
29 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/results_schema/alembic/versions/__init__.py


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/a20104116533_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: a20104116533
 4 | Revises: 8cef808549dd
 5 | Create Date: 2020-06-11 16:32:41.319128
 6 | 
 7 | """
 8 | import os
 9 | from alembic import op
10 | import sqlalchemy as sa
11 | from sqlalchemy.dialects import postgresql
12 | 
13 | # revision identifiers, used by Alembic.
14 | revision = 'a20104116533'
15 | down_revision = '8cef808549dd'
16 | branch_labels = None
17 | depends_on = None
18 | 
19 | 
20 | def upgrade():
21 |     # ### commands auto generated by Alembic - please adjust! ###
22 |     op.execute("CREATE SCHEMA IF NOT EXISTS triage_metadata")
23 |     op.execute(
24 |         "ALTER TABLE model_metadata.experiment_matrices SET SCHEMA triage_metadata;"
25 |         + " ALTER TABLE model_metadata.experiment_models SET SCHEMA triage_metadata;"
26 |         + " ALTER TABLE model_metadata.experiment_runs SET SCHEMA triage_metadata;"
27 |         + " ALTER TABLE model_metadata.experiments SET SCHEMA triage_metadata;"
28 |         + " ALTER TABLE model_metadata.list_predictions SET SCHEMA triage_metadata;"
29 |         + " ALTER TABLE model_metadata.matrices SET SCHEMA triage_metadata;"
30 |         + " ALTER TABLE model_metadata.model_groups SET SCHEMA triage_metadata;"
31 |         + " ALTER TABLE model_metadata.models SET SCHEMA triage_metadata;"
32 |         + " ALTER TABLE model_metadata.subsets SET SCHEMA triage_metadata;"
33 |     )
34 | 
35 |     op.execute("DROP SCHEMA IF EXISTS model_metadata")
36 | 
37 |     ## We update (replace) the function
38 |     group_proc_filename = os.path.join(
39 |         os.path.dirname(__file__), "../../sql/model_group_stored_procedure.sql"
40 |     )
41 |     with open(group_proc_filename) as fd:
42 |         stmt = fd.read()
43 |         op.execute(stmt)
44 | 
45 |     # ### end Alembic commands ###
46 | 
47 | 
48 | def downgrade():
49 |     # ### commands auto generated by Alembic - please adjust! ###
50 |     op.execute("CREATE SCHEMA IF NOT EXISTS model_metadata")
51 | 
52 |     op.execute(
53 |         "ALTER TABLE triage_metadata.experiment_matrices SET SCHEMA model_metadata;"
54 |         + " ALTER TABLE triage_metadata.experiment_models SET SCHEMA model_metadata;"
55 |         + " ALTER TABLE triage_metadata.experiment_runs SET SCHEMA model_metadata;"
56 |         + " ALTER TABLE triage_metadata.experiments SET SCHEMA model_metadata;"
57 |         + " ALTER TABLE triage_metadata.matrices SET SCHEMA model_metadata;"
58 |         + " ALTER TABLE triage_metadata.model_groups SET SCHEMA model_metadata;"
59 |         + " ALTER TABLE triage_metadata.models SET SCHEMA model_metadata;"
60 |         + " ALTER TABLE triage_metadata.subsets SET SCHEMA model_metadata;"
61 |     )
62 | 
63 |     # ### end Alembic commands ###
64 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/a98acf92fd48_add_nuke_triage_function.py:
--------------------------------------------------------------------------------
 1 | """add nuke triage function
 2 | 
 3 | Revision ID: a98acf92fd48
 4 | Revises: 4ae804cc0977
 5 | Create Date: 2020-07-19 01:46:02.751987
 6 | 
 7 | """
 8 | from alembic import op
 9 | import os
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'a98acf92fd48'
13 | down_revision = '4ae804cc0977'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     nuke_triage_filename = os.path.join(
20 |         os.path.dirname(__file__), "../../sql/nuke_triage.sql"
21 |     )
22 |     with open(nuke_triage_filename) as fd:
23 |         stmt = fd.read()
24 |         op.execute(stmt)
25 | 
26 | 
27 | 
28 | def downgrade():
29 |     pass
30 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/cdd0dc9d9870_rename_production_schema_and_prediction_table.py:
--------------------------------------------------------------------------------
 1 | """rename production schema and list_predcitons to triage_predcition and predictions 
 2 | 
 3 | Revision ID: cdd0dc9d9870
 4 | Revises: 670289044eb2
 5 | Create Date: 2021-04-13 00:53:56.098572
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'cdd0dc9d9870'
14 | down_revision = '670289044eb2'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.execute("CREATE SCHEMA IF NOT EXISTS triage_production")
21 |     op.execute("ALTER TABLE production.list_predictions SET SCHEMA triage_production;")
22 |     op.execute("ALTER TABLE production.prediction_metadata SET SCHEMA triage_production")
23 |     op.execute("ALTER TABLE triage_production.list_predictions RENAME TO predictions")
24 |  
25 | 
26 | def downgrade():
27 |     op.execute("ALTER TABLE triage_production.predictions SET SCHEMA production;")
28 |     op.execute("ALTER TABLE triage_production.prediction_metadata SET SCHEMA production")
29 |     op.execute("ALTER TABLE production.predictions RENAME TO list_predictions")
30 |     op.execute("DROP SCHEMA IF EXISTS triage_production")
31 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/ce5b50ffa8e2_break_ties_in_list_predictions.py:
--------------------------------------------------------------------------------
 1 | """Break ties in list predictions
 2 | 
 3 | Revision ID: ce5b50ffa8e2
 4 | Revises: 264786a9fe85
 5 | Create Date: 2021-01-08 21:59:13.403934
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'ce5b50ffa8e2'
14 | down_revision = '264786a9fe85'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('list_predictions', sa.Column('rank_abs_with_ties', sa.Integer(), nullable=True), schema='production')
22 |     op.add_column('list_predictions', sa.Column('rank_pct_with_ties', sa.Float(), nullable=True), schema='production')
23 |     op.alter_column('list_predictions', 'rank_abs', new_column_name='rank_abs_no_ties', schema='production')
24 |     op.alter_column('list_predictions', 'rank_pct', new_column_name='rank_pct_no_ties', schema='production')
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade():
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.alter_column('list_predictions', 'rank_abs_no_ties', new_column_name='rank_abs', schema='production')
31 |     op.alter_column('list_predictions', 'rank_pct_no_ties', new_column_name='rank_pct', schema='production')
32 |     op.drop_column('list_predictions', 'rank_pct_with_ties', schema='production')
33 |     op.drop_column('list_predictions', 'rank_abs_with_ties', schema='production')
34 |     # ### end Alembic commands ###
35 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/d0ac573eaf1a_model_group_stored_procedure.py:
--------------------------------------------------------------------------------
 1 | """model_group_stored_procedure
 2 | 
 3 | Revision ID: d0ac573eaf1a
 4 | Revises: 2446a931de7a
 5 | Create Date: 2018-06-20 17:44:27.162699
 6 | 
 7 | """
 8 | from alembic import op
 9 | import os
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "d0ac573eaf1a"
14 | down_revision = "2446a931de7a"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     group_proc_filename = os.path.join(
21 |         os.path.dirname(__file__), "../../sql/model_group_stored_procedure.sql"
22 |     )
23 |     with open(group_proc_filename) as fd:
24 |         stmt = fd.read()
25 |         op.execute(stmt)
26 | 
27 | 
28 | def downgrade():
29 |     pass
30 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/alembic/versions/fa1760d35710_.py:
--------------------------------------------------------------------------------
 1 | """empty message
 2 | 
 3 | Revision ID: fa1760d35710
 4 | Revises: a20104116533
 5 | Create Date: 2020-07-16 18:07:58.229213
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'fa1760d35710'
14 | down_revision = 'a20104116533'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.add_column('experiments', sa.Column('random_seed', sa.Integer(), nullable=True), schema='triage_metadata')
21 |     # ### end Alembic commands ###
22 | 
23 | 
24 | def downgrade():
25 |     op.drop_column('experiments', 'random_seed', schema='triage_metadata')
26 |     # ### end Alembic commands ###
27 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/example_db_config.yaml:
--------------------------------------------------------------------------------
1 | host: localhost
2 | user: results_schema
3 | pass: results_schema
4 | port: 5432
5 | db: results_schema
6 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/sql/model_group_stored_procedure.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Function for using the model group table. This function requires a table like
 3 | -----------
 4 | CREATE TABLE triage_metadata.model_groups
 5 | (
 6 |   model_group_id    SERIAL PRIMARY KEY,
 7 |   model_type        TEXT,
 8 |   hyperparameters   JSONB,
 9 |   feature_list      TEXT []
10 |   model_config		JSONB
11 | );
12 | -----------
13 | populates the table and returns the IDs
14 | */
15 | CREATE OR REPLACE FUNCTION public.get_model_group_id(in_model_type        TEXT,
16 |                                              in_hyperparameters   JSONB,
17 |                                              in_feature_list      TEXT [],
18 |                                              in_model_config      JSONB)
19 |   RETURNS INTEGER AS
20 | $BODY$
21 | DECLARE
22 |   model_group_return_id INTEGER;
23 | BEGIN
24 |   --Obtain an advisory lock on the table to avoid double execution
25 |   PERFORM pg_advisory_lock(60637);
26 | 
27 |   -- Check if the model_group_id exists, if not insert the model parameters and return the new value
28 |   SELECT *
29 |   INTO model_group_return_id
30 |   FROM triage_metadata.model_groups
31 |   WHERE
32 |     model_type = in_model_type
33 |     AND hyperparameters = in_hyperparameters
34 |     AND feature_list = ARRAY(Select unnest(in_feature_list) ORDER BY 1)
35 |     AND model_config = in_model_config ;
36 |   IF NOT FOUND
37 |   THEN
38 |     INSERT INTO triage_metadata.model_groups (model_group_id, model_type, hyperparameters, feature_list, model_config)
39 |     VALUES (DEFAULT, in_model_type, in_hyperparameters, ARRAY(Select unnest(in_feature_list) ORDER BY 1), in_model_config)
40 |     RETURNING model_group_id
41 |       INTO model_group_return_id;
42 |   END IF;
43 | 
44 |   -- Release the lock again
45 |   PERFORM pg_advisory_unlock(60637);
46 | 
47 | 
48 |   RETURN model_group_return_id;
49 | END;
50 | 
51 | $BODY$
52 | LANGUAGE plpgsql VOLATILE
53 | COST 100;
54 | 
55 | 
56 | 
57 | comment on function get_model_group_id (text, jsonb, text [], jsonb) is 'Function for using the model group table. This function requires a table like
58 | -----------
59 | CREATE TABLE triage_metadata.model_groups
60 | (
61 |   model_group_id    SERIAL PRIMARY KEY,
62 |   model_type        TEXT,
63 |   hyperparameters   JSONB,
64 |   feature_list      TEXT []
65 |   model_config		JSONB
66 | );
67 | -----------
68 | populates the table and returns the IDs';
69 | 


--------------------------------------------------------------------------------
/src/triage/component/results_schema/sql/nuke_triage.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Function for wiping out all triage tables, schemas, functions and indexes.
 3 |   It also deletes results_schema_versions (from alembic)
 4 |   Useful  when ou try to start clean (again) and try to recover some
 5 |   previous errors
 6 |  */
 7 | create or replace function nuke_triage()
 8 |     returns text as $result$
 9 | 
10 |     declare
11 |     result text;
12 |     query text;
13 | 
14 |     begin
15 | 
16 |     execute 'drop schema if exists triage_metadata cascade';
17 |     raise notice 'triage_metadata deleted';
18 |     execute 'drop schema if exists features cascade';
19 |     raise notice 'features deleted';
20 |     execute 'drop schema if exists train_results cascade';
21 |     raise notice 'train_results deleted';
22 |     execute 'drop schema if exists test_results cascade';
23 |     raise notice 'test_results deleted';
24 | 
25 |     execute 'drop table if exists results_schema_versions';
26 |     raise notice 'results_schema_versions deleted';
27 | 
28 |     execute 'drop function if exists get_model_group_id';
29 |     raise notice 'get_model_group_id deleted';
30 | 
31 |     execute 'drop type if exists experimentrunstatus';
32 |     raise notice 'experimentrunstatus type deleted';
33 | 
34 | 
35 |       select into query
36 |                   string_agg(
37 |                     format('drop table %I cascade;', tablename), E'\n'
38 |                   )
39 |         from   pg_tables
40 |        where  tablename ~ 'cohort_|labels_|ranks_';
41 | 
42 |       if query is not null then
43 |         raise notice '%', query;
44 |         execute query;
45 |       else
46 |         raise notice 'no  labels or states tables from triage found';
47 |       end if;
48 | 
49 |       return 'triage was send to the oblivion. Long live to triage!';
50 |     end;
51 | $result$ language plpgsql;
52 | 
53 | comment on function nuke_triage () is 'Function for wiping out all triage tables, schemas, functions and indexes.
54 |   It also deletes results_schema_versions (from alembic)
55 |   Useful  when ou try to start clean (again) and try to recover some
56 |   previous errors';
57 | 


--------------------------------------------------------------------------------
/src/triage/component/timechop/__init__.py:
--------------------------------------------------------------------------------
1 | from .timechop import Timechop
2 | 
3 | __all__ = ("Timechop",)
4 | 


--------------------------------------------------------------------------------
/src/triage/component/timechop/utils.py:
--------------------------------------------------------------------------------
 1 | from six import string_types
 2 | 
 3 | 
 4 | def convert_to_list(x):
 5 |     """
 6 |     Given an object, if it is not a list, convert it to a list.
 7 | 
 8 |     Arguments:
 9 |         x (object): an object to be converted to a list
10 | 
11 |     return:
12 |         list: x as a list
13 |     """
14 |     if isinstance(x, string_types):
15 |         return [x]
16 | 
17 |     try:
18 |         iter(x)
19 |     except TypeError:
20 |         return [x]
21 |     else:
22 |         return list(x)
23 | 


--------------------------------------------------------------------------------
/src/triage/config/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | 
 3 | formatters:
 4 |   simple:
 5 |     format: '%(asctime)s - %(message)s'
 6 |   colored_console:
 7 |     (): 'coloredlogs.ColoredFormatter'
 8 |     format: '%(asctime)s - %(levelname)7s %(message)s'
 9 |   rich:
10 |     format: '%(name)-30s  %(asctime)s %(levelname)10s %(process)6d  %(filename)-24s  %(lineno)4d: %(message)s'
11 |     datefmt: '%d/%m/%Y %I:%M:%S %p'
12 | 
13 | handlers:
14 |   console:
15 |     level: 15 # VERBOSE
16 |     class: logging.StreamHandler
17 |     formatter: colored_console
18 |     stream: ext://sys.stdout
19 | 
20 | loggers:
21 |   triage:
22 |     level: DEBUG
23 |     handlers: [console]
24 |     propagate: no
25 | 
26 |   matplotlib:
27 |     level: WARNING
28 |     handlers: [console]
29 |     propagate: no
30 | 
31 |   boto3:
32 |     level: WARNING
33 |     handlers: [console]
34 |     propagate: no
35 | 
36 |   pip:
37 |     level: CRITICAL
38 |     handlers: [console]
39 | 
40 | root:
41 |   level: NOTSET
42 |   handlers: [console]
43 | 


--------------------------------------------------------------------------------
/src/triage/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | # Avoid circular import (required by base)
2 | CONFIG_VERSION = "v8"  # noqa: E402
3 | 
4 | from .base import ExperimentBase
5 | from .multicore import MultiCoreExperiment
6 | from .singlethreaded import SingleThreadedExperiment
7 | 
8 | __all__ = ("ExperimentBase", "MultiCoreExperiment", "SingleThreadedExperiment")
9 | 


--------------------------------------------------------------------------------
/src/triage/experiments/singlethreaded.py:
--------------------------------------------------------------------------------
 1 | from triage.experiments import ExperimentBase
 2 | 
 3 | 
 4 | class SingleThreadedExperiment(ExperimentBase):
 5 |     def process_query_tasks(self, query_tasks):
 6 |         self.feature_generator.process_table_tasks(query_tasks)
 7 | 
 8 |     def process_matrix_build_tasks(self, matrix_build_tasks):
 9 |         self.matrix_builder.build_all_matrices(matrix_build_tasks)
10 | 
11 |     def process_train_test_batches(self, batches):
12 |         self.model_train_tester.process_all_batches(batches)
13 | 
14 |     def process_subset_tasks(self, subset_tasks):
15 |         self.subsetter.process_all_tasks(subset_tasks)
16 | 


--------------------------------------------------------------------------------
/src/triage/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/util/__init__.py


--------------------------------------------------------------------------------
/src/triage/util/db.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import sqlalchemy
 4 | import wrapt
 5 | from contextlib import contextmanager
 6 | from sqlalchemy.orm import Session
 7 | from sqlalchemy.engine.url import make_url
 8 | 
 9 | import json
10 | import functools
11 | 
12 | from psycopg2.extras import DateRange, DateTimeRange
13 | from datetime import date, datetime
14 | 
15 | 
16 | def serialize_to_database(obj):
17 |     """JSON serializer for objects not serializable by default json code"""
18 | 
19 |     if isinstance(obj, date):
20 |         return str(obj.isoformat())
21 | 
22 |     if isinstance(obj, (DateRange, DateTimeRange)):
23 |         return f"[{obj.lower}, {obj.upper}]"
24 | 
25 |     return obj
26 | 
27 | 
28 | def json_dumps(d):
29 |     return json.dumps(d, default=serialize_to_database)
30 | 
31 | 
32 | 
33 | class SerializableDbEngine(wrapt.ObjectProxy):
34 |     """A sqlalchemy engine that can be serialized across process boundaries.
35 | 
36 |     Works by saving all kwargs used to create the engine and reconstructs them later.  As a result, the state won't be saved upon serialization/deserialization.
37 |     """
38 | 
39 |     __slots__ = ("url", "creator", "kwargs")
40 | 
41 |     def __init__(self, url, *, creator=sqlalchemy.create_engine, **kwargs):
42 |         self.url = make_url(url)
43 |         self.creator = creator
44 |         self.kwargs = kwargs
45 | 
46 |         engine = creator(url, **kwargs)
47 |         super().__init__(engine)
48 | 
49 |     def __reduce__(self):
50 |         return (self.__reconstruct__, (self.url, self.creator, self.kwargs))
51 | 
52 |     def __reduce_ex__(self, protocol):
53 |         # wrapt requires reduce_ex to be implemented
54 |         return self.__reduce__()
55 | 
56 |     @classmethod
57 |     def __reconstruct__(cls, url, creator, kwargs):
58 |         return cls(url, creator=creator, **kwargs)
59 | 
60 | 
61 | create_engine = functools.partial(SerializableDbEngine, json_serializer=json_dumps)
62 | 
63 | @contextmanager
64 | def scoped_session(db_engine):
65 |     """Provide a transactional scope around a series of operations."""
66 |     session = Session(bind=db_engine)
67 |     try:
68 |         yield session
69 |         session.commit()
70 |     except:
71 |         session.rollback()
72 |         raise
73 |     finally:
74 |         session.close()
75 | 
76 | 
77 | @contextmanager
78 | def get_for_update(db_engine, orm_class, primary_key):
79 |     """ Gets object from the database to updated it """
80 |     with scoped_session(db_engine) as session:
81 |         obj = session.query(orm_class).get(primary_key)
82 |         yield obj
83 |         session.merge(obj)
84 | 


--------------------------------------------------------------------------------
/src/triage/util/defaults.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/util/defaults.py


--------------------------------------------------------------------------------
/src/triage/util/introspection.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | 
 4 | def classpath(klass):
 5 |     """Return the full class path
 6 | 
 7 |     Args:
 8 |         klass (class): A class
 9 |     """
10 |     return f"{klass.__module__}.{klass.__name__}"
11 | 
12 | 
13 | def bind_kwargs(kallable, **kwargs):
14 |     """Bind keyword arguments to a callable and return as a dictionary
15 | 
16 |     Args:
17 |         callable (callable): any callable
18 |         **kwargs: keyword arguments to bind
19 | 
20 |     Returns: (dict)
21 |     """
22 |     call_signature = inspect.signature(kallable).bind_partial(**kwargs).arguments
23 |     if 'kwargs' in call_signature:
24 |         passed_kwargs = call_signature['kwargs']
25 |     else:
26 |         passed_kwargs = call_signature
27 |     return passed_kwargs
28 | 


--------------------------------------------------------------------------------
/src/triage/util/pandas.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | import verboselogs, logging
 6 | logger = verboselogs.VerboseLogger(__name__)
 7 | 
 8 | def downcast_matrix(df):
 9 |     """Downcast the numeric values of a matrix.
10 | 
11 |     This will make the matrix use less memory by turning, every number into
12 |     float32. It's more expensive in time to try to convert int64 into int32 
13 |     than just convert the whole matrix in float32, which still is less memory
14 |     intensive than the original matrix. 
15 | 
16 |     Operates on the dataframe as passed, without doing anything to the index.
17 |     Callers may pass an index-less dataframe if they wish to re-add the index afterwards
18 |     and save memory on the index storage.
19 |     """
20 |     logger.spam("Downcasting matrix.")
21 |     logger.spam(f"Starting memory usage: {df.memory_usage(deep=True).sum()/1000000} MB")
22 |     logger.spam(f"Initial types: \n {df.dtypes}")
23 | 
24 |     df = df.apply(lambda x: x.astype('float32'))
25 |     
26 |     logger.spam("Downcasting matrix completed.")
27 |     logger.spam(f"Final memory usage: {df.memory_usage(deep=True).sum()/1000000} MB")
28 |     logger.spam(f"Final data types: \n {df.dtypes}")
29 | 
30 |     return df
31 | 


--------------------------------------------------------------------------------
/src/triage/util/random.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | FLOAT_TO_INT_MULTIPLIER = 2000000000
 5 | 
 6 | 
 7 | def generate_python_random_seed():
 8 |     """Generate a random integer suitable for seeding the Python random generator
 9 |     """
10 |     return int(random.uniform(0, 1.0) * FLOAT_TO_INT_MULTIPLIER)
11 | 


--------------------------------------------------------------------------------
/src/triage/util/structs.py:
--------------------------------------------------------------------------------
 1 | """Classes representing simple but deep data structures that we reuse throughout
 2 | Triage code and want to display more intelligently in log files
 3 | """
 4 | 
 5 | 
 6 | class TruncatedRepresentationList(list):
 7 |     def __repr__(self):
 8 |         total = len(self)
 9 |         if total != 1:
10 |             return f"[{self[0]} ... {self[-1]} (Total: {total})]"
11 |         else:
12 |             return f"[{self[0]}] (Total: {total})"
13 | 
14 | 
15 | class AsOfTimeList(TruncatedRepresentationList):
16 |     pass
17 | 
18 | 
19 | class FeatureNameList(TruncatedRepresentationList):
20 |     pass
21 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py3
 3 | 
 4 | [testenv:py3]
 5 | setenv = 
 6 |     AWS_ACCESS_KEY_ID=fake
 7 |     AWS_SECRET_ACCESS_KEY=fake
 8 |     BOTO_CONFIG=/dev/null
 9 | deps = -r{toxinidir}/requirement/test.txt
10 | commands = py.test --basetemp={envtmpdir} {posargs:-vvv --cov=triage}
11 | extras = rq
12 | 


--------------------------------------------------------------------------------