├── .bandit.yml ├── .codeclimate.yml ├── .dockerignore ├── .editorconfig ├── .github └── workflows │ ├── build-mkdocs.yaml │ ├── publish-to-pypi.yml │ └── test.yaml ├── .gitignore ├── .python-version.current ├── .pyup.yml ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.md ├── Dockerfile ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── README.md ├── develop ├── dirtyduck ├── .dockerignore ├── docker-compose.yml └── food_db │ ├── 00_create_extensions.sql │ ├── 01_create_inspections_table.sql │ ├── 02_create_cleaned_inspections_table.sql │ ├── 03_create_violations_table.sql │ ├── 04_create_semantic_tables.sql │ ├── Dockerfile │ └── inspections_2014_2017.csv.bz2 ├── docs ├── __init__.py ├── md_autogen.py ├── mkdocs.yml ├── sources │ ├── api │ │ ├── audition │ │ │ ├── audition-config.md │ │ │ ├── auditioner.md │ │ │ ├── database-dependencies.md │ │ │ ├── index.md │ │ │ └── selection_rules.md │ │ └── timechop │ │ │ ├── index.md │ │ │ ├── plotting.md │ │ │ └── timechop.md │ ├── audition │ │ ├── audition_intro.md │ │ ├── images │ │ │ └── sanjose-2.png │ │ └── model_selection.md │ ├── db.md │ ├── dirtyduck │ │ ├── .swp │ │ ├── AUTHORS │ │ ├── aws_batch.md │ │ ├── choose_your_own_adventure.md │ │ ├── data_preparation.md │ │ ├── dirty_duckling.md │ │ ├── eis.md │ │ ├── for_the_impatient.md │ │ ├── images │ │ │ ├── AWS_Batch_Architecture.png │ │ │ ├── AWS_Batch_Architecture.svg │ │ │ ├── EDA │ │ │ │ ├── facilities_inspected_over_time.png │ │ │ │ ├── facilities_inspections_over_time.png │ │ │ │ ├── facilities_with_failed_inspections_severe_violations_over_time.png │ │ │ │ ├── facilities_with_inspections_failed_over_time.png │ │ │ │ ├── failed_inspections_over_time.png │ │ │ │ ├── failed_inspections_severe_violations_over_time.png │ │ │ │ └── inspections_over_time.png │ │ │ ├── audition │ │ │ │ ├── eis │ │ │ │ │ ├── distance_from_best_precision@10_pct.png │ │ │ │ │ ├── metric_over_time_precision@10_pct.png │ │ │ │ │ ├── precision@10_pct_next_time.png │ │ │ │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ │ │ │ └── regret_over_time_precision@10_pct.png │ │ │ │ └── inspections │ │ │ │ │ ├── distance_from_best_precision@10_pct.png │ │ │ │ │ ├── metric_over_time_precision@10_pct.png │ │ │ │ │ ├── precision@10_pct_next_time.png │ │ │ │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ │ │ │ └── regret_over_time_precision@10_pct.png │ │ │ ├── data_road.png │ │ │ ├── outcomes-eis.png │ │ │ ├── outcomes-inspections.png │ │ │ ├── postmodeling │ │ │ │ ├── eis_jaccard_on_lists_over_time.png │ │ │ │ ├── eis_mg_prec_over_time.png │ │ │ │ ├── eis_mg_recall_over_time.png │ │ │ │ ├── eis_model_group_64_feature_group_importances.png │ │ │ │ ├── eis_model_group_64_feature_importances.png │ │ │ │ ├── eis_postmodeling_config.yaml │ │ │ │ ├── inspection_jaccard_on_lists_over_time.png │ │ │ │ ├── inspection_mg_prec_over_time.png │ │ │ │ ├── inspection_mg_recall_over_time.png │ │ │ │ ├── inspection_model_group_39_model_125_feature_group_importances.png │ │ │ │ ├── inspection_model_group_39_model_125_feature_importances.png │ │ │ │ ├── inspection_model_group_39_model_125_rayid_curve.png │ │ │ │ └── inspection_postmodeling_config.yaml │ │ │ ├── quickstart.png │ │ │ ├── rolling-origin.png │ │ │ ├── sanjose-2.png │ │ │ ├── timechop │ │ │ │ ├── timechop_1.png │ │ │ │ ├── timechop_10.png │ │ │ │ ├── timechop_2.png │ │ │ │ ├── timechop_3.png │ │ │ │ ├── timechop_4.png │ │ │ │ ├── timechop_5.png │ │ │ │ ├── timechop_6.png │ │ │ │ ├── timechop_7.png │ │ │ │ ├── timechop_8.png │ │ │ │ └── timechop_9.png │ │ │ └── triage │ │ │ │ ├── distance_from_best_precision@10_pct.png │ │ │ │ ├── eis_01.png │ │ │ │ ├── inspections_baseline.png │ │ │ │ ├── inspections_dt.png │ │ │ │ ├── inspections_label_failed_01.png │ │ │ │ ├── metric_over_time_precision@10_pct.png │ │ │ │ ├── precision@10_pct_next_time.png │ │ │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ │ │ ├── regret_over_time_precision@10_pct.png │ │ │ │ ├── results_model_group_ids.json │ │ │ │ └── simple_test_skeleton.png │ │ ├── index.md │ │ ├── infrastructure.md │ │ ├── inspections.md │ │ ├── ml_governance.md │ │ ├── problem_description.md │ │ ├── triage_intro.md │ │ └── who_is_this_tutorial_for.md │ ├── experiments │ │ ├── algorithm.md │ │ ├── architecture.md │ │ ├── cohort-labels.md │ │ ├── experiment-config.md │ │ ├── feature-testing.md │ │ ├── features.md │ │ ├── featuretest-cli.png │ │ ├── featuretest-result.png │ │ ├── prediction-ranking.md │ │ ├── running.md │ │ ├── temporal-validation.md │ │ ├── temporal_config_graph.png │ │ ├── timechops.png │ │ ├── upgrade-to-v5.md │ │ ├── upgrade-to-v6.md │ │ ├── upgrade-to-v7.md │ │ └── upgrade-to-v8.md │ ├── index.md │ ├── js │ │ └── mermaid.min.js │ ├── postmodeling │ │ ├── index.md │ │ ├── postmodeling-config.md │ │ └── postmodeling_general_flow.png │ ├── predictlist │ │ └── index.md │ ├── quickstart.md │ ├── triage.experiments.base.md │ ├── triage.experiments.multicore.md │ ├── triage.experiments.singlethreaded.md │ ├── triage_docs.css │ └── triage_project_workflow.md └── update_docs.py ├── example ├── aws_batch │ ├── aws_env.example │ ├── credentials.filter.example │ ├── deploy.sh │ ├── triage-job-definition.json.example │ └── triage-overrides.json.example ├── cohort │ └── past_events.sql ├── colab │ └── colab_triage.ipynb ├── config │ ├── README.md │ ├── audition.yaml │ ├── database.yaml │ ├── dirty-duckling.yaml │ ├── experiment.yaml │ ├── postmodeling_config.yaml │ └── postmodeling_crosstabs.yaml ├── dirtyduck │ ├── audition │ │ ├── eis_audition_config.yaml │ │ ├── inspection_audition_config.yaml │ │ └── inspections │ │ │ ├── distance_from_best_precision@10_pct.png │ │ │ ├── distance_from_best_precision@15_pct.png │ │ │ ├── metric_over_time_precision@10_pct.png │ │ │ ├── metric_over_time_precision@15_pct.png │ │ │ ├── precision@10_pct_next_time.png │ │ │ ├── precision@15_pct_next_time.png │ │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ │ ├── regret_distance_from_best_rules_precision@15_pct.png │ │ │ ├── regret_over_time_precision@10_pct.png │ │ │ ├── regret_over_time_precision@15_pct.png │ │ │ └── results_model_group_ids.json │ ├── crosstabs │ │ └── eis_crosstabs_config.yaml │ ├── experiments │ │ ├── dirty-duckling.yaml │ │ ├── eis_01.yaml │ │ ├── eis_02.yaml │ │ ├── inspections_baseline.yaml │ │ ├── inspections_dt.yaml │ │ ├── inspections_label_failed_01.yaml │ │ ├── inspections_label_failed_02.yaml │ │ └── simple_test_skeleton.yaml │ ├── images │ │ ├── distance_from_best_precision@10_pct.png │ │ ├── eis_01.png │ │ ├── inspections_baseline.png │ │ ├── inspections_dt.png │ │ ├── inspections_label_failed_01.png │ │ ├── metric_over_time_precision@10_pct.png │ │ ├── precision@10_pct_next_time.png │ │ ├── regret_distance_from_best_rules_precision@10_pct.png │ │ ├── regret_over_time_precision@10_pct.png │ │ ├── results_model_group_ids.json │ │ └── simple_test_skeleton.png │ ├── output │ │ ├── .gitkeep │ │ └── images │ │ │ ├── .gitkeep │ │ │ ├── eis.svg │ │ │ ├── inspections.svg │ │ │ ├── inspections_dt.svg │ │ │ ├── inspections_test.svg │ │ │ ├── model_7_tree_0.svg │ │ │ └── simple_test_skeleton.svg │ └── postmodeling │ │ ├── database.yaml │ │ ├── eis_postmodeling_config.yaml │ │ ├── inspection_jaccard_on_lists_over_time.png │ │ ├── inspection_mg_prec_over_time.png │ │ ├── inspection_mg_recall_over_time.png │ │ ├── inspection_model_group_39_model_125_feature_group_importances.png │ │ ├── inspection_model_group_39_model_125_feature_importances.png │ │ ├── inspection_model_group_39_model_125_rayid_curve.png │ │ └── inspection_postmodeling_config.yaml └── label │ └── events.sql ├── manage.py ├── pytest.ini ├── requirement ├── dev.txt ├── extras-rq.txt ├── include │ ├── build.txt │ ├── lint.txt │ └── test-management.txt ├── main.txt └── test.txt ├── setup.cfg ├── setup.py ├── src ├── tests │ ├── __init__.py │ ├── architect_tests │ │ ├── README.md │ │ ├── __init__.py │ │ ├── test_builders.py │ │ ├── test_entity_date_table_generators.py │ │ ├── test_feature_dictionary_creator.py │ │ ├── test_feature_generators.py │ │ ├── test_feature_group_creator.py │ │ ├── test_feature_group_mixer.py │ │ ├── test_integration.py │ │ ├── test_label_generators.py │ │ ├── test_planner.py │ │ └── utils.py │ ├── audition_tests │ │ ├── __init__.py │ │ ├── test_audition.py │ │ ├── test_distance_from_best.py │ │ ├── test_model_group_performance.py │ │ ├── test_plotting.py │ │ ├── test_preaudition.py │ │ ├── test_regrets.py │ │ ├── test_rules_maker.py │ │ ├── test_selection_rule_grid.py │ │ ├── test_selection_rule_performance.py │ │ ├── test_selection_rules.py │ │ ├── test_thresholding.py │ │ └── utils.py │ ├── catwalk_tests │ │ ├── README.md │ │ ├── __init__.py │ │ ├── test_baselines.py │ │ ├── test_estimators.py │ │ ├── test_evaluation.py │ │ ├── test_feature_importances.py │ │ ├── test_individual_importance.py │ │ ├── test_individual_importance_uniform.py │ │ ├── test_integration.py │ │ ├── test_metrics.py │ │ ├── test_model_grouping.py │ │ ├── test_model_trainers.py │ │ ├── test_predictors.py │ │ ├── test_protected_groups_generators.py │ │ ├── test_storage.py │ │ ├── test_utils.py │ │ └── utils.py │ ├── collate_tests │ │ ├── __init__.py │ │ ├── create_inspections_subset.py │ │ ├── food_inspections_subset.csv │ │ ├── initialize_db.py │ │ ├── test_collate.py │ │ ├── test_from_obj.py │ │ ├── test_helpers.py │ │ ├── test_imputation_output.py │ │ ├── test_imputations.py │ │ ├── test_integration.py │ │ └── test_spacetime.py │ ├── conftest.py │ ├── example_schema.yaml │ ├── postmodeling_tests │ │ ├── test_add_predictions.py │ │ ├── test_crosstabs.py │ │ ├── test_model_evaluator.py │ │ ├── test_model_group_evaluator.py │ │ └── test_without_predictions.py │ ├── results_tests │ │ ├── __init__.py │ │ ├── factories.py │ │ ├── test_factories.py │ │ ├── test_upgrade_if_clean.py │ │ └── test_valid_schema.py │ ├── test_cli.py │ ├── test_database_reflection.py │ ├── test_defaults.py │ ├── test_experiments.py │ ├── test_partial_experiments.py │ ├── test_predictlist.py │ ├── test_tracking_experiments.py │ ├── test_utils.py │ ├── test_utils_pandas.py │ ├── test_validation.py │ ├── test_validation_primitives.py │ ├── timechop_tests │ │ ├── __init__.py │ │ ├── test_plotting.py │ │ ├── test_timechop.py │ │ └── test_utils.py │ └── utils.py └── triage │ ├── __init__.py │ ├── cli.py │ ├── component │ ├── __init__.py │ ├── architect │ │ ├── README.md │ │ ├── __init__.py │ │ ├── builders.py │ │ ├── database_reflection.py │ │ ├── entity_date_table_generators.py │ │ ├── feature_dictionary_creator.py │ │ ├── feature_generators.py │ │ ├── feature_group_creator.py │ │ ├── feature_group_mixer.py │ │ ├── features.py │ │ ├── label_generators.py │ │ ├── planner.py │ │ ├── utils.py │ │ └── validations.py │ ├── audition │ │ ├── Audition_Tutorial.ipynb │ │ ├── README.md │ │ ├── __init__.py │ │ ├── distance_from_best.py │ │ ├── metric_directionality.py │ │ ├── model_group_performance.py │ │ ├── plotting.py │ │ ├── pre_audition.py │ │ ├── regrets.py │ │ ├── rules_maker.py │ │ ├── selection_rule_grid.py │ │ ├── selection_rule_performance.py │ │ ├── selection_rules.py │ │ ├── thresholding.py │ │ └── utils.py │ ├── catwalk │ │ ├── README.rst │ │ ├── __init__.py │ │ ├── baselines │ │ │ ├── __init__.py │ │ │ ├── rankers.py │ │ │ └── thresholders.py │ │ ├── db.py │ │ ├── estimators │ │ │ ├── __init__.py │ │ │ ├── classifiers.py │ │ │ ├── dsapp_scaler.org │ │ │ └── transformers.py │ │ ├── evaluation.py │ │ ├── exceptions.py │ │ ├── feature_importances.py │ │ ├── individual_importance │ │ │ ├── __init__.py │ │ │ └── uniform.py │ │ ├── metrics.py │ │ ├── model_grouping.py │ │ ├── model_trainers.py │ │ ├── predictors.py │ │ ├── protected_groups_generators.py │ │ ├── storage.py │ │ ├── subsetters.py │ │ └── utils.py │ ├── collate │ │ ├── README.rst │ │ ├── __init__.py │ │ ├── collate.py │ │ ├── from_obj.py │ │ ├── imputations.py │ │ ├── spacetime.py │ │ └── sql.py │ ├── postmodeling │ │ ├── __init__.py │ │ ├── add_predictions.py │ │ ├── add_predictions_example_config.yaml │ │ ├── base.py │ │ ├── crosstabs.py │ │ ├── deprecated │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── db_credentials_example.yaml │ │ │ ├── model_analyzer.py │ │ │ ├── model_evaluator.py │ │ │ ├── model_group_evaluator.py │ │ │ ├── parameters.py │ │ │ ├── postmodeling_analyzer.py │ │ │ ├── postmodeling_tutorial.ipynb │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ └── aux_funcs.py │ │ ├── error_analysis.py │ │ ├── error_analysis_example.ipynb │ │ ├── example_experiment_summary_given_experiment_hashes.ipynb │ │ ├── example_experiment_summary_report_specific_experiment_wSubsets.ipynb │ │ ├── example_generate_experiment_summary_report_automatically_after_experiment_run.ipynb │ │ ├── example_triage_report.html │ │ ├── experiment_summarizer.py │ │ ├── experiment_summary_report_template.ipynb │ │ ├── fairness │ │ │ ├── __init__.py │ │ │ ├── aequitas_example.ipynb │ │ │ └── aequitas_utils.py │ │ ├── list_analysis.py │ │ ├── postmodeling_config.yaml │ │ ├── postmodeling_report_example_acdhs_housing.ipynb │ │ ├── readme.md │ │ └── utils │ │ │ └── __init__.py │ ├── results_schema │ │ ├── README.md │ │ ├── __init__.py │ │ ├── alembic.ini │ │ ├── alembic │ │ │ ├── README │ │ │ ├── __init__.py │ │ │ ├── env.py │ │ │ ├── script.py.mako │ │ │ └── versions │ │ │ │ ├── 079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py │ │ │ │ ├── 0bca1ba9706e_add_matrix_uuid_to_eval.py │ │ │ │ ├── 0d44655e35fd_.py │ │ │ │ ├── 1b990cbc04e4_production_schema.py │ │ │ │ ├── 2446a931de7a_changing_column_names_and_removing_.py │ │ │ │ ├── 264245ddfce2_.py │ │ │ │ ├── 264786a9fe85_add_label_value_to_prodcution_table.py │ │ │ │ ├── 38f37d013686_associate_experiments_with_models_and_.py │ │ │ │ ├── 3ce027594a5c_add_hashes_to_runs.py │ │ │ │ ├── 45219f25072b_hash_partitioning_predictions_tables.py │ │ │ │ ├── 4ae804cc0977_.py │ │ │ │ ├── 50e1f1bc2cac_add_subsets.py │ │ │ │ ├── 5dd2ba8222b1_add_run_type.py │ │ │ │ ├── 609c7cc51794_rankify_predictions.py │ │ │ │ ├── 670289044eb2_add_production_prediction_metadata.py │ │ │ │ ├── 72ac5cbdca05_change_importance_to_float.py │ │ │ │ ├── 7d57d1cf3429_.py │ │ │ │ ├── 89a8ce240bae_.py │ │ │ │ ├── 8b3f167d0418_.py │ │ │ │ ├── 8cef808549dd_.py │ │ │ │ ├── 97cf99b7348f_evaluation_randomness.py │ │ │ │ ├── 9bbfdcf8bab0_.py │ │ │ │ ├── __init__.py │ │ │ │ ├── a20104116533_.py │ │ │ │ ├── a98acf92fd48_add_nuke_triage_function.py │ │ │ │ ├── b097e47ba829_remove_random_seed_from_experiments.py │ │ │ │ ├── b4d7569d31cb_aequitas.py │ │ │ │ ├── cdd0dc9d9870_rename_production_schema_and_prediction_table.py │ │ │ │ ├── ce5b50ffa8e2_break_ties_in_list_predictions.py │ │ │ │ ├── cfd5c3386014_add_experiment_runs.py │ │ │ │ ├── d0ac573eaf1a_model_group_stored_procedure.py │ │ │ │ └── fa1760d35710_.py │ │ ├── example_db_config.yaml │ │ ├── schema.py │ │ └── sql │ │ │ ├── model_group_stored_procedure.sql │ │ │ ├── nuke_triage.sql │ │ │ ├── predictions_hash_partitioning.sql │ │ │ └── undo_predictions_hash_partitioning.sql │ └── timechop │ │ ├── README.md │ │ ├── __init__.py │ │ ├── plotting.py │ │ ├── timechop.py │ │ └── utils.py │ ├── config │ └── logging.yaml │ ├── database_reflection.py │ ├── experiments │ ├── __init__.py │ ├── base.py │ ├── defaults.py │ ├── model_grid_presets.yaml │ ├── multicore.py │ ├── rq.py │ ├── singlethreaded.py │ └── validate.py │ ├── predictlist │ ├── __init__.py │ └── utils.py │ ├── tracking.py │ ├── util │ ├── __init__.py │ ├── conf.py │ ├── db.py │ ├── defaults.py │ ├── introspection.py │ ├── pandas.py │ ├── random.py │ └── structs.py │ └── validation_primitives.py ├── tox.ini └── tutorial.sh /.bandit.yml: -------------------------------------------------------------------------------- 1 | skips: ['B101'] 2 | -------------------------------------------------------------------------------- /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | version: "2" # required to adjust maintainability checks 2 | plugins: 3 | pep8: 4 | enabled: true 5 | checks: 6 | argument-count: 7 | config: 8 | threshold: 5 9 | complex-logic: 10 | config: 11 | threshold: 4 12 | file-lines: 13 | config: 14 | threshold: 400 15 | method-complexity: 16 | config: 17 | threshold: 5 18 | method-count: 19 | config: 20 | threshold: 20 21 | method-lines: 22 | config: 23 | threshold: 25 24 | nested-control-flow: 25 | config: 26 | threshold: 4 27 | return-statements: 28 | config: 29 | threshold: 4 30 | similar-code: 31 | enabled: false 32 | identical-code: 33 | enabled: false 34 | exclude_patterns: 35 | - "src/triage/component/results_schema/alembic/versions" 36 | - "src/triage/component/results_schema/alembic/env.py" 37 | - "src/triage/component/results_schema/schema.py" 38 | - "docs/" 39 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .cache 2 | .config 3 | docker 4 | docs 5 | example 6 | develop 7 | .git 8 | .hypothesis 9 | .local 10 | .pytest_cache 11 | tox.ini 12 | .travis.yml 13 | tutorial.sh 14 | .pyup.yml 15 | .python-version 16 | .python-version.current 17 | *.rst 18 | *.yml 19 | *.inc 20 | CONTRIBUTING.md 21 | database.yaml 22 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/workflows/build-mkdocs.yaml: -------------------------------------------------------------------------------- 1 | name: Build Docs and Serve to Github Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - 'docs/**' 9 | 10 | jobs: 11 | docs: 12 | name: Build Docs and Serve to Github Pages 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Set up Python 3.9 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: 3.9 24 | 25 | - name: Install dependencies 26 | run: | 27 | pip install --upgrade pip && pip install -r requirement/dev.txt 28 | pip install git+https://github.com/dssg/triage.git@master 29 | git config user.name 'github-actions[bot]' && git config user.email 'github-actions[bot]@users.noreply.github.com' 30 | 31 | - name: Publish docs 32 | run: mkdocs gh-deploy -f "$(pwd)/docs/mkdocs.yml" 33 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish python distributions to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | build: 10 | name: Build and publish python distributions to PyPI 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: 3.9 18 | - name: Install pypa/build 19 | run: >- 20 | python -m 21 | pip install 22 | build 23 | --user 24 | - name: Build a binary wheel and a source tarball 25 | run: python -m build 26 | - name: Store the distribution packages 27 | uses: actions/upload-artifact@v4 28 | with: 29 | name: python-package-distributions 30 | path: dist/ 31 | publish-to-pypi: 32 | name: Publish python distribution to PyPI 33 | if: startsWith(github.ref, 'refs/tags/') 34 | needs: 35 | - build 36 | runs-on: ubuntu-latest 37 | environment: 38 | name: pypi 39 | url: https://pypi.org/p/triage 40 | steps: 41 | - name: Download all the dists 42 | uses: actions/download-artifact@v4 43 | with: 44 | name: python-package-distributions 45 | path: dist/ 46 | - name: Publish distribution to PyPI 47 | uses: pypa/gh-action-pypi-publish@release/v1 48 | with: 49 | password: ${{ secrets.PYPI_API_TOKEN }} 50 | 51 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.9', '3.10'] 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install system dependencies 23 | run: | 24 | sudo apt-get update 25 | sudo apt-get install libblas-dev liblapack-dev libatlas-base-dev gfortran 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | pip install -r requirement/include/build.txt 31 | pip install -r requirement/include/test-management.txt 32 | - name: Test with tox 33 | run: | 34 | tox 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.egg-info/ 3 | .eggs/ 4 | build/ 5 | dist/ 6 | .cache/ 7 | .coverage* 8 | .tox/ 9 | docs/site/ 10 | **/.hypothesis/ 11 | 12 | /.install.*.bash.inc 13 | /.python-version 14 | .DS_Store 15 | 16 | .ipynb_checkpoints/ 17 | venv/ 18 | my_db_config.yaml 19 | database.yaml 20 | database*.yaml 21 | dirtyduck/triage/** 22 | 23 | *~ 24 | **/trained_models/** 25 | **/matrices 26 | 27 | *.swp 28 | src/triage/component/postmodeling/postmodeling_tmp.ipynb 29 | src/triage/component/postmodeling/model_class_testing.py 30 | src/triage/component/postmodeling/model_class_testing.ipynb 31 | src/triage/component/postmodeling/model_groups_testing.ipynb -------------------------------------------------------------------------------- /.python-version.current: -------------------------------------------------------------------------------- 1 | triage-3.9.10 2 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | # autogenerated pyup.io config file 2 | # see https://pyup.io/docs/configuration/ for all available options 3 | 4 | schedule: every month 5 | 6 | requirements: 7 | - requirement/dev.txt 8 | - requirement/main.txt 9 | - requirement/test.txt 10 | - requirement/include/build.txt 11 | - requirement/include/lint.txt 12 | - requirement/include/test-management.txt 13 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | Center for Data Science and Public Policy 9 | 10 | Contributors 11 | ------------ 12 | 13 | - Adolfo De Unánue 14 | - Andrea Navarrete 15 | - Avishek Kumar 16 | - Benedict Kuester 17 | - Eddie Lin 18 | - Eric Potash 19 | - Erika Salomon 20 | - Hannes Koenig 21 | - Jesse London 22 | - Joe Walsh 23 | - Kit Rodolfa 24 | - Klaus Ackermann 25 | - Matt Bauman 26 | - Rayid Ghani 27 | - Tristan Crockett 28 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.1.0 (2016-10-19) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Created by Data Science and Public Policy, University of Chicago 2 | 3 | MIT License 4 | 5 | Copyright (c) 2019 Data Science and Public Policy, University of Chicago 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include requirement *.txt 8 | recursive-include src alembic.ini 9 | recursive-include src *.sql 10 | recursive-include src/tests * 11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | recursive-include src *.yaml 13 | 14 | recursive-exclude * __pycache__ 15 | recursive-exclude * *.py[co] 16 | -------------------------------------------------------------------------------- /dirtyduck/.dockerignore: -------------------------------------------------------------------------------- 1 | food_db/* 2 | docker_compose.yml 3 | -------------------------------------------------------------------------------- /dirtyduck/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | food_db: 5 | build: 6 | context: ./food_db 7 | image: dirtyduck/db 8 | container_name: dirtyduck_db 9 | environment: 10 | - POSTGRES_HOST=0.0.0.0 11 | - POSTGRES_USER=food_user 12 | - POSTGRES_PORT=5432 13 | - POSTGRES_PASSWORD=some_password 14 | - POSTGRES_DB=food 15 | volumes: 16 | - db-data:/var/lib/postgresql/data 17 | ports: 18 | - "5434:5432" 19 | 20 | bastion: 21 | build: 22 | context: .. 23 | dockerfile: Dockerfile 24 | target: development 25 | image: dsapp/triage:development 26 | container_name: dirtyduck_bastion 27 | tty: true 28 | environment: 29 | - PS1=\[$$(tput setaf 4)$$(tput bold)[\]\u@$$(tput setaf 2)$$(tput smul)dirtyduck$$(tput rmul)$$(tput setaf 4)$$:\\w]#\[$$(tput sgr0) ]\ 30 | - TRIAGE_IMAGE=dirtyduck 31 | - DATABASE_URL=postgresql://food_user:some_password@dirtyduck_db/food 32 | - TRIAGE_OUTPUT=/triage-output 33 | volumes: 34 | - "../example/dirtyduck:/dirtyduck" 35 | - "../dirtyduck-output:/triage-output" 36 | working_dir: /dirtyduck 37 | 38 | volumes: 39 | db-data: 40 | -------------------------------------------------------------------------------- /dirtyduck/food_db/00_create_extensions.sql: -------------------------------------------------------------------------------- 1 | create extension postgis; 2 | 3 | create extension postgis_raster; 4 | create extension postgis_topology; 5 | create extension postgis_sfcgal; 6 | 7 | 8 | 9 | create extension if not exists fuzzystrmatch; 10 | create extension if not exists unaccent; 11 | create extension if not exists pg_trgm; 12 | create extension if not exists bloom; 13 | 14 | create extension if not exists citext; 15 | 16 | create extension if not exists cube; 17 | 18 | create extension if not exists file_fdw; 19 | create extension if not exists postgres_fdw; 20 | 21 | create extension if not exists earthdistance; 22 | -------------------------------------------------------------------------------- /dirtyduck/food_db/01_create_inspections_table.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists raw; 2 | 3 | drop table if exists raw.inspections; 4 | create table if not exists raw.inspections ( 5 | inspection text not null, 6 | DBA_Name text, 7 | AKA_Name text, 8 | license_Num decimal, 9 | facility_type text, 10 | risk text, 11 | address text, 12 | city text, 13 | state text, 14 | zip text, 15 | date date, 16 | type text, 17 | results text, 18 | violations text, 19 | latitude decimal, 20 | longitude decimal, 21 | location text 22 | ); 23 | 24 | copy raw.inspections from program 'bzcat /tmp/inspections_2014_2017.csv.bz2' HEADER CSV QUOTE '"'; 25 | -------------------------------------------------------------------------------- /dirtyduck/food_db/02_create_cleaned_inspections_table.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists cleaned; 2 | 3 | drop table if exists cleaned.inspections cascade; 4 | 5 | create table cleaned.inspections as ( 6 | with cleaned as ( 7 | select 8 | inspection::integer, 9 | btrim(lower(results)) as result, 10 | license_num::integer, 11 | btrim(lower(dba_name)) as facility, 12 | btrim(lower(aka_name)) as facility_aka, 13 | case when 14 | facility_type is null then 'unknown' 15 | else btrim(lower(facility_type)) 16 | end as facility_type, 17 | lower(substring(risk from '\((.+)\)')) as risk, 18 | btrim(lower(address)) as address, 19 | zip as zip_code, 20 | substring( 21 | btrim(lower(regexp_replace(type, 'liquor', 'task force', 'gi'))) 22 | from 'canvass|task force|complaint|food poisoning|consultation|license|tag removal') as type, 23 | date, 24 | -- point(longitude, latitude) as location 25 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as location -- We use geography so the measurements are in meters 26 | from raw.inspections 27 | where zip is not null -- removing NULL zip codes 28 | ) 29 | 30 | select * from cleaned where type is not null 31 | ); 32 | -------------------------------------------------------------------------------- /dirtyduck/food_db/03_create_violations_table.sql: -------------------------------------------------------------------------------- 1 | drop table if exists cleaned.violations cascade; 2 | 3 | create table cleaned.violations as ( 4 | select 5 | inspection::integer, 6 | license_num::integer, 7 | date::date, 8 | btrim(tuple[1]) as code, 9 | lower(btrim(tuple[2])) as description, 10 | lower(btrim(tuple[3])) as comment, 11 | (case 12 | when btrim(tuple[1]) = '' then NULL 13 | when btrim(tuple[1])::int between 1 and 14 then 'critical' -- From the documentation 14 | when btrim(tuple[1])::int between 15 and 29 then 'serious' 15 | else 'minor' 16 | end 17 | ) as severity from 18 | ( 19 | select 20 | inspection, 21 | license_num, 22 | date, 23 | regexp_split_to_array( -- Create an array we will split the code, description, comment 24 | regexp_split_to_table( -- Create a row per each comment we split by | 25 | coalesce( -- If there isn't a violation add '- Comments:' 26 | regexp_replace(violations, '[\n\r]+', '', 'g' ) -- Remove line breaks 27 | , '- Comments:') 28 | , '\|') -- Split the violations 29 | , '(?<=\d+)\.\s*|\s*-\s*Comments:') -- Split each violation in three 30 | -- , '\.\s*|\s*-\s*Comments:') -- Split each violation in three (Use this if your postgresql is kind off old 31 | as tuple 32 | from raw.inspections 33 | where results in ('Fail', 'Pass', 'Pass w/ Conditions') and license_num is not null 34 | ) as t 35 | ); 36 | -------------------------------------------------------------------------------- /dirtyduck/food_db/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:12 2 | 3 | ## Installing PostGIS 4 | RUN apt-get -y update \ 5 | && apt-get -y install wget \ 6 | && wget --quiet -O - http://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - \ 7 | && apt-get -y update \ 8 | && apt-get -y install postgresql-12-postgis-3 postgis postgresql-12-pgrouting bzip2 9 | 10 | ## Chigago Food Inspections 11 | ## From https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD 12 | ## Only the rows between the years 2014 and 2017 (inclusive) are included 13 | COPY inspections_2014_2017.csv.bz2 /tmp 14 | 15 | ## DB setup 16 | ADD 00_create_extensions.sql /docker-entrypoint-initdb.d/ 17 | ADD 01_create_inspections_table.sql /docker-entrypoint-initdb.d/ 18 | ADD 02_create_cleaned_inspections_table.sql /docker-entrypoint-initdb.d/ 19 | ADD 03_create_violations_table.sql /docker-entrypoint-initdb.d/ 20 | ADD 04_create_semantic_tables.sql /docker-entrypoint-initdb.d/ 21 | 22 | RUN chown postgres:postgres /docker-entrypoint-initdb.d/*.sql 23 | -------------------------------------------------------------------------------- /dirtyduck/food_db/inspections_2014_2017.csv.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/dirtyduck/food_db/inspections_2014_2017.csv.bz2 -------------------------------------------------------------------------------- /docs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/__init__.py -------------------------------------------------------------------------------- /docs/sources/api/audition/auditioner.md: -------------------------------------------------------------------------------- 1 | The `Auditioner` class is the main entry point for the Audition module. Users pass its constructor a database connection, information about the model groups to be evaluated, and a specification for a filter to prune the worst-performing models. 2 | 3 | Other methods allow users to define more complex selection rules, list selected models, or plot results from the selection process. 4 | 5 | ::: triage.component.audition 6 | options: 7 | show_root_toc_entry: false 8 | group_by_category: true 9 | show_category_heading: true 10 | show_if_no_docstring: true 11 | -------------------------------------------------------------------------------- /docs/sources/api/audition/index.md: -------------------------------------------------------------------------------- 1 | ## Audition Reference 2 | 3 | Audition is the Triage model selection module. It simplifies the process of comparing multiple model_groups trained across time. 4 | 5 | Find user-focused documentation for Audition [here](../../audition/audition_intro.md) 6 | 7 | |Page|| 8 | |-|-| 9 | |[Auditioner](auditioner.md)|The Auditioner class is the main entry point for Audition. 10 | |[Selection Rules](selection_rules.md)|The Audition selection rules implement a range of criteria for identifying best-performing models.| 11 | |[Audition Config](audition-config.md)|Users of the Triage CLI can specify settings for Audition in an Audition config file.| 12 | |[Database Dependencies](database-dependencies.md)|The database schema from which Audition reads model training results.| 13 | 14 | -------------------------------------------------------------------------------- /docs/sources/api/audition/selection_rules.md: -------------------------------------------------------------------------------- 1 | ## Selection Rules 2 | 3 | The Triage uses *selection rules* to compare the performance of trained model groups over time, and select a model group for future predictions. A selection rule tries to predict the best-performing model group in some train/test period, based on the historical performance of each model group on some metric. 4 | 5 | For example, a simple selection rule might predict that the best-performing model group during one train/test period will perform best in the following period. 6 | 7 | A selection rule can be evaluated by calculating its *regret*, or the difference between the performance of its selected model group and the best-performing model group in some period. 8 | 9 | Triage supports 8 model selection rules. Each is represented internally by one of the following functions: 10 | 11 | ::: triage.component.audition.selection_rules 12 | options: 13 | heading_level: 3 14 | show_root_toc_entry: false 15 | selection: 16 | filters: 17 | - "!^BoundSelectionRule" 18 | - "!^_" 19 | 20 | ## RuleMakers 21 | 22 | Triage uses `RuleMaker` classes to conveniently format the parameter grids accepted by `make_selection_rule_grid`. Each type of `RuleMaker` class holds methods that build parameter grids for a subset of the available selection rules. 23 | 24 | The arguments of each `add_rule_` method map to the arguments of the corresponding model selection function. 25 | 26 | 27 | ::: triage.component.audition.rules_maker 28 | options: 29 | show_if_no_docstring: true 30 | show_category_heading: false 31 | show_root_heading: false 32 | show_root_toc_entry: false 33 | heading_level: 3 34 | selection: 35 | members: 36 | - SimpleRuleMaker 37 | - TwoMetricsRuleMaker 38 | - RandomGroupRuleMaker 39 | 40 | ## Selection Grid 41 | 42 | ::: triage.component.audition.selection_rule_grid 43 | options: 44 | heading_level: 3 45 | show_root_toc_entry: false -------------------------------------------------------------------------------- /docs/sources/api/timechop/index.md: -------------------------------------------------------------------------------- 1 | ## Timechop Reference 2 | 3 | Timechop handles temporal logic in the Triage Experiment pipeline. 4 | 5 | 6 | |Page|| 7 | |-|-| 8 | |[Timechop](timechop.md)|The Timechop class is the main entry point for Timechop.| 9 | |[Plotting](plotting.md)|Tools for visualizing Timechop| -------------------------------------------------------------------------------- /docs/sources/api/timechop/plotting.md: -------------------------------------------------------------------------------- 1 | ::: triage.component.timechop.plotting.visualize_chops 2 | options: 3 | show_root_toc_entry: False 4 | group_by_category: True 5 | show_category_heading: True 6 | show_if_no_docstring: True 7 | -------------------------------------------------------------------------------- /docs/sources/api/timechop/timechop.md: -------------------------------------------------------------------------------- 1 | ::: triage.component.timechop.timechop 2 | options: 3 | show_root_toc_entry: False 4 | group_by_category: True 5 | show_category_heading: True 6 | show_if_no_docstring: True -------------------------------------------------------------------------------- /docs/sources/audition/images/sanjose-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/audition/images/sanjose-2.png -------------------------------------------------------------------------------- /docs/sources/db.md: -------------------------------------------------------------------------------- 1 | # Triage database provisioner 2 | 3 | This document explains the purpose and behavior of the Triage database provisioner, accessed from the Triage CLI. It is optional and only intended for use if you don't have an existing Postgres database to use for Triage. 4 | 5 | The Triage database provisioner is just a single command: 6 | 7 | `triage db up` 8 | 9 | This command attempts to use docker to spawn a new Postgres 12 database. If successful, it will prompt you for a password to use for a user, and populate the connection information in `database.yaml` *in the directory where you ran it from*. The next time you run `triage db up`, it will look for the existing container and reuse it. 10 | 11 | At this point, you can use the database either from Triage or anything else that can connect to Postgres (eg. [psql](https://www.postgresql.org/docs/13/app-psql.html) or [dbeaver](https://dbeaver.io/), using the credentials in the autogenerated `database.yaml`. 12 | 13 | ## Troubleshooting 14 | 15 | ### No docker 16 | The command does require some version of Docker. We recommend getting it from the [official Docker downloads page](https://docs.docker.com/get-docker/). 17 | 18 | ### Can't log in 19 | Because of the way Docker volumes work, if you manually remove the Docker container created by `triage db up`, the volume will still be around. This is usually fine, but the superuser credential information will persist as well, which means the next time you spawn the database, *the Postgres server will not take the new credential information into account*. Under normal usage (simply calling `triage db up` and never removing the container), you will never run into this situation. But if you do, and you would like to use a new username/password, you will have to remove the volume before recreating. This can be done with `docker volume rm triage-db-data`. This will also remove all of the stored data in Postgres, so beware! 20 | -------------------------------------------------------------------------------- /docs/sources/dirtyduck/.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/.swp -------------------------------------------------------------------------------- /docs/sources/dirtyduck/AUTHORS: -------------------------------------------------------------------------------- 1 | Adolfo De Unanue 2 | Joseph Walsh 3 | Hans Koening 4 | Arthi Ramachandran 5 | Iván Higuera 6 | Kit Rodolfa 7 | -------------------------------------------------------------------------------- /docs/sources/dirtyduck/choose_your_own_adventure.md: -------------------------------------------------------------------------------- 1 | # How to use this tutorial? 2 | 3 | 4 | - You are interested in the *learn* how to use `triage` and have a *lot* of time: 5 | - [Problem description](problem_description.md) 6 | - [Infrastructure](infrastructure.md) 7 | - [Data preparation](data_preparation.md) 8 | - [Resource prioritization](inspections.md) 9 | - [Early warning systems](eis.md) 10 | - [A deeper look into triage](triage_intro.md) 11 | - [Scaling up](aws_batch.md) 12 | - You want to know about `triage` 13 | - [A deeper look into triage](triage_intro.md) 14 | - [Model governance](ml_governance.md) 15 | - [Model selection](../audition/audition_intro.md) 16 | - You want to learn about case studies 17 | - [Quick setup](for_the_impatient.md) 18 | - [Resource prioritization](inspections.md) and/or [Early warning systems](eis.md) 19 | - You *already* know `triage` but want to use it on the cloud 20 | - [Scaling up](aws_batch.md) 21 | - You *just* want to use the database for your own purposes 22 | - [Quick setup](for_the_impatient.md) 23 | -------------------------------------------------------------------------------- /docs/sources/dirtyduck/for_the_impatient.md: -------------------------------------------------------------------------------- 1 | # For the impatient 2 | 3 | If you want to skip all the cleaning and transformations and dive directly into `triage` you can execute the following *inside bastion*: 4 | 5 | ```sh 6 | psql ${DATABASE_URL} -c "\copy raw.inspections from program 'curl "https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD"' HEADER CSV" 7 | 8 | psql ${DATABASE_URL} < /sql/create_cleaned_inspections_table.sql 9 | 10 | psql ${DATABASE_URL} < /sql/create_violations_table.sql 11 | 12 | psql ${DATABASE_URL} < /sql/create_semantic_tables.sql 13 | ``` 14 | 15 | If everything works, you should end with two new schemas: `cleaned` and `semantic`. 16 | 17 | You could check that (from `psql`) With 18 | 19 | ```sql 20 | \dn 21 | ``` 22 | 23 | | List of schemas | | 24 | |--------------- |------------------- | 25 | | Name | Owner | 26 | | cleaned | food\_user | 27 | | postgis | food\_user | 28 | | public | postgres | 29 | | raw | food\_user | 30 | | semantic | food\_user | 31 | -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/AWS_Batch_Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/AWS_Batch_Architecture.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/facilities_inspected_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_inspected_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/facilities_inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_inspections_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/facilities_with_failed_inspections_severe_violations_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_with_failed_inspections_severe_violations_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/facilities_with_inspections_failed_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/facilities_with_inspections_failed_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/failed_inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/failed_inspections_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/failed_inspections_severe_violations_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/failed_inspections_severe_violations_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/EDA/inspections_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/EDA/inspections_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/eis/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/eis/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/eis/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/eis/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/eis/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/eis/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/inspections/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/inspections/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/inspections/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/audition/inspections/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/audition/inspections/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/data_road.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/data_road.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/outcomes-eis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/outcomes-eis.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/outcomes-inspections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/outcomes-inspections.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/eis_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/eis_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_mg_prec_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/eis_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_mg_recall_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_group_importances.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/eis_model_group_64_feature_importances.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/eis_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | audition_output_path: '/triage/audition/eis/results_model_group_ids.json' 5 | 6 | thresholds: # Thresholds for defining positive predictions 7 | rank_abs: [50, 100, 250] 8 | rank_pct: [5, 10, 25] 9 | 10 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 11 | select g.model_group_id, 12 | m.model_id, 13 | extract('year' from m.evaluation_end_time) as as_of_date_year, 14 | m.metric, 15 | m.parameter, 16 | m.stochastic_value, 17 | m.num_labeled_examples, 18 | m.num_labeled_above_threshold, 19 | m.num_positive_labels 20 | from test_results.evaluations m 21 | left join model_metadata.models g 22 | using(model_id) 23 | where g.model_group_id = 20 24 | and metric = 'precision@' 25 | and parameter = '10_pct' 26 | 27 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 28 | n_features_plots: 10 # Number of features for importances 29 | figsize: [12, 12] # Default size for plots 30 | fontsize: 20 # Default fontsize for plots 31 | -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_mg_prec_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_mg_recall_over_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_feature_importances.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_rayid_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/postmodeling/inspection_model_group_39_model_125_rayid_curve.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/postmodeling/inspection_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | 5 | model_group_id: 6 | - 39 7 | - 9 8 | - 29 9 | - 30 10 | 11 | thresholds: # Thresholds for defining positive predictions 12 | rank_abs: [50, 100, 250] 13 | rank_pct: [5, 10, 25] 14 | 15 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 16 | select g.model_group_id, 17 | m.model_id, 18 | extract('year' from m.evaluation_end_time) as as_of_date_year, 19 | m.metric, 20 | m.parameter, 21 | m.stochastic_value, 22 | m.num_labeled_examples, 23 | m.num_labeled_above_threshold, 24 | m.num_positive_labels 25 | from test_results.evaluations as m 26 | left join model_metadata.models as g 27 | using(model_id) 28 | where g.model_group_id = 1 29 | and metric = 'precision@' 30 | and parameter = '15_pct' 31 | 32 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 33 | n_features_plots: 10 # Number of features for importances 34 | figsize: [12, 12] # Default size for plots 35 | fontsize: 20 # Default fontsize for plots 36 | -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/quickstart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/quickstart.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/rolling-origin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/rolling-origin.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/sanjose-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/sanjose-2.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_1.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_10.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_2.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_3.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_4.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_5.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_6.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_7.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_8.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/timechop/timechop_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/timechop/timechop_9.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/eis_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/eis_01.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/inspections_baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/inspections_baseline.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/inspections_dt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/inspections_dt.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/inspections_label_failed_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/inspections_label_failed_01.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/results_model_group_ids.json: -------------------------------------------------------------------------------- 1 | {"best_current_value_precision@_10_pct": [7, 6, 5], "best_average_value_precision@_10_pct": [6, 7, 4], "lowest_metric_variance_precision@_10_pct": [1, 2, 3], "most_frequent_best_dist_precision@_10_pct_0.05": [6, 4, 5]} -------------------------------------------------------------------------------- /docs/sources/dirtyduck/images/triage/simple_test_skeleton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/dirtyduck/images/triage/simple_test_skeleton.png -------------------------------------------------------------------------------- /docs/sources/dirtyduck/who_is_this_tutorial_for.md: -------------------------------------------------------------------------------- 1 | # Who is this tutorial for? 2 | 3 | We created this tutorial with two roles in mind: 4 | 5 | 6 | - A data scientist/ML practitioner who wants to focus 7 | in the problem at his/her hands, not in the nitty-gritty detail about 8 | how to configure and setup a Machine learning pipeline, Model 9 | governance, Model selection, etc. 10 | 11 | - A policy maker with a little of technical background that wants to 12 | learn how to pose his/her policy problem as a Machine Learning 13 | problem. 14 | -------------------------------------------------------------------------------- /docs/sources/experiments/feature-testing.md: -------------------------------------------------------------------------------- 1 | # Testing a Feature Aggregation 2 | 3 | Developing features for Triage experiments can be a daunting task. There are a lot of things to configure, a small amount of configuration can result in a ton of SQL, and it can take a long time to validate your feature configuration in the context of an Experiment being run on real data. 4 | 5 | To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `FeatureGenerator` component. 6 | 7 | ## Using Triage CLI 8 | ![triage featuretest cli help screen](featuretest-cli.png) 9 | 10 | The command-line interface for testing features takes in two arguments: 11 | - An experiment config file. Refer to the [example_experiment_config.yaml](https://github.com/dssg/triage/blob/master/example/config/experiment.yaml)'s `feature_aggregations` section. It consists of a YAML list, with one or more feature_aggregation rows present. 12 | - An as-of-date. This should be in the format `2016-01-01`. 13 | 14 | Example: `triage experiment featuretest example/config/experiment.yaml 2016-01-01` 15 | 16 | All given feature aggregations will be processed for the given date. You will see a bunch of queries pass by in your terminal, populating tables in the `features_test` schema which you can inspect afterwards. 17 | 18 | ![triage feature test result](featuretest-result.png) 19 | 20 | ## Using Python Code 21 | If you'd like to call this from a notebook or from any other Python code, the arguments look similar but are a bit different. You have to supply your own sqlalchemy database engine to create a 'FeatureGenerator' object, and then call the `create_features_before_imputation` method with your feature config as a list of dictionaries, along with an as-of-date as a string. Make sure your logging level is set to INFO if you want to see all of the queries. 22 | 23 | ``` 24 | from triage.component.architect.feature_generators import FeatureGenerator 25 | from triage.util.db import create_engine 26 | import logging 27 | import yaml 28 | 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | # create a db_engine 32 | db_url = 'your db url here' 33 | db_engine = create_engine(db_url) 34 | 35 | feature_config = [{ 36 | 'prefix': 'aprefix', 37 | 'aggregates': [ 38 | { 39 | 'quantity': 'quantity_one', 40 | 'metrics': ['sum', 'count'], 41 | ], 42 | 'categoricals': [ 43 | { 44 | 'column': 'cat_one', 45 | 'choices': ['good', 'bad'], 46 | 'metrics': ['sum'] 47 | }, 48 | ], 49 | 'intervals': ['all'], 50 | 'knowledge_date_column': 'knowledge_date', 51 | 'from_obj': 'data' 52 | }] 53 | 54 | FeatureGenerator(db_engine, 'features_test').create_features_before_imputation( 55 | feature_aggregation_config=feature_config, 56 | feature_dates=['2016-01-01'] 57 | ) 58 | ``` 59 | -------------------------------------------------------------------------------- /docs/sources/experiments/features.md: -------------------------------------------------------------------------------- 1 | # Feature Generation Recipe Book 2 | 3 | This document is a collection of 'collate' aggregate features that we have found useful to create in Triage that may not be apparent at first. 4 | 5 | For an introduction to feature generation in Triage, refer to [Dirty Duck Feature Generation](https://dssg.github.io/dirtyduck/#orgaae2e66) 6 | 7 | ## Age 8 | 9 | You can calculate age from a date of birth column using the `collate_date` special variable. This variable is marked as a placeholder in the feature quantity input, but is replaced with each as-of-date when features are being calculated. Combined with the Postgres `age` function, this calculates a person's age at each as-of-date as a feature. 10 | 11 | For this example, let's assume you have a column called 'dob' that is a timestamp (or anything that can be cast to a date) in your source table. The `feature_aggregation`'s quantity would be: 12 | 13 | ```EXTRACT(YEAR FROM AGE('{collate_date}'::DATE, dob::DATE))``` 14 | 15 | If Triage is calculating this for the as-of-date '2016-01-01', it will internally expand the `collate_date` out to: 16 | ```EXTRACT(YEAR FROM AGE('2016-01-01'::DATE, dob::DATE))``` 17 | 18 | In context, a feature aggregate that uses age may look more like: 19 | 20 | ``` 21 | aggregates: 22 | - # age in years 23 | quantity: 24 | age: "EXTRACT(YEAR FROM AGE('{collate_date}'::DATE, dob::DATE))" 25 | metrics: ['max'] 26 | ``` 27 | 28 | 29 | Here, we call the feature 'age' and since everything in collate is defined as an aggregate, we pick 'max'; Any records for the same person and as-of-date should have the same 'dob', so there are many aggregates you can use that will arrive at the same answer (e.g. 'min', 'avg'). In these cases 'max' is the standard aggregate metric of choice in Triage. 30 | -------------------------------------------------------------------------------- /docs/sources/experiments/featuretest-cli.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/featuretest-cli.png -------------------------------------------------------------------------------- /docs/sources/experiments/featuretest-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/featuretest-result.png -------------------------------------------------------------------------------- /docs/sources/experiments/prediction-ranking.md: -------------------------------------------------------------------------------- 1 | # Prediction Ranking 2 | 3 | The predictions tables in the `train_results` and `test_results` 4 | schemas contain several different flavors of rankings, covering 5 | absolute vs percentile ranking and whether or not ties exist. 6 | 7 | ## Ranking columns 8 | 9 | | Column name | Behavior | 10 | | ----------- | ------- | 11 | | rank_abs_with_ties | Absolute ranking, with ties. Ranks will skip after a set of ties, so if two entities are tied at rank 3, the next entity after them will have rank 5. | 12 | | rank_pct_with_ties | Percentile ranking, with ties. Percentiles will skip after a set of ties, so if two entities out of ten are tied at 0.1 (tenth percentile), the next entity after them will have 0.3 (thirtieth percentile). At most five decimal places. | 13 | | rank_abs_no_ties | Absolute ranking, with no ties. Ties are broken according to a configured choice: 'best', 'worst', or 'random', which is recorded in the `prediction_metadata` table | 14 | | rank_pct_no_ties | Percentile ranking, with no ties. Ties are broken according to a configured choice: 'best', 'worst', or 'random', which is recorded in the `prediction_metadata` table. At most five decimal places. | 15 | 16 | 17 | ## Viewing prediction metadata 18 | 19 | The `prediction_metadata` table contains information about how ties 20 | were broken. There is one row per model/matrix combination. For each 21 | model and matrix, it records: 22 | 23 | - `tiebreaker_ordering` - The tiebreaker ordering rule (e.g. 'random', 24 | 'best', 'worst') used for the corresponding predictions. 25 | - `random_seed` - The random seed, if 'random' was the ordering 26 | used. Otherwise None 27 | - `predictions_saved` - Whether or not predictions were saved. If it's 28 | false, you won't expect to find any predictions, but the row is 29 | inserted as a record that the prediction was performed. 30 | 31 | There is one `prediction_metadata` table in each of the 32 | `train_results`, `test_results` schemas (in other words, wherever 33 | there is a companion `predictions` table). 34 | 35 | 36 | 37 | ## Subsequent runs 38 | 39 | If you run Triage Experiments with `replace=False`, and you change 40 | nothing except for the `rank_tiebreaker` in experiment config, ranking 41 | will be redone and the row in `prediction_metadata` updated. You don't 42 | have to run a full experiment if that's all you want to do; you could 43 | follow the directions for backfilling ranks above, which will redo the 44 | ranking for an individual model/matrix pair. However, changing the 45 | `rank_tiebreaker` in experiment config and re-running the experiment 46 | is a handy way of redoing all of them if that's what is useful. 47 | -------------------------------------------------------------------------------- /docs/sources/experiments/temporal_config_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/temporal_config_graph.png -------------------------------------------------------------------------------- /docs/sources/experiments/timechops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/experiments/timechops.png -------------------------------------------------------------------------------- /docs/sources/experiments/upgrade-to-v6.md: -------------------------------------------------------------------------------- 1 | # Upgrading your experiment configuration to v6 2 | 3 | 4 | This document details the steps needed to update a triage v5 configuration to 5 | v6, mimicking the old behavior. 6 | 7 | Experiment configuration v6 includes only one change from v5: When specifying 8 | the `cohort_config`, if a `query` is given , the `{af_of_date}` is no longer 9 | quoted or casted by Triage. Instead, the user must perform the quoting and 10 | casting, as is done already for the `label_config`. 11 | 12 | Old: 13 | 14 | ``` 15 | cohort_config: 16 | query: | 17 | SELECT DISTINCT entity_id 18 | FROM semantic.events 19 | WHERE event = 'booking' 20 | AND startdt <@ daterange(({as_of_date} - '3 years'::interval)::date, {as_of_date}) 21 | AND enddt < {as_of_date} 22 | LIMIT 100 23 | name: 'booking_last_3_years_limit_100' 24 | ``` 25 | 26 | New: 27 | 28 | ``` 29 | cohort_config: 30 | query: | 31 | SELECT DISTINCT entity_id 32 | FROM semantic.events 33 | WHERE event = 'booking' 34 | AND startdt <@ daterange(('{as_of_date}'::date - '3 years'::interval)::date, '{as_of_date}'::date) 35 | AND enddt < '{as_of_date}' 36 | LIMIT 100 37 | name: 'booking_last_3_years_limit_100' 38 | ``` 39 | 40 | ## Upgrading the experiment config version 41 | 42 | At this point, you should be able to bump the top-level experiment config version to v6: 43 | 44 | Old: 45 | 46 | ``` 47 | config_version: 'v5' 48 | ``` 49 | 50 | New: 51 | 52 | ``` 53 | config_version: 'v6' 54 | ``` 55 | 56 | -------------------------------------------------------------------------------- /docs/sources/experiments/upgrade-to-v7.md: -------------------------------------------------------------------------------- 1 | # Upgrading your experiment configuration to v7 2 | 3 | 4 | This document details the steps needed to update a triage v6 configuration to 5 | v7, mimicking the old behavior. 6 | 7 | Experiment configuration v7 includes only one change from v6: the addition of a mandatory random_seed, that is set at the beginning of the experiment and affects all subsequent random numbers. It is expected to be an integer. 8 | 9 | Old: 10 | ```yaml 11 | 12 | config_version: 'v6' 13 | 14 | # EXPERIMENT METADATA 15 | ``` 16 | 17 | New: 18 | ```yaml 19 | 20 | config_version: 'v7' 21 | 22 | # EXPERIMENT METADATA 23 | # random_seed will be set in Python at the beginning of the experiment and 24 | # affect the generation of all model seeds 25 | random_seed: 23895478 26 | ``` 27 | -------------------------------------------------------------------------------- /docs/sources/experiments/upgrade-to-v8.md: -------------------------------------------------------------------------------- 1 | # Upgrading your experiment configuration to v8 2 | 3 | 4 | This document details the steps needed to update a triage v6 configuration to 5 | v8, mimicking the old behavior. 6 | 7 | Experiment configuration v8 includes only one change from v7: the `groups` key is no longer supported in the feature configuration (all features must be grouped only at the `entity_id` level). 8 | 9 | Old: 10 | ```yaml 11 | 12 | config_version: 'v7' 13 | 14 | # FEATURE GENERATION 15 | feature_aggregations: 16 | - 17 | prefix: 'inspections' 18 | from_obj: 'semantic.events' 19 | knowledge_date_column: 'date' 20 | 21 | aggregates_imputation: 22 | count: 23 | type: 'zero_noflag' 24 | 25 | aggregates: 26 | - 27 | quantity: 28 | total: "*" 29 | metrics: 30 | - 'count' 31 | 32 | intervals: ['all'] 33 | 34 | groups: 35 | - 'entity_id' 36 | ``` 37 | 38 | New: 39 | ```yaml 40 | 41 | config_version: 'v8' 42 | 43 | # FEATURE GENERATION 44 | feature_aggregations: 45 | - 46 | prefix: 'inspections' 47 | from_obj: 'semantic.events' 48 | knowledge_date_column: 'date' 49 | 50 | aggregates_imputation: 51 | count: 52 | type: 'zero_noflag' 53 | 54 | aggregates: 55 | - 56 | quantity: 57 | total: "*" 58 | metrics: 59 | - 'count' 60 | 61 | intervals: ['all'] 62 | ``` 63 | -------------------------------------------------------------------------------- /docs/sources/index.md: -------------------------------------------------------------------------------- 1 | # Triage 2 | 3 | [![Build Status](https://travis-ci.org/dssg/triage.svg?branch=master)](https://travis-ci.org/dssg/triage) 4 | [![codecov](https://codecov.io/gh/dssg/triage/branch/master/graph/badge.svg)](https://codecov.io/gh/dssg/triage) 5 | [![codeclimate](https://codeclimate.com/github/dssg/triage.png)](https://codeclimate.com/github/dssg/triage) 6 | 7 | 8 | ## What is Triage? 9 | 10 | Triage is an open source machine learning toolkit to help data scientists, machine learning developers, and analysts quickly prototype, build and evaluate end-to-end predictive risk modeling systems for public policy and social good problems. 11 | 12 | While many tools (sklearn, keras, pytorch, etc.) exist to build ML models, an end-to-end project requires a lot more than just building models. Developing AI/ML/data science systems requires making many design decisions that need to match how the system is going to be deployed and used. These choices then get turned into modeling choices and code. Triage lets you focus on the problem you’re solving and guides you through design choices you need to make at each step of the machine learning pipeline. 13 | 14 | ## How to get started with Triage? 15 | 16 | ### [Go through a quick online tutorial with sample data (no setup required)](https://colab.research.google.com/github/dssg/triage/blob/master/example/colab/colab_triage.ipynb) 17 | 18 | ### [Go through a more in-depth tutorial with sample data](dirtyduck/index.md) 19 | 20 | ### [Get started with your own project and data](quickstart.md) 21 | 22 | 23 | ## Background 24 | 25 | Triage was initially developed at the University of Chicago's [Center For Data Science and Public Policy](http://dsapp.uchicago.edu) and is now being maintained and extended at Carnegie Mellon University. 26 | 27 | -------------------------------------------------------------------------------- /docs/sources/postmodeling/postmodeling-config.md: -------------------------------------------------------------------------------- 1 | ## Postmodeling Configuration 2 | 3 | The Triage Postmodeling module is controlled by two config files: `postmodeling_config.yaml` and `postmodeling_crosstabs.yaml`. 4 | 5 | ### Postmodeling Configuration File 6 | Configuration for the Triage Postmodeling module. An example `postmodeling_config.yaml` file can be found [here](https://github.com/dssg/triage/blob/master/example/config/postmodeling_config.yaml). 7 | 8 | - `project_path`: Project path defined in triage with matrices and models 9 | - `audition_output_path`: Audition output path 10 | - `model_group_id`: List of model_id's [optional if a audition_output_path is given] 11 | - `thresholds`: Thresholds for defining positive predictions 12 | - `baseline_query`: SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 13 | - `max_depth_error_tree`: For error trees, how depth the decision trees should go? 14 | - `n_features_plots`: Number of features for importances 15 | - `figsize`: Default size for plots 16 | - `fontsize`: Default fontsize for plots 17 | 18 | 19 | ### Postmodeling Crosstabs Configuration File 20 | Configuration for crosstabs in Triage's Postmodeling module. An example `postmodeling_crosstabs.yaml` file can be found [here](https://github.com/dssg/triage/blob/master/example/config/postmodeling_crosstabs.yaml). 21 | 22 | - `output`: Define the schema and table for crosstabs 23 | - `thresholds`: Thresholds for defining positive predictions 24 | - `entity_id_list`: (optional) a list of `entity_ids` to subset on the crosstabs analysis 25 | - `models_list_query`: SQL query for getting `model_id`s 26 | - `as_of_dates_query`: SQL query for getting `as_of_date`s 27 | - `models_dates_join_query`: don't change the default query unless strictly necessary. It is just validating pairs of (`model_id`, `as_of_date`) in a predictions table 28 | - `features_query`: features_query must join `models_dates_join_query` with 1 or more features table using `as_of_date` 29 | - `predictions_query`: the predictions query must return `model_id`, `as_of_date`, `entity_id`, `score`, `label_value`, `rank_abs` and `rank_pct`. It must join `models_dates_join_query` using both `model_id` and `as_of_date`. 30 | 31 | -------------------------------------------------------------------------------- /docs/sources/postmodeling/postmodeling_general_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/docs/sources/postmodeling/postmodeling_general_flow.png -------------------------------------------------------------------------------- /docs/sources/triage_docs.css: -------------------------------------------------------------------------------- 1 | /* Indents and adds a border to lower headings. */ 2 | div.doc-contents:not(.first) { 3 | padding-left: 25px; 4 | border-left: 4px solid rgba(230, 230, 230); 5 | margin-bottom: 80px; 6 | } 7 | 8 | /* Don't capitalize names. */ 9 | h5.doc-heading { 10 | text-transform: none !important; 11 | } 12 | 13 | /* Don't use vertical space on hidden ToC entries. */ 14 | h6.hidden-toc { 15 | margin: 0 !important; 16 | position: relative; 17 | top: -70px; 18 | } 19 | h6.hidden-toc::before { 20 | margin-top: 0 !important; 21 | padding-top: 0 !important; 22 | } 23 | 24 | /* Don't show permalink of hidden ToC entries. 25 | h6.hidden-toc a.headerlink { 26 | display: none; 27 | } */ 28 | 29 | /* Avoid breaking parameters name, etc. in table cells. */ 30 | td code { 31 | word-break: normal !important; 32 | } 33 | 34 | /* For pieces of Markdown rendered in table cells. */ 35 | td p { 36 | margin-top: 0 !important; 37 | margin-bottom: 0 !important; 38 | } 39 | -------------------------------------------------------------------------------- /docs/update_docs.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | from md_autogen import MarkdownAPIGenerator 4 | from md_autogen import to_md_file 5 | 6 | from triage import experiments 7 | 8 | 9 | def generate_api_docs(): 10 | modules = [ 11 | experiments.base, 12 | experiments.singlethreaded, 13 | experiments.multicore 14 | ] 15 | 16 | md_gen = MarkdownAPIGenerator("triage", "https://github.com/dssg/triage/tree/master") 17 | for m in modules: 18 | md_string = md_gen.module2md(m) 19 | to_md_file(md_string, m.__name__, "docs/sources") 20 | 21 | 22 | def update_index_md(): 23 | shutil.copyfile('README.md', 'docs/sources/index.md') 24 | 25 | 26 | def copy_templates(): 27 | shutil.rmtree('docs/sources', ignore_errors=True) 28 | shutil.copytree('docs/templates', 'docs/sources') 29 | 30 | 31 | if __name__ == "__main__": 32 | #copy_templates() 33 | #update_index_md() 34 | #generate_api_docs() 35 | -------------------------------------------------------------------------------- /example/aws_batch/aws_env.example: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PROJECT_NAME=dirtyduck 4 | TRIAGE_VERSION=3.3.0 5 | ENV=development 6 | AWS_REGISTRY={your-ecr-registry} 7 | AWS_JOB_QUEUE={your-job-queue} 8 | POSTGRES_DB={postgresql://user:password@db_server/dbname} 9 | S3_BUCKET={your-bucket} 10 | -------------------------------------------------------------------------------- /example/aws_batch/credentials.filter.example: -------------------------------------------------------------------------------- 1 | { 2 | "environment": [ 3 | { 4 | "name": "AWS_ACCESS_KEY_ID", 5 | "value": .Credentials.AccessKeyId 6 | }, 7 | { 8 | "name": "AWS_SECRET_ACCESS_KEY", 9 | "value": .Credentials.SecretAccessKey 10 | }, 11 | { 12 | "name": "AWS_SESSION_TOKEN", 13 | "value": .Credentials.SessionToken 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /example/aws_batch/triage-job-definition.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "containerProperties": { 3 | "command": [ 4 | "--tb", 5 | "Ref::experiment_file", 6 | "--project-path", 7 | "Ref::output_path", 8 | "Ref::replace", 9 | "Ref::save_predictions", 10 | "Ref::profile", 11 | "Ref::validate" 12 | ], 13 | "image": "AWS_ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/YOUR_TRIAGE_IMAGE", 14 | "jobRoleArn": "arn:aws:iam::AWS_ACCOUNT:role/dsappBatchJobRole", 15 | "memory": 16000, 16 | "vcpus": 1 17 | }, 18 | "jobDefinitionName": "triage-cli-experiment", 19 | "retryStrategy": { 20 | "attempts": 1 21 | }, 22 | "type": "container" 23 | } 24 | -------------------------------------------------------------------------------- /example/aws_batch/triage-overrides.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "environment": [ 3 | { 4 | "name":"AWS_DEFAULT_REGION", 5 | "value":"us-west-2" 6 | }, 7 | { 8 | "name":"AWS_JOB_QUEUE", 9 | "value":"" 10 | }, 11 | { 12 | "name":"POSTGRES_PASSWORD", 13 | "value":"" 14 | }, 15 | { 16 | "name":"POSTGRES_USER", 17 | "value":"" 18 | }, 19 | { 20 | "name":"POSTGRES_DB", 21 | "value":"" 22 | }, 23 | { 24 | "name":"POSTGRES_PORT", 25 | "value":"" 26 | }, 27 | { 28 | "name":"POSTGRES_HOST", 29 | "value":"" 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /example/cohort/past_events.sql: -------------------------------------------------------------------------------- 1 | select entity_id 2 | from events 3 | where outcome_date < '{as_of_date}' 4 | -------------------------------------------------------------------------------- /example/config/README.md: -------------------------------------------------------------------------------- 1 | ### Triage Example Config Files 2 | 3 | This folder contains examples of the config files that control Triage. These config files exist to demonstrate the format and syntax of Triage's config files, and provide templates for implementing new projects in triage. 4 | 5 | #### audition.yaml 6 | 7 | An example of the config file that controls Audition, the Triage model selection module. Find additional documentation for the Audition config file [here](https://dssg.github.io/triage/dirtyduck/audition/audition-config/). 8 | 9 | #### database.yaml 10 | 11 | Triage requires a database connection for source data and [model governance](https://dssg.github.io/triage/dirtyduck/ml_governance/). Use a file of this format to specify your connection. 12 | 13 | #### dirty-duckling.yaml 14 | 15 | A Triage experiment config file used in [Dirty Duckling](https://dssg.github.io/triage/dirtyduck/), the Triage tutorial. 16 | 17 | #### experiment.yaml 18 | 19 | An example of an experiment config file. Experiment configs control behavior of the Triage experiment pipeline, which handles feature and label generation, model training, and model evaluation. Find more documentation for the Triage experiment config file [here](https://dssg.github.io/triage/experiments/experiment-config/). 20 | 21 | #### postmodeling_config.yaml & postmodeling_crosstabs.yaml 22 | 23 | Controls the Triage Postmodeling module. Postmodeling is currently under development. It provides a set of tools for evaluating and investigating trained models. More documentation is available [here](https://dssg.github.io/triage/postmodeling/postmodeling-config). -------------------------------------------------------------------------------- /example/config/audition.yaml: -------------------------------------------------------------------------------- 1 | # CHOOSE MODEL GROUPS 2 | # Audition needs a bunch of model_group_ids to help you select the models. 3 | # The query is to choose what the model groups you want to include in the first round. 4 | model_groups: 5 | query: | 6 | SELECT DISTINCT(model_group_id) 7 | FROM triage_metadata.model_groups 8 | 9 | # CHOOSE TIMESTAMPS/TRAIN END TIMES 10 | # The timestamps when audition happens for each model group. 11 | # There's a hard rule in Audition that all of the chosen model groups for audition should 12 | # have the same train end times as the timestamps or the subset of the timestamps from this 13 | # query, otherwise those model groups with unmatched train end times will be pruned in the 14 | # first round. 15 | time_stamps: 16 | query: | 17 | SELECT DISTINCT train_end_time 18 | FROM triage_metadata.models 19 | WHERE model_group_id IN ({}) 20 | AND EXTRACT(DAY FROM train_end_time) IN (1) 21 | AND train_end_time >= '2012-01-01' 22 | 23 | # FILTER 24 | # Configuration for the Auditioner 25 | filter: 26 | metric: 'precision@' # metric of interest 27 | parameter: '50_abs' # parameter of interest 28 | max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time. 29 | threshold_value: 0.0 # The worst absolute value that the given metric should be. 30 | distance_table: 'distance_table' # name of the distance table 31 | models_table: 'models' # name of the models table 32 | agg_type: 'worst' # Optional: how to aggregate multiple metric values if multiple models exist for a model group/train end time. 33 | 34 | # RULES 35 | # The selection rules for Audition to simulate the model selection process for each timestamps. 36 | # More rules can be found in the README. 37 | # The metric and parameter in shared_parameters should be the same in the filter section as well. 38 | rules: 39 | - 40 | shared_parameters: 41 | - 42 | metric: 'precision@' 43 | parameter: '50_abs' 44 | selection_rules: 45 | - 46 | name: 'best_current_value' # Pick the model group with the best current metric value 47 | n: 3 48 | - 49 | name: 'best_average_value' # Pick the model with the highest average metric value 50 | n: 3 51 | - 52 | name: 'lowest_metric_variance' # Pick the model with the lowest metric variance 53 | n: 3 54 | - 55 | name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case` 56 | dist_from_best_case: [0.05] 57 | n: 3 58 | 59 | -------------------------------------------------------------------------------- /example/config/database.yaml: -------------------------------------------------------------------------------- 1 | # Connecting to the database requires a configuration file like this one 2 | 3 | # address.of.database.server 4 | host: 0.0.0.0 5 | user: food_user 6 | db: food 7 | # user password 8 | pass: some_password 9 | # connection port 10 | port: 5434 11 | -------------------------------------------------------------------------------- /example/config/dirty-duckling.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v8' 2 | 3 | model_comment: 'dirtyduck-quickstart' 4 | 5 | random_seed: 1234 6 | 7 | temporal_config: 8 | label_timespans: ['3months'] 9 | 10 | label_config: 11 | query: | 12 | select 13 | entity_id, 14 | bool_or(result = 'fail')::integer as outcome 15 | from semantic.events 16 | where '{as_of_date}'::timestamp <= date 17 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 18 | group by entity_id 19 | name: 'failed_inspections' 20 | 21 | feature_aggregations: 22 | - 23 | prefix: 'inspections' 24 | from_obj: 'semantic.events' 25 | knowledge_date_column: 'date' 26 | 27 | aggregates_imputation: 28 | count: 29 | type: 'zero_noflag' 30 | 31 | aggregates: 32 | - 33 | quantity: 34 | total: "*" 35 | metrics: 36 | - 'count' 37 | 38 | intervals: ['all'] 39 | 40 | model_grid_preset: 'quickstart' 41 | 42 | scoring: 43 | testing_metric_groups: 44 | - 45 | metrics: [precision@] 46 | thresholds: 47 | percentiles: [1] 48 | 49 | 50 | training_metric_groups: 51 | - 52 | metrics: [precision@] 53 | thresholds: 54 | percentiles: [1] 55 | -------------------------------------------------------------------------------- /example/config/postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: 'triage_output/output/' # Project path defined in triage with matrices and models 4 | audition_output_path: 'results_model_group_ids.json' # Audition output path 5 | model_group_id: # List of model_id's [optional if a audition_output_path is given] 6 | - 19 7 | - 43 8 | - 55 9 | 10 | thresholds: # Thresholds for defining positive predictions 11 | rank_abs: [10, 20] 12 | rank_pct: [10, 25, 50] 13 | 14 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 15 | SELECT g.model_group_id, 16 | m.model_id, 17 | EXTRACT('YEAR' FROM m.evaluation_end_time) AS as_of_date_year, 18 | m.metric, 19 | m.parameter, 20 | m.value, 21 | m.num_labeled_examples, 22 | m.num_labeled_above_threshold, 23 | m.num_positive_labels 24 | FROM test_results.evaluations m 25 | LEFT JOIN triage_metadata.models g 26 | USING(model_id) 27 | WHERE g.model_group_id IN (1, 2, 3) 28 | AND metric = 'precision@' 29 | AND parameter = '10.0_pct' 30 | 31 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 32 | n_features_plots: 10 # Number of features for importances 33 | figsize: [12, 12] # Default size for plots 34 | fontsize: 20 # Default fontsize for plots 35 | -------------------------------------------------------------------------------- /example/config/postmodeling_crosstabs.yaml: -------------------------------------------------------------------------------- 1 | output: 2 | schema: 'test_results' 3 | table: 'crosstabs' 4 | 5 | thresholds: 6 | rank_abs: [50] 7 | rank_pct: [] 8 | 9 | #(optional): a list of entity_ids to subset on the crosstabs analysis 10 | entity_id_list: [] 11 | 12 | models_list_query: "select unnest(ARRAY[44, 86]) :: int as model_id" 13 | 14 | as_of_dates_query: "select unnest(ARRAY['2016-01-13','2017-01-13']) :: date as as_of_date" 15 | 16 | #don't change this query unless strictly necessary. It is just validating pairs of (model_id,as_of_date) 17 | #it is just a join with distinct (model_id, as_of_date) in a predictions table 18 | models_dates_join_query: " 19 | select model_id, 20 | as_of_date 21 | from models_list_query m 22 | cross join as_of_dates_query a join (select distinct model_id, as_of_date from test_results.predictions) p 23 | using (model_id, as_of_date)" 24 | 25 | #features_query must join models_dates_join_query with 1 or more features table using as_of_date 26 | features_query: " 27 | select m.model_id, f1.* 28 | from features.inspections_aggregation_imputed f1 join 29 | models_dates_join_query m using (as_of_date)" 30 | 31 | #the predictions query must return model_id, as_of_date, entity_id, score, label_value, rank_abs and rank_pct 32 | #it must join models_dates_join_query using both model_id and as_of_date 33 | predictions_query: " 34 | select model_id, 35 | as_of_date, 36 | entity_id, 37 | score, 38 | label_value, 39 | coalesce(rank_abs_no_ties, row_number() over (partition by (model_id, as_of_date) order by score desc)) as rank_abs, 40 | coalesce(rank_pct_no_ties*100, ntile(100) over (partition by (model_id, as_of_date) order by score desc)) as rank_pct 41 | from test_results.predictions 42 | JOIN models_dates_join_query USING(model_id, as_of_date) 43 | where model_id IN (select model_id from models_list_query) 44 | AND as_of_date in (select as_of_date from as_of_dates_query)" 45 | -------------------------------------------------------------------------------- /example/dirtyduck/audition/eis_audition_config.yaml: -------------------------------------------------------------------------------- 1 | # CHOOSE MODEL GROUPS 2 | model_groups: 3 | query: | 4 | select distinct(model_group_id) 5 | from triage_metadata.model_groups 6 | where model_config ->> 'experiment_type' ~ 'eis' 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES 8 | time_stamps: 9 | query: | 10 | select distinct train_end_time 11 | from triage_metadata.models 12 | where model_group_id in ({}) 13 | and extract(day from train_end_time) in (1) 14 | and train_end_time >= '2014-01-01' 15 | # FILTER 16 | filter: 17 | metric: 'precision@' # metric of interest 18 | parameter: '10_pct' # parameter of interest 19 | max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time. 20 | threshold_value: 0.0 # The worst absolute value that the given metric should be. 21 | distance_table: 'eis_distance_table' # name of the distance table 22 | models_table: 'models' # name of the models table 23 | 24 | # RULES 25 | rules: 26 | - 27 | shared_parameters: 28 | - 29 | metric: 'precision@' 30 | parameter: '10_pct' 31 | 32 | selection_rules: 33 | - 34 | name: 'best_current_value' # Pick the model group with the best current metric value 35 | n: 5 36 | - 37 | name: 'best_average_value' # Pick the model with the highest average metric value 38 | n: 5 39 | - 40 | name: 'lowest_metric_variance' # Pick the model with the lowest metric variance 41 | n: 5 42 | - 43 | name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case` 44 | dist_from_best_case: [0.05] 45 | n: 5 46 | -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspection_audition_config.yaml: -------------------------------------------------------------------------------- 1 | # CHOOSE MODEL GROUPS 2 | model_groups: 3 | query: | 4 | select distinct(model_group_id) 5 | from triage_metadata.model_groups 6 | where model_config ->> 'experiment_type' ~ 'inspection' 7 | # CHOOSE TIMESTAMPS/TRAIN END TIMES 8 | time_stamps: 9 | query: | 10 | select distinct train_end_time 11 | from triage_metadata.models 12 | where model_group_id in ({}) 13 | and extract(day from train_end_time) in (1) 14 | and train_end_time >= '2014-01-01' 15 | # FILTER 16 | filter: 17 | metric: 'precision@' # metric of interest 18 | parameter: '10_pct' # parameter of interest 19 | max_from_best: 1.0 # The maximum value that the given metric can be worse than the best model for a given train end time. 20 | threshold_value: 0.0 # The worst absolute value that the given metric should be. 21 | distance_table: 'inspections_distance_table' # name of the distance table 22 | models_table: 'models' # name of the models table 23 | 24 | # RULES 25 | rules: 26 | - 27 | shared_parameters: 28 | - 29 | metric: 'precision@' 30 | parameter: '10_pct' 31 | 32 | selection_rules: 33 | - 34 | name: 'best_current_value' # Pick the model group with the best current metric value 35 | n: 3 36 | - 37 | name: 'best_average_value' # Pick the model with the highest average metric value 38 | n: 3 39 | - 40 | name: 'lowest_metric_variance' # Pick the model with the lowest metric variance 41 | n: 3 42 | - 43 | name: 'most_frequent_best_dist' # Pick the model that is most frequently within `dist_from_best_case` 44 | dist_from_best_case: [0.05] 45 | n: 3 46 | -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/distance_from_best_precision@15_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/distance_from_best_precision@15_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/metric_over_time_precision@15_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/metric_over_time_precision@15_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/precision@15_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/precision@15_pct_next_time.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@15_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_distance_from_best_rules_precision@15_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/regret_over_time_precision@15_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/audition/inspections/regret_over_time_precision@15_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/audition/inspections/results_model_group_ids.json: -------------------------------------------------------------------------------- 1 | {"best_current_value_precision@_10_pct": [39, 30, 9], "best_average_value_precision@_10_pct": [39, 9, 29], "lowest_metric_variance_precision@_10_pct": [1, 5, 19], "most_frequent_best_dist_precision@_10_pct_0.05": [8, 9, 10]} -------------------------------------------------------------------------------- /example/dirtyduck/crosstabs/eis_crosstabs_config.yaml: -------------------------------------------------------------------------------- 1 | output: 2 | schema: 'test_results' 3 | table: 'eis_crosstabs' 4 | 5 | thresholds: 6 | rank_abs: [50] 7 | rank_pct: [5] 8 | 9 | #(optional): a list of entity_ids to subset on the crosstabs analysis 10 | entity_id_list: [] 11 | 12 | models_list_query: "select unnest(ARRAY[226]) :: int as model_id" 13 | 14 | as_of_dates_query: "select generate_series('2017-12-01'::date, '2018-09-01'::date, interval '1month') as as_of_date" 15 | 16 | #don't change this query unless strictly necessary. It is just validating pairs of (model_id,as_of_date) 17 | #it is just a join with distinct (model_id, as_of_date) in a predictions table 18 | models_dates_join_query: | 19 | select model_id, 20 | as_of_date 21 | from models_list_query as m 22 | cross join as_of_dates_query a join (select distinct model_id, as_of_date from test_results.predictions) as p 23 | using (model_id, as_of_date) 24 | 25 | #features_query must join models_dates_join_query with 1 or more features table using as_of_date 26 | features_query: | 27 | select m.model_id, m.as_of_date, f4.entity_id, f4.results_entity_id_1month_result_fail_avg, f4.results_entity_id_3month_result_fail_avg, f4.results_entity_id_6month_result_fail_avg, 28 | f2.inspection_types_entity_id_1month_type_canvass_sum, f3.risks_entity_id_1month_risk_high_sum, f4.results_entity_id_6month_result_pass_avg, 29 | f3.risks_entity_id_all_risk_high_sum, f2.inspection_types_entity_id_3month_type_canvass_sum, f4.results_entity_id_6month_result_pass_sum, 30 | f2.inspection_types_entity_id_all_type_canvass_sum 31 | from features.inspection_types_aggregation_imputed as f2 32 | inner join features.risks_aggregation_imputed as f3 using (entity_id, as_of_date) 33 | inner join features.results_aggregation_imputed as f4 using (entity_id, as_of_date) 34 | inner join models_dates_join_query as m using (as_of_date) 35 | 36 | #the predictions query must return model_id, as_of_date, entity_id, score, label_value, rank_abs and rank_pct 37 | #it must join models_dates_join_query using both model_id and as_of_date 38 | predictions_query: | 39 | select model_id, 40 | as_of_date, 41 | entity_id, 42 | score, 43 | label_value, 44 | coalesce(rank_abs_no_ties, row_number() over (partition by (model_id, as_of_date) order by score desc)) as rank_abs, 45 | coalesce(rank_pct_no_ties*100, ntile(100) over (partition by (model_id, as_of_date) order by score desc)) as rank_pct 46 | from test_results.predictions 47 | join models_dates_join_query using(model_id, as_of_date) 48 | where model_id in (select model_id from models_list_query) 49 | and as_of_date in (select as_of_date from as_of_dates_query) 50 | -------------------------------------------------------------------------------- /example/dirtyduck/experiments/dirty-duckling.yaml: -------------------------------------------------------------------------------- 1 | config_version: 'v8' 2 | 3 | model_comment: 'dirtyduck-quickstart' 4 | 5 | random_seed: 1234 6 | 7 | temporal_config: 8 | label_timespans: ['3months'] 9 | 10 | label_config: 11 | query: | 12 | select 13 | entity_id, 14 | bool_or(result = 'fail')::integer as outcome 15 | from semantic.events 16 | where '{as_of_date}'::timestamp <= date 17 | and date < '{as_of_date}'::timestamp + interval '{label_timespan}' 18 | group by entity_id 19 | name: 'failed_inspections' 20 | 21 | feature_aggregations: 22 | - 23 | prefix: 'inspections' 24 | from_obj: 'semantic.events' 25 | knowledge_date_column: 'date' 26 | 27 | aggregates_imputation: 28 | count: 29 | type: 'zero_noflag' 30 | 31 | aggregates: 32 | - 33 | quantity: 34 | total: "*" 35 | metrics: 36 | - 'count' 37 | 38 | intervals: ['all'] 39 | 40 | model_grid_preset: 'quickstart' 41 | 42 | scoring: 43 | testing_metric_groups: 44 | - 45 | metrics: [precision@] 46 | thresholds: 47 | percentiles: [10] 48 | 49 | 50 | training_metric_groups: 51 | - 52 | metrics: [precision@] 53 | thresholds: 54 | percentiles: [10] 55 | -------------------------------------------------------------------------------- /example/dirtyduck/images/distance_from_best_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/distance_from_best_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/images/eis_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/eis_01.png -------------------------------------------------------------------------------- /example/dirtyduck/images/inspections_baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/inspections_baseline.png -------------------------------------------------------------------------------- /example/dirtyduck/images/inspections_dt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/inspections_dt.png -------------------------------------------------------------------------------- /example/dirtyduck/images/inspections_label_failed_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/inspections_label_failed_01.png -------------------------------------------------------------------------------- /example/dirtyduck/images/metric_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/metric_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/images/precision@10_pct_next_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/precision@10_pct_next_time.png -------------------------------------------------------------------------------- /example/dirtyduck/images/regret_distance_from_best_rules_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/regret_distance_from_best_rules_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/images/regret_over_time_precision@10_pct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/regret_over_time_precision@10_pct.png -------------------------------------------------------------------------------- /example/dirtyduck/images/results_model_group_ids.json: -------------------------------------------------------------------------------- 1 | {"best_current_value_precision@_10_pct": [7, 6, 5], "best_average_value_precision@_10_pct": [6, 7, 4], "lowest_metric_variance_precision@_10_pct": [1, 2, 3], "most_frequent_best_dist_precision@_10_pct_0.05": [6, 4, 5]} -------------------------------------------------------------------------------- /example/dirtyduck/images/simple_test_skeleton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/images/simple_test_skeleton.png -------------------------------------------------------------------------------- /example/dirtyduck/output/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/output/.gitkeep -------------------------------------------------------------------------------- /example/dirtyduck/output/images/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/output/images/.gitkeep -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/database.yaml: -------------------------------------------------------------------------------- 1 | host: food_db 2 | user: food_user 3 | password: some_password 4 | port: 5432 5 | dbname: food 6 | -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/eis_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | audition_output_path: '/triage/audition/eis/results_model_group_ids.json' 5 | 6 | thresholds: # Thresholds for defining positive predictions 7 | rank_abs: [50, 100, 250] 8 | rank_pct: [5, 10, 25] 9 | 10 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 11 | select g.model_group_id, 12 | m.model_id, 13 | extract('year' from m.evaluation_end_time) as as_of_date_year, 14 | m.metric, 15 | m.parameter, 16 | m.stochastic_value, 17 | m.num_labeled_examples, 18 | m.num_labeled_above_threshold, 19 | m.num_positive_labels 20 | from test_results.evaluations m 21 | left join triage_metadata.models g 22 | using(model_id) 23 | where g.model_group_id = 20 24 | and metric = 'precision@' 25 | and parameter = '10_pct' 26 | 27 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 28 | n_features_plots: 10 # Number of features for importances 29 | figsize: [12, 12] # Default size for plots 30 | fontsize: 20 # Default fontsize for plots 31 | -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_jaccard_on_lists_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_jaccard_on_lists_over_time.png -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_mg_prec_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_mg_prec_over_time.png -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_mg_recall_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_mg_recall_over_time.png -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_group_importances.png -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_feature_importances.png -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_model_group_39_model_125_rayid_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/example/dirtyduck/postmodeling/inspection_model_group_39_model_125_rayid_curve.png -------------------------------------------------------------------------------- /example/dirtyduck/postmodeling/inspection_postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | # Postmodeling Configuration File 2 | 3 | project_path: '/triage' # Project path defined in triage with matrices and models 4 | 5 | model_group_id: 6 | - 39 7 | - 9 8 | - 29 9 | - 30 10 | 11 | thresholds: # Thresholds for defining positive predictions 12 | rank_abs: [50, 100, 250] 13 | rank_pct: [5, 10, 25] 14 | 15 | baseline_query: | # SQL query for defining a baseline for comparison in plots. It needs a metric and parameter 16 | select g.model_group_id, 17 | m.model_id, 18 | extract('year' from m.evaluation_end_time) as as_of_date_year, 19 | m.metric, 20 | m.parameter, 21 | m.stochastic_value, 22 | m.num_labeled_examples, 23 | m.num_labeled_above_threshold, 24 | m.num_positive_labels 25 | from test_results.evaluations as m 26 | left join triage_metadata.models as g 27 | using(model_id) 28 | where g.model_group_id = 1 29 | and metric = 'precision@' 30 | and parameter = '15_pct' 31 | 32 | max_depth_error_tree: 5 # For error trees, how depth the decision trees should go? 33 | n_features_plots: 10 # Number of features for importances 34 | figsize: [12, 12] # Default size for plots 35 | fontsize: 20 # Default fontsize for plots 36 | -------------------------------------------------------------------------------- /example/label/events.sql: -------------------------------------------------------------------------------- 1 | select 2 | entity_id, 3 | bool_or(outcome::bool)::integer as outcome 4 | from events 5 | where outcome_date >= '{as_of_date}' 6 | and outcome_date < '{as_of_date}'::timestamp + interval '{label_timespan}' 7 | group by entity_id 8 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | from argcmdr import local, LocalRoot, Local 5 | from plumbum import local as plumlocal 6 | 7 | 8 | ROOT_PATH = Path(__file__).parent.resolve() 9 | 10 | 11 | class Development(LocalRoot): 12 | """Commands to aid in Triage library development""" 13 | pass 14 | 15 | 16 | @Development.register 17 | @local('remainder', metavar='alembic arguments', nargs=argparse.REMAINDER) 18 | def alembic(context, args): 19 | """Configuration wrapper to use the Alembic schema migrations library for Triage development. 20 | Try `alembic -h` or `manage alembic -- -h` to see a description of all 21 | the available subcommands""" 22 | return context.local['env'][ 23 | 'PYTHONPATH=' + str(ROOT_PATH / 'src'), 24 | 'alembic', 25 | '-c', ROOT_PATH / 'src' / 'triage' / 'component' / 'results_schema' / 'alembic.ini', 26 | '-x', 'db_config_file=database.yaml', 27 | args.remainder, 28 | ] 29 | 30 | 31 | @Development.register 32 | class Docs(Local): 33 | """View Triage documentation through local server""" 34 | def prepare(self, args): 35 | yield plumlocal['python']['docs/update_docs.py'] 36 | with plumlocal.cwd(ROOT_PATH / 'docs'): 37 | yield plumlocal['mkdocs']['serve'] 38 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = src/tests/ 3 | -------------------------------------------------------------------------------- /requirement/dev.txt: -------------------------------------------------------------------------------- 1 | -r include/build.txt 2 | bumpversion==0.6.0 3 | mkdocs==1.6.1 4 | pymdown-extensions==10.8 5 | mkdocs-material==9.4.6 6 | mkdocstrings==0.29.1 7 | mkdocstrings-python==1.16.10 8 | black==22.3.0 9 | -------------------------------------------------------------------------------- /requirement/extras-rq.txt: -------------------------------------------------------------------------------- 1 | rq==1.4.3 # pyup: ignore 2 | redis 3 | -------------------------------------------------------------------------------- /requirement/include/build.txt: -------------------------------------------------------------------------------- 1 | wheel==0.38.2 2 | -------------------------------------------------------------------------------- /requirement/include/lint.txt: -------------------------------------------------------------------------------- 1 | flake8==4.0.1 2 | -------------------------------------------------------------------------------- /requirement/include/test-management.txt: -------------------------------------------------------------------------------- 1 | codecov==2.1.13 2 | coverage>=4.4 3 | tox==3.25.0 4 | -------------------------------------------------------------------------------- /requirement/main.txt: -------------------------------------------------------------------------------- 1 | polars==0.18.2 2 | pyarrow>=12.0.1 3 | numpy==1.26.0 4 | pandas==1.5.0 5 | alembic==1.7.7 6 | SQLAlchemy==1.3.18 # pyup: ignore 7 | PyYAML==6.0.2 8 | psycopg2-binary==2.9.3 9 | boto3==1.22.4 10 | click==8.1.3 11 | inflection==0.5.1 12 | sqlalchemy-postgres-copy==0.5.0 13 | retrying==1.3.3 14 | Dickens==1.0.1 15 | signalled-timeout==1.0.0 16 | wrapt==1.14.0 17 | argcmdr==0.7.0 18 | sqlparse==0.4.4 19 | pebble==4.6.3 20 | adjustText==0.7.3 21 | graphviz==0.20 22 | requests==2.31.0 23 | coloredlogs==15.0.1 24 | verboselogs==1.7 25 | s3fs==0.4.2 # pyup: ignore 26 | scikit-learn==1.6.1 27 | matplotlib==3.5.1 28 | seaborn==0.11.2 29 | ohio==0.5.0 30 | aequitas==0.42.0 31 | plotly==6.0.1 32 | jupyter==1.0.0 -------------------------------------------------------------------------------- /requirement/test.txt: -------------------------------------------------------------------------------- 1 | -r include/lint.txt 2 | -r include/test-management.txt 3 | parsedatetime==2.6 4 | csvkit==1.0.7 5 | factory_boy==3.2.1 6 | testing.postgresql==1.3.0 7 | pytest==6.2.5 #<4.0.0 # pyup: ignore 8 | pytest-cov==3.0.0 9 | moto==3.1.7 10 | fakeredis==1.7.1 11 | hypothesis==6.46.1 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 5.5.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:src/triage/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [pycodestyle] 21 | max-line-length = 88 22 | statistics = True 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import re 4 | from pathlib import Path 5 | from setuptools import find_packages, setup 6 | 7 | 8 | ROOT_PATH = Path(__file__).parent 9 | 10 | LICENSE_PATH = ROOT_PATH / "LICENSE" 11 | 12 | README_PATH = ROOT_PATH / "README.md" 13 | 14 | REQUIREMENTS_PATH = ROOT_PATH / "requirement" / "main.txt" 15 | 16 | REQUIREMENTS_TEST_PATH = ROOT_PATH / "requirement" / "test.txt" 17 | 18 | REQUIREMENTS_RQ_PATH = ROOT_PATH / "requirement" / "extras-rq.txt" 19 | 20 | 21 | def stream_requirements(fd): 22 | """For a given requirements file descriptor, generate lines of 23 | distribution requirements, ignoring comments and chained requirement 24 | files. 25 | 26 | """ 27 | for line in fd: 28 | cleaned = re.sub(r"#.*$", "", line).strip() 29 | if cleaned and not cleaned.startswith("-r"): 30 | yield cleaned 31 | 32 | 33 | with REQUIREMENTS_PATH.open() as requirements_file: 34 | REQUIREMENTS = list(stream_requirements(requirements_file)) 35 | 36 | 37 | with REQUIREMENTS_TEST_PATH.open() as test_requirements_file: 38 | REQUIREMENTS_TEST = REQUIREMENTS[:] 39 | REQUIREMENTS_TEST.extend(stream_requirements(test_requirements_file)) 40 | 41 | with REQUIREMENTS_RQ_PATH.open() as rq_requirements_file: 42 | RQ_REQUIREMENTS = list(stream_requirements(rq_requirements_file)) 43 | 44 | 45 | setup( 46 | name='triage', 47 | version='5.5.1', 48 | description="Risk modeling and prediction", 49 | long_description=README_PATH.read_text(), 50 | long_description_content_type="text/markdown", 51 | author="Center for Data Science and Public Policy", 52 | author_email="datascifellows@gmail.com", 53 | url="https://dssg.github.io/triage/", 54 | project_urls={ 55 | "Documentation": "https://dssg.github.io/triage/", 56 | "Source Code": "https://github.com/dssg/triage", 57 | "Tutorial": "https://dssg.github.io/triage/dirtyduck/", 58 | }, 59 | packages=find_packages("src", exclude=["tests", "tests.*"]), 60 | package_dir={"": "src"}, 61 | include_package_data=True, 62 | install_requires=REQUIREMENTS, 63 | entry_points={ 64 | "console_scripts": ["triage = triage.cli:execute"], 65 | }, 66 | extras_require={"rq": RQ_REQUIREMENTS}, 67 | license="MIT License", 68 | zip_safe=False, 69 | keywords="triage", 70 | classifiers=[ 71 | "Development Status :: 2 - Pre-Alpha", 72 | "Intended Audience :: Developers", 73 | "License :: OSI Approved :: MIT License", 74 | "Natural Language :: English", 75 | "Programming Language :: Python :: 3", 76 | "Programming Language :: Python :: 3.8", 77 | "Programming Language :: Python :: 3.9", 78 | "Programming Language :: Python :: 3.10", 79 | ], 80 | python_requires=">=3.8", 81 | test_suite="tests", 82 | tests_require=REQUIREMENTS_TEST, 83 | ) 84 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /src/tests/architect_tests/README.md: -------------------------------------------------------------------------------- 1 | Write some tests! 2 | -------------------------------------------------------------------------------- /src/tests/architect_tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the app.""" 2 | -------------------------------------------------------------------------------- /src/tests/audition_tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the app.""" 2 | -------------------------------------------------------------------------------- /src/tests/audition_tests/test_model_group_performance.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import numpy as np 4 | import testing.postgresql 5 | from sqlalchemy import create_engine 6 | 7 | from triage.component.audition.model_group_performance import ( 8 | ModelGroupPerformancePlotter, 9 | ) 10 | 11 | from .utils import create_sample_distance_table 12 | 13 | 14 | def test_ModelGroupPerformancePlotter_generate_plot_data(): 15 | with testing.postgresql.Postgresql() as postgresql: 16 | engine = create_engine(postgresql.url()) 17 | distance_table, model_groups = create_sample_distance_table(engine) 18 | plotter = ModelGroupPerformancePlotter(distance_table) 19 | df = plotter.generate_plot_data( 20 | metric="precision@", 21 | parameter="100_abs", 22 | model_group_ids=[1, 2], 23 | train_end_times=["2014-01-01", "2015-01-01"], 24 | ) 25 | assert sorted(df["model_type"].unique()) == [ 26 | "best case", 27 | "mySpikeClassifier", 28 | "myStableClassifier", 29 | ] 30 | for value in df[df["model_group_id"] == 1]["raw_value"].values: 31 | assert np.isclose(value, 0.5) 32 | 33 | 34 | def test_ModelGroupPerformancePlotter_plot_all(): 35 | with patch( 36 | "triage.component.audition.model_group_performance.plot_cats" 37 | ) as plot_patch: 38 | with testing.postgresql.Postgresql() as postgresql: 39 | engine = create_engine(postgresql.url()) 40 | distance_table, model_groups = create_sample_distance_table(engine) 41 | plotter = ModelGroupPerformancePlotter(distance_table) 42 | plotter.plot_all( 43 | [{"metric": "precision@", "parameter": "100_abs"}], 44 | model_group_ids=[1, 2], 45 | train_end_times=["2014-01-01", "2015-01-01"], 46 | ) 47 | assert plot_patch.called 48 | args, kwargs = plot_patch.call_args 49 | assert "raw_value" in kwargs["frame"] 50 | assert "train_end_time" in kwargs["frame"] 51 | assert kwargs["x_col"] == "train_end_time" 52 | assert kwargs["y_col"] == "raw_value" 53 | -------------------------------------------------------------------------------- /src/tests/audition_tests/test_plotting.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pandas as pd 4 | from matplotlib import lines as mlines 5 | 6 | from triage.component.audition.plotting import ( 7 | generate_plot_lines, 8 | category_colordict, 9 | category_styledict, 10 | plot_cats, 11 | ) 12 | 13 | 14 | def test_generate_plot_lines(): 15 | colordict = {"cat1": "#001122", "cat2": "#112233", "cat3": "#223344"} 16 | styledict = {"cat1": "-", "cat2": "--", "cat3": "-"} 17 | plot_lines = generate_plot_lines(colordict, lambda x: "Cat {}".format(x), styledict) 18 | assert len(plot_lines) == 3 19 | for line in plot_lines: 20 | assert type(line) == mlines.Line2D 21 | assert "Cat " in line._label 22 | assert "-" in line._linestyle 23 | if line._label == "Cat 2": 24 | assert line._linestyle == "--" 25 | 26 | 27 | def test_category_colordict(): 28 | cmap_name = "tab10" 29 | categories = ["Cat1", "Cat2", "Cat3", "Cat4"] 30 | colordict = category_colordict(cmap_name, categories) 31 | assert len(colordict.keys()) == 4 32 | 33 | 34 | def test_category_colordict_with_highlight(): 35 | cmap_name = "tab10" 36 | colordict_with_highlight = category_colordict( 37 | cmap_name, ["Cat1", "Cat2", "Cat3", "Cat4"], "Cat2" 38 | ) 39 | colordict_without_highlight = category_colordict( 40 | cmap_name, ["Cat1", "Cat3", "Cat4"] 41 | ) 42 | for cat in ["Cat1", "Cat3", "Cat4"]: 43 | assert colordict_with_highlight[cat] == colordict_without_highlight[cat] 44 | assert colordict_with_highlight["Cat2"] == "#000000" 45 | 46 | 47 | def test_category_styledict(): 48 | colordict = {"cat1": "#001122", "cat2": "#112233", "cat3": "#223344"} 49 | assert category_styledict(colordict, "cat3") == { 50 | "cat1": "-", 51 | "cat2": "-", 52 | "cat3": "--", 53 | } 54 | 55 | 56 | def test_plot_cats(): 57 | test_df = pd.DataFrame.from_dict( 58 | { 59 | "cats": ["tuxedo", "maine coon", "lion!"], 60 | "groups": ["i", "dont", "know"], 61 | "col1": [1, 2, 3], 62 | "col2": [4, 5, 6], 63 | "col3": [7, 8, 9], 64 | } 65 | ) 66 | # hard to make many assertions, but we can make sure it gets to the end 67 | # and shows the contents 68 | with patch("triage.component.audition.plotting.plt.show") as show_patch: 69 | plot_cats(test_df, "col1", "col2", cat_col="cats", grp_col="groups") 70 | assert show_patch.called 71 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/README.md: -------------------------------------------------------------------------------- 1 | Write some tests! 2 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the app.""" 2 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/test_estimators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pytest 4 | 5 | from triage.component.catwalk.estimators.transformers import CutOff 6 | from triage.component.catwalk.estimators.classifiers import ScaledLogisticRegression 7 | 8 | from sklearn import linear_model 9 | 10 | from sklearn import datasets 11 | from sklearn import preprocessing 12 | from sklearn.pipeline import Pipeline 13 | from sklearn.model_selection import train_test_split 14 | 15 | 16 | @pytest.fixture 17 | def data(): 18 | dataset = datasets.load_breast_cancer() 19 | X = dataset.data 20 | y = dataset.target 21 | 22 | X_train, X_test, y_train, y_test = train_test_split( 23 | X, y, test_size=0.3, random_state=12345 24 | ) 25 | 26 | return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} 27 | 28 | 29 | def test_cutoff_warning(): 30 | X_data = [1, 2, 0.5, 0.7, 100, -1, -23, 0] 31 | 32 | cutoff = CutOff() 33 | 34 | with pytest.raises(ValueError): 35 | cutoff.fit_transform(X_data) 36 | 37 | 38 | def test_cutoff_transformer(): 39 | cutoff = CutOff() 40 | 41 | X_data = np.array([1, 2, 0.5, 0.7, 100, -1, -23, 0]).reshape(-1,1) 42 | 43 | assert np.all(cutoff.fit_transform(X_data) == np.array([1, 1, 0.5, 0.7, 1, 0, 0, 0]).reshape(-1,1)) 44 | 45 | 46 | def test_cutoff_inside_a_pipeline(data): 47 | minmax_scaler = preprocessing.MinMaxScaler() 48 | dsapp_cutoff = CutOff() 49 | 50 | pipeline = Pipeline( 51 | [("minmax_scaler", minmax_scaler), ("dsapp_cutoff", dsapp_cutoff)] 52 | ) 53 | 54 | pipeline.fit(data["X_train"], data["y_train"]) 55 | 56 | X_fake_new_data = data["X_test"][-1, :].reshape(1, -1) + 0.5 57 | 58 | mms = preprocessing.MinMaxScaler().fit(data["X_train"]) 59 | 60 | assert np.all( 61 | (mms.transform(X_fake_new_data) > 1) 62 | == (pipeline.transform(X_fake_new_data) == 1) 63 | ) 64 | 65 | 66 | def test_dsapp_lr(data): 67 | dsapp_lr = ScaledLogisticRegression() 68 | dsapp_lr.fit(data["X_train"], data["y_train"]) 69 | 70 | minmax_scaler = preprocessing.MinMaxScaler() 71 | dsapp_cutoff = CutOff() 72 | lr = linear_model.LogisticRegression(solver='lbfgs') 73 | 74 | pipeline = Pipeline( 75 | [("minmax_scaler", minmax_scaler), ("dsapp_cutoff", dsapp_cutoff), ("lr", lr)] 76 | ) 77 | 78 | pipeline.fit(data["X_train"], data["y_train"]) 79 | 80 | assert np.all(dsapp_lr.predict(data["X_test"]) == pipeline.predict(data["X_test"])) 81 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/test_feature_importances.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from triage.component.catwalk.feature_importances import ( 4 | get_feature_importances, 5 | ) 6 | 7 | from sklearn import datasets 8 | from sklearn.svm import SVC 9 | from sklearn.dummy import DummyClassifier 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.linear_model import LogisticRegression 12 | 13 | from sklearn.model_selection import train_test_split 14 | 15 | 16 | @pytest.fixture 17 | def trained_models(): 18 | dataset = datasets.load_breast_cancer() 19 | X = dataset.data 20 | y = dataset.target 21 | 22 | X_train, X_test, y_train, y_test = train_test_split( 23 | X, y, test_size=0.3, random_state=12345 24 | ) 25 | 26 | rf = RandomForestClassifier(n_estimators=100) 27 | rf.fit(X_train, y_train) 28 | 29 | lr = LogisticRegression(solver='liblinear') 30 | lr.fit(X_train, y_train) 31 | 32 | svc_w_linear_kernel = SVC(kernel="linear", gamma='auto') 33 | svc_w_linear_kernel.fit(X_train, y_train) 34 | 35 | svc_wo_linear_kernel = SVC(gamma='auto') 36 | svc_wo_linear_kernel.fit(X_train, y_train) 37 | 38 | dummy = DummyClassifier(strategy='stratified') 39 | dummy.fit(X_train, y_train) 40 | 41 | return { 42 | "RF": rf, 43 | "LR": lr, 44 | "SVC_w_linear_kernel": svc_w_linear_kernel, 45 | "Dummy": dummy, 46 | "SVC_wo_linear_kernel": svc_wo_linear_kernel, 47 | } 48 | 49 | def test_correct_feature_importances_for_lr(trained_models): 50 | feature_importances = get_feature_importances(trained_models["LR"]) 51 | 52 | # It returns the intercept, too 53 | assert feature_importances.shape == (30,) 54 | 55 | 56 | def test_correct_feature_importances_for_rf(trained_models): 57 | feature_importances = get_feature_importances(trained_models["RF"]) 58 | assert feature_importances.shape == (30,) 59 | 60 | 61 | def test_correct_feature_importances_for_svc_w_linear_kernel(trained_models): 62 | feature_importances = get_feature_importances( 63 | trained_models["SVC_w_linear_kernel"]) 64 | assert feature_importances.shape == (30,) 65 | 66 | 67 | def test_correct_feature_importances_for_svc_wo_linear_kernel(trained_models): 68 | feature_importances = get_feature_importances( 69 | trained_models["SVC_wo_linear_kernel"] 70 | ) 71 | assert feature_importances is None 72 | 73 | 74 | def test_correct_feature_importances_for_dummy(trained_models): 75 | feature_importances = get_feature_importances(trained_models["Dummy"]) 76 | assert feature_importances is None 77 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/test_individual_importance_uniform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from triage.component.catwalk.individual_importance.uniform import uniform_distribution 4 | from tests.utils import rig_engines, get_matrix_store, matrix_metadata_creator 5 | import datetime 6 | 7 | from tests.results_tests.factories import ( 8 | ModelFactory, 9 | FeatureImportanceFactory, 10 | ) 11 | 12 | 13 | def test_uniform_distribution(): 14 | with rig_engines() as (db_engine, project_storage): 15 | model = ModelFactory() 16 | feature_importances = [ 17 | FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) 18 | for i in range(0, 10) 19 | ] 20 | data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]} 21 | for imp in feature_importances: 22 | data_dict[imp.feature] = [0.5, 0.5] 23 | metadata = matrix_metadata_creator() 24 | test_store = get_matrix_store( 25 | project_storage, 26 | pd.DataFrame.from_dict(data_dict), 27 | metadata, 28 | ) 29 | results = uniform_distribution( 30 | db_engine, 31 | model_id=model.model_id, 32 | as_of_date=datetime.date(2016, 1, 1), 33 | test_matrix_store=test_store, 34 | n_ranks=5, 35 | ) 36 | 37 | assert len(results) == 5 # 5 features x 1 entity for this as_of_date 38 | for result in results: 39 | assert "entity_id" in result 40 | assert "feature_name" in result 41 | assert "score" in result 42 | assert "feature_value" in result 43 | assert result["feature_value"] == 0.5 44 | assert result["score"] >= 0 45 | assert result["score"] <= 1 46 | assert isinstance(result["feature_name"], str) 47 | assert result["entity_id"] in [1, 2] 48 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | from triage.component.catwalk.metrics import fpr 2 | from triage.component.catwalk.evaluation import ModelEvaluator 3 | 4 | 5 | def test_metric_directionality(): 6 | """All metrics must be wrapped using the @Metric decorator available 7 | in catwalk.metrics to provide an `greater_is_better` attribute which must 8 | be one of True or False. 9 | """ 10 | for met in ModelEvaluator.available_metrics.values(): 11 | assert hasattr(met, "greater_is_better") 12 | assert met.greater_is_better in (True, False) 13 | 14 | 15 | def test_fpr(): 16 | predictions_binary = [1, 1, 1, 0, 0, 0, 0, 0] 17 | labels = [1, 1, 0, 1, 0, 0, 0, 1] 18 | 19 | result = fpr([], predictions_binary, labels, []) 20 | # false positives = 1 21 | # total negatives = 4 22 | assert result == 0.25 23 | -------------------------------------------------------------------------------- /src/tests/catwalk_tests/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | import tempfile 4 | from contextlib import contextmanager 5 | import pytest 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import yaml 10 | 11 | from triage.component.catwalk.storage import ( 12 | ProjectStorage, 13 | ) 14 | from triage.util.structs import FeatureNameList 15 | 16 | 17 | def fake_labels(length): 18 | return np.array([random.choice([True, False]) for i in range(0, length)]) 19 | 20 | 21 | @pytest.fixture 22 | def sample_metadata(): 23 | return { 24 | "feature_start_time": datetime.date(2012, 12, 20), 25 | "end_time": datetime.date(2016, 12, 20), 26 | "label_name": "label", 27 | "as_of_date_frequency": "1w", 28 | "max_training_history": "5y", 29 | "state": "default", 30 | "cohort_name": "default", 31 | "label_timespan": "1y", 32 | "metta-uuid": "1234", 33 | "feature_names": FeatureNameList(["ft1", "ft2"]), 34 | "feature_groups": ["all: True"], 35 | "indices": ["entity_id"], 36 | } 37 | 38 | 39 | @pytest.fixture 40 | def sample_df(): 41 | return pd.DataFrame.from_dict( 42 | { 43 | "entity_id": [1, 2], 44 | "feature_one": [3, 4], 45 | "feature_two": [5, 6], 46 | "label": ["good", "bad"], 47 | } 48 | ).set_index("entity_id") 49 | 50 | 51 | @pytest.fixture 52 | def sample_matrix_store(sample_df, sample_metadata): 53 | with tempfile.TemporaryDirectory() as tempdir: 54 | project_storage = ProjectStorage(tempdir) 55 | store = project_storage.matrix_storage_engine().get_store("1234") 56 | store.matrix = sample_df 57 | store.metadata = sample_metadata 58 | return store 59 | -------------------------------------------------------------------------------- /src/tests/collate_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /src/tests/collate_tests/create_inspections_subset.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import random 3 | from contextlib import contextmanager 4 | 5 | import pandas as pd 6 | import requests 7 | from tqdm import tqdm 8 | import re 9 | 10 | 11 | @contextmanager 12 | def download(url): 13 | "download `url` to a file, returning the file name" 14 | with tempfile.NamedTemporaryFile(mode="wb") as f: 15 | response = requests.get(url, stream=True) 16 | for data in tqdm(response.iter_content()): 17 | f.write(data) 18 | f.flush() 19 | yield f.name 20 | 21 | 22 | def create_subset(src, dest, n=250): 23 | "Given a csv file `src`, create a subset `dest` with `n` unique entities" 24 | df = pd.read_csv(src) 25 | lics = pd.unique(df["License #"]) 26 | sublics = lics[random.sample(range(0, len(lics)), n)] 27 | subset = df[df["License #"].isin(sublics)] 28 | # Make the column names a little more readable 29 | subset.columns = map(clean_column_name, subset.columns) 30 | subset.to_csv(dest, index=False) 31 | 32 | 33 | def clean_column_name(col): 34 | col = col.lower() 35 | col = col.replace(" ", "_") 36 | col = col.replace("#", "no") 37 | return re.sub("[\W]+", "", col) 38 | 39 | 40 | if __name__ == "__main__": 41 | # download the entire Chicago restaurant inspections CSV file 42 | with download( 43 | "https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD" 44 | ) as f: 45 | create_subset(f, "food_inspections_subset.csv") 46 | -------------------------------------------------------------------------------- /src/tests/collate_tests/initialize_db.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import subprocess 3 | from sqlalchemy import create_engine 4 | 5 | 6 | DATA_NAME = "food_inspections_subset.csv" 7 | DATA_PATH = pathlib.Path(__file__).with_name(DATA_NAME) 8 | 9 | 10 | def handler(database): 11 | engine = create_engine(database.url()) 12 | connection = engine.connect() 13 | try: 14 | load_data(connection) 15 | finally: 16 | connection.close() 17 | 18 | 19 | def load_data(connection): 20 | connection.execute("DROP TABLE IF EXISTS food_inspections") 21 | subprocess.run( 22 | [ 23 | "csvsql", 24 | "-v", 25 | "--no-constraints", 26 | "--tables", 27 | "food_inspections", 28 | "--insert", 29 | "--db", 30 | str(connection.engine.url), 31 | str(DATA_PATH), 32 | ], 33 | check=True, 34 | ) 35 | connection.execute("CREATE INDEX ON food_inspections(license_no, inspection_date)") 36 | 37 | # create a state table for license/date 38 | connection.execute("DROP TABLE IF EXISTS inspection_states") 39 | connection.execute( 40 | """\ 41 | CREATE TABLE inspection_states AS ( 42 | SELECT license_no, date 43 | FROM (SELECT DISTINCT license_no FROM food_inspections) a 44 | CROSS JOIN (SELECT DISTINCT inspection_date as date FROM food_inspections) b 45 | )""" 46 | ) 47 | connection.execute("CREATE INDEX ON inspection_states(license_no, date)") 48 | 49 | # create an alternate state table with a different date column 50 | connection.execute("DROP TABLE IF EXISTS inspection_states_diff_colname") 51 | connection.execute( 52 | """\ 53 | CREATE TABLE inspection_states_diff_colname 54 | AS select license_no, date as aggregation_date 55 | FROM inspection_states 56 | """ 57 | ) 58 | connection.execute( 59 | """\ 60 | CREATE INDEX ON 61 | inspection_states_diff_colname(license_no, aggregation_date) 62 | """ 63 | ) 64 | 65 | # create a state table for licenseo only 66 | connection.execute("DROP TABLE IF EXISTS all_licenses") 67 | connection.execute( 68 | """\ 69 | CREATE TABLE all_licenses AS ( 70 | SELECT DISTINCT license_no FROM food_inspections 71 | )""" 72 | ) 73 | connection.execute("CREATE INDEX ON all_licenses(license_no)") 74 | -------------------------------------------------------------------------------- /src/tests/example_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | entities: 3 | - 4 | name: house 5 | attributes: 6 | siding: str 7 | construction_year: int 8 | - 9 | name: kid 10 | attributes: 11 | age: int 12 | - 13 | name: insurance_policy 14 | attributes: 15 | deductible: bool 16 | - 17 | name: address 18 | spatial: True 19 | attributes: 20 | street_address: str 21 | - 22 | name: inspection 23 | event: True 24 | attributes: 25 | result: bool 26 | 27 | relationships: 28 | - 29 | name: residency 30 | entity_one: house 31 | entity_two: kid 32 | type: m2m 33 | temporal: True 34 | - 35 | entity_one: house 36 | entity_two: insurance_policy 37 | type: o2m 38 | temporal: True 39 | - 40 | entity_one: house 41 | entity_two: address 42 | type: o2m 43 | - 44 | entity_one: house 45 | entity_two: inspection 46 | type: o2m 47 | primary_entity: kid 48 | outcome_variable: lead_level 49 | -------------------------------------------------------------------------------- /src/tests/postmodeling_tests/test_crosstabs.py: -------------------------------------------------------------------------------- 1 | from triage.component.postmodeling.crosstabs import run_crosstabs 2 | from triage.database_reflection import table_has_data 3 | 4 | 5 | def test_run_crosstabs(finished_experiment, crosstabs_config): 6 | run_crosstabs(finished_experiment.db_engine, crosstabs_config) 7 | expected_table_name = ( 8 | crosstabs_config.output["schema"] + "." + crosstabs_config.output["table"] 9 | ) 10 | table_has_data(expected_table_name, finished_experiment.db_engine) 11 | -------------------------------------------------------------------------------- /src/tests/postmodeling_tests/test_without_predictions.py: -------------------------------------------------------------------------------- 1 | from triage.component.postmodeling.deprecated.model_group_evaluator import ModelGroupEvaluator 2 | from triage.component.postmodeling.deprecated.model_evaluator import ModelEvaluator 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="module") 7 | def model_group_evaluator(finished_experiment_without_predictions): 8 | return ModelGroupEvaluator((1, 1), finished_experiment_without_predictions.db_engine) 9 | 10 | 11 | @pytest.fixture(scope="module") 12 | def model_evaluator(finished_experiment_without_predictions): 13 | return ModelEvaluator(1, 1, finished_experiment_without_predictions.db_engine) 14 | 15 | 16 | def test_ModelGroupEvaluator_metadata(model_group_evaluator): 17 | assert all(value for metadata_row in model_group_evaluator.metadata for key, value in metadata_row.items() ) 18 | 19 | 20 | def test_ModelGroupEvaluator_predictions(model_group_evaluator): 21 | with pytest.raises(RuntimeError): 22 | model_group_evaluator.predictions 23 | 24 | 25 | def test_ModelEvaluator_metadata(model_evaluator): 26 | assert all(value for key, value in model_evaluator.metadata.items()) 27 | 28 | 29 | def test_ModelEvaluator_predictions(model_evaluator): 30 | with pytest.raises(RuntimeError): 31 | model_evaluator.predictions 32 | -------------------------------------------------------------------------------- /src/tests/results_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/tests/results_tests/__init__.py -------------------------------------------------------------------------------- /src/tests/results_tests/test_upgrade_if_clean.py: -------------------------------------------------------------------------------- 1 | from triage.component import results_schema 2 | from alembic import command, script 3 | import pytest 4 | 5 | 6 | def test_upgrade_if_clean_upgrades_if_clean(db_engine): 7 | results_schema.upgrade_if_clean(db_engine.url) 8 | db_version = db_engine.execute("select version_num from results_schema_versions").scalar() 9 | alembic_cfg = results_schema.alembic_config(db_engine.url) 10 | assert db_version == script.ScriptDirectory.from_config(alembic_cfg).get_current_head() 11 | 12 | 13 | def test_upgrade_if_clean_does_not_upgrade_if_not_clean(db_engine): 14 | command.upgrade(results_schema.alembic_config(dburl=db_engine.url), "head") 15 | command.downgrade(results_schema.alembic_config(dburl=db_engine.url), "-1") 16 | with pytest.raises(ValueError): 17 | results_schema.upgrade_if_clean(db_engine.url) 18 | -------------------------------------------------------------------------------- /src/tests/results_tests/test_valid_schema.py: -------------------------------------------------------------------------------- 1 | import testing.postgresql 2 | from sqlalchemy import create_engine 3 | 4 | from triage.component.results_schema import Base 5 | 6 | 7 | def test_full_schema(): 8 | with testing.postgresql.Postgresql() as postgres: 9 | engine = create_engine(postgres.url()) 10 | Base.metadata.create_all(bind=engine) 11 | -------------------------------------------------------------------------------- /src/tests/test_utils_pandas.py: -------------------------------------------------------------------------------- 1 | from triage.util.pandas import downcast_matrix 2 | from triage.component.catwalk.storage import MatrixStore 3 | from .utils import matrix_creator 4 | 5 | 6 | def test_downcast_matrix(): 7 | df = matrix_creator().set_index(MatrixStore.indices) 8 | downcasted_df = downcast_matrix(df) 9 | 10 | # make sure the contents are equivalent 11 | assert((downcasted_df == df).all().all()) 12 | 13 | # make sure the memory usage is lower because there would be no point of this otherwise 14 | assert downcasted_df.memory_usage().sum() < df.memory_usage().sum() 15 | -------------------------------------------------------------------------------- /src/tests/test_validation.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | import testing.postgresql 3 | from unittest import mock 4 | 5 | from triage.component.catwalk.db import ensure_db 6 | 7 | from tests.utils import sample_config, populate_source_data, open_side_effect 8 | from triage.experiments.validate import ExperimentValidator 9 | 10 | 11 | def test_experiment_validator(): 12 | with testing.postgresql.Postgresql() as postgresql: 13 | db_engine = create_engine(postgresql.url()) 14 | ensure_db(db_engine) 15 | populate_source_data(db_engine) 16 | with mock.patch( 17 | "triage.util.conf.open", side_effect=open_side_effect 18 | ) as mock_file: 19 | ExperimentValidator(db_engine).run(sample_config("query")) 20 | ExperimentValidator(db_engine).run(sample_config("filepath")) 21 | -------------------------------------------------------------------------------- /src/tests/test_validation_primitives.py: -------------------------------------------------------------------------------- 1 | from triage.validation_primitives import string_is_tablesafe 2 | from hypothesis import given, example 3 | from hypothesis.strategies import text, characters 4 | 5 | 6 | # test with a variety of strings based on letters and numbers auto-generated by hypothesis 7 | # and also add a hardcoded example that includes underscores because those are fine 8 | @given(text(alphabet=characters(whitelist_categories=('Ll', 'Nd')), min_size=1)) 9 | @example('a_valid_name') 10 | def test_string_is_tablesafe(s): 11 | assert string_is_tablesafe(s) 12 | 13 | 14 | # test with a variety of strings based on unsafe characters auto-generated by hypothesis 15 | # and also add a hardcoded example that should be bad because it has spaces 16 | @given(text(alphabet='/ "A')) 17 | @example('spaces are not valid') 18 | @example('Neither_are_CAPITAL_letters') 19 | def test_string_is_not_tablesafe(s): 20 | assert not string_is_tablesafe(s) 21 | -------------------------------------------------------------------------------- /src/tests/timechop_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/tests/timechop_tests/__init__.py -------------------------------------------------------------------------------- /src/tests/timechop_tests/test_plotting.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | from unittest import TestCase 3 | import yaml 4 | import matplotlib 5 | 6 | matplotlib.use("Agg") 7 | from triage.component.timechop import Timechop # noqa 8 | from triage.component.timechop.plotting import visualize_chops # noqa 9 | 10 | 11 | class VisualizeChopTest(TestCase): 12 | @property 13 | def chopper(self): 14 | # create a valid Timechop chopper 15 | # least brittle current way of doing this is by loading the 16 | # example_experiment_config.yaml file, because that is a 17 | # diligently updated file. If Timechop config changes, the 18 | # example config should change too 19 | with open("example/config/experiment.yaml") as fd: 20 | experiment_config = yaml.full_load(fd) 21 | return Timechop(**(experiment_config["temporal_config"])) 22 | 23 | # hard to make many assertions, but we can make sure it gets to the end 24 | # and shows the contents. 25 | 26 | # we do one such test case to work out each combination of boolean arguments 27 | def test_default_args(self): 28 | with patch("triage.component.timechop.plotting.plt.show") as show_patch: 29 | visualize_chops(self.chopper) 30 | assert show_patch.called 31 | 32 | def test_no_as_of_times(self): 33 | with patch("triage.component.timechop.plotting.plt.show") as show_patch: 34 | visualize_chops(self.chopper, show_as_of_times=False) 35 | assert show_patch.called 36 | 37 | def test_no_boundaries(self): 38 | with patch("triage.component.timechop.plotting.plt.show") as show_patch: 39 | visualize_chops(self.chopper, show_boundaries=False) 40 | assert show_patch.called 41 | 42 | def test_no_boundaries_or_as_of_times(self): 43 | with patch("triage.component.timechop.plotting.plt.show") as show_patch: 44 | visualize_chops(self.chopper, show_as_of_times=False, show_boundaries=False) 45 | assert show_patch.called 46 | -------------------------------------------------------------------------------- /src/tests/timechop_tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from triage.component.timechop.utils import convert_to_list 2 | 3 | 4 | def test_convert_to_list(): 5 | tests = [ 6 | {"val": "1 day", "expected_result": ["1 day"]}, 7 | {"val": ["1 day"], "expected_result": ["1 day"]}, 8 | {"val": 1, "expected_result": [1]}, 9 | ] 10 | for test in tests: 11 | assert convert_to_list(test["val"]) == test["expected_result"] 12 | -------------------------------------------------------------------------------- /src/triage/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """Center for Data Science and Public Policy""" 4 | __email__ = "datascifellows@gmail.com" 5 | __version__ = '5.5.1' # do not change to double-quotes, it will screw up bumpversion 6 | 7 | import logging 8 | import logging.config 9 | import yaml 10 | import pathlib 11 | 12 | 13 | logging_config = pathlib.Path(__file__).parent / 'config' / 'logging.yaml' 14 | 15 | with open(logging_config, 'r') as f: 16 | config = yaml.safe_load(f.read()) 17 | logging.config.dictConfig(config) 18 | 19 | 20 | from .util.db import create_engine 21 | 22 | 23 | 24 | __all__ = ('create_engine',) 25 | -------------------------------------------------------------------------------- /src/triage/component/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/__init__.py -------------------------------------------------------------------------------- /src/triage/component/architect/README.md: -------------------------------------------------------------------------------- 1 | # The Architect 2 | 3 | Plan, design, and build train and test matrices 4 | 5 | [![Build Status](https://travis-ci.org/dssg/architect.svg?branch=master)](https://travis-ci.org/dssg/architect) 6 | [![codecov](https://codecov.io/gh/dssg/architect/branch/master/graph/badge.svg)](https://codecov.io/gh/dssg/architect) 7 | [![codeclimate](https://codeclimate.com/github/dssg/architect.png)](https://codeclimate.com/github/dssg/architect) 8 | 9 | In order to run classification algorithms on source data, this data must be properly organized into design matrices. Converting cleaned data into these matrices is not a trivial task; the process of creating the needed features and labels for an experiment from source data can be complicated, creating the matrices themselves out of features and labels can be inefficient, and there is opportunity at each step to leak data backwards in time to give model trained on a matrix an unfair advantage. 10 | 11 | The Architect addresses these issues with functionality aimed at all tasks between cleaned source data (in a PostgreSQL database) and design matrices. 12 | 13 | ## Components 14 | 15 | - [LabelGenerator](architect/label_generators.py): Create binary labels suitable for a design matrix by querying a database table containing outcome events. 16 | - [FeatureGenerator](architect/feature_generators.py): Create aggregate features suitable for a design matrix from a set of database tables containing events. Uses [collate](https://github.com/dssg/collate/) to build aggregation SQL queries. 17 | - [FeatureGroupCreator](architect/feature_group_creator.py), [FeatureGroupMixer](architect/feature_group_mixer.py): Create groupings of features, and mix them using different strategies (like 'leave one out') to test their effectiveness. 18 | - [Planner](architect/planner.py), [Builder](architect/builders.py): Build all design matrices needed for an experiment, taking into account different labels, state configurations, and feature groups. 19 | 20 | In addition to being usable individually to assist in different aspects of building matrices in your project, the Architect components are integrated in [triage](https://github.com/dssg/triage) as a part of an entire modeling experiment that incorporates later tasks like model training and testing. 21 | 22 | ## Distributing, Building & Testing 23 | 24 | The Architect is a Python package distributable via `setuptools`. It may be installed directly using `easy_install` or `pip`, or listed as a dependency of another package (namely `triage`), under the package name `matrix-architect`. 25 | 26 | To build this package for development, its dependencies may be installed using `pip`: 27 | 28 | pip install -r requirements_dev.txt 29 | 30 | (or, without test and development dependencies, using **requirements.txt**). 31 | 32 | And, having built for development, to run tests: 33 | 34 | pytest 35 | -------------------------------------------------------------------------------- /src/triage/component/architect/__init__.py: -------------------------------------------------------------------------------- 1 | """Main application""" 2 | from .planner import Planner 3 | from . import builders 4 | 5 | __all__ = ("Planner", "builders") 6 | -------------------------------------------------------------------------------- /src/triage/component/architect/feature_dictionary_creator.py: -------------------------------------------------------------------------------- 1 | import verboselogs, logging 2 | logger = verboselogs.VerboseLogger(__name__) 3 | 4 | from triage.component.architect.utils import str_in_sql 5 | from triage.util.structs import FeatureNameList 6 | 7 | 8 | class FeatureDictionaryCreator: 9 | def __init__(self, features_schema_name, db_engine): 10 | self.features_schema_name = features_schema_name 11 | self.db_engine = db_engine 12 | 13 | def _tables_to_include(self, feature_table_names): 14 | return [ 15 | feature_table 16 | for feature_table in feature_table_names 17 | if "aggregation_imputed" in feature_table 18 | ] 19 | 20 | def feature_dictionary(self, feature_table_names, index_column_lookup): 21 | """ Create a dictionary of feature names, where keys are feature tables 22 | and values are lists of feature names. 23 | 24 | :return: feature_dictionary 25 | :rtype: dict 26 | """ 27 | feature_dictionary = {} 28 | 29 | # iterate! store each table name + features names as key-value pair 30 | for feature_table_name in self._tables_to_include(feature_table_names): 31 | feature_names = [ 32 | row[0] 33 | for row in self.db_engine.execute( 34 | self._build_feature_names_query( 35 | feature_table_name, index_column_lookup[feature_table_name] 36 | ) 37 | ) 38 | ] 39 | feature_dictionary[feature_table_name] = FeatureNameList(feature_names) 40 | logger.spam(f"Feature dictionary built: {feature_dictionary}") 41 | return feature_dictionary 42 | 43 | def _build_feature_names_query(self, table_name, index_columns): 44 | """ For a given feature table, get the names of the feature columns. 45 | 46 | :param table_name: name of the feature table 47 | :type table_name: str 48 | 49 | :return: names of the feature columns in given table 50 | :rtype: list 51 | """ 52 | # format the query that gets column names, 53 | # excluding indices from result 54 | feature_names_query = f""" 55 | SELECT column_name 56 | FROM information_schema.columns 57 | WHERE table_name = '{table_name}' AND 58 | table_schema = '{self.features_schema_name}' AND 59 | column_name NOT IN ({str_in_sql(index_columns)}) 60 | """ 61 | logger.spam( 62 | f"Extracting all possible feature names for table {table_name} with query {feature_names_query}" 63 | ) 64 | 65 | return feature_names_query 66 | -------------------------------------------------------------------------------- /src/triage/component/architect/features.py: -------------------------------------------------------------------------------- 1 | from triage.component.architect.feature_generators import FeatureGenerator 2 | from triage.component.architect.feature_dictionary_creator import ( 3 | FeatureDictionaryCreator, 4 | ) 5 | from triage.component.architect.feature_group_creator import FeatureGroupCreator 6 | from triage.component.architect.feature_group_mixer import FeatureGroupMixer 7 | 8 | __all__ = ( 9 | "FeatureGenerator", 10 | "FeatureDictionaryCreator", 11 | "FeatureGroupCreator", 12 | "FeatureGroupMixer", 13 | ) 14 | -------------------------------------------------------------------------------- /src/triage/component/audition/utils.py: -------------------------------------------------------------------------------- 1 | def make_list(a): 2 | return [a] if not isinstance(a, list) else a 3 | 4 | 5 | def str_in_sql(values): 6 | return ",".join(map(lambda x: "'{}'".format(x), values)) 7 | -------------------------------------------------------------------------------- /src/triage/component/catwalk/README.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Catwalk 3 | ======= 4 | 5 | Training, testing, and evaluating machine learning classifier models 6 | 7 | At the core of many predictive analytics applications is the need to train classifiers on large set of design matrices, test and temporally cross-validate them, and generate evaluation metrics about them. 8 | 9 | Python's scikit-learn package provides much of this functionality, but it is not trivial to design large experiments with it in a persistable way. Catwalk builds upon the functionality offered by scikit-learn by implementing: 10 | 11 | - Saving of modeling results and metadata in a `Postgres database `_ for later analysis 12 | - Exposure of computationally-intensive tasks as discrete workloads that can be used with different parallelization solutions (e.g. multiprocessing, Celery) 13 | - Different model persistence strategies such as on-filesystem or Amazon S3, that can be easily switched between 14 | - Hashing classifier model configuration to only retrain a model if necessary. 15 | - Various best practices in areas like input scaling for different classifier types and feature importance 16 | - Common scikit-learn model evaluation metrics as well as the ability to bundle custom evaluation metrics 17 | - Custom model wrappers for classifiers 18 | - 'Baseline' classes that generate classifications or predictions based on pre-determined rules, to be used for evaluating predictive models against simple hueristics 19 | -------------------------------------------------------------------------------- /src/triage/component/catwalk/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/catwalk/baselines/__init__.py -------------------------------------------------------------------------------- /src/triage/component/catwalk/db.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from sqlalchemy import create_engine 3 | from sqlalchemy.engine.url import URL 4 | from sqlalchemy.pool import QueuePool 5 | 6 | from triage.component.results_schema import Base 7 | 8 | 9 | def ensure_db(engine): 10 | Base.metadata.create_all(engine) 11 | 12 | 13 | def connect(poolclass=QueuePool): 14 | with open("database.yaml") as fd: 15 | config = yaml.full_load(fd) 16 | dburl = URL( 17 | "postgres", 18 | host=config["host"], 19 | username=config["user"], 20 | database=config["db"], 21 | password=config["pass"], 22 | port=config["port"], 23 | ) 24 | return create_engine(dburl, poolclass=poolclass) 25 | -------------------------------------------------------------------------------- /src/triage/component/catwalk/estimators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/catwalk/estimators/__init__.py -------------------------------------------------------------------------------- /src/triage/component/catwalk/estimators/transformers.py: -------------------------------------------------------------------------------- 1 | import verboselogs, logging 2 | logger = verboselogs.VerboseLogger(__name__) 3 | 4 | import numpy as np 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.utils import check_array 7 | 8 | class CutOff(BaseEstimator, TransformerMixin): 9 | """Transform features cutting values out of established range 10 | 11 | Args: 12 | feature_range: Range of allowed values, default=`(0,1)` 13 | 14 | Usage: 15 | The recommended way of using this is:: 16 | 17 | from sklearn.pipeline import Pipeline 18 | 19 | minmax_scaler = preprocessing.MinMaxScaler() 20 | dsapp_cutoff = CutOff() 21 | lr = linear_model.LogisticRegression() 22 | 23 | pipeline =Pipeline([ 24 | ('minmax_scaler',minmax_scaler), 25 | ('dsapp_cutoff', dsapp_cutoff), 26 | ('lr', lr) 27 | ]) 28 | 29 | pipeline.fit(X_train, y_train) 30 | pipeline.predict(X_test) 31 | 32 | """ 33 | 34 | def __init__(self, feature_range=(0, 1), copy=True): 35 | self.feature_range = feature_range 36 | self.copy = copy 37 | 38 | 39 | def fit(self, X, y=None): 40 | return self 41 | 42 | 43 | def transform(self, X): 44 | feature_range = self.feature_range 45 | 46 | X = check_array(X, copy=self.copy, ensure_2d=True) 47 | 48 | if np.any(X > feature_range[1]) or np.any(X < feature_range[0]): 49 | logger.notice( 50 | f"You got feature values that are out of the range: {feature_range}. " 51 | f"The feature values will cutoff to fit in the range {feature_range}." 52 | ) 53 | 54 | X[X > feature_range[1]] = feature_range[1] 55 | X[X < feature_range[0]] = feature_range[0] 56 | 57 | return X 58 | -------------------------------------------------------------------------------- /src/triage/component/catwalk/exceptions.py: -------------------------------------------------------------------------------- 1 | __all__ = ["BaselineFeatureNotInMatrix"] 2 | 3 | 4 | class BaselineFeatureNotInMatrix(KeyError): 5 | """ This error is used to allow feature mixing and baseline classes to be 6 | included in the same experiment. 7 | 8 | Without error handling, the baseline classes would cause the experiment to 9 | end prematurely when they received a matrix without the required feature 10 | (if, for example, leave-one-out feature mixing is enabled). Raising this 11 | error will cause the model to be skipped elegantly. 12 | """ 13 | -------------------------------------------------------------------------------- /src/triage/component/catwalk/feature_importances.py: -------------------------------------------------------------------------------- 1 | import verboselogs, logging 2 | logger = verboselogs.VerboseLogger(__name__) 3 | 4 | 5 | import numpy as np 6 | import sklearn.linear_model 7 | from sklearn.svm import SVC 8 | from triage.component.catwalk.estimators.classifiers import ScaledLogisticRegression 9 | 10 | 11 | def _ad_hoc_feature_importances(model): 12 | """ 13 | Get the "ad-hoc feature importances" for scikit-learn's models 14 | lacking the `feature_importances_` attribute 15 | 16 | Args: 17 | model: A trained model that has not a `feature_importances_` attribute 18 | 19 | Returns: 20 | At this moment, this method only returns the odds ratio of both the 21 | intercept and the coefficients given by sklearn's implementation of 22 | the LogisticRegression. 23 | The order of the odds ratio list is the standard 24 | of the statistical packages (like R, SAS, etc) i.e. (intercept, coefficients) 25 | """ 26 | feature_importances = None 27 | 28 | if (isinstance(model, (sklearn.linear_model.LogisticRegression)) or 29 | isinstance(model, (ScaledLogisticRegression))): 30 | coef_odds_ratio = np.exp(model.coef_) 31 | # intercept_odds_ratio = np.exp(model.intercept_[:,np.newaxis]) 32 | # We are ignoring the intercept 33 | 34 | # NOTE: We need to squeeze this array so it has the correct dimensions 35 | feature_importances = coef_odds_ratio.squeeze() 36 | 37 | elif isinstance(model, (SVC)) and (model.get_params()["kernel"] == "linear"): 38 | feature_importances = model.coef_.squeeze() 39 | 40 | return feature_importances 41 | 42 | 43 | def get_feature_importances(model): 44 | """ 45 | Get feature importances (from scikit-learn) of a trained model. 46 | 47 | Args: 48 | model: Trained model 49 | 50 | Returns: 51 | Feature importances, or failing that, None 52 | """ 53 | feature_importances = None 54 | 55 | if hasattr(model, "feature_importances_"): 56 | feature_importances = model.feature_importances_ 57 | 58 | else: 59 | logger.warning( 60 | "The selected algorithm, doesn't support a standard way " 61 | "of calculate the importance of each feature used. " 62 | "Falling back to ad-hoc methods " 63 | "(e.g. in LogisticRegression we will return Odd Ratios instead coefficients)" 64 | ) 65 | 66 | feature_importances = _ad_hoc_feature_importances(model) 67 | 68 | # if we just ended up with a scalar (e.g., single feature logit), ensure we return an array 69 | if isinstance(feature_importances, np.ndarray) and feature_importances.shape == (): 70 | feature_importances = feature_importances.reshape((1,)) 71 | 72 | return feature_importances 73 | -------------------------------------------------------------------------------- /src/triage/component/collate/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .collate import available_imputations, Aggregation, Aggregate, Compare, Categorical 3 | from .from_obj import FromObj 4 | from .spacetime import SpacetimeAggregation 5 | 6 | __all__ = [ 7 | "available_imputations", 8 | "Aggregation", 9 | "Aggregate", 10 | "FromObj", 11 | "Compare", 12 | "Categorical", 13 | "SpacetimeAggregation", 14 | ] 15 | __author__ = """DSaPP Researchers""" 16 | __email__ = "datascifellows@gmail.com" 17 | -------------------------------------------------------------------------------- /src/triage/component/collate/sql.py: -------------------------------------------------------------------------------- 1 | import sqlalchemy.sql.expression as ex 2 | from sqlalchemy.ext.compiler import compiles 3 | 4 | 5 | def make_sql_clause(s, constructor): 6 | if not isinstance(s, ex.ClauseElement): 7 | return constructor(s) 8 | else: 9 | return s 10 | 11 | 12 | class CreateTableAs(ex.Executable, ex.ClauseElement): 13 | def __init__(self, name, query): 14 | self.name = name 15 | self.query = query 16 | 17 | 18 | @compiles(CreateTableAs) 19 | def _create_table_as(element, compiler, **kw): 20 | return "CREATE TABLE %s AS %s" % (element.name, compiler.process(element.query)) 21 | 22 | 23 | class InsertFromSelect(ex.Executable, ex.ClauseElement): 24 | def __init__(self, name, query): 25 | self.name = name 26 | self.query = query 27 | 28 | 29 | @compiles(InsertFromSelect) 30 | def _insert_from_select(element, compiler, **kw): 31 | return "INSERT INTO %s (%s)" % (element.name, compiler.process(element.query)) 32 | 33 | 34 | def to_sql_name(name): 35 | return name.replace('"', "") 36 | -------------------------------------------------------------------------------- /src/triage/component/postmodeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/__init__.py -------------------------------------------------------------------------------- /src/triage/component/postmodeling/add_predictions_example_config.yaml: -------------------------------------------------------------------------------- 1 | # Path where the models and matrices are stored 2 | project_path: 'path/to/models/and/matrices' 3 | 4 | # Model group ids we need predictions for 5 | # List of integers 6 | model_group_ids: 7 | - 1 8 | - 2 9 | 10 | # Following parameters are optional 11 | # These will help narrow down the model_ids in the above model groups in case you are not interested in all the models in a group 12 | # If these are not specificied, all the models in the group will be scored 13 | # Either (or both) can be specified independent of the other 14 | 15 | # Narrowing down by the experiment hash(es) 16 | # If this is provided, only the model ids relevant to these experiment hashes will be scored 17 | experiments: 18 | - 'experiment_hash1' 19 | - 'experiment_hash2' 20 | 21 | # Narrowing down by the train_end_time 22 | # Here you can score models that are trained with data from a certain time period 23 | # The first element should be the start date, and the second element should be the end date 24 | # All models (in the above model groups) that has a train_end_time that falls within this range will be scored 25 | # range end points are inclusive 26 | # If you only specify one limit, an open ended interval is used. 27 | train_end_times: 28 | range_start_date: '1970-01-01' # If only this is specified, all train_end_times on and after this date will be incuded 29 | range_end_date: '1980-01-01' # If only this is specified, all train_end_times on and before this date will be included 30 | -------------------------------------------------------------------------------- /src/triage/component/postmodeling/deprecated/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/deprecated/__init__.py -------------------------------------------------------------------------------- /src/triage/component/postmodeling/deprecated/db_credentials_example.yaml: -------------------------------------------------------------------------------- 1 | host: 2 | dbname: 3 | user: 4 | password: 5 | port: 6 | role: '' 7 | -------------------------------------------------------------------------------- /src/triage/component/postmodeling/deprecated/parameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Postmodeling parameters 3 | 4 | This script contain the parameters Class that will be used across all the 5 | postmodeling functions within the ModelEvaluator and ModelGroupEvaluator 6 | classes. This class will be initialized using the 'postmodeling_parameters.yaml' 7 | file 8 | 9 | """ 10 | 11 | import yaml 12 | import json 13 | 14 | import verboselogs, logging 15 | logger = verboselogs.VerboseLogger(__name__) 16 | 17 | class PostmodelParameters: 18 | ''' 19 | PostmodelParameters reads all parameters from a 'yaml' file and store them 20 | in this object to be used in other functions. Different metrics can be 21 | passed to this object, by default it will reads from a 22 | 'postmodeling_parameters.yaml', but an Audition config file can be passed 23 | and will parse from it the needed parameters 24 | ''' 25 | def __init__(self, path_params): 26 | 27 | with open(path_params) as f: 28 | params = yaml.full_load(f) 29 | 30 | # Assign dict elements to Parameters object and flatten 31 | # thresholds 32 | self.__dict__.update(params) 33 | self.figsize = tuple(self.figsize) 34 | 35 | try: 36 | if self.audition_output_path is not None: 37 | with open(self.audition_output_path) as f: 38 | json_models = json.load(f) 39 | 40 | list_models = [model for model_list in json_models.values() 41 | for model in model_list] 42 | self.model_group_id = list_models 43 | 44 | except AttributeError: 45 | logger.exception( 46 | f'''No audition output file was defined. I will use the models 47 | defined in the {path_params} configuration file.''' 48 | ) 49 | -------------------------------------------------------------------------------- /src/triage/component/postmodeling/deprecated/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/deprecated/utils/__init__.py -------------------------------------------------------------------------------- /src/triage/component/postmodeling/deprecated/utils/aux_funcs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A4uxiliary functions and helpers: 3 | 4 | This set of functions are helper functions to format data 5 | (i.e., prediction matrices, etc.) for plotting. This functions 6 | are called in both Model class and ModelGroup class in 7 | evaluation.py. 8 | ''' 9 | 10 | from sqlalchemy import create_engine 11 | from sqlalchemy.sql import text 12 | from collections import namedtuple 13 | import yaml 14 | 15 | import verboselogs, logging 16 | logger = verboselogs.VerboseLogger(__name__) 17 | 18 | 19 | ModelEvaluator = namedtuple('ModelEvaluator', 20 | ('model_group_id', 'model_id')) 21 | 22 | 23 | def create_pgconn(credentials_yaml): 24 | ''' 25 | Create SQL connection object using a psycopg2 cursor and abiding to new 26 | dssg/dsapp db user configuration. 27 | 28 | Arguments: 29 | - credentials_yaml: .yaml file with db credentials 30 | ''' 31 | with open(credentials_yaml) as f: 32 | configs = yaml.full_load(f) 33 | try: 34 | conn = create_engine("postgresql://{user}:{password}@{host}:{port}/{dbname}".format(**configs)) 35 | except: 36 | logger.error("Error connecting to db.") 37 | 38 | return conn 39 | 40 | 41 | def get_models_ids(audited_model_group_ids, conn): 42 | ''' 43 | This helper functions will retrieve the model_id's from a set 44 | of model_group_ids and will instantiate each model into the 45 | ModelEvaluator class. 46 | 47 | Aguments: 48 | - audited_model_group_ids: List of model_group_ids 49 | (ideally from Audition's output) 50 | - conn: sql engine 51 | 52 | This function will return a list of ModelEvaluator objects 53 | ''' 54 | 55 | query = conn.execute(text(""" 56 | SELECT model_group_id, 57 | model_id 58 | FROM triage_metadata.models 59 | WHERE model_group_id = ANY(:ids); 60 | """), ids=audited_model_group_ids) 61 | 62 | return [ModelEvaluator._make(row) for row in query] 63 | -------------------------------------------------------------------------------- /src/triage/component/postmodeling/fairness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/fairness/__init__.py -------------------------------------------------------------------------------- /src/triage/component/postmodeling/fairness/aequitas_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def get_aequitas_results(engine, parameter, schema="test_results", table="aequitas", model_id=None, subset_hash="", tie_breaker="worst"): 4 | ''' This function returns the current contents of the aequitas table. 5 | 6 | Args: 7 | - engine: SQLAlchemy engine conected to database 8 | - parameter: A string that indicates any parameters for the metric (ex. `100_abs` indicates top-100 entities) 9 | - schema: Databse schema to find table within 10 | - table: Databse table to select data from 11 | - model_id: A model_id, to query only for results of that model 12 | - subset_hash: Identifies the subset for the evaluation 13 | - tie_breaker: Indicates how ties are broken 14 | 15 | Returns: A DataFrame, corresponding to schema.table 16 | ''' 17 | 18 | query = f"""SELECT * FROM {schema}.{table} 19 | WHERE parameter = '{parameter}' 20 | AND subset_hash = '{subset_hash}' 21 | AND tie_breaker = '{tie_breaker}' 22 | """ 23 | if model_id: 24 | query += f" AND model_id = {model_id}" 25 | return pd.read_sql(query, con=engine) -------------------------------------------------------------------------------- /src/triage/component/postmodeling/postmodeling_config.yaml: -------------------------------------------------------------------------------- 1 | error_analysis: 2 | # Size of the list. 3 | k: [100] 4 | 5 | # Parameter grid to try on the DecisionTreeClassifier trained for the error analysis. 6 | model_params: 7 | max_depth: [5] 8 | 9 | # Flag to define if plots are going to be displayed (True) or saved (False) 10 | view_plots: False -------------------------------------------------------------------------------- /src/triage/component/postmodeling/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/postmodeling/utils/__init__.py -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic.ini: -------------------------------------------------------------------------------- 1 | [alembic] 2 | script_location = %(here)s/alembic 3 | 4 | [exclude] 5 | tables = predictions_\d+ 6 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/results_schema/alembic/__init__.py -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py: -------------------------------------------------------------------------------- 1 | """merge b097e47ba829 with cdd0dc9d9870 2 | 3 | Revision ID: 079a74c15e8b 4 | Revises: b097e47ba829, cdd0dc9d9870 5 | Create Date: 2021-05-30 20:49:19.039280 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '079a74c15e8b' 14 | down_revision = ('b097e47ba829', 'cdd0dc9d9870') 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | pass 21 | 22 | 23 | def downgrade(): 24 | pass 25 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/0bca1ba9706e_add_matrix_uuid_to_eval.py: -------------------------------------------------------------------------------- 1 | """add_matrix_uuid_to_eval 2 | 3 | Revision ID: 0bca1ba9706e 4 | Revises: 38f37d013686 5 | Create Date: 2019-02-05 13:19:50.172109 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '0bca1ba9706e' 14 | down_revision = '38f37d013686' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('evaluations', sa.Column('matrix_uuid', sa.Text(), nullable=True), schema='test_results') 22 | op.create_foreign_key(None, 'evaluations', 'matrices', ['matrix_uuid'], ['matrix_uuid'], source_schema='test_results', referent_schema='model_metadata') 23 | op.add_column('evaluations', sa.Column('matrix_uuid', sa.Text(), nullable=True), schema='train_results') 24 | op.create_foreign_key(None, 'evaluations', 'matrices', ['matrix_uuid'], ['matrix_uuid'], source_schema='train_results', referent_schema='model_metadata') 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade(): 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.drop_constraint(None, 'evaluations', schema='train_results', type_='foreignkey') 31 | op.drop_column('evaluations', 'matrix_uuid', schema='train_results') 32 | op.drop_constraint(None, 'evaluations', schema='test_results', type_='foreignkey') 33 | op.drop_column('evaluations', 'matrix_uuid', schema='test_results') 34 | # ### end Alembic commands ### 35 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/0d44655e35fd_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 0d44655e35fd 4 | Revises: 8b3f167d0418 5 | Create Date: 2017-08-31 14:14:54.187073 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "0d44655e35fd" 14 | down_revision = "8b3f167d0418" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table( 22 | "individual_importances", 23 | sa.Column("model_id", sa.Integer(), nullable=False), 24 | sa.Column("entity_id", sa.BigInteger(), nullable=False), 25 | sa.Column("as_of_date", sa.DateTime(), nullable=False), 26 | sa.Column("feature", sa.String(), nullable=False), 27 | sa.Column("method", sa.String(), nullable=False), 28 | sa.Column("importance_score", sa.Text(), nullable=True), 29 | sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]), 30 | sa.PrimaryKeyConstraint( 31 | "model_id", "entity_id", "as_of_date", "feature", "method" 32 | ), 33 | schema="results", 34 | ) 35 | op.create_table( 36 | "list_predictions", 37 | sa.Column("model_id", sa.Integer(), nullable=False), 38 | sa.Column("entity_id", sa.BigInteger(), nullable=False), 39 | sa.Column("as_of_date", sa.DateTime(), nullable=False), 40 | sa.Column("score", sa.Numeric(), nullable=True), 41 | sa.Column("rank_abs", sa.Integer(), nullable=True), 42 | sa.Column("rank_pct", sa.Float(), nullable=True), 43 | sa.Column("matrix_uuid", sa.Text(), nullable=True), 44 | sa.Column("test_label_window", sa.Interval(), nullable=True), 45 | sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]), 46 | sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), 47 | schema="results", 48 | ) 49 | # ### end Alembic commands ### 50 | 51 | 52 | def downgrade(): 53 | # ### commands auto generated by Alembic - please adjust! ### 54 | op.drop_table("list_predictions", schema="results") 55 | op.drop_table("individual_importances", schema="results") 56 | # ### end Alembic commands ### 57 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 1b990cbc04e4 4 | Revises: 0bca1ba9706e 5 | Create Date: 2019-02-20 16:41:22.810452 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '1b990cbc04e4' 14 | down_revision = '45219f25072b' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.execute("CREATE SCHEMA IF NOT EXISTS production") 21 | op.execute("ALTER TABLE triage_metadata.list_predictions SET SCHEMA production;") 22 | 23 | 24 | def downgrade(): 25 | op.execute("ALTER TABLE production.list_predictions SET SCHEMA triage_metadata;") 26 | op.execute("DROP SCHEMA IF EXISTS production") 27 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/2446a931de7a_changing_column_names_and_removing_.py: -------------------------------------------------------------------------------- 1 | """Changing column names and removing redundancies in table names 2 | 3 | Revision ID: 2446a931de7a 4 | Revises: 89a8ce240bae 5 | Create Date: 2018-05-24 17:07:20.567789 6 | 7 | """ 8 | from alembic import op 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = "2446a931de7a" 12 | down_revision = "89a8ce240bae" 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | # ### commands auto generated by Alembic - please adjust! ### 19 | op.execute( 20 | "ALTER TABLE test_results.test_evaluations RENAME TO evaluations;" 21 | + "ALTER TABLE test_results.test_predictions RENAME TO predictions;" 22 | + "ALTER TABLE train_results.train_evaluations RENAME TO evaluations;" 23 | + "ALTER TABLE train_results.train_predictions RENAME TO predictions;" 24 | ) 25 | 26 | op.alter_column( 27 | "matrices", 28 | "n_examples", 29 | new_column_name="num_observations", 30 | schema="model_metadata", 31 | ) 32 | op.alter_column( 33 | "model_groups", 34 | "model_parameters", 35 | new_column_name="hyperparameters", 36 | schema="model_metadata", 37 | ) 38 | op.alter_column( 39 | "models", 40 | "model_parameters", 41 | new_column_name="hyperparameters", 42 | schema="model_metadata", 43 | ) 44 | # ### end Alembic commands ### 45 | 46 | 47 | def downgrade(): 48 | # ### commands auto generated by Alembic - please adjust! ### 49 | op.execute( 50 | "ALTER TABLE test_results.evaluations RENAME TO test_evaluations;" 51 | + "ALTER TABLE test_results.predictions RENAME TO test_predictions;" 52 | + "ALTER TABLE train_results.evaluations RENAME TO train_evaluations;" 53 | + "ALTER TABLE train_results.predictions RENAME TO train_predictions;" 54 | ) 55 | 56 | op.alter_column( 57 | "matrices", 58 | "num_observations", 59 | new_column_name="n_examples", 60 | schema="model_metadata", 61 | ) 62 | op.alter_column( 63 | "model_groups", 64 | "hyperparameters", 65 | new_column_name="model_parameters", 66 | schema="model_metadata", 67 | ) 68 | op.alter_column( 69 | "models", 70 | "hyperparameters", 71 | new_column_name="model_parameters", 72 | schema="model_metadata", 73 | ) 74 | 75 | # ### end Alembic commands ### 76 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/264245ddfce2_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 264245ddfce2 4 | Revises: 0d44655e35fd 5 | Create Date: 2017-09-01 14:26:01.107455 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "264245ddfce2" 14 | down_revision = "0d44655e35fd" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column( 22 | "individual_importances", 23 | sa.Column("feature_value", sa.Float(), nullable=True), 24 | schema="results", 25 | ) 26 | # ### end Alembic commands ### 27 | 28 | 29 | def downgrade(): 30 | # ### commands auto generated by Alembic - please adjust! ### 31 | op.drop_column("individual_importances", "feature_value", schema="results") 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py: -------------------------------------------------------------------------------- 1 | """add label_value to prodcution table 2 | 3 | Revision ID: 264786a9fe85 4 | Revises: 1b990cbc04e4 5 | Create Date: 2019-02-26 13:17:05.365654 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '264786a9fe85' 14 | down_revision = '1b990cbc04e4' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.drop_table("list_predictions", schema="production") 21 | op.create_table( 22 | "list_predictions", 23 | sa.Column("model_id", sa.Integer(), nullable=False), 24 | sa.Column("entity_id", sa.BigInteger(), nullable=False), 25 | sa.Column("as_of_date", sa.DateTime(), nullable=False), 26 | sa.Column("score", sa.Numeric(), nullable=True), 27 | sa.Column('label_value', sa.Integer, nullable=True), 28 | sa.Column("rank_abs", sa.Integer(), nullable=True), 29 | sa.Column("rank_pct", sa.Float(), nullable=True), 30 | sa.Column("matrix_uuid", sa.Text(), nullable=True), 31 | sa.Column("test_label_timespan", sa.Interval(), nullable=True), 32 | sa.ForeignKeyConstraint(["model_id"], ["triage_metadata.models.model_id"]), 33 | sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), 34 | schema="production", 35 | ) 36 | 37 | 38 | def downgrade(): 39 | op.drop_table("list_predictions", schema="production") 40 | op.create_table( 41 | "list_predictions", 42 | sa.Column("model_id", sa.Integer(), nullable=False), 43 | sa.Column("entity_id", sa.BigInteger(), nullable=False), 44 | sa.Column("as_of_date", sa.DateTime(), nullable=False), 45 | sa.Column("score", sa.Numeric(), nullable=True), 46 | sa.Column("rank_abs", sa.Integer(), nullable=True), 47 | sa.Column("rank_pct", sa.Float(), nullable=True), 48 | sa.Column("matrix_uuid", sa.Text(), nullable=True), 49 | sa.Column("test_label_timespan", sa.Interval(), nullable=True), 50 | sa.ForeignKeyConstraint(["model_id"], ["triage_metadata.models.model_id"]), 51 | sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), 52 | schema="results", 53 | ) 54 | 55 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/38f37d013686_associate_experiments_with_models_and_.py: -------------------------------------------------------------------------------- 1 | """Associate experiments with models and matrices 2 | 3 | Revision ID: 38f37d013686 4 | Revises: d0ac573eaf1a 5 | Create Date: 2018-10-18 16:31:43.181779 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '38f37d013686' 14 | down_revision = 'd0ac573eaf1a' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('experiment_matrices', 22 | sa.Column('experiment_hash', sa.String(), nullable=False), 23 | sa.Column('matrix_uuid', sa.String(), nullable=False), 24 | sa.ForeignKeyConstraint(['experiment_hash'], ['model_metadata.experiments.experiment_hash'], ), 25 | sa.PrimaryKeyConstraint('experiment_hash', 'matrix_uuid'), 26 | schema='model_metadata' 27 | ) 28 | op.create_table('experiment_models', 29 | sa.Column('experiment_hash', sa.String(), nullable=False), 30 | sa.Column('model_hash', sa.String(), nullable=False), 31 | sa.ForeignKeyConstraint(['experiment_hash'], ['model_metadata.experiments.experiment_hash'], ), 32 | sa.PrimaryKeyConstraint('experiment_hash', 'model_hash'), 33 | schema='model_metadata' 34 | ) 35 | op.add_column('matrices', sa.Column('built_by_experiment', sa.String(), nullable=True), schema='model_metadata') 36 | op.create_foreign_key(None, 'matrices', 'experiments', ['built_by_experiment'], ['experiment_hash'], source_schema='model_metadata', referent_schema='model_metadata') 37 | 38 | op.alter_column('models', 'experiment_hash', new_column_name='built_by_experiment', schema='model_metadata') 39 | # ### end Alembic commands ### 40 | 41 | 42 | def downgrade(): 43 | # ### commands auto generated by Alembic - please adjust! ### 44 | op.alter_column('models', 'built_by_experiment', new_column_name='experiment_hash', schema='model_metadata') 45 | op.drop_constraint(None, 'matrices', schema='model_metadata', type_='foreignkey') 46 | op.drop_column('matrices', 'built_by_experiment', schema='model_metadata') 47 | op.drop_table('experiment_models', schema='model_metadata') 48 | op.drop_table('experiment_matrices', schema='model_metadata') 49 | # ### end Alembic commands ### 50 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/3ce027594a5c_add_hashes_to_runs.py: -------------------------------------------------------------------------------- 1 | """add hashes to runs 2 | 3 | Revision ID: 3ce027594a5c 4 | Revises: 5dd2ba8222b1 5 | Create Date: 2022-03-25 12:58:38.370271 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '3ce027594a5c' 14 | down_revision = '5dd2ba8222b1' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.add_column('triage_runs', sa.Column('cohort_table_name', sa.String(), nullable=True), schema='triage_metadata') 21 | op.add_column('triage_runs', sa.Column('labels_table_name', sa.String(), nullable=True), schema='triage_metadata') 22 | op.add_column('triage_runs', sa.Column('bias_hash', sa.String(), nullable=True), schema='triage_metadata') 23 | 24 | 25 | def downgrade(): 26 | op.drop_column('triage_runs', 'bias_hash', schema='triage_metadata') 27 | op.drop_column('triage_runs', 'labels_table_name', schema='triage_metadata') 28 | op.drop_column('triage_runs', 'cohort_table_name', schema='triage_metadata') 29 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/45219f25072b_hash_partitioning_predictions_tables.py: -------------------------------------------------------------------------------- 1 | """hash-partitioning predictions tables 2 | 3 | Revision ID: 45219f25072b 4 | Revises: a98acf92fd48 5 | Create Date: 2020-08-21 09:29:04.751933 6 | 7 | """ 8 | from alembic import op 9 | import os 10 | 11 | import verboselogs, logging 12 | logger = verboselogs.VerboseLogger(__name__) 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision = '45219f25072b' 17 | down_revision = 'a98acf92fd48' 18 | branch_labels = None 19 | depends_on = None 20 | 21 | 22 | def get_pg_major_version(op): 23 | conn = op.get_bind() 24 | pg_major_version = conn.execute('show server_version').fetchone()[0].split('.')[0] 25 | logger.debug(f'PostgreSQL major version {pg_major_version}') 26 | return int(pg_major_version) 27 | 28 | 29 | def upgrade(): 30 | 31 | pg_major_version = get_pg_major_version(op) 32 | 33 | if pg_major_version >= 11: 34 | logger.info(f'PostgreSQL 11 or greater found (PostgreSQL {pg_major_version}): Using hash partitioning') 35 | hash_partitioning_filename = os.path.join( 36 | os.path.dirname(__file__), "../../sql/predictions_hash_partitioning.sql" 37 | ) 38 | with open(hash_partitioning_filename) as fd: 39 | stmt = fd.read() 40 | op.execute(stmt) 41 | else: 42 | logger.info(f'No hash partitioning implemented because PostgreSQL 11 or greater not found (using: PostgreSQL {pg_major_version})') 43 | 44 | 45 | def downgrade(): 46 | 47 | pg_major_version = get_pg_major_version(op) 48 | 49 | if pg_major_version >= 11: 50 | logger.info(f'PostgreSQL 11 or greater found (PostgreSQL {pg_major_version}): Removing hash partitioning') 51 | undo_hash_partitioning_filename = os.path.join( 52 | os.path.dirname(__file__), "../../sql/undo_predictions_hash_partitioning.sql" 53 | ) 54 | with open(undo_hash_partitioning_filename) as fd: 55 | stmt = fd.read() 56 | op.execute(stmt) 57 | else: 58 | logger.info(f'No hash partitioning implemented because PostgreSQL 11 or greater not found (using: PostgreSQL {pg_major_version})') 59 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/4ae804cc0977_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 4ae804cc0977 4 | Revises: 9bbfdcf8bab0 5 | Create Date: 2020-07-19 01:35:54.419099 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '4ae804cc0977' 14 | down_revision = '9bbfdcf8bab0' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('models', sa.Column('built_in_experiment_run', sa.Integer(), nullable=True), schema='triage_metadata') 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade(): 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('models', 'built_in_experiment_run', schema='triage_metadata') 28 | # ### end Alembic commands ### 29 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/670289044eb2_add_production_prediction_metadata.py: -------------------------------------------------------------------------------- 1 | """Add production prediction metadata 2 | 3 | Revision ID: 670289044eb2 4 | Revises: ce5b50ffa8e2 5 | Create Date: 2021-01-08 22:27:23.433813 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '670289044eb2' 14 | down_revision = 'ce5b50ffa8e2' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('prediction_metadata', 22 | sa.Column('model_id', sa.Integer(), nullable=False), 23 | sa.Column('matrix_uuid', sa.Text(), nullable=False), 24 | sa.Column('tiebreaker_ordering', sa.Text(), nullable=True), 25 | sa.Column('random_seed', sa.Integer(), nullable=True), 26 | sa.Column('predictions_saved', sa.Boolean(), nullable=True), 27 | sa.ForeignKeyConstraint(['matrix_uuid'], ['triage_metadata.matrices.matrix_uuid'], ), 28 | sa.ForeignKeyConstraint(['model_id'], ['triage_metadata.models.model_id'], ), 29 | sa.PrimaryKeyConstraint('model_id', 'matrix_uuid'), 30 | schema='production' 31 | ) 32 | # ### end Alembic commands ### 33 | 34 | 35 | def downgrade(): 36 | # ### commands auto generated by Alembic - please adjust! ### 37 | op.drop_table('prediction_metadata', schema='production') 38 | # ### end Alembic commands ### 39 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/72ac5cbdca05_change_importance_to_float.py: -------------------------------------------------------------------------------- 1 | """Change importance to float 2 | 3 | Revision ID: 72ac5cbdca05 4 | Revises: 264245ddfce2 5 | Create Date: 2017-09-01 14:31:09.302828 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "72ac5cbdca05" 14 | down_revision = "264245ddfce2" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.alter_column( 21 | table_name="individual_importances", 22 | column_name="importance_score", 23 | type_=sa.Float(), 24 | schema="results", 25 | postgresql_using="importance_score::double precision", 26 | ) 27 | 28 | 29 | def downgrade(): 30 | op.alter_column( 31 | table_name="individual_importances", 32 | column_name="importance_score", 33 | type_=sa.Text(), 34 | schema="results", 35 | ) 36 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/7d57d1cf3429_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 7d57d1cf3429 4 | Revises: 72ac5cbdca05 5 | Create Date: 2017-11-06 11:34:23.046005 6 | 7 | """ 8 | from alembic import op 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = "7d57d1cf3429" 12 | down_revision = "72ac5cbdca05" 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | op.alter_column( 19 | "evaluations", "example_frequency", new_column_name="as_of_date_frequency" 20 | ) 21 | op.alter_column( 22 | "models", "train_label_window", new_column_name="training_label_timespan" 23 | ) 24 | op.alter_column( 25 | "predictions", "test_label_window", new_column_name="test_label_timespan" 26 | ) 27 | op.alter_column( 28 | "list_predictions", "test_label_window", new_column_name="test_label_timespan" 29 | ) 30 | 31 | 32 | def downgrade(): 33 | op.alter_column( 34 | "evaluations", "as_of_date_frequency", new_column_name="example_frequency" 35 | ) 36 | op.alter_column( 37 | "models", "training_label_timespan", new_column_name="train_label_window" 38 | ) 39 | op.alter_column( 40 | "predictions", "test_label_timespan", new_column_name="test_label_window" 41 | ) 42 | op.alter_column( 43 | "list_predictions", "test_label_timespan", new_column_name="test_label_window" 44 | ) 45 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/8cef808549dd_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 8cef808549dd 4 | Revises: b4d7569d31cb 5 | Create Date: 2020-06-02 21:26:32.528991 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '8cef808549dd' 14 | down_revision = 'b4d7569d31cb' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('experiment_runs', sa.Column('python_version', sa.String(), nullable=True), schema='model_metadata') 22 | op.create_index(op.f('ix_model_metadata_models_model_hash'), 'models', ['model_hash'], unique=True, schema='model_metadata') 23 | op.drop_index('ix_results_models_model_hash', table_name='models', schema='model_metadata') 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade(): 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | op.create_index('ix_results_models_model_hash', 'models', ['model_hash'], unique=True, schema='model_metadata') 30 | op.drop_index(op.f('ix_model_metadata_models_model_hash'), table_name='models', schema='model_metadata') 31 | op.drop_column('experiment_runs', 'python_version', schema='model_metadata') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/9bbfdcf8bab0_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: 9bbfdcf8bab0 4 | Revises: fa1760d35710 5 | Create Date: 2020-07-19 01:04:23.442598 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '9bbfdcf8bab0' 14 | down_revision = 'fa1760d35710' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('experiment_runs', sa.Column('random_seed', sa.Integer(), nullable=True), schema='triage_metadata') 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade(): 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('experiment_runs', 'random_seed', schema='triage_metadata') 28 | # ### end Alembic commands ### 29 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/component/results_schema/alembic/versions/__init__.py -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/a20104116533_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: a20104116533 4 | Revises: 8cef808549dd 5 | Create Date: 2020-06-11 16:32:41.319128 6 | 7 | """ 8 | import os 9 | from alembic import op 10 | import sqlalchemy as sa 11 | from sqlalchemy.dialects import postgresql 12 | 13 | # revision identifiers, used by Alembic. 14 | revision = 'a20104116533' 15 | down_revision = '8cef808549dd' 16 | branch_labels = None 17 | depends_on = None 18 | 19 | 20 | def upgrade(): 21 | # ### commands auto generated by Alembic - please adjust! ### 22 | op.execute("CREATE SCHEMA IF NOT EXISTS triage_metadata") 23 | op.execute( 24 | "ALTER TABLE model_metadata.experiment_matrices SET SCHEMA triage_metadata;" 25 | + " ALTER TABLE model_metadata.experiment_models SET SCHEMA triage_metadata;" 26 | + " ALTER TABLE model_metadata.experiment_runs SET SCHEMA triage_metadata;" 27 | + " ALTER TABLE model_metadata.experiments SET SCHEMA triage_metadata;" 28 | + " ALTER TABLE model_metadata.list_predictions SET SCHEMA triage_metadata;" 29 | + " ALTER TABLE model_metadata.matrices SET SCHEMA triage_metadata;" 30 | + " ALTER TABLE model_metadata.model_groups SET SCHEMA triage_metadata;" 31 | + " ALTER TABLE model_metadata.models SET SCHEMA triage_metadata;" 32 | + " ALTER TABLE model_metadata.subsets SET SCHEMA triage_metadata;" 33 | ) 34 | 35 | op.execute("DROP SCHEMA IF EXISTS model_metadata") 36 | 37 | ## We update (replace) the function 38 | group_proc_filename = os.path.join( 39 | os.path.dirname(__file__), "../../sql/model_group_stored_procedure.sql" 40 | ) 41 | with open(group_proc_filename) as fd: 42 | stmt = fd.read() 43 | op.execute(stmt) 44 | 45 | # ### end Alembic commands ### 46 | 47 | 48 | def downgrade(): 49 | # ### commands auto generated by Alembic - please adjust! ### 50 | op.execute("CREATE SCHEMA IF NOT EXISTS model_metadata") 51 | 52 | op.execute( 53 | "ALTER TABLE triage_metadata.experiment_matrices SET SCHEMA model_metadata;" 54 | + " ALTER TABLE triage_metadata.experiment_models SET SCHEMA model_metadata;" 55 | + " ALTER TABLE triage_metadata.experiment_runs SET SCHEMA model_metadata;" 56 | + " ALTER TABLE triage_metadata.experiments SET SCHEMA model_metadata;" 57 | + " ALTER TABLE triage_metadata.matrices SET SCHEMA model_metadata;" 58 | + " ALTER TABLE triage_metadata.model_groups SET SCHEMA model_metadata;" 59 | + " ALTER TABLE triage_metadata.models SET SCHEMA model_metadata;" 60 | + " ALTER TABLE triage_metadata.subsets SET SCHEMA model_metadata;" 61 | ) 62 | 63 | # ### end Alembic commands ### 64 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/a98acf92fd48_add_nuke_triage_function.py: -------------------------------------------------------------------------------- 1 | """add nuke triage function 2 | 3 | Revision ID: a98acf92fd48 4 | Revises: 4ae804cc0977 5 | Create Date: 2020-07-19 01:46:02.751987 6 | 7 | """ 8 | from alembic import op 9 | import os 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'a98acf92fd48' 13 | down_revision = '4ae804cc0977' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | nuke_triage_filename = os.path.join( 20 | os.path.dirname(__file__), "../../sql/nuke_triage.sql" 21 | ) 22 | with open(nuke_triage_filename) as fd: 23 | stmt = fd.read() 24 | op.execute(stmt) 25 | 26 | 27 | 28 | def downgrade(): 29 | pass 30 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/cdd0dc9d9870_rename_production_schema_and_prediction_table.py: -------------------------------------------------------------------------------- 1 | """rename production schema and list_predcitons to triage_predcition and predictions 2 | 3 | Revision ID: cdd0dc9d9870 4 | Revises: 670289044eb2 5 | Create Date: 2021-04-13 00:53:56.098572 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'cdd0dc9d9870' 14 | down_revision = '670289044eb2' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.execute("CREATE SCHEMA IF NOT EXISTS triage_production") 21 | op.execute("ALTER TABLE production.list_predictions SET SCHEMA triage_production;") 22 | op.execute("ALTER TABLE production.prediction_metadata SET SCHEMA triage_production") 23 | op.execute("ALTER TABLE triage_production.list_predictions RENAME TO predictions") 24 | 25 | 26 | def downgrade(): 27 | op.execute("ALTER TABLE triage_production.predictions SET SCHEMA production;") 28 | op.execute("ALTER TABLE triage_production.prediction_metadata SET SCHEMA production") 29 | op.execute("ALTER TABLE production.predictions RENAME TO list_predictions") 30 | op.execute("DROP SCHEMA IF EXISTS triage_production") 31 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/ce5b50ffa8e2_break_ties_in_list_predictions.py: -------------------------------------------------------------------------------- 1 | """Break ties in list predictions 2 | 3 | Revision ID: ce5b50ffa8e2 4 | Revises: 264786a9fe85 5 | Create Date: 2021-01-08 21:59:13.403934 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'ce5b50ffa8e2' 14 | down_revision = '264786a9fe85' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('list_predictions', sa.Column('rank_abs_with_ties', sa.Integer(), nullable=True), schema='production') 22 | op.add_column('list_predictions', sa.Column('rank_pct_with_ties', sa.Float(), nullable=True), schema='production') 23 | op.alter_column('list_predictions', 'rank_abs', new_column_name='rank_abs_no_ties', schema='production') 24 | op.alter_column('list_predictions', 'rank_pct', new_column_name='rank_pct_no_ties', schema='production') 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade(): 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.alter_column('list_predictions', 'rank_abs_no_ties', new_column_name='rank_abs', schema='production') 31 | op.alter_column('list_predictions', 'rank_pct_no_ties', new_column_name='rank_pct', schema='production') 32 | op.drop_column('list_predictions', 'rank_pct_with_ties', schema='production') 33 | op.drop_column('list_predictions', 'rank_abs_with_ties', schema='production') 34 | # ### end Alembic commands ### 35 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/d0ac573eaf1a_model_group_stored_procedure.py: -------------------------------------------------------------------------------- 1 | """model_group_stored_procedure 2 | 3 | Revision ID: d0ac573eaf1a 4 | Revises: 2446a931de7a 5 | Create Date: 2018-06-20 17:44:27.162699 6 | 7 | """ 8 | from alembic import op 9 | import os 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "d0ac573eaf1a" 14 | down_revision = "2446a931de7a" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | group_proc_filename = os.path.join( 21 | os.path.dirname(__file__), "../../sql/model_group_stored_procedure.sql" 22 | ) 23 | with open(group_proc_filename) as fd: 24 | stmt = fd.read() 25 | op.execute(stmt) 26 | 27 | 28 | def downgrade(): 29 | pass 30 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/alembic/versions/fa1760d35710_.py: -------------------------------------------------------------------------------- 1 | """empty message 2 | 3 | Revision ID: fa1760d35710 4 | Revises: a20104116533 5 | Create Date: 2020-07-16 18:07:58.229213 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'fa1760d35710' 14 | down_revision = 'a20104116533' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.add_column('experiments', sa.Column('random_seed', sa.Integer(), nullable=True), schema='triage_metadata') 21 | # ### end Alembic commands ### 22 | 23 | 24 | def downgrade(): 25 | op.drop_column('experiments', 'random_seed', schema='triage_metadata') 26 | # ### end Alembic commands ### 27 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/example_db_config.yaml: -------------------------------------------------------------------------------- 1 | host: localhost 2 | user: results_schema 3 | pass: results_schema 4 | port: 5432 5 | db: results_schema 6 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/sql/model_group_stored_procedure.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Function for using the model group table. This function requires a table like 3 | ----------- 4 | CREATE TABLE triage_metadata.model_groups 5 | ( 6 | model_group_id SERIAL PRIMARY KEY, 7 | model_type TEXT, 8 | hyperparameters JSONB, 9 | feature_list TEXT [] 10 | model_config JSONB 11 | ); 12 | ----------- 13 | populates the table and returns the IDs 14 | */ 15 | CREATE OR REPLACE FUNCTION public.get_model_group_id(in_model_type TEXT, 16 | in_hyperparameters JSONB, 17 | in_feature_list TEXT [], 18 | in_model_config JSONB) 19 | RETURNS INTEGER AS 20 | $BODY$ 21 | DECLARE 22 | model_group_return_id INTEGER; 23 | BEGIN 24 | --Obtain an advisory lock on the table to avoid double execution 25 | PERFORM pg_advisory_lock(60637); 26 | 27 | -- Check if the model_group_id exists, if not insert the model parameters and return the new value 28 | SELECT * 29 | INTO model_group_return_id 30 | FROM triage_metadata.model_groups 31 | WHERE 32 | model_type = in_model_type 33 | AND hyperparameters = in_hyperparameters 34 | AND feature_list = ARRAY(Select unnest(in_feature_list) ORDER BY 1) 35 | AND model_config = in_model_config ; 36 | IF NOT FOUND 37 | THEN 38 | INSERT INTO triage_metadata.model_groups (model_group_id, model_type, hyperparameters, feature_list, model_config) 39 | VALUES (DEFAULT, in_model_type, in_hyperparameters, ARRAY(Select unnest(in_feature_list) ORDER BY 1), in_model_config) 40 | RETURNING model_group_id 41 | INTO model_group_return_id; 42 | END IF; 43 | 44 | -- Release the lock again 45 | PERFORM pg_advisory_unlock(60637); 46 | 47 | 48 | RETURN model_group_return_id; 49 | END; 50 | 51 | $BODY$ 52 | LANGUAGE plpgsql VOLATILE 53 | COST 100; 54 | 55 | 56 | 57 | comment on function get_model_group_id (text, jsonb, text [], jsonb) is 'Function for using the model group table. This function requires a table like 58 | ----------- 59 | CREATE TABLE triage_metadata.model_groups 60 | ( 61 | model_group_id SERIAL PRIMARY KEY, 62 | model_type TEXT, 63 | hyperparameters JSONB, 64 | feature_list TEXT [] 65 | model_config JSONB 66 | ); 67 | ----------- 68 | populates the table and returns the IDs'; 69 | -------------------------------------------------------------------------------- /src/triage/component/results_schema/sql/nuke_triage.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Function for wiping out all triage tables, schemas, functions and indexes. 3 | It also deletes results_schema_versions (from alembic) 4 | Useful when ou try to start clean (again) and try to recover some 5 | previous errors 6 | */ 7 | create or replace function nuke_triage() 8 | returns text as $result$ 9 | 10 | declare 11 | result text; 12 | query text; 13 | 14 | begin 15 | 16 | execute 'drop schema if exists triage_metadata cascade'; 17 | raise notice 'triage_metadata deleted'; 18 | execute 'drop schema if exists features cascade'; 19 | raise notice 'features deleted'; 20 | execute 'drop schema if exists train_results cascade'; 21 | raise notice 'train_results deleted'; 22 | execute 'drop schema if exists test_results cascade'; 23 | raise notice 'test_results deleted'; 24 | 25 | execute 'drop table if exists results_schema_versions'; 26 | raise notice 'results_schema_versions deleted'; 27 | 28 | execute 'drop function if exists get_model_group_id'; 29 | raise notice 'get_model_group_id deleted'; 30 | 31 | execute 'drop type if exists experimentrunstatus'; 32 | raise notice 'experimentrunstatus type deleted'; 33 | 34 | 35 | select into query 36 | string_agg( 37 | format('drop table %I cascade;', tablename), E'\n' 38 | ) 39 | from pg_tables 40 | where tablename ~ 'cohort_|labels_|ranks_'; 41 | 42 | if query is not null then 43 | raise notice '%', query; 44 | execute query; 45 | else 46 | raise notice 'no labels or states tables from triage found'; 47 | end if; 48 | 49 | return 'triage was send to the oblivion. Long live to triage!'; 50 | end; 51 | $result$ language plpgsql; 52 | 53 | comment on function nuke_triage () is 'Function for wiping out all triage tables, schemas, functions and indexes. 54 | It also deletes results_schema_versions (from alembic) 55 | Useful when ou try to start clean (again) and try to recover some 56 | previous errors'; 57 | -------------------------------------------------------------------------------- /src/triage/component/timechop/__init__.py: -------------------------------------------------------------------------------- 1 | from .timechop import Timechop 2 | 3 | __all__ = ("Timechop",) 4 | -------------------------------------------------------------------------------- /src/triage/component/timechop/utils.py: -------------------------------------------------------------------------------- 1 | from six import string_types 2 | 3 | 4 | def convert_to_list(x): 5 | """ 6 | Given an object, if it is not a list, convert it to a list. 7 | 8 | Arguments: 9 | x (object): an object to be converted to a list 10 | 11 | return: 12 | list: x as a list 13 | """ 14 | if isinstance(x, string_types): 15 | return [x] 16 | 17 | try: 18 | iter(x) 19 | except TypeError: 20 | return [x] 21 | else: 22 | return list(x) 23 | -------------------------------------------------------------------------------- /src/triage/config/logging.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | 3 | formatters: 4 | simple: 5 | format: '%(asctime)s - %(message)s' 6 | colored_console: 7 | (): 'coloredlogs.ColoredFormatter' 8 | format: '%(asctime)s - %(levelname)7s %(message)s' 9 | rich: 10 | format: '%(name)-30s %(asctime)s %(levelname)10s %(process)6d %(filename)-24s %(lineno)4d: %(message)s' 11 | datefmt: '%d/%m/%Y %I:%M:%S %p' 12 | 13 | handlers: 14 | console: 15 | level: 15 # VERBOSE 16 | class: logging.StreamHandler 17 | formatter: colored_console 18 | stream: ext://sys.stdout 19 | 20 | loggers: 21 | triage: 22 | level: DEBUG 23 | handlers: [console] 24 | propagate: no 25 | 26 | matplotlib: 27 | level: WARNING 28 | handlers: [console] 29 | propagate: no 30 | 31 | boto3: 32 | level: WARNING 33 | handlers: [console] 34 | propagate: no 35 | 36 | pip: 37 | level: CRITICAL 38 | handlers: [console] 39 | 40 | root: 41 | level: NOTSET 42 | handlers: [console] 43 | -------------------------------------------------------------------------------- /src/triage/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | # Avoid circular import (required by base) 2 | CONFIG_VERSION = "v8" # noqa: E402 3 | 4 | from .base import ExperimentBase 5 | from .multicore import MultiCoreExperiment 6 | from .singlethreaded import SingleThreadedExperiment 7 | 8 | __all__ = ("ExperimentBase", "MultiCoreExperiment", "SingleThreadedExperiment") 9 | -------------------------------------------------------------------------------- /src/triage/experiments/singlethreaded.py: -------------------------------------------------------------------------------- 1 | from triage.experiments import ExperimentBase 2 | 3 | 4 | class SingleThreadedExperiment(ExperimentBase): 5 | def process_query_tasks(self, query_tasks): 6 | self.feature_generator.process_table_tasks(query_tasks) 7 | 8 | def process_matrix_build_tasks(self, matrix_build_tasks): 9 | self.matrix_builder.build_all_matrices(matrix_build_tasks) 10 | 11 | def process_train_test_batches(self, batches): 12 | self.model_train_tester.process_all_batches(batches) 13 | 14 | def process_subset_tasks(self, subset_tasks): 15 | self.subsetter.process_all_tasks(subset_tasks) 16 | -------------------------------------------------------------------------------- /src/triage/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/util/__init__.py -------------------------------------------------------------------------------- /src/triage/util/db.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import sqlalchemy 4 | import wrapt 5 | from contextlib import contextmanager 6 | from sqlalchemy.orm import Session 7 | from sqlalchemy.engine.url import make_url 8 | 9 | import json 10 | import functools 11 | 12 | from psycopg2.extras import DateRange, DateTimeRange 13 | from datetime import date, datetime 14 | 15 | 16 | def serialize_to_database(obj): 17 | """JSON serializer for objects not serializable by default json code""" 18 | 19 | if isinstance(obj, date): 20 | return str(obj.isoformat()) 21 | 22 | if isinstance(obj, (DateRange, DateTimeRange)): 23 | return f"[{obj.lower}, {obj.upper}]" 24 | 25 | return obj 26 | 27 | 28 | def json_dumps(d): 29 | return json.dumps(d, default=serialize_to_database) 30 | 31 | 32 | 33 | class SerializableDbEngine(wrapt.ObjectProxy): 34 | """A sqlalchemy engine that can be serialized across process boundaries. 35 | 36 | Works by saving all kwargs used to create the engine and reconstructs them later. As a result, the state won't be saved upon serialization/deserialization. 37 | """ 38 | 39 | __slots__ = ("url", "creator", "kwargs") 40 | 41 | def __init__(self, url, *, creator=sqlalchemy.create_engine, **kwargs): 42 | self.url = make_url(url) 43 | self.creator = creator 44 | self.kwargs = kwargs 45 | 46 | engine = creator(url, **kwargs) 47 | super().__init__(engine) 48 | 49 | def __reduce__(self): 50 | return (self.__reconstruct__, (self.url, self.creator, self.kwargs)) 51 | 52 | def __reduce_ex__(self, protocol): 53 | # wrapt requires reduce_ex to be implemented 54 | return self.__reduce__() 55 | 56 | @classmethod 57 | def __reconstruct__(cls, url, creator, kwargs): 58 | return cls(url, creator=creator, **kwargs) 59 | 60 | 61 | create_engine = functools.partial(SerializableDbEngine, json_serializer=json_dumps) 62 | 63 | @contextmanager 64 | def scoped_session(db_engine): 65 | """Provide a transactional scope around a series of operations.""" 66 | session = Session(bind=db_engine) 67 | try: 68 | yield session 69 | session.commit() 70 | except: 71 | session.rollback() 72 | raise 73 | finally: 74 | session.close() 75 | 76 | 77 | @contextmanager 78 | def get_for_update(db_engine, orm_class, primary_key): 79 | """ Gets object from the database to updated it """ 80 | with scoped_session(db_engine) as session: 81 | obj = session.query(orm_class).get(primary_key) 82 | yield obj 83 | session.merge(obj) 84 | -------------------------------------------------------------------------------- /src/triage/util/defaults.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/triage/2c85f5619b4be72c0c0d6a6de01b0253f474a0e9/src/triage/util/defaults.py -------------------------------------------------------------------------------- /src/triage/util/introspection.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | def classpath(klass): 5 | """Return the full class path 6 | 7 | Args: 8 | klass (class): A class 9 | """ 10 | return f"{klass.__module__}.{klass.__name__}" 11 | 12 | 13 | def bind_kwargs(kallable, **kwargs): 14 | """Bind keyword arguments to a callable and return as a dictionary 15 | 16 | Args: 17 | callable (callable): any callable 18 | **kwargs: keyword arguments to bind 19 | 20 | Returns: (dict) 21 | """ 22 | call_signature = inspect.signature(kallable).bind_partial(**kwargs).arguments 23 | if 'kwargs' in call_signature: 24 | passed_kwargs = call_signature['kwargs'] 25 | else: 26 | passed_kwargs = call_signature 27 | return passed_kwargs 28 | -------------------------------------------------------------------------------- /src/triage/util/pandas.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import pandas as pd 3 | import numpy as np 4 | 5 | import verboselogs, logging 6 | logger = verboselogs.VerboseLogger(__name__) 7 | 8 | def downcast_matrix(df): 9 | """Downcast the numeric values of a matrix. 10 | 11 | This will make the matrix use less memory by turning, every number into 12 | float32. It's more expensive in time to try to convert int64 into int32 13 | than just convert the whole matrix in float32, which still is less memory 14 | intensive than the original matrix. 15 | 16 | Operates on the dataframe as passed, without doing anything to the index. 17 | Callers may pass an index-less dataframe if they wish to re-add the index afterwards 18 | and save memory on the index storage. 19 | """ 20 | logger.spam("Downcasting matrix.") 21 | logger.spam(f"Starting memory usage: {df.memory_usage(deep=True).sum()/1000000} MB") 22 | logger.spam(f"Initial types: \n {df.dtypes}") 23 | 24 | df = df.apply(lambda x: x.astype('float32')) 25 | 26 | logger.spam("Downcasting matrix completed.") 27 | logger.spam(f"Final memory usage: {df.memory_usage(deep=True).sum()/1000000} MB") 28 | logger.spam(f"Final data types: \n {df.dtypes}") 29 | 30 | return df 31 | -------------------------------------------------------------------------------- /src/triage/util/random.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | FLOAT_TO_INT_MULTIPLIER = 2000000000 5 | 6 | 7 | def generate_python_random_seed(): 8 | """Generate a random integer suitable for seeding the Python random generator 9 | """ 10 | return int(random.uniform(0, 1.0) * FLOAT_TO_INT_MULTIPLIER) 11 | -------------------------------------------------------------------------------- /src/triage/util/structs.py: -------------------------------------------------------------------------------- 1 | """Classes representing simple but deep data structures that we reuse throughout 2 | Triage code and want to display more intelligently in log files 3 | """ 4 | 5 | 6 | class TruncatedRepresentationList(list): 7 | def __repr__(self): 8 | total = len(self) 9 | if total != 1: 10 | return f"[{self[0]} ... {self[-1]} (Total: {total})]" 11 | else: 12 | return f"[{self[0]}] (Total: {total})" 13 | 14 | 15 | class AsOfTimeList(TruncatedRepresentationList): 16 | pass 17 | 18 | 19 | class FeatureNameList(TruncatedRepresentationList): 20 | pass 21 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py3 3 | 4 | [testenv:py3] 5 | setenv = 6 | AWS_ACCESS_KEY_ID=fake 7 | AWS_SECRET_ACCESS_KEY=fake 8 | BOTO_CONFIG=/dev/null 9 | deps = -r{toxinidir}/requirement/test.txt 10 | commands = py.test --basetemp={envtmpdir} {posargs:-vvv --cov=triage} 11 | extras = rq 12 | --------------------------------------------------------------------------------