├── .gitignore ├── LICENSE ├── Lectures ├── Lecture1-ClassOverview.pptx ├── Lecture10-ModelSelection2.pptx ├── Lecture11-Module1Review.pptx ├── Lecture12-EthicsOverview.pptx ├── Lecture13-InterpretabilityOverview.pptx ├── Lecture14-UnderstandingModels.pptx ├── Lecture19-FairnessOverview.pptx ├── Lecture2-Scoping.pptx ├── Lecture20-FieldValidation.pptx ├── Lecture21-Module3Review.pptx ├── Lecture22-Wrapup.pptx ├── Lecture3-Data.pptx ├── Lecture4-Formulation.pptx ├── Lecture5-ModelSelection.pptx ├── Lecture6-EvaluationMetrics.pptx ├── Lecture7-Features.pptx ├── Lecture8-Machine-Learning-Pipelines.pptx ├── Lecture9-PracticalModeling-Hyperparameters.pptx ├── common ml pitfalls.pptx └── review-module-2.pptx ├── README.md ├── Readings └── PDF │ ├── AmarasingheExplainable.pdf │ ├── BodriaExplainable.pdf │ ├── CaruanaGAM.pdf │ ├── CelisFairConstraint.pdf │ ├── CruzHyperparameter.pdf │ ├── DworkDecoupled.pdf │ ├── FairLearn.pdf │ ├── HardtEqualityOpportunity.pdf │ ├── HuqRacialEquity.pdf │ ├── KamiranPreprocessing.pdf │ ├── KehrenbergBalancingLabels.pdf │ ├── KohInfluenceFunctions.pdf │ ├── LakkarajuDecisionSets.pdf │ ├── LiuTransductiveTopK.pdf │ ├── LundbergHyboxaemia.pdf │ ├── LundbergSHAP.pdf │ ├── MothilalCounterfactual.pdf │ ├── ObermeyerBias.pdf │ ├── PassiFormulation.pdf │ ├── PearlCausality.pdf │ ├── PlumbMAPLE.pdf │ ├── Princeton-AI-Ethics-Case-Study-6.pdf │ ├── RawlsJustice.pdf │ ├── RibeiroLIME.pdf │ ├── RileyPitfalls.pdf │ ├── RobertsCV.pdf │ ├── SalimiDatabaseRepair.pdf │ ├── TanPrototypes.pdf │ ├── UstunRudinINFORMS.pdf │ ├── VermaFairnessDefn.pdf │ ├── ZafarConstraints.pdf │ └── cvsurvey.pdf ├── _config.yml ├── abhishek-parikh_400x400.jpeg ├── img └── dc_img.png ├── kit_rodolfa.png ├── old ├── 01 - Intro and Scoping │ ├── ClassOverview.pptx │ ├── KumarWaterMains.pdf │ ├── README.md │ ├── Scoping.pptx │ └── ScopingGuide.md ├── 02 - Case Studies and Acquiring Data │ ├── PotashLead.pdf │ ├── README.md │ ├── RehmanDengue.pdf │ ├── VergeHCAlgo.pdf │ ├── casestudies.pptx │ ├── data -- discussion.pptx │ └── data.pptx ├── 03 - Data Exploration, Analytical Formulation, and Baselines │ ├── AmeisenBaseline.md │ ├── BruceExploratory.md │ ├── README.md │ ├── data-exploration.pptx │ ├── formulation-and-baselines.pptx │ └── formulation-discussion.pptx ├── 04 - Machine Learning Pipelines │ ├── KoenPipeline.md │ ├── Machine-Learning-Pipelines.pptx │ └── README.md ├── 05 - Features │ ├── 10-1-20-reminders.pptx │ ├── AkinfaderinImputation.md │ ├── README.md │ ├── features-and-imputation.pptx │ └── features-discussion.pptx ├── 06 - Performance and Evaluation Pt 1 │ ├── Model Selection and Validation - Part I.pptx │ ├── README.md │ └── RudinSecrets.pdf ├── 07 - Performance and Evaluation Pt 2 │ ├── Model Selection and Validation - Part II.pptx │ ├── README.md │ └── StaporEvaluating.pdf ├── 08 - Recap and Check-In │ ├── GonfalonieriDeployment.md │ ├── Model Selection and Recap.pptx │ ├── PerlichLeakage.pdf │ └── README.md ├── 09 - Project Update Presentations │ └── README.md ├── 10 - Model Interpretability Pt 1 │ ├── Interpretability part 1.pptx │ ├── README.md │ └── RudinInterpretable.pdf ├── 11 - Model Interpretability Pt 2 │ ├── Interpretability part 2.pptx │ └── README.md ├── 12 - Algorithmic Bias and Fairness Pt 1 │ ├── FALL 20 - ethics bias fairness - part 1.pptx │ ├── README.md │ └── ethics bias fairness - part 1.pptx ├── 13 - Algorithmic Bias and Fairness Pt 2 │ ├── ChouldechovaFosterCare.pdf │ ├── FALL 20 - ethics bias fairness - part 2.pptx │ ├── FALL 20 - other ML ethics issues.pptx │ ├── README.md │ ├── ethics bias fairness - part 2.pptx │ └── ethics bias fairness - part 3.pptx ├── 14 - Causality and Field Validation │ ├── README.md │ ├── causal inference.pdf │ └── causal inference.pptx └── README.md ├── project ├── 10718_assignment_1.ipynb ├── final_project_presentation.md ├── final_project_report.md ├── proposal.md └── readme.md ├── riyaz_panjwani.jpeg ├── ryan.jpeg ├── scripts ├── README.md └── vpn-to-cmu.sh └── techhelp ├── 10718_visualize_chops_example.ipynb ├── README.md ├── class_db_pointers.md ├── dbeaver_instructions.pdf ├── handling_secrets.md ├── img ├── jupyter-login.png ├── jupyter-new-nb.png ├── jupyter-shutdown.png └── jupyter-terminal.png ├── jupyter_setup.md ├── pipelines_session.pptx ├── python_sql_tech_session.ipynb ├── remote-workflow ├── img │ ├── 10718-workflow.png │ ├── bash-absolute-path.png │ ├── bash-anatomy.png │ ├── bash-nano-save.png │ ├── bash-nano.png │ ├── bash-pwd.png │ ├── jupyter-notebook-kernel.png │ ├── jupyter-port-selection.png │ ├── jupyter-token.png │ ├── jupyter_kernel.png │ ├── vscode-changed-interpreter.png │ ├── vscode-click-find.png │ ├── vscode-connect-to-host.png │ ├── vscode-enter-login.png │ ├── vscode-enter-venv-path.png │ ├── vscode-file-menu.png │ ├── vscode-open-connect-to-host.png │ ├── vscode-open-folder.png │ ├── vscode-remote-diagram.png │ ├── vscode-remote-ssh-install.png │ ├── vscode-run-python.png │ ├── vscode-select-folder.png │ ├── vscode-select-host.png │ ├── vscode-select-interpreter-path.png │ ├── vscode-select-interpreter.png │ ├── vscode-select-python.png │ ├── vscode-ssh-connected.png │ └── vscode-update-config.png └── remote-workflow.md ├── sklearn.md ├── tech_session_1_initial_setup.pdf ├── tech_session_2_git_sql.pdf └── tech_session_template.sql /.gitignore: -------------------------------------------------------------------------------- 1 | ~$* 2 | .DS_Store 3 | *.xcf 4 | *.drawio 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Data Science for Social Good 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Lectures/Lecture1-ClassOverview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture1-ClassOverview.pptx -------------------------------------------------------------------------------- /Lectures/Lecture10-ModelSelection2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture10-ModelSelection2.pptx -------------------------------------------------------------------------------- /Lectures/Lecture11-Module1Review.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture11-Module1Review.pptx -------------------------------------------------------------------------------- /Lectures/Lecture12-EthicsOverview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture12-EthicsOverview.pptx -------------------------------------------------------------------------------- /Lectures/Lecture13-InterpretabilityOverview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture13-InterpretabilityOverview.pptx -------------------------------------------------------------------------------- /Lectures/Lecture14-UnderstandingModels.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture14-UnderstandingModels.pptx -------------------------------------------------------------------------------- /Lectures/Lecture19-FairnessOverview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture19-FairnessOverview.pptx -------------------------------------------------------------------------------- /Lectures/Lecture2-Scoping.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture2-Scoping.pptx -------------------------------------------------------------------------------- /Lectures/Lecture20-FieldValidation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture20-FieldValidation.pptx -------------------------------------------------------------------------------- /Lectures/Lecture21-Module3Review.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture21-Module3Review.pptx -------------------------------------------------------------------------------- /Lectures/Lecture22-Wrapup.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture22-Wrapup.pptx -------------------------------------------------------------------------------- /Lectures/Lecture3-Data.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture3-Data.pptx -------------------------------------------------------------------------------- /Lectures/Lecture4-Formulation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture4-Formulation.pptx -------------------------------------------------------------------------------- /Lectures/Lecture5-ModelSelection.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture5-ModelSelection.pptx -------------------------------------------------------------------------------- /Lectures/Lecture6-EvaluationMetrics.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture6-EvaluationMetrics.pptx -------------------------------------------------------------------------------- /Lectures/Lecture7-Features.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture7-Features.pptx -------------------------------------------------------------------------------- /Lectures/Lecture8-Machine-Learning-Pipelines.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture8-Machine-Learning-Pipelines.pptx -------------------------------------------------------------------------------- /Lectures/Lecture9-PracticalModeling-Hyperparameters.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/Lecture9-PracticalModeling-Hyperparameters.pptx -------------------------------------------------------------------------------- /Lectures/common ml pitfalls.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/common ml pitfalls.pptx -------------------------------------------------------------------------------- /Lectures/review-module-2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Lectures/review-module-2.pptx -------------------------------------------------------------------------------- /Readings/PDF/AmarasingheExplainable.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/AmarasingheExplainable.pdf -------------------------------------------------------------------------------- /Readings/PDF/BodriaExplainable.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/BodriaExplainable.pdf -------------------------------------------------------------------------------- /Readings/PDF/CaruanaGAM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/CaruanaGAM.pdf -------------------------------------------------------------------------------- /Readings/PDF/CelisFairConstraint.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/CelisFairConstraint.pdf -------------------------------------------------------------------------------- /Readings/PDF/CruzHyperparameter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/CruzHyperparameter.pdf -------------------------------------------------------------------------------- /Readings/PDF/DworkDecoupled.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/DworkDecoupled.pdf -------------------------------------------------------------------------------- /Readings/PDF/FairLearn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/FairLearn.pdf -------------------------------------------------------------------------------- /Readings/PDF/HardtEqualityOpportunity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/HardtEqualityOpportunity.pdf -------------------------------------------------------------------------------- /Readings/PDF/HuqRacialEquity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/HuqRacialEquity.pdf -------------------------------------------------------------------------------- /Readings/PDF/KamiranPreprocessing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/KamiranPreprocessing.pdf -------------------------------------------------------------------------------- /Readings/PDF/KehrenbergBalancingLabels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/KehrenbergBalancingLabels.pdf -------------------------------------------------------------------------------- /Readings/PDF/KohInfluenceFunctions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/KohInfluenceFunctions.pdf -------------------------------------------------------------------------------- /Readings/PDF/LakkarajuDecisionSets.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/LakkarajuDecisionSets.pdf -------------------------------------------------------------------------------- /Readings/PDF/LiuTransductiveTopK.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/LiuTransductiveTopK.pdf -------------------------------------------------------------------------------- /Readings/PDF/LundbergHyboxaemia.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/LundbergHyboxaemia.pdf -------------------------------------------------------------------------------- /Readings/PDF/LundbergSHAP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/LundbergSHAP.pdf -------------------------------------------------------------------------------- /Readings/PDF/MothilalCounterfactual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/MothilalCounterfactual.pdf -------------------------------------------------------------------------------- /Readings/PDF/ObermeyerBias.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/ObermeyerBias.pdf -------------------------------------------------------------------------------- /Readings/PDF/PassiFormulation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/PassiFormulation.pdf -------------------------------------------------------------------------------- /Readings/PDF/PearlCausality.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/PearlCausality.pdf -------------------------------------------------------------------------------- /Readings/PDF/PlumbMAPLE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/PlumbMAPLE.pdf -------------------------------------------------------------------------------- /Readings/PDF/Princeton-AI-Ethics-Case-Study-6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/Princeton-AI-Ethics-Case-Study-6.pdf -------------------------------------------------------------------------------- /Readings/PDF/RawlsJustice.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/RawlsJustice.pdf -------------------------------------------------------------------------------- /Readings/PDF/RibeiroLIME.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/RibeiroLIME.pdf -------------------------------------------------------------------------------- /Readings/PDF/RileyPitfalls.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/RileyPitfalls.pdf -------------------------------------------------------------------------------- /Readings/PDF/RobertsCV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/RobertsCV.pdf -------------------------------------------------------------------------------- /Readings/PDF/SalimiDatabaseRepair.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/SalimiDatabaseRepair.pdf -------------------------------------------------------------------------------- /Readings/PDF/TanPrototypes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/TanPrototypes.pdf -------------------------------------------------------------------------------- /Readings/PDF/UstunRudinINFORMS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/UstunRudinINFORMS.pdf -------------------------------------------------------------------------------- /Readings/PDF/VermaFairnessDefn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/VermaFairnessDefn.pdf -------------------------------------------------------------------------------- /Readings/PDF/ZafarConstraints.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/ZafarConstraints.pdf -------------------------------------------------------------------------------- /Readings/PDF/cvsurvey.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/Readings/PDF/cvsurvey.pdf -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: "10-718: Machine Learning in Practice" 3 | 4 | tags: 5 | - ML 6 | - Explainability 7 | - Bias 8 | -------------------------------------------------------------------------------- /abhishek-parikh_400x400.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/abhishek-parikh_400x400.jpeg -------------------------------------------------------------------------------- /img/dc_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/img/dc_img.png -------------------------------------------------------------------------------- /kit_rodolfa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/kit_rodolfa.png -------------------------------------------------------------------------------- /old/01 - Intro and Scoping/ClassOverview.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/01 - Intro and Scoping/ClassOverview.pptx -------------------------------------------------------------------------------- /old/01 - Intro and Scoping/KumarWaterMains.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/01 - Intro and Scoping/KumarWaterMains.pdf -------------------------------------------------------------------------------- /old/01 - Intro and Scoping/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction to the goals of the Class 2 | [Slides](ClassOverview.pptx) 3 | 4 | On Tuesday, we’ll provide an introduction to the class, its goals, and an overview of the project options to help you decide what you’re interested in working on for the remainder of the semester. 5 | 6 | Notes for Tuesday: 7 | - Different from previous iterations of the course, project-driven 8 | - Note that there isn't a single lecture on the syllabus about ML methods, class is about everything that happens before and after building a model, which is where you spend most of your time anyway 9 | - Unfortunately, no single textbook or set of readings exist for this type of course. The readings we've selected for each class speak to elements of the topics, but lectures will be critical to get a more complete understanding 10 | 11 | During the Wednesday session, we'll help ensure everyone is set up to access the class technical resources. 12 | 13 | ## Tech Session 14 | 15 | [Slides](https://docs.google.com/presentation/d/1MEP-UF9dHuEfoIWWKIphFAYI23miVHCovLJgndPLgvE/edit?usp=sharing) 16 | 17 | This tech session will set you up to access to the class infrastructure: the course server via SSH, the database via command line and GUI, and the course github. 18 | 19 | Don't worry if any of these topics are unfamiliar to you - future tech sessions will cover them in greater depth. 20 | 21 | ## ML Project Scoping 22 | 23 | [Slides](Scoping.pptx) and [Scoping Worksheet](https://docs.google.com/document/d/17svWoaRrRCjsROb2UxOVKPY86GyyRuiJMnPgjL2IkKg/edit) 24 | 25 | On Thursday, we’ll talk about scoping, problem definition, and understanding and balancing organizational goals. Well before the outset of technical work, a decision needs to be made about whether a given policy problem can and should be addressed with machine learning: 26 | - Is the problem significant, feasible to solve with a technical approach, and of sufficient importance to policy makers that they will devote resources to implementing the solution? 27 | - How will success be measured? 28 | - How will (often competing) goals of efficiency, effectiveness, andequity be balanced? 29 | 30 | We'll present and discuss our project scoping methodology that you will be expected to use in the class projects. 31 | 32 | 33 | ### Required Readings: 34 | - [*Data Science Project Scoping Guide*](http://www.datasciencepublicpolicy.org/home/resources/data-science-project-scoping-guide/) 35 | - [*Using Machine Learning to Assess the Risk of and Prevent Water Main Breaks*](KumarWaterMains.pdf) by Kumar, A, Rizvi, SAA, et al. KDD 2018. 36 | 37 | ### Optional Reading: 38 | - *Deconstructing Statistical Questions* by Hand, D.J. J. Royal Stat Soc. A 157(3) 1994. [Available Online](http://stat688.bio5.org/sites/default/files/fall2014/hand-deconstructin.pdf) 39 | -------------------------------------------------------------------------------- /old/01 - Intro and Scoping/Scoping.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/01 - Intro and Scoping/Scoping.pptx -------------------------------------------------------------------------------- /old/01 - Intro and Scoping/ScopingGuide.md: -------------------------------------------------------------------------------- 1 | # [Click Here for the Reading](http://www.datasciencepublicpolicy.org/home/resources/data-science-project-scoping-guide/) 2 | -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/PotashLead.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/02 - Case Studies and Acquiring Data/PotashLead.pdf -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/README.md: -------------------------------------------------------------------------------- 1 | ## General plan for the week 2 | This week, we’ll break up into project teams and begin project work 3 | 4 | 5 | ## Tuesday: Case Studies 6 | Practical examples can provide a great way to gain an understanding of the nuance of applying machine learning to policy problems, so Tuesday will focus on a class discussion of the a case study of recent applications that have been deployed in the real world. 7 | 8 | ### Required Reading: 9 | - *Fine-grained dengue forecasting using telephone triage services* by Rehman, NA, et al. Sci. Adv. 2016. [Available Online](https://nyunetworks.github.io/Pubs/rehman-science16.pdf) 10 | 11 | ### Optional Readings: 12 | - *Predictive Modeling for Public Health: Preventing Childhood Lead Poisoning* by Potash, E, et al. KDD 2015. [Available Online](http://www.dssgfellowship.org/wp-content/uploads/2016/01/p2039-potash.pdf), [Short 3 minute video](https://www.youtube.com/watch?v=DbplLXRQquI) 13 | - *What Happens When an Algorithm Cuts Your Health Care* by Lecher, C. 2018. (The Verge) [Available Online](https://www.theverge.com/2018/3/21/17144260/healthcare-medicaid-algorithm-arkansas-cerebral-palsy) 14 | 15 | ## Wednesday: SQL + Github 16 | During the Wednesday session, the TAs will lead a tutorial on using SQL and github for your class project. 17 | **[Slides for the git and SQL session](https://docs.google.com/presentation/d/11fXNBGCgp26XjLgJOmtyXf8MgAtjrIY2ThA92vzca70/edit#slide=id.g957ac0b37e_0_0)** 18 | 19 | - Useful github tutorials are [here](https://dssg.github.io/hitchhikers-guide/curriculum/setup/git-and-github/) 20 | - [Basic SQL tutorial](https://dssg.github.io/hitchhikers-guide/curriculum/software/basic_sql/) 21 | 22 | ## Thursday: Data Acquisition and Integration 23 | (*[Lecture Slides](data.pptx)*) 24 | 25 | Required: (*[Pre-watch lecture video](https://youtu.be/BCnut05L5OA)*) 26 | 27 | On Thursday, we’ll delve into some of the details of acquiring data, protecting privacy, and linking records across data sources. Acquiring data from an external (or even an internal) organization is often an involved process with a number of legal and technical aspects. Researchers need to understand how the data acquired may and may not be used (typically formalized in a data use agreement as well as underlying law) and ensure that the privacy of individuals in the dataset is protected(potentially both through access restrictions and techniques like anonymization). One datahas been acquired, it often needs to be transformed to ingest into the system used for analy-sis, records from multiple data sources linked, and data structured for further analysis. 28 | 29 | During class on Thursday, we'll also talk a little bit about working together with your project team. 30 | 31 | ### Optional Readings: 32 | *Broken Promises of Privacy* by Ohm, P. UCLA Law Review. 2009. Introduction and Section 1. [Available Online](https://heinonline.org/HOL/Page?handle=hein.journals/uclalr57&div=48&g_sent=1&casa_token=&collection=journals) 33 | - *Data Matching* by Christen, P. Springer (2012). Chapter 2: The Data Matching Process [Available Online](https://link.springer.com/book/10.1007\%2F978-3-642-31164-2) 34 | - *Big Data and Social Science* edited by Foster, Ghani, et al. Chapter 4: Databases. 35 | - *Challenges in Administrative Data Linkage for Research*. Harron, Katie, et al. Big Data & Society,. Dec. 2017 [Available Online](https://journals.sagepub.com/doi/full/10.1177/2053951717745678) 36 | 37 | -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/RehmanDengue.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/02 - Case Studies and Acquiring Data/RehmanDengue.pdf -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/VergeHCAlgo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/02 - Case Studies and Acquiring Data/VergeHCAlgo.pdf -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/casestudies.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/02 - Case Studies and Acquiring Data/casestudies.pptx -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/data -- discussion.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/02 - Case Studies and Acquiring Data/data -- discussion.pptx -------------------------------------------------------------------------------- /old/02 - Case Studies and Acquiring Data/data.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/02 - Case Studies and Acquiring Data/data.pptx -------------------------------------------------------------------------------- /old/03 - Data Exploration, Analytical Formulation, and Baselines/AmeisenBaseline.md: -------------------------------------------------------------------------------- 1 | # [Click Here for the Reading](https://blog.insightdatascience.com/always-start-with-a-stupid-model-no-exceptions-3a22314b9aaa) 2 | -------------------------------------------------------------------------------- /old/03 - Data Exploration, Analytical Formulation, and Baselines/BruceExploratory.md: -------------------------------------------------------------------------------- 1 | # [Click Here for the Reading](https://learning.oreilly.com/library/view/practical-statistics-for/9781491952955/ch01.html#EDA) 2 | -------------------------------------------------------------------------------- /old/03 - Data Exploration, Analytical Formulation, and Baselines/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | Work on your project during this week should include continuing to develop and refine your scope as you begin to explore the data. You'll also need to prepare and load some data into a database in order to make use of it in your modeling. 3 | 4 | **Due Friday (Sep 18)**: [Data loading assignment with ACS data. Assignment and submission instructions are on canvas](https://canvas.cmu.edu/courses/18465/assignments/268647) 5 | 6 | ## Tuesday: Data Exploration 7 | Tuesday ([Lecture Slides](data-exploration.pptx)) of this week will provide a crash course in exploratory data analysis. Data exploration is fundamental to developing an understanding of the nuances of the data and how the policy problem you initially scoped can be specifically formulated as a machine learning problem. This process involves generating and plotting summary statistics, exploring trend sover time and understanding rapid changes in distributions, as well as identifying missing data and outliers. Typically, data exploration should involve considerable input from domain experts as you develop an understanding of how the data relates to the underlying generative process, as well as its idiosyncrasies and limitations. 8 | 9 | We'll also dedicate about 30 minutes during class on Tuesday for you to meet with your project teams and discuss your project scope. 10 | 11 | Required Reading for Tuesday: 12 | - *Practical Statistics for Data Scientists* by Bruce, A. and Bruce, P. O'Reilly (2017). Chapter 1: Exploratory Data Analysis [Available Online](https://learning.oreilly.com/library/view/practical-statistics-for/9781491952955/ch01.html#EDA) 13 | 14 | ## Wednesday: Project Work and Check-ins 15 | During the Wednesday session, you'll have time to work together with your project team to refine your scope and explore the data. We will also use this time to meet with every project team to go over project and data details. 16 | 17 | ## Thursday: Analytical Formulation and Baseline 18 | 19 | Required Readings for Thursday: 20 | - [Recorded Lecture Video](https://youtu.be/-whVPGncD9c) 21 | - Dissecting Racial Bias in an Algorithm Used to Manage the Health of Populations by Obermeyer, Z., Powers, B., et al. Science. 2019. [Available Online](https://science.sciencemag.org/content/sci/366/6464/447.full.pdf) 22 | 23 | On Thursday, we'll discuss analytical formulation of policy projects. Distinct from the initial scoping, a true analytical formulation of your policy problem can 24 | only come after you have developed an understanding of the data at hand, which in turn will 25 | often result in a greater understanding of the problem itself. Here, you’ll ask how specifically 26 | your target variable (if relevant) is defined in the data, what types of information are available as predictors, and what baseline you’ll be measure performance against. Very rarely is the appropriate baseline as simple as "random choice" or the population prevalence. Rather, 27 | it should reflect what would be expected to happen otherwise: perhaps a simple decision rule 28 | that an expert would come up with or even a pre-existing statistical model that the current 29 | effort is seeking to replace. 30 | 31 | Optional Readings: 32 | - Always Start with a Stupid Model, No Exceptions by 33 | Ameisen, E. Medium. [Available Online](https://blog.insightdatascience.com/always-start-with-a-stupid-model-no-exceptions-3a22314b9aaa) 34 | - Create a Common-Sense Baseline First by Ramakrishnan. Medium. [Available Online](https://towardsdatascience.com/first-create-a-common-sense-baseline-e66dbf8a8a47) 35 | - Data Science for Business by Provost and Fawcett. O’Reilly. 2013. Chapter 2: Business 36 | Problems and Data Science [Available Online](https://learning.oreilly.com/library/view/data-science-for/9781449374273/ch02.html) 37 | 38 | 39 | -------------------------------------------------------------------------------- /old/03 - Data Exploration, Analytical Formulation, and Baselines/data-exploration.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/03 - Data Exploration, Analytical Formulation, and Baselines/data-exploration.pptx -------------------------------------------------------------------------------- /old/03 - Data Exploration, Analytical Formulation, and Baselines/formulation-and-baselines.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/03 - Data Exploration, Analytical Formulation, and Baselines/formulation-and-baselines.pptx -------------------------------------------------------------------------------- /old/03 - Data Exploration, Analytical Formulation, and Baselines/formulation-discussion.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/03 - Data Exploration, Analytical Formulation, and Baselines/formulation-discussion.pptx -------------------------------------------------------------------------------- /old/04 - Machine Learning Pipelines/KoenPipeline.md: -------------------------------------------------------------------------------- 1 | # [Click Here for the Reading](https://towardsdatascience.com/architecting-a-machine-learning-pipeline-a847f094d1c7) 2 | -------------------------------------------------------------------------------- /old/04 - Machine Learning Pipelines/Machine-Learning-Pipelines.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/04 - Machine Learning Pipelines/Machine-Learning-Pipelines.pptx -------------------------------------------------------------------------------- /old/04 - Machine Learning Pipelines/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | At this point in your project work, you should be developing your initial end-to-end pipeline. 3 | 4 | **Due Friday (Sep 25)**: [Project proposal with scope and descriptive statistics.](https://canvas.cmu.edu/courses/18465/assignments/268653) 5 | 6 | ## Tuesday: Machine Learning Pipelines 7 | On Tuesday ([Lecture Slides](Machine-Learning-Pipelines.pptx)), we’ll describe the components of typical machine learning pipelines. End-to-end ML Pipelines can quickly become unwieldy with several moving pieces and well-structured, modular code is often critical to detecting and fixing bugs in the process. This session will provide an overview of the pipeline, each underlying element, and some best practices for building them. 8 | 9 | Required Readings for Tuesday: 10 | - Review the lecture slides before class: [Online](https://github.com/dssg/mlforpublicpolicylab/blob/master/04%20-%20Machine%20Learning%20Pipelines/Machine-Learning-Pipelines.pptx) 11 | 12 | Useful examples: 13 | - [Triage](http://www.datasciencepublicpolicy.org/triage) 14 | - [Code on github](http://github.com/dssg/triage) 15 | - [Tutorial](https://dssg.github.io/triage/dirtyduck/) 16 | 17 | Optional Reading: 18 | - *Architecting a Machine Learning Pipeline* by Koen, S. (Medium) [Available Online](https://towardsdatascience.com/architecting-a-machine-learning-pipeline-a847f094d1c7) 19 | - *Meet Michelangelo: Uber's Machine Learning Platform* by Hermann, J and Del Balso, M. [Available Online](https://eng.uber.com/michelangelo/) 20 | 21 | ## Wednesday and Thursday: Project Work 22 | During the Wednesday and Thursday sessions, you'll have time to work with your group on the proposal due this week as well as your initial pipeline. 23 | 24 | Optional Readings: 25 | - *Data Analysis, Exploratory* by Brillinger. [Available Online](https://www.stat.berkeley.edu/~brill/Papers/EDASage.pdf) 26 | 27 | 28 | ## Advanced Version 29 | [ML Pipeline (archictecture) video](https://www.youtube.com/watch?v=9653dXoqSpI&ab_channel=DataScienceforSocialGood) 30 | -------------------------------------------------------------------------------- /old/05 - Features/10-1-20-reminders.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/05 - Features/10-1-20-reminders.pptx -------------------------------------------------------------------------------- /old/05 - Features/AkinfaderinImputation.md: -------------------------------------------------------------------------------- 1 | # [Click Here for the Reading](https://medium.com/ibm-data-science-experience/missing-data-conundrum-exploration-and-imputation-techniques-9f40abe0fd87) 2 | -------------------------------------------------------------------------------- /old/05 - Features/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | Pipeline development should be continuing in your project, with a focus on producing the 3 | simplest-possible version of the full system. 4 | 5 | **Due Friday, Oct. 2:** Peer reviews of two project proposals. 6 | 7 | ## Tuesday: Feature Engineering and Imputation 8 | In many real-world contexts, expressing domain expertise through thoughtful feature engineering can dramatically improve model performance by understanding what underlying factors are likely to be predictive and helping the model find these relationships. Likewise, 9 | most data sets you’ll encounter in practice are littered with outliers, inconsistencies, and missingness. Handling these data issues in a smart way can be critical to a project’s success. 10 | 11 | Required Readings for Tuesday: 12 | - [Short Video Lecture](https://www.youtube.com/watch?v=kluqz_1GN5c) and corresponding [slides](https://github.com/dssg/mlforpublicpolicylab/blob/master/05%20-%20Features/features-and-imputation.pptx?raw=true) 13 | 14 | 15 | ## Wednesday: Remote Workflows Tech Session 16 | On Wednesday, the TAs will lead tutorials on remote technical workflows and using Python and SQL together. 17 | 18 | ## Thursday: Project Work 19 | On Thursday, you'll have time to work together with your project team. 20 | 21 | Optional Readings: 22 | - Feature Engineering for Machine Learning by Zhang, A. and Casari, A. O’Reilly. 2018. Chapter 2: Fancy Tricks with Simple Numbers [Available Online](https://learning.oreilly.com/library/view/feature-engineering-for/9781491953235/) 23 | - Missing-data imputation by Gelman, A. [Available Online](http://www.stat.columbia.edu/~gelman/arm/missing.pdf) 24 | - Missing Data Conundrum by Akinfaderin, W. Medium. [Available Online](https://medium.com/ibm-data-science-experience/missing-data-conundrum-exploration-and-imputation-techniques-9f40abe0fd87) 25 | 26 | -------------------------------------------------------------------------------- /old/05 - Features/features-and-imputation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/05 - Features/features-and-imputation.pptx -------------------------------------------------------------------------------- /old/05 - Features/features-discussion.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/05 - Features/features-discussion.pptx -------------------------------------------------------------------------------- /old/06 - Performance and Evaluation Pt 1/Model Selection and Validation - Part I.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/06 - Performance and Evaluation Pt 1/Model Selection and Validation - Part I.pptx -------------------------------------------------------------------------------- /old/06 - Performance and Evaluation Pt 1/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | By this week, your group should have a very simple version of an end-to-end pipeline that can generate 3 | preliminary results for a single model specification. 4 | 5 | **Due Friday, Oct. 9:** [Skeleton pipeline code, analytical formulation, and baselines](https://canvas.cmu.edu/courses/18465/assignments/297595) 6 | 7 | ## Tuesday: Performance Metrics and Evaluation, Part I ([Zoom video](https://cmu.zoom.us/rec/play/SnGljUp1WBKvDXbaiOOeVFxHmYpj2-CYxH0eZVIQmyJd73VBTAVyfI8HSES8qHMoF5K7k5gsS2O7b5Sj.9rgpGOFjOwyRLLD0) - restricted to students in class for now) 8 | In most cases, a vast array of methods — each with a number of tunable hyperparameters — 9 | can be brought to bear on your modeling question. How do you decide which models are better than others and how can you be confident this decision will generalize into the future when the model is deployed? How should you balance considerations of performance (``accuracy``) and 10 | fairness when making these decisions? Are models that are performing similarly well giving 11 | similar predictions? What should you do if they are not? In this week, we’ll begin to answer 12 | these questions, focusing on cross-validation stategies (beyond the often used k-fold) and choosing performance metrics. 13 | 14 | Required Readings for Tuesday: 15 | 16 | - Cross-validation strategies for data with temporal, spatial, hierarchical, or phylogenetic 17 | structure by Roberts, DR, Bahn, V, et al. Ecography 40:2017. [Available Online](https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecog.02881) 18 | 19 | Optional Readings: 20 | 21 | - Time Series Nested Cross-Validation by Cochrane, C. Medium. [Available Online](https://towardsdatascience.com/time-series-nested-cross-validation-76adba623eb9) 22 | - Section 7 of Chapter 7 in Data and Social Science (2nd edition) edited by Foster, Ghani, et al. Chapter 7: Machine Learning. [Available Online](https://textbook.coleridgeinitiative.org/chap-ml.html) 23 | - The Secrets of Machine Learning by Rudin, C. and Carlson, D. arXiv preprint: 1906.01998. 2019. [Available Online](https://arxiv.org/abs/1906.01998) 24 | 25 | 26 | ## Wednesday Group Check-Ins 27 | On Wednesday, we’ll start our regular group check-ins to provide feedback on your project progress. 28 | 29 | ## Thursday: Temporal Validation Deep Dive 30 | Thursday, we’ll meet together as a class to do a deep dive on temporal validation through a few class project examples. 31 | 32 | -------------------------------------------------------------------------------- /old/06 - Performance and Evaluation Pt 1/RudinSecrets.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/06 - Performance and Evaluation Pt 1/RudinSecrets.pdf -------------------------------------------------------------------------------- /old/07 - Performance and Evaluation Pt 2/Model Selection and Validation - Part II.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/07 - Performance and Evaluation Pt 2/Model Selection and Validation - Part II.pptx -------------------------------------------------------------------------------- /old/07 - Performance and Evaluation Pt 2/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | During this week, your pipeline development and refinement should continue with a widening set of model specifications and features to explore. 3 | 4 | Sometime this week, we'll have a tech session on sklearn and modeling parameters. 5 | 6 | ## Tuesday: Performance Metrics and Evaluation, Pt II 7 | On Tuesday, we’ll continue our discussion from the previous week, delving into the details of winnowing down a large number of model specifications to one or a handful that perform "best" for some definition of "best". In particular, we’ll focus on the common case of machine learning problems with a strong time series component and the desire to balance performance and stability in model selection. 8 | 9 | Required Readings for Tuesday: 10 | - [Short Video on Evaluation Metrics](https://youtu.be/JsicCiQiq7E) 11 | - *Transductive Optimization of Top k Precision* by Liu, LP, Dietterich, TG, et al. IJCAI 2016. [Available Online](https://arxiv.org/abs/1510.05976) 12 | 13 | Highly Recommended: 14 | - - Section 7 of Chapter 7 in Data and Social Science (2nd edition) edited by Foster, Ghani, et al. Chapter 7: Machine Learning. [Available Online](https://textbook.coleridgeinitiative.org/chap-ml.html) 15 | 16 | Optional Readings: 17 | - *Evaluating and Comparing Classifiers* by Stapor, K. CORES 2017. [Available Online](https://link.springer.com/chapter/10.1007/978-3-319-59162-9_2) 18 | 19 | ## Wednesday: Group Check-Ins 20 | 21 | Make sure to go over the slides submitted by other teams in your project group and we'll use the wednesday sessions for each team and instructor to give feedback. 22 | 23 | ## Thursday: Project Work 24 | 25 | -------------------------------------------------------------------------------- /old/07 - Performance and Evaluation Pt 2/StaporEvaluating.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/07 - Performance and Evaluation Pt 2/StaporEvaluating.pdf -------------------------------------------------------------------------------- /old/08 - Recap and Check-In/GonfalonieriDeployment.md: -------------------------------------------------------------------------------- 1 | # [Click Here for the Reading](https://towardsdatascience.com/why-is-machine-learning-deployment-hard-443af67493cd) 2 | -------------------------------------------------------------------------------- /old/08 - Recap and Check-In/Model Selection and Recap.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/08 - Recap and Check-In/Model Selection and Recap.pptx -------------------------------------------------------------------------------- /old/08 - Recap and Check-In/PerlichLeakage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/08 - Recap and Check-In/PerlichLeakage.pdf -------------------------------------------------------------------------------- /old/08 - Recap and Check-In/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | During this week, your pipeline development and refinement should continue with a widening set of model specifications and features to explore. 3 | 4 | **Due Monday, Oct 19**: Technical modeling plan and detailed feature list. 5 | 6 | ## Tuesday: Recap and Check-In 7 | We'll dedicate our time this Tuesday to a recap of the class so far and check-in on progress on your projects. 8 | 9 | Required Readings for Tuesday: 10 | - *Three Pitfalls to Avoid in Machine Learning* by Riley, P. Nature. 527. 2019 (Comment) [Available Online](https://www.nature.com/magazine-assets/d41586-019-02307-y/d41586-019-02307-y.pdf) 11 | - *Top 10 ways your Machine Learning models may have leakage* by Ghani, R. et al. DSSG Blog. [Available Online](http://www.dssgfellowship.org/2020/01/23/top-10-ways-your-machine-learning-models-may-have-leakage/) 12 | 13 | ## Wednesday: Group Check-Ins 14 | 15 | ## Thursday: Project Work 16 | 17 | Optional Readings: 18 | - *Data Science for Business* by Provost and Fawcett. O’Reilly. 2013. Chapter 5: Overfitting and Its Avoidance [Available Online](https://learning.oreilly.com/library/view/data-science-for/9781449374273/ch05.html) 19 | - *Overview of Different Approaches to Deploying Machine Learning Models in Production* by Kervizic, J. KDnuggets. [Available Online](https://www.kdnuggets.com/2019/06/approaches-deploying-machine-learning-production.html) 20 | - *Why is Machine Learning Deployment Hard?* by Gonfalonieri, A. Medium. [Available Online](https://towardsdatascience.com/why-is-machine-learning-deployment-hard-443af67493cd) 21 | - *Leakage in Data Mining* by Kaufman, S., Rosset, S., et al. TKDD. 2011. [Available Online](https://www.researchgate.net/profile/Claudia_Perlich/publication/221653692_Leakage_in_Data_Mining_Formulation_Detection_and_Avoidance/links/54418bb80cf2a6a049a5a0ca/Leakage-in-Data-Mining-Formulation-Detection-and-Avoidance.pdf) 22 | 23 | 24 | -------------------------------------------------------------------------------- /old/09 - Project Update Presentations/README.md: -------------------------------------------------------------------------------- 1 | ## Project Update Presentations 2 | By this week, your group should have a preliminary set of “correct but crappy” results reflecting a relatively simple model grid and the features you prioritized to build as a first pass. 3 | 4 | **Due Monday, Oct 26:** Weekly project update with validation splits, matrices, and “version 0” results. 5 | 6 | This week, each group will give a presentation about the current status of their project, covering the problem setting, approach, and initial results. The presentations will be split between the Tuesday and Thursday Sessions. On Wednesday, we’ll continue our group check-ins and sometime this week we’ll hold an additional tech session with a deep dive on building modeling pipelines. 7 | 8 | The mid-term progress presentation should provide a status update on your work so far as well as your plans for the next steps in your project. The presentation should be clear, well-organized, and at appropriate level of depth for the decision makers relevant to your project (as opposed to ML experts). 9 | 10 | 11 | ### Guidelines for Presentations: 12 | 13 | Please keep your presentation to 10 minutes and provide a link to the recording in the spreadsheet listed on canvas for your classmates to access. 14 | 15 | #### Content and timing: 16 | 17 | - What problem are you solving and why is it important? Be specific about goals and scope and available data (at a high level) for addressing it (1 minute) 18 | 19 | - Analytical formulation (3 minutes): 20 | - what is the overall formulation? 21 | - what are rows, cohort, labels, and features 22 | - what model evaluation/selection metric are you focused on? 23 | - highlight any data exploration that informed these choices 24 | - Pipeline structure/overview 25 | 26 | - Initial results (4 minutes): 27 | - V0 results for initial models (including PR-k curve) 28 | - results using all the baselines (also describe the baselines) 29 | 30 | - Next steps (2 minutes): 31 | - Planned features 32 | - Model grid & selection 33 | -------------------------------------------------------------------------------- /old/10 - Model Interpretability Pt 1/Interpretability part 1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/10 - Model Interpretability Pt 1/Interpretability part 1.pptx -------------------------------------------------------------------------------- /old/10 - Model Interpretability Pt 1/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | At this point, your group should be continuing to refine and expand on your preliminary modeling results. 3 | 4 | **Due Monday, Nov. 2**: Weekly project update with fixed "version 0" results and a list of models and hyperparameters you'll be running. 5 | 6 | ## Tuesday: Model Interpretability, Part I 7 | Model interpretability can be thought of at two levels: global (how 8 | the model works in aggregate) and local (why an individual 9 | prediction came out as it did). This week, we’ll focus on the bigger 10 | picture: understanding how a model is performing globally and what it 11 | means to compare this performance across model specifications. 12 | 13 | Required Readings for Tuesday: 14 | - *Optimized Scoring Systems: Toward Trust in Machine Learning for Healthcare and Criminal Justice* by Rudin, C, and Usutn, B. INFORMS Journal on Applied Analytics. 2018. [Available Online](https://pubsonline.informs.org/doi/pdf/10.1287/inte.2018.0957) 15 | - *Intelligible Models for HealthCare: Predicting Pneumonia Risk and Hospital 30-day Readmission* by Caruana, R, et al. KDD 2015. [Available Online](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.704.9327&rep=rep1&type=pdf) 16 | 17 | ## Wednesday: Group Check-Ins 18 | 19 | ## Thursday: Project Work 20 | 21 | Optional Readings: 22 | - *Interpretable Classification Models for Recidivism Prediction* by Zeng, J, Ustun, B, and Rudin, C. J. Royal Stat. Soc. A. 2016. [Available Online](https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/rssa.12227) 23 | -------------------------------------------------------------------------------- /old/10 - Model Interpretability Pt 1/RudinInterpretable.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/10 - Model Interpretability Pt 1/RudinInterpretable.pdf -------------------------------------------------------------------------------- /old/11 - Model Interpretability Pt 2/Interpretability part 2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/11 - Model Interpretability Pt 2/Interpretability part 2.pptx -------------------------------------------------------------------------------- /old/11 - Model Interpretability Pt 2/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | By this week, project work should be beginning to focus more heavily on evaluation, model selection, and interpretation. 3 | 4 | **Due Monday, Nov 9**: Weekly project update. 5 | 6 | ## Tuesday: Model Interpretability, Part II 7 | This week, we’ll look at the other side of interpretability, with a 8 | focus on several practical applications of local explanations: they 9 | can help researchers debug and improve their models, build trust 10 | among stakeholders (including a growing legal movement towards a 11 | "right to explanation"), help those acting on model predictions 12 | understand when they should override the model with their judgement, 13 | and importantly help those actors decide not only on whom to intervene 14 | but suggest what sort of intervention to take. 15 | 16 | Required Readings for Tuesday: 17 | - *Why Should I Trust You? Explaining the Predictions of any Classifier* by Ribeiro, MT, Singh, S, and Guestring, C. KDD 2016. [Available Online](https://dl.acm.org/doi/abs/10.1145/2939672.2939778) 18 | - *Explainable machine-learning predictions for the prevention of hypoxaemia during surgery* by Lundberg, SM, Nair, B, et al. Nature Biomed. Eng. 2018. [Available Online](https://www.nature.com/articles/s41551-018-0304-0.pdf) 19 | 20 | ## Wednesday: Group Check-Ins 21 | 22 | ## Thursday: Project Work 23 | 24 | Optional Readings: 25 | - *Model Agnostic Supervised Local Explanations* by Plumb, G, Molitor, D, and Talwalkar, AS. NIPS 2018. [Available Online](http://papers.nips.cc/paper/7518-model-agnostic-supervised-local-explanations) 26 | - *A Unified Approach to Interpreting Model Predictions* by Lundberg, SM and Lee, S. NIPS 2017. [Available Online](http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predicti) 27 | - *Explainable AI for Trees* by Lundberg, SM, Erion, G, et al. arXiv preprint: arxiv/1905.04610. [Available Online](https://arxiv.org/pdf/1905.04610.pdf) 28 | -------------------------------------------------------------------------------- /old/12 - Algorithmic Bias and Fairness Pt 1/FALL 20 - ethics bias fairness - part 1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/12 - Algorithmic Bias and Fairness Pt 1/FALL 20 - ethics bias fairness - part 1.pptx -------------------------------------------------------------------------------- /old/12 - Algorithmic Bias and Fairness Pt 1/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | By this week, you should be finalizing your modeling results and 3 | beginning to look at bias and disparities in your models. 4 | 5 | **Due Monday, Nov 16**: Weekly project update. 6 | 7 | ## Tuesday: Algorithmic Bias and Fairness, Part I 8 | Just as important as assessing whether your model is making accurate 9 | predictions is determining whether it is doing so in a fair 10 | manner. But, what do we mean by fairness? How can you measure it and 11 | what can you do to mitigate any disparities you might find? Where in 12 | your pipeline can bias be introduced? (spoiler: everywhere). This week 13 | will provide a very brief introduction to the expansive field of 14 | algorithmic fairness. 15 | 16 | Required Readings for Tuesday: 17 | - *Bias and Fairness in ML* [Available Online](https://textbook.coleridgeinitiative.org/chap-bias.html) 18 | - *Fairness Definitions Explained* by Verma, S and Rubin, J. [Available Online](https://dl.acm.org/citation.cfm?doid=3194770.3194776) 19 | - *A Theory of Justice* by Rawls, J. 1971. Chapter 1: Justice as Fairness, pp. 1-19. [Available Online](https://blogs.baruch.cuny.edu/eng2100kmwd/files/2015/12/A-Theory-of-Justice-Excerpts.pdf) 20 | - *Racial Equity in Algorithmic Criminal Justice* by Huq, A. Duke Law Journal. 2018. [Available Online](https://heinonline.org/HOL/Page?handle=hein.journals/duklr68&id=1067&div=33&collection=journals) [Focus on sections: I.B.2, all of section II, III introduction, III.B, and III.D.3] 21 | 22 | ## Wednesday: Group Check-Ins 23 | 24 | ## Thursday: Project Work 25 | 26 | Optional Readings: 27 | - *Is Algorithmic Affirmative Action Legal?* by Bent, JR. Georgetown Law Journal. 2019. [Available Online](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3372690) 28 | - *Does Mitigating ML’s Impact Disparity Require Treatment Disparity?* by Lipton, Z, McAuley, J, and Chouldechova, A. NIPS 2018. [Available Online](http://papers.nips.cc/paper/8035-does-mitigating-mls-impact-disparity-require-treatment-disparity) 29 | - *Equality of Opportunity* by Roemer, JE and Trannoy, A. 2013. [Available Online](http://cowles.yale.edu/sites/default/files/files/pub/d19/d1921.pdf) 30 | -------------------------------------------------------------------------------- /old/12 - Algorithmic Bias and Fairness Pt 1/ethics bias fairness - part 1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/12 - Algorithmic Bias and Fairness Pt 1/ethics bias fairness - part 1.pptx -------------------------------------------------------------------------------- /old/13 - Algorithmic Bias and Fairness Pt 2/ChouldechovaFosterCare.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/13 - Algorithmic Bias and Fairness Pt 2/ChouldechovaFosterCare.pdf -------------------------------------------------------------------------------- /old/13 - Algorithmic Bias and Fairness Pt 2/FALL 20 - ethics bias fairness - part 2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/13 - Algorithmic Bias and Fairness Pt 2/FALL 20 - ethics bias fairness - part 2.pptx -------------------------------------------------------------------------------- /old/13 - Algorithmic Bias and Fairness Pt 2/FALL 20 - other ML ethics issues.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/13 - Algorithmic Bias and Fairness Pt 2/FALL 20 - other ML ethics issues.pptx -------------------------------------------------------------------------------- /old/13 - Algorithmic Bias and Fairness Pt 2/README.md: -------------------------------------------------------------------------------- 1 | **Note:** No classes on Wednesday, Nov 25, or Thursday, Nov 26, for Thanksgiving. 2 | 3 | ## Project Work 4 | During this week, your group should be continuing to investigate any 5 | disparities in your model results as well as performing any other 6 | necessary post-modeling analyses. 7 | 8 | **Due Monday, Nov 23**: Weekly project update. 9 | 10 | ## Tuesday: Algorithmic Bias and Fairness, Part II 11 | This week, we’ll continue our discussion of bias and fairness with a 12 | very brief survey of practical considerations and open research 13 | questions in the rapidly-developing field. 14 | 15 | Link to [slides for this week](FALL%2020%20-%20ethics%20bias%20fairness%20-%20part%202.pptx). 16 | 17 | Although we won't have time to cover these in class, you may want to check out these [additional slides covering some other important topics in data & ML/AI ethics](FALL%2020%20-%20other%20ML%20ethics%20issues.pptx). 18 | 19 | Required Readings for Tuesday: 20 | - *A case study of algorithm-assisted decision making in child maltreatment hotline screening decisions* by Chouldechova, A, Putnam-Hornstein, E, et al. PMLR. 2018. [Available Online](http://proceedings.mlr.press/v81/chouldechova18a/chouldechova18a.pdf) 21 | - *Equality of Opportunity in Supervised Learning* by Hardt, M. and Price, E. NIPS 2016. [Available Online](https://papers.nips.cc/paper/2016/file/9d2682367c3935defcb1f9e247a97c0d-Paper.pdf) 22 | 23 | Optional Readings: 24 | - *Predictive Fairness to Reduce Misdemeanor Recidivism Through Social Service Interventions. K. Rodolfa; E. Salomon; L. Haynes; I. Mendieta; J. Larson; R. Ghani. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency (ACM FAT*) 2020. [Available Online](https://arxiv.org/abs/2001.09233) 25 | - *Classification with fairness constraints: A meta-algorithm with provable guarantees* by Celis, E, Huang, L, et al. FAT\* 2019. [Available Online](https://dl.acm.org/citation.cfm?doid=3287560.3287586) 26 | - *Fairness Through Awareness* by Dwork, C, Hardt, M, et al. ITCS 2012. [Available Online](https://dl.acm.org/citation.cfm?id=2090255) 27 | - *Fairness Constraints: Mechanisms for Fair Classification* Zafar, M, 28 | Valera I, et al. PMLR 2017 [Available Online](http://proceedings.mlr.press/v54/zafar17a.html) 29 | - *Fair Prediction with Disparate Impact: A Study of Bias in Recidivism Prediction Instruments* by Chouldechova, A. Big Data. 2017. [Available Online](https://www.liebertpub.com/doi/10.1089/big.2016.0047) 30 | -------------------------------------------------------------------------------- /old/13 - Algorithmic Bias and Fairness Pt 2/ethics bias fairness - part 2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/13 - Algorithmic Bias and Fairness Pt 2/ethics bias fairness - part 2.pptx -------------------------------------------------------------------------------- /old/13 - Algorithmic Bias and Fairness Pt 2/ethics bias fairness - part 3.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/13 - Algorithmic Bias and Fairness Pt 2/ethics bias fairness - part 3.pptx -------------------------------------------------------------------------------- /old/14 - Causality and Field Validation/README.md: -------------------------------------------------------------------------------- 1 | ## Project Work 2 | By this week, you should be wrapping up any remaining 3 | technical work and beginning to put together results for your 4 | group’s final report and presentation. 5 | 6 | **Due Monday, Nov 30**: Weekly project update. 7 | 8 | ## Tuesday: Causality and Field Validation 9 | Even with careful planning and handling of the data, the only way to 10 | truly understand how well your model works is by testing it in the 11 | field. Generally, you’re concerned not only with its predictiveness, 12 | but the actual ability of the model to help the program achieve its 13 | policy goals, such as improving outcomes among the population it 14 | serves. Typically, this involves working closely with policy makers to 15 | develop a field trial using either randomization or non-experimental 16 | methods depending on the constraints of the setting. 17 | 18 | Required Readings for Tuesday: 19 | - *The seven tools of causal inference, with reflections on machine learning* by Pearl, J. Comm ACM. 2019 [Available Online](https://ftp.cs.ucla.edu/pub/stat_ser/r481.pdf) 20 | 21 | ## Wednesday: Group Check-Ins 22 | 23 | ## Thursday: Project Work 24 | 25 | Optional Readings: 26 | - *Elements of Causal Inference* by Peters et al. MIT Press. Chapters 1 and 2. [Available Online (Open Access Link)](https://mitpress.mit.edu/books/elements-causal-inference) 27 | -------------------------------------------------------------------------------- /old/14 - Causality and Field Validation/causal inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/14 - Causality and Field Validation/causal inference.pdf -------------------------------------------------------------------------------- /old/14 - Causality and Field Validation/causal inference.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/old/14 - Causality and Field Validation/causal inference.pptx -------------------------------------------------------------------------------- /old/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project/10718_assignment_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "10718_assignment_1.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | } 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "source": [ 33 | "# 10718 Assignment 1: Getting to know the class project\n", 34 | "\n", 35 | "## Overview\n", 36 | "\n", 37 | "The purpose of this exercise is to provide a quick introduction to the data and modeling problem we'll be using as an illustrative example throughout Machine Learning in Practice (10-718). Note that this colab notebook **only uses a small sample of the data**, but we'll be working with the full dataset as you get into your group projects for the rest of the semester. \n", 38 | "\n", 39 | "**Completing this assignment should only take you 2-3 hours.**\n", 40 | "\n", 41 | "## Problem Background\n", 42 | "\n", 43 | "This notebook makes use of a sample of the data provided by [DonorsChoose](https://www.donorschoose.org/) to the [2014 KDD Cup](https://www.kaggle.com/c/kdd-cup-2014-predicting-excitement-at-donors-choose/data). Public schools in the United States face large disparities in funding, often resulting in teachers and staff members filling these gaps by purchasing classroom supplies out of their own pockets. DonorsChoose is an online crowdfunding platform that tries to help alleviate this financial burden on teachers by allowing them to seek funding for projects and resources from the community (projects can include classroom basics like books and markers, larger items like lab equipment or musical instruments, specific experiences like field trips or guest speakers). \n", 44 | "\n", 45 | "Projects on DonorsChoose expire after 4 months, and if the target funding level isn't reached, the project receives no funding. Since its launch in 2000, the platform has helped fund over 2 million projects at schools across the US, but about 1/3 of the projects that are posted nevertheless fail to meet their goal and go unfunded.\n", 46 | "\n", 47 | "### The Modeling Problem\n", 48 | "\n", 49 | "For the purposes of this exercise, let's assume that DonorsChoose has hired a digital content expert who will review projects and help teachers improve their postings and increase their chances of reaching their funding threshold. Because this individualized review is a labor-intensive process, the digital content expert has ** time to review and support only 10% of the projects posted to the platform on a given day**. \n", 50 | "\n", 51 | "You are a data scientist working with DonorsChoose, and your task is to help this content expert focus their limited resources on projects that most need the help. As such, you want to build a model to **identify projects that are least likely to be fully funded before they expire** and pass them off to the digital content expert for review.\n" 52 | ], 53 | "metadata": { 54 | "id": "Zw1i3ELeKDLD" 55 | } 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "source": [ 60 | "# Getting Set Up\n", 61 | "\n", 62 | "Running the code below will create a local postgres 11 database for you and import the sampled donors choose data. Don't worry about the details of that and you shouldn't need to touch any of the code here aside from running it. Below, we'll talk about how to access the database from within the notebook to run queries." 63 | ], 64 | "metadata": { 65 | "id": "BjRBddip6lPI" 66 | } 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "id": "iY8dwqamIIQc" 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# Install and start postgresql-11 server\n", 77 | "!sudo apt-get -y -qq update\n", 78 | "!wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -\n", 79 | "!echo \"deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main\" |sudo tee /etc/apt/sources.list.d/pgdg.list\n", 80 | "!sudo apt-get -y -qq update\n", 81 | "!sudo apt-get -y -qq install postgresql-11 postgresql-client-11\n", 82 | "!sudo service postgresql start\n", 83 | "\n", 84 | "# Setup a password `postgres` for username `postgres`\n", 85 | "!sudo -u postgres psql -U postgres -c \"ALTER USER postgres PASSWORD 'postgres';\"\n", 86 | "\n", 87 | "# Setup a database with name `donors_choose` to be used\n", 88 | "!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS donors_choose;'\n", 89 | "\n", 90 | "!sudo -u postgres psql -U postgres -c 'CREATE DATABASE donors_choose;'\n", 91 | "\n", 92 | "# Environment variables for connecting to the database\n", 93 | "%env DEMO_DATABASE_NAME=donors_choose\n", 94 | "%env DEMO_DATABASE_HOST=localhost\n", 95 | "%env DEMO_DATABASE_PORT=5432\n", 96 | "%env DEMO_DATABASE_USER=postgres\n", 97 | "%env DEMO_DATABASE_PASS=postgres" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "source": [ 103 | "# Download sampled DonorsChoose data and load it into our postgres server\n", 104 | "!curl -s -OL https://dsapp-public-data-migrated.s3.us-west-2.amazonaws.com/donors_sampled_20210920_v3.dmp\n", 105 | "!PGPASSWORD=$DEMO_DATABASE_PASS pg_restore -h $DEMO_DATABASE_HOST -p $DEMO_DATABASE_PORT -d $DEMO_DATABASE_NAME -U $DEMO_DATABASE_USER -O -j 8 donors_sampled_20210920_v3.dmp" 106 | ], 107 | "metadata": { 108 | "id": "ImdiugfVIXcq" 109 | }, 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "source": [ 116 | "!pip install SQLAlchemy==1.3.18 PyYAML==6.0 psycopg2-binary==2.9.3" 117 | ], 118 | "metadata": { 119 | "id": "Uj114AFLIpug" 120 | }, 121 | "execution_count": null, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "source": [ 127 | "import pandas as pd\n", 128 | "pd.set_option('display.max_columns', None)" 129 | ], 130 | "metadata": { 131 | "id": "ZIZEHiMpANsU" 132 | }, 133 | "execution_count": null, 134 | "outputs": [] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "source": [ 139 | "from sqlalchemy.engine.url import URL\n", 140 | "from sqlalchemy import create_engine\n", 141 | "\n", 142 | "db_url = URL(\n", 143 | " 'postgres',\n", 144 | " host='localhost',\n", 145 | " username='postgres',\n", 146 | " database='donors_choose',\n", 147 | " password='postgres',\n", 148 | " port=5432,\n", 149 | " )\n", 150 | "\n", 151 | "db_engine = create_engine(db_url)" 152 | ], 153 | "metadata": { 154 | "id": "JRYwIn-UJI6t" 155 | }, 156 | "execution_count": null, 157 | "outputs": [] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "source": [ 162 | "# Querying the Database\n", 163 | "\n", 164 | "The code block above used the `sqlalchemy` module to create a connection to the database called `db_engine`. An easy way to run SQL queries against this database is to use the `read_sql` command provided by `pandas`. For instance, if you run the example below, it should return the number of projects in the sampled dataset (16,480):" 165 | ], 166 | "metadata": { 167 | "id": "9t7vS9VfKJm_" 168 | } 169 | }, 170 | { 171 | "cell_type": "code", 172 | "source": [ 173 | "pd.read_sql(\"SELECT COUNT(*) FROM data.projects\", db_engine)" 174 | ], 175 | "metadata": { 176 | "id": "fEpuSoSdJUN2" 177 | }, 178 | "execution_count": null, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "source": [ 184 | "You can find some more details about the dataset on the [KDD Cup page](https://www.kaggle.com/c/kdd-cup-2014-predicting-excitement-at-donors-choose/data), but here is a quick description of the four main source tables:\n", 185 | "- `data.projects` contains information about each project that was posted on the site, including IDs for the project, school, and teacher, as well as the total amount being requested (note that projects can also request additional \"optional support\" but don't need to reach this higher bar to be funded)\n", 186 | "- `data.essays` has project titles and descriptions\n", 187 | "- `data.resources` has information about the specific resources being requested\n", 188 | "- `data.donations` contains details about each donation that was received by a project (when it came in, the amount, whether it was from another teacher, etc.)\n", 189 | "\n", 190 | "** Note that if you prefer to work in python to explore and manipulate the data, you can download a full table (for example, `data.projects`) into a pandas dataframe via**:\n", 191 | "```\n", 192 | "projects_df = pd.read_sql(\"SELECT * FROM data.projects\", db_engine)\n", 193 | "```" 194 | ], 195 | "metadata": { 196 | "id": "UECOSNF-8pTs" 197 | } 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "source": [ 202 | "## Want other packages?\n", 203 | "If you need to install any other python modules for your analysis, you can easily do so from a code block by prefixing your `pip install` command with an `!` character. For instance:\n", 204 | "```\n", 205 | "!pip install PyYAML\n", 206 | "```" 207 | ], 208 | "metadata": { 209 | "id": "7bLEVeYa8IGY" 210 | } 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "source": [ 215 | "# QUESTION 1\n", 216 | "\n", 217 | "**(A)** Create a scatter plot of the number of resources requested vs fraction of the total ask amount that was funded across all projects in New Hampshire. (Hint: You'll need to join a couple of tables here to figure out the amount donated to a project)" 218 | ], 219 | "metadata": { 220 | "id": "IIRe2r2tKNJI" 221 | } 222 | }, 223 | { 224 | "cell_type": "code", 225 | "source": [], 226 | "metadata": { 227 | "id": "108ogvgOJrpF" 228 | }, 229 | "execution_count": null, 230 | "outputs": [] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "source": [ 235 | "**(B)** Spend a little time exploring the data, with a focus on the outcome of whether or not a project is fully funded after 4 months as well as features you think might be predictive of this outcome." 236 | ], 237 | "metadata": { 238 | "id": "OozrlP8dN7zG" 239 | } 240 | }, 241 | { 242 | "cell_type": "code", 243 | "source": [], 244 | "metadata": { 245 | "id": "n4LFpsrLMePc" 246 | }, 247 | "execution_count": null, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "source": [ 253 | "# QUESTION 2\n", 254 | "\n", 255 | "For this question, you'll develop a model to help DonorsChoose **identify 10% of projects with the highest risk of failing to meet their funding goal** for their digital content expert review and provide suggestions. In order to intervene early in the process, DonorsChoose wants to identify these projects to help **immediately upon being posted** to the site.\n", 256 | "\n", 257 | "Build and evaluate the performance of several machine learning models for this task. **Be sure to use comments or text blocks to discuss the choices and assumptions you're making along the way.** Feel free to use any python packages available (such as sklearn) for this." 258 | ], 259 | "metadata": { 260 | "id": "6y7NI6XAS-96" 261 | } 262 | }, 263 | { 264 | "cell_type": "code", 265 | "source": [], 266 | "metadata": { 267 | "id": "sJDDm0HjpcXy" 268 | }, 269 | "execution_count": null, 270 | "outputs": [] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "source": [ 275 | "# QUESTION 3\n", 276 | "\n", 277 | "Briefly discuss (in at most 1-2 paragraphs) the results of your analysis above: Which model would you select to deploy and why? Are there future analysis or improvements you would suggest? Are there other data sources you would ideally like to be able to incorporate into your models?" 278 | ], 279 | "metadata": { 280 | "id": "CSOIZcMFESpn" 281 | } 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "source": [], 286 | "metadata": { 287 | "id": "VIedu1vjE8PR" 288 | } 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "source": [ 293 | "# Submission\n", 294 | "\n", 295 | "To submit your exercise, please **save a copy** of this notebook containing your code and outputs (you can save it either to google drive or github, but make sure the course staff will have read permissions to access it).\n", 296 | "\n", 297 | "Provide a link to your copy of the notebook when you submit the assignment in canvas.\n", 298 | "\n", 299 | "Thank you and we're looking forward to a great semester in 10718!" 300 | ], 301 | "metadata": { 302 | "id": "XP2k3Z--DNju" 303 | } 304 | } 305 | ] 306 | } -------------------------------------------------------------------------------- /project/final_project_presentation.md: -------------------------------------------------------------------------------- 1 | # Final Project Presentation 2 | 3 | **NOTE: Presentations will take place during class on December 8 and 10 (we'll decide which group for each day when it gets closer). Please upload your slides on canvas before the presentation.** 4 | 5 | Each team will have 8 minutes for their presentation. We will need to be strict on the timing to make sure everyone has enough time to present: 6 | 7 | ## Timing 8 | Practice your timing before and have a plan for the last 30 seconds. If you're out of time, what do you want to say in the last 30 seconds? 9 | 10 | ## Content 11 | The presentation should be clear, well-organized, and **at an appropriate level of depth for the decision-makers relevant to your project (as opposed to ML experts)** 12 | 13 | ## Suggested Structure 14 | 15 | 1. What problem are you solving and why is it important? Be specific about goals, potential policy impact, and efficiency/equity/effectiveness trade-offs (1 minute) 16 | 1. What data did you use? (1 minute) 17 | 1. Machine Learning Formulation, Analysis, and Evaluation - described in a way that makes sense to decision-makers (4 minutes) 18 | - formulation of the problem 19 | - what are your rows, what are your labels, what are your features 20 | - how did you validate - training/validation splits, evaluation metrics, sensible baseline 21 | - results – performance, important features, and bias audit 22 | 1. Caveats: based on the limitations of your data or analysis (1 minutes) 23 | 1. Policy Recommendations: concrete recommendations based on your analysis (1 minutes) 24 | 1. Future Work < 1 minute 25 | 26 | # Evaluation Criteria 27 | 28 | - Goals/Context: 29 | - The project has clear and actionable policy goals. 30 | - The use of this project in a policy setting is well described. 31 | - The project is well motivated and achieves the policy goals described. 32 | - Thoughtful consideration of balancing equity, efficiency, and effectiveness, as well as other potential ethical issues and mitigation strategies. 33 | - Data: 34 | - The data description and data exploration shows that the data used is relevant and sufficient for the problem. 35 | - Analysis: The analysis is done correctly and is evaluated appropriately 36 | - The machine learning models used are appropriate for the task and well-justified. All of the methods appropriate for the task and covered in class should be used. 37 | - The evaluation methodology is appropriate for the task and matches the operational use of this analysis/models. 38 | - Training and validation set (and process) is well described. 39 | - The correct metrics are being optimized for and optimizing for those metrics achieve the policy goals described. 40 | - Results: 41 | - Evaluation results are described for every train/validate set, metric, and models used 42 | - Performance is compared against a sensible baseline that reflects what a decision-maker might do in the absence of a machine learning model. 43 | - The selection of the final model recommended for use is well described 44 | - The model interpretation is done well. 45 | - Models are audited for bias and fairness (motivated by the correct bias and fairness metrics and groups of interest) and results provided. 46 | - Policy Recommendations: 47 | - Concrete and actionable policy recommendations are provided based on the results of the analysis 48 | - Caveats: 49 | - Caveats of the project and recommendations are provided to a policy audience based on the limitations of the data and/or the analysis. 50 | - Future recommendations on how to improve the analysis are provided 51 | 52 | -------------------------------------------------------------------------------- /project/final_project_report.md: -------------------------------------------------------------------------------- 1 | # Final Project Report 2 | 3 | **Due: December 17, 11:59pm EST** 4 | 5 | The final project report should be approximately 10-15 pages in length (excluding appendix and references) and cover the following topics: 6 | 7 | 1. Executive Summary: Succinctly describe the project, results, and recommendations. The executive summary should not exceed 1 page in length. 8 | 1. Background and Introduction: This section motivates the problem, explains why it's important, why should we care, and the potential impact if it's solved. 9 | 1. Related work: What's been done before in this area (both using and ML and without) and why your work is different or better. 10 | 1. Problem formulation and Overview of your solution 11 | 1. Data Description, including briefly highlighting any data exploration that informed important formulation/modeling choices. 12 | 1. Details of your solution: methods, tools, analysis you did, model types and hyperparameters used, features. This section of the report should also include a link to well-documented code in your group’s course github repository. 13 | 1. Evaluation: results, plots (for example precision recall k curves, other types of results), important features, and bias audit of the models you built. 14 | 1. Discussion of the results: what did you learn from looking at the results about the data, problem, and solution. 15 | 1. Brief (1-2 paragraph) design of a field trial to evaluate the accuracy of the model you built and selected in practice, as well as its ability to help the organization achieve its goals. It's not enough to say we'll do an A/B test or a randomized trial. 16 | 1. Policy Recommendations based on your analysis/models 17 | 1. Limitations, caveats, future work to improve on what you've done. 18 | 1. Optionally, you may also wish to include a proposal for future avenues of research beyond the scope of this work, for instance on novel machine learning methods to improve on the current work, new policy interventions to evaluate or explore, or other related research opportunities. 19 | 20 | ## Appendix 21 | Please include the following details in an appendix so we can better evaluate the work you've done and not just evaluate the outputs: 22 | 23 | - Exact definition of label: 1) how did you decide from the database what was a positive example and negative example. 2) over what time 24 | - List of *all* features generated 25 | - Model grid used: models and hyper-parameters. You can give the grid or a list of all model-hyperparameter combinations 26 | - List of train/validation sets (table with the dates) 27 | - The temporal graph of your primary evaluation metric (precision at k) for each validation set for all the models in the grid (line color by model type) - it's the slide from Update 5 28 | - Criteria used to select top models (mean precision at k for example) 29 | - For those top 5 models + smart baseline, please provide: 30 | - What are they 31 | - PR_k graphs of top models as well as 32 | - List of feature importance of *all* features 33 | - Cross-tabs for ~10 most different features 34 | - Bias metrics that are relevant to your problem scope 35 | 36 | ## Github Repo 37 | The Github repo should be documented in two ways: 38 | 39 | 1. Instructions on the structure of the repository, what files are there, and how one should run your code (installing any packages for example) 40 | 1. The relevant code files should be documented. 41 | 42 | 43 | # Evaluation Criteria 44 | 45 | ## Final Project Report 46 | 47 | - Goals/Context: 48 | - The project has clear and actionable policy goals. 49 | - The use of this project in a policy setting is well described. 50 | - The project is well motivated and achieves the policy goals described. 51 | - Thoughtful consideration of balancing equity, efficiency, and effectiveness, as well as other potential ethical issues and mitigation strategies. 52 | - Previous work in this area is described and covered well. 53 | - Data: 54 | - The data used is relevant for the problem, over a long enough period to solve this problem 55 | - Data exploration is described well 56 | - Analysis: The analysis is done correctly and is evaluated appropriately 57 | - The machine learning models used are appropriate for the task and well-justified. All of the methods appropriate for the task and covered in class should be used. 58 | - The evaluation methodology is appropriate for the task and matches the operational use of this analysis/models. 59 | - Each training and validation set (and the generation process) is well described. 60 | - The correct metrics are being optimized for and optimizing for those metrics achieve the policy goals described. 61 | - Results: 62 | - Evaluation results are described in detail for every train/validate set, metric, and models used 63 | - Performance is compared against a sensible baseline that reflects what a decision maker might do in the absence of a machine learning model. 64 | - The selection of the final model recommended for use is well described 65 | - The model interpretation is done well. 66 | - Models are audited for bias and fairness (motivated by the correct bias and fairness metrics and groups of interest) and results provided. 67 | - Policy Recommendations and Field Trial Design: 68 | - Suggested field trial design is appropriate to assess both the performance of the model and impact of program outcomes, as well as accounting for potential nuances of feasibility or ethical constraints (e.g., withholding services, etc.) 69 | - Concrete and actionable policy recommendations are provided based on the results of the analysis 70 | - Caveats: 71 | - Caveats of the project and recommendations are provided to a policy audience based on the limitations of the data and/or the analysis. 72 | - Future recommendations on how to improve the analysis are provided 73 | - Appendix: 74 | - The additional information in the appendix is correct 75 | - The additional information in the appendix is correct supports and justifies the results provided in the report 76 | 77 | ## Code and Repo 78 | 79 | - The repository is well-structured and well-documented. 80 | - Usage and installation instructions are clear. 81 | - Code is well-organized and documented. 82 | - Code is reproducible, extensible, and modular. 83 | -------------------------------------------------------------------------------- /project/proposal.md: -------------------------------------------------------------------------------- 1 | ## Project Proposal 2 | The proposal should be 4-5 pages (pdf) and should contain: 3 | 4 | - Background and Goals 5 | - what is the problem you're solving? 6 | - why is it important? 7 | - what impact will your solution have? 8 | - who cares about this problem? 9 | - who will take action based on your work 10 | - what are the policy goals you care about (efficiency, equity, effectiveness,…)? How will you decide on tradeoffs across these goals? 11 | - How this problem is solved today/previously 12 | - What interventions exist/will exist 13 | - What data do you have and what additional data will you need? 14 | - **Important: You should do data exploration and provide descriptive stats to show that you have enough relevant data to solve this problem** 15 | - What analysis are you proposing to do? 16 | - What is the ML problem? What are some possible outcome variables (labels) that you might use? 17 | - How will you validate it in the class project? What metrics will you use? Why will those metrics achieve the goal you described above? 18 | - What additional validation will need to be done later? 19 | - What are some ethical considerations here around privacy, equity, transparency, and accountability? How do you plan on dealing with them? 20 | - Caveats (due to data limitations, analysis limitations, time limitations, etc.) 21 | - Policy recommendations: what kind of recommendations do you hope to give to policymakers based on this analysis/project. How will you validate whether what you are proposing will have the desired impact? 22 | 23 | -------------------------------------------------------------------------------- /project/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /riyaz_panjwani.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/riyaz_panjwani.jpeg -------------------------------------------------------------------------------- /ryan.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/ryan.jpeg -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | #Some useful scripts 2 | 3 | [connect to vpn script](vpn-to-cmu.sh): If you're on a mac and want to connect to vpn (lazily) through the command line 4 | -------------------------------------------------------------------------------- /scripts/vpn-to-cmu.sh: -------------------------------------------------------------------------------- 1 | /opt/cisco/anyconnect/bin/vpn -s connect vpn.cmu.edu << "EOF" 2 | 1 3 | YOUR_ANDREW_ID 4 | YOUR_ANDREW_ID_PASSWORD 5 | -------------------------------------------------------------------------------- /techhelp/README.md: -------------------------------------------------------------------------------- 1 | # Tech Setup 2 | 3 | 1. Make sure you are on cmu vpn (Full VPN group) 4 | 2. Connect to class server: mlpolicylab.dssg.io (command line/terminal/putty) : type `ssh your_andrew_id@mlpolicylab.dssg.io` 5 | 3. Connect to database server: mlpolicylab.db.dssg.io If you're on the server, type `psql -h mlpolicylab.db.dssg.io -U YOUR_ANDREW_ID group_students_database` 6 | 4. setting up dbeaver (a visual ide to the database) [instructions are here](https://github.com/dssg/mlforpublicpolicylab/raw/master/techhelp/dbeaver_instructions.pdf) 7 | 8 | **Detailed instructions** are in [slack post](https://mlppfall2020.slack.com/files/T01A8J5N01E/F01A4RF49U4?origin_team=T01A8J5N01E&origin_channel=C019AFXU9NK) 9 | 10 | **[Slides from week 1 tech session](https://docs.google.com/presentation/d/1MEP-UF9dHuEfoIWWKIphFAYI23miVHCovLJgndPLgvE/edit#slide=id)** 11 | 12 | ## ssh 13 | `ssh your_andrew_id@mlpolicylab.dssg.io` 14 | 15 | ssh is what you'll use to connect to the class server, which is where you will do all the work. You will need to give us your ssh key, using the instructions we sent, and then you'll be good to go. Based on which operating system you're using, you can google for which tool is the best (command line, terminal, putty, etc.) 16 | 17 | ## Linux Command Line (Bash) 18 | If you're not too familiar with working at the command line, we have a quick overview and intro [here](https://dssg.github.io/hitchhikers-guide/curriculum/setup/command-line-tools/) 19 | 20 | A couple of quick pointers that might be helpful: 21 | - One of the most useful linux utilities is `screen`, which allows you to create sessions that persist even when disconnect from ssh. This can be handy for things like long-running jobs, notebook servers, or even just to guard against your internet connection dropping and losing your work. Here's a quick [video intro](https://www.youtube.com/watch?v=3txYaF_IVZQ) with the basics and a more [in-depth tutorial](https://linuxize.com/post/how-to-use-linux-screen/) (note that screen is already installed, so you can ignore those details). 22 | - Everyone is sharing the resources of the course server and it can be a good idea to keep an eye on memory and processor usage (both to know if you're hogging resources with your processes and understand how the load looks before starting a job). A good way to do so is with the utility [htop](https://www.deonsworld.co.za/2012/12/20/understanding-and-using-htop-monitor-system-resources/), which provides a visual representation of this information (to open htop just type `htop` at the command prompt and to exit, you can simply hit the `q` key) 23 | - Each group should have their own folder on the server, in `/data/groups/{group name}`. For example, `/data/groups/bills1` 24 | - We've set up a shared python virtual environment for each group. This will automatically activate when you navigate to `/data/groups/{group_name}`. Or, manually activate it with `source /data/groups/{group_name}/dssg_env/bin/activate`. 25 | - When you first navigate to `/data/groups/{group_name}` you'll get a message prompting you to run `direnv allow`. Run this command to allow the automatic virtual environment switching. 26 | 27 | ## github 28 | We'll use github to collaborate on the code all semester. You will have a project repository based on your projhect assignment. 29 | 30 | #### common (extremely simple) workflow 31 | 32 | - When you start working: 33 | - The first time, clone an existing repo: `git clone` 34 | - Every time, get changes since last time: `git pull` 35 | - Add new files: `git add` or make changes to existing files 36 | - Make a local checkpoint: `git commit` 37 | - Push to the remote repository: `git push` 38 | 39 | A [more advanced cheatsheet](https://gist.github.com/jedmao/5053440). Other useful tutorials are [here](https://dssg.github.io/hitchhikers-guide/curriculum/setup/git-and-github/basic_git_tutorial/) 40 | 41 | ## PostgreSQL 42 | If you're not too familiar with SQL or would like a quick review, we have an overview and intro [here](https://dssg.github.io/hitchhikers-guide/curriculum/software/basic_sql/). 43 | 44 | Additionally, check out these [notes and tips about using the course database](class_db_pointers.md). 45 | 46 | ## psql 47 | PSQL is a command line tool to connect to the postgresql databvase server we're using for class. You will bneed to be on the server through assh first and then type `psql -h mlpolicylab.db.dssg.io -U YOUR_ANDREW_ID databasename` where `databasename` is the database for your project that you will receive after your project assignment. To test it you can use `psql -h mlpolicylab.db.dssg.io -U YOUR_ANDREW_ID group_students_database` - make sure to change `YOUR_ANDREW_ID` 48 | 49 | A couple quick usage pointers: 50 | - `\dn` will list the schemas in the database you're connected to 51 | - `\dt {schema_name}.*` will list the tables in schema `{schema_name}` 52 | - `\d {schema_name}.{table_name}` will list the columns of table `{schema_name}.{table_name}` 53 | - `\x` can be used to enter "extended display mode" to view results in a tall, key-value format 54 | - For cleaner display of wide tables, you can launch `psql` using: `PAGER='less -S' psql -h mlpolicylab.db.dssg.io -U YOUR_ANDREW_ID databasename` (then use the left and right arrows to navigate columns of wide results) 55 | - `\?` will show help about psql meta-commands 56 | - `\q` will exit 57 | 58 | ## dbeaver 59 | dbeaver is a free tool that gives you a slightly nicer and visual interface to the database. [Instructions for installinfg and set up are here] 60 | (https://github.com/dssg/mlforpublicpolicylab/raw/master/techhelp/dbeaver_instructions.pdf) 61 | 62 | ## Connecting to the database from python 63 | The `sqlalchemy` module provides an interface to connect to a postgres database from python (you'll also need to install `psycopg2` in order to talk to postgres specifically). You'll can install it in your virtualenv with: 64 | ``` 65 | pip install psycopg2-binary sqlalchemy 66 | ``` 67 | (Note that `psycopg2-binary` comes packaged with its dependencies, so you should install it rather than the base `psycopg2` module) 68 | 69 | A simple usage pattern might look like: 70 | ```python 71 | from sqlalchemy import create_engine 72 | 73 | # read parameters from a secrets file, don't hard-code them! 74 | db_params = get_secrets('db') 75 | engine = create_engine('postgres://{user}:{password}@{host}:{port}/{dbname}'.format( 76 | host=db_params['host'], 77 | port=db_params['port'], 78 | dbname=db_params['dbname'], 79 | user=db_params['user'], 80 | password=db_params['password'] 81 | )) 82 | result_set = engine.execute("SELECT * FROM your_table LIMIT 100;") 83 | for record in result_set: 84 | process_record(record) 85 | 86 | # Close communication with the database 87 | engine.dispose() 88 | ``` 89 | 90 | If you're changing data in the database, note that you may need to use `engine.execute("COMMIT")` to ensure that changes persist. 91 | 92 | Note that the engine object can also be used with other utilities that interact with the database, such as ohio or pandas (though the latter can be very inefficient/slow) 93 | 94 | **For a more detailed walk-through of using python and postgresql together, check out the [Python+SQL tech session notebook](python_sql_tech_session.ipynb)** 95 | 96 | ## Jupyter Notebooks 97 | Although not a good environment for running your ML pipeline and models, jupyter notebooks can be useful for exploratory data analysis as well as visualizing modeling results. Since the data needs to stay in the AWS environment, you'll need to do so by running a notebook server on the remote machine and creating an SSH tunnel (because the course server can only be accessed via the SSH protocol) so you can access it via your local browser. 98 | 99 | One important note: **be sure to explicitly shut down the kernels when you're done working with a notebook** as "zombie" notebook sessions can end up using up a lot of processed! 100 | 101 | You can find some details about using jupyter with the class server [here](jupyter_setup.md) 102 | 103 | ## Handling Secrets 104 | You'll need access to various secrets (such as database credentials) in your code, but keeping these secrets out of the code itself is an important part of keeping your infrastructure and data secure. You can find a few tips about different ways to do so [here](handling_secrets.md) 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /techhelp/class_db_pointers.md: -------------------------------------------------------------------------------- 1 | # Some pointers for using the course database 2 | 3 | Each group has their own database, named `{group_name}_database`. For example, team bills1 has `bills1_database`. Log in the same way you log into group_students_database: 4 | ```bash 5 | psql -h mlpolicylab.db.dssg.io -U {andrewid} -d {group_name}_database 6 | ``` 7 | Or, if using DBeaver, simply update the Database field to your group's database name. 8 | 9 | ## Access and permissions 10 | Within your group database, you'll find several schemas (depending on your particular project). Most of these schemas are read-only in order to avoid accidentally modifying or overwriting the raw data for the project, but you should be able to write to the `sketch` schema as well as create new schemas to help organize your project work. You can run the following query to get more information on permissions: 11 | ```sql 12 | SELECT * 13 | FROM information_schema.role_table_grants 14 | ``` 15 | 16 | ### Creating new schemas 17 | When you create a new schema, you'll want to be sure to grant permissions to everyone in your group, which can be done by granting privileges to your group name, for instance: 18 | ```sql 19 | CREATE SCHEMA my_new_schema; 20 | GRANT ALL ON SCHEMA my_new_schema TO {group_name}; 21 | ``` 22 | (replacing `{group_name}` with your group name, such as `bills1`) 23 | 24 | ### Creating new tables 25 | Likewise, when you create a new table, you'll want to grant permissions to everyone in your group: 26 | ```sql 27 | CREATE TABLE my_schema.my_new_table ( 28 | some_column_name INT, 29 | some_other_column VARCHAR, 30 | ); 31 | GRANT ALL ON my_schema.my_new_table TO {group_name}; 32 | ``` 33 | (replacing `{group_name}` with your group name, such as `bills1`) 34 | 35 | ## Query Performance 36 | Most of these projects use moderately large data. While postgres can work with this type of structured data very efficiently if your queries and tables are properly optimized, if they aren't, some queries can be painfully slow. A few pointers: 37 | - Especially when creating relatively large tables, using [appropriate indices](https://www.postgresqltutorial.com/postgresql-indexes/postgresql-create-index/) will vastly improve accessing data and joining to the table 38 | - For large, complex queries, subqueries are typically less performant that [CTEs](http://www.craigkerstiens.com/2013/11/18/best-postgres-feature-youre-not-using/) or building up pieces with temporary tables (which, in turn, can be indexed as well) 39 | - Be sure you're making use of the relational nature of the database; often, if you find yourself doing a large number of small queries in a loop to do the same thing to different slices of the data, you could likely optimize by reworking this into a single query that works on everything at once. 40 | - Pandas is very, very bad at moving large amounts of data into databases from python -- take a look at [Ohio](https://github.com/dssg/ohio) for a more efficient option. 41 | 42 | ### Killing hung or run-away queries 43 | If you think one of your queries has hung (or is taking far longer or too many resources than it should), you can run the following query to confirm that it is still running: 44 | ```sql 45 | SELECT * FROM pg_stat_activity; 46 | ``` 47 | If you need to kill your query, you can note down the PID from that result and then use: 48 | ```sql 49 | SELECT pg_cancel_backend({PID}); 50 | ``` 51 | To kill it (it's a good idea to check `pg_stat_activity` again to ensure it's been killed). Sometimes that may not work, and you need to use the more aggressive: 52 | ```sql 53 | SELECT pg_terminate_backend({PID}); 54 | ``` 55 | 56 | ### Remember to close your database connections 57 | It's always a good practice to close out your database connections explicitly, both for database software (such as dbeaver) as well as `psycopg2` connections from python (e.g., make sure you run `cursor.close()` as well as `connection.close` after running all your queries). 58 | -------------------------------------------------------------------------------- /techhelp/dbeaver_instructions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/dbeaver_instructions.pdf -------------------------------------------------------------------------------- /techhelp/handling_secrets.md: -------------------------------------------------------------------------------- 1 | # Some Tips for Handling Secrets 2 | Keeping secrets (such as database passwords, API credentials, etc) out of your code is important to ensure the security your systems and data. While there are many approaches to doing so, two simple options are making use of environment variables and using secret config files. 3 | 4 | ## Option 1: Environment Variables 5 | Environment variables you set at the bash command line are available to your code running in that environment and a good option for keeping secrets out of your code itself. You can set environment variables at the command line by assigning them with an `=` sign (avoid any spaces around the `=`) and check their value using `echo` and placing a `$` before the variable name: 6 | 7 | ```bash 8 | you@server:~$ FOO="HELLO WORLD" 9 | you@server:~$ echo $FOO 10 | HELLO WORLD 11 | ``` 12 | 13 | In python, you can access these using the built-in `os` module, for instance if you had your database password stored in the `PGPASSWORD` environment variable: 14 | 15 | ```python 16 | import os 17 | 18 | db_pass = os.getenv('PGPASSWORD') 19 | ``` 20 | 21 | If you don't want to set the environment variables by time every time you start a new terminal session, you could also store them in a shell script that would load them up when run, for instance, you might have a file called `environment.sh` with contents: 22 | 23 | ```bash 24 | export FOO="HELLO WORLD" 25 | export BAR="BAZ" 26 | ``` 27 | 28 | Importantly, you'll need to **restrict the access to this file**: store it somewhere only you can access (e.g., your home directory), avoid committing it to a git repository, and change the permissions so only you can view it using `chmod 600 {filename}`. 29 | 30 | Once you've created that file, any time you want to load the environment variables, you can simply run its contents as a shell script using `source`. For instance, if the file was named `environment.sh`: 31 | 32 | ```bash 33 | you@server:~$ source environment.sh 34 | ``` 35 | 36 | ## Option 2: Secrets Config File 37 | 38 | A second option involves storing your secrets in a config file that can be read by your programs (any number of formats is reasonable: yaml, json, even plain text). For instance, you might create a file called `secrets.yaml` with contents such as: 39 | 40 | ```yaml 41 | db: 42 | host: mlpolicylab.db.dssg.io 43 | port: 5432 44 | dbname: group_students_database 45 | user: andrewid 46 | password: 12345 47 | web_resource: 48 | api_key: 23b53ca9845f70424ad08f958c94b275 49 | ``` 50 | 51 | Then, you can access your secrets within your code with the appropriate loading utility, such as (here, the `yaml` module is not built-in, but comes from the package `PyYAML`): 52 | 53 | ```python 54 | import yaml 55 | 56 | with open('path/to/secrets.yaml', 'r') as f: 57 | # loads contents of secrets.yaml into a python dictionary 58 | secret_config = yaml.safe_load(f.read()) 59 | ``` 60 | 61 | This can be an easy way to feed secrets into your programs, but you'll need to **ensure these secrets don't accidentally get committed to github**. You could either provide the path to config file as an input parameter to your program (in which case, you could keep the secrets file somewhere entirely outside of the git repo, such as your home directory) or have it live in some expected location within the structure of the github repo, but use a `.gitignore` file to avoid committing the secrets file itself. 62 | 63 | To do so, edit (or create) your `.gitignore` file at the top level of your repo to add (in the example where the secrets are contained in `secrets.yaml`): 64 | 65 | ``` 66 | # ignore secrets config 67 | secrets.yaml 68 | ``` 69 | 70 | Make sure you've added and committed the `.gitignore` file to your repo, and then you should be able to confirm that your secrets file isn't being tracked with `git status`. 71 | -------------------------------------------------------------------------------- /techhelp/img/jupyter-login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/img/jupyter-login.png -------------------------------------------------------------------------------- /techhelp/img/jupyter-new-nb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/img/jupyter-new-nb.png -------------------------------------------------------------------------------- /techhelp/img/jupyter-shutdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/img/jupyter-shutdown.png -------------------------------------------------------------------------------- /techhelp/img/jupyter-terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/img/jupyter-terminal.png -------------------------------------------------------------------------------- /techhelp/jupyter_setup.md: -------------------------------------------------------------------------------- 1 | # Using Jupyter Notebooks 2 | 3 | ## tl;dr 4 | 1. start jupyter server on the server from your project directory and select a port number **between 1024 and 65535** (more details below if you get an error) 5 | ```bash 6 | cd /data/groups/{group-name}/ 7 | jupyter notebook --no-browser --port {YOUR_PORT} 8 | ``` 9 | 10 | Take note of token once the command finishes running. it will be in a string similar to ``[I 04:14:21.181 NotebookApp] http://localhost8:888/?token=65d0e5010af61874004ddeea962cd727992a593b82bc4e1b`` 11 | 12 | 2. Set up an SSH tunnel to connect to the server from your laptop: 13 | 14 | ```bash 15 | ssh -N -L localhost:8888:localhost:{YOUR_PORT} {YOUR_ANDREW_ID}@mlpolicylab.dssg.io 16 | ``` 17 | 18 | 3. Open browser on laptop and type in http://localhost:8888/ and enter token from step 1. Make sure to select the kernel with your group name when creating a notebook 19 | 20 | ## Before you get started 21 | 22 | Although not a good environment for running your ML pipeline and models, jupyter notebooks can be useful for exploratory data analysis as well as visualizing modeling results. Since the data needs to stay in the AWS environment, you'll need to do so by running a notebook server on the remote machine and creating an SSH tunnel (because the course server can only be accessed via the SSH protocol) so you can access it via your local browser. 23 | 24 | One important note: **be sure to explicitly shut down the kernels when you're done working with a notebook** (you can do this from the notebook directory listing: see the figure below) as "zombie" notebook sessions can end up using up a lot of processed! 25 | 26 | ![notebook shutdown](/techhelp/img/jupyter-shutdown.png) 27 | 28 | ## Starting up the server 29 | On the course server, you'll want to choose an open port for your notebook server (so you can consistently access it in the same place). You can see the ports currently in use with: 30 | ```bash 31 | ss -lntu 32 | ``` 33 | Choose a port number **between 1024 and 65535** that is **NOT** on that list. 34 | 35 | Next, you'll actually start the notebook server -- you may want to consider doing this in a `screen` session to ensure the keep ther server persistent (see the [linux command line section of the tech setup readme](https://github.com/dssg/mlforpublicpolicylab/tree/master/techhelp#linux-command-line-bash) for details). Make sure you're in your group's python virtualenv and start your notebook server, for instance using: 36 | ```bash 37 | cd /data/groups/{group-name}/ 38 | jupyter notebook --no-browser --port {YOUR_PORT} 39 | ``` 40 | or 41 | ```bash 42 | source /data/groups/{group-name}/dssg_env/bin/activate 43 | jupyter notebook --no-browser --port {YOUR_PORT} 44 | ``` 45 | Your group name is bills1, schools1, etc. 46 | 47 | Note that whatever directory you're in when you start the server is where your notebooks will be stored. Starting the server will print out a message indicating that the server is starting and giving you a token you can use to access it, which looks something like this: 48 | 49 | ![notebook server startup](/techhelp/img/jupyter-terminal.png) 50 | 51 | Take note of the token (outlined with the red box in the image), as you'll need this to log in. 52 | 53 | ## Accessing the server from your browser 54 | Now, on your local machine, you'll need to set up an SSH tunnel to connect to the server: 55 | 56 | ```bash 57 | ssh -N -L localhost:8888:localhost:{YOUR_PORT} {YOUR_ANDREW_ID}@mlpolicylab.dssg.io 58 | ``` 59 | 60 | Note that if you already have a local notebook server running, you may need to choose a different port than 8888 to map to, but we'll assume this is open here. Also, you may need to specify the "-i" parameter to provide the path to your private key file. If you're on windows, you may need to do this using PuTTY -- [see the instructions here](https://docs.bitnami.com/bch/faq/get-started/access-ssh-tunnel/) 61 | 62 | Running this command won't look like it did anything because it's just opening a connection between your machine and the course server to route traffic to the local port (here, 8888) to the port you choose for your notebook server on the class server. **You'll need to keep this terminal/putty session open to maintain the tunnel** 63 | 64 | Finally, open a browser of your choice on your local machine and navigate to http://localhost:8888/ and you should get a jupyter notebook login page asking for the token that was generated when you started the server (if this doesn't work, you might also try http://0.0.0.0:8888/ or http://127.0.0.1:8888/ ): 65 | 66 | ![notebook browser login](/techhelp/img/jupyter-login.png) 67 | 68 | If you successfully log in, you should see a directory listing where you started the notebook server on the remote server allowing you to create new python files. 69 | 70 | To make sure packages you install in your group's python virtualenv are available, we have created a kernel with each group's name that uses this virtualenv. **Be sure you're selecting the kernel with your group name when creating a notebook** (we'll eventually be deleting the "Python 3" kernel to avoid confusion as well): 71 | 72 | ![creating a new notebook](/techhelp/img/jupyter-new-nb.png) 73 | 74 | ## Shutting down 75 | You'll need to do two things to shut down your notebook server: 76 | 1. Kill the notebook server on the remote machine (return to the terminal/screen window where the server is running and type control-C then `y` when prompted if you reall want to shut down) 77 | 1. Close the SSH tunnel on your local machine: on linux/macos, you can do so by running `ps aux | grep {YOUR_PORT}` to find the process id (PID) then using `kill {PID}`, or alternatively closing the terminal session you used to start it. With putty on windows, you should simply be able to close the putty session. 78 | -------------------------------------------------------------------------------- /techhelp/pipelines_session.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/pipelines_session.pptx -------------------------------------------------------------------------------- /techhelp/python_sql_tech_session.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python + SQL Tech Session\n", 8 | "\n", 9 | "Today we'll be covering:\n", 10 | "1. Connecting to the database from python\n", 11 | "1. Using templated SQL in python\n", 12 | "1. getting data into and out of postgres efficiently\n", 13 | "1. Advanced SQL\n", 14 | " - CTEs (WITH clauses)\n", 15 | " - window functions\n", 16 | " - indices / check plan\n", 17 | " - temp tables\n", 18 | "\n", 19 | "### Some initial setup\n", 20 | "Downloading the materials we'll need:\n", 21 | "1. SSH to the class server\n", 22 | "1. Make sure you're in your home directory: `cd ~`\n", 23 | "1. Download the notebook: `wget https://raw.githubusercontent.com/dssg/mlforpublicpolicylab/master/techhelp/python_sql_tech_session.ipynb`\n", 24 | "1. Download the sql template example: `wget https://raw.githubusercontent.com/dssg/mlforpublicpolicylab/master/techhelp/tech_session_template.sql`\n", 25 | "1. Take a look at the sql template: `less tech_session_template.sql` (Type `q` to exit)\n", 26 | "\n", 27 | "Install some packages in your group virtualenv (only one person should need to do this):\n", 28 | "1. SSH to the class server (if you're not already there)\n", 29 | "1. Activate virtualenv: `source /data/groups/{your_group}/dssg_env/bin/activate`\n", 30 | "1. Install pandas and matplotlib: `pip install pandas matplotlib`\n", 31 | "1. Install pyscopg2 and sqlalchemy (to connect to postgres): `pip install psycopg2-binary sqlalchemy`\n", 32 | "1. Install ohio (tool for moving data to/from postgres): `pip install ohio`\n", 33 | "1. Install PyYAML (to read YAML format): `pip install PyYAML`\n", 34 | "\n", 35 | "Create a secrets file:\n", 36 | "1. SSH to the class server (if you're not already there)\n", 37 | "1. Make sure you're in your home directory: `cd ~`\n", 38 | "1. Create the secrets file: `touch secrets.yaml`\n", 39 | "1. Restrict access to the file: `chmod 600 secrets.yaml`\n", 40 | "1. Edit the file: `nano secrets.yaml`\n", 41 | "1. Fill it in with contents (remember, your password can be found in your `.pgpass` file):\n", 42 | " ```\n", 43 | " db:\n", 44 | " host: mlpolicylab.db.dssg.io\n", 45 | " port: 5432\n", 46 | " dbname: db_donorschoose_example\n", 47 | " user: {your_andrewid}\n", 48 | " password: {your_db_password}\n", 49 | " ```\n", 50 | "\n", 51 | "Start up your jupyter server (detailed instructions [here](https://github.com/dssg/mlforpublicpolicylab/blob/master/techhelp/jupyter_setup.md)):\n", 52 | "1. SSH to the class server (if you're not already there)\n", 53 | "1. Start a screen session: `screen`\n", 54 | "1. Choose a port (if you haven't already): `ss -lntu` (pick a port between 1024 and 65535 that is NOT on that list)\n", 55 | "1. Make sure you're in your home directory: `cd ~`\n", 56 | "1. Activate virtualenv: `source /data/groups/{your_group}/dssg_env/bin/activate`\n", 57 | "1. Start your server: `jupyter notebook --port {port_from_above} --no-browser` (make note of the token here)\n", 58 | "1. ON YOUR LOCAL MACHINE, create an SSH tunnel: `ssh -N -L localhost:8888:localhost:{YOUR_PORT} {YOUR_ANDREW_ID}@mlpolicylab.dssg.io` (or [using PuTTY on windows](https://docs.bitnami.com/bch/faq/get-started/access-ssh-tunnel/))\n", 59 | "1. ON YOUR LOCAL MACHINE, open a browser and navigate to: `http://localhost:8888/`\n", 60 | "1. Fill in the token from the jupyter server\n", 61 | "1. Open this notebook\n", 62 | "1. **Be sure to choose your group kernel from the \"Kernel\" menu**\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Import packages" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "import matplotlib.pyplot as plt\n", 79 | "import pandas as pd\n", 80 | "from sqlalchemy import create_engine\n", 81 | "import yaml\n", 82 | "\n", 83 | "import ohio.ext.pandas" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## TOPIC 1: Connect to the database from python" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "with open('secrets.yaml', 'r') as f:\n", 100 | " secrets = yaml.safe_load(f)\n", 101 | "\n", 102 | "db_params = secrets['db']\n", 103 | "engine = create_engine('postgres://{user}:{password}@{host}:{port}/{dbname}'.format(\n", 104 | " host=db_params['host'],\n", 105 | " port=db_params['port'],\n", 106 | " dbname=db_params['dbname'],\n", 107 | " user=db_params['user'],\n", 108 | " password=db_params['password'] \n", 109 | "))\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "We're connected to a database with data from the DonorsChoose organization. It has a few useful tables:\n", 117 | "- `public.projects` -- general information about projects\n", 118 | "- `public.resources` -- detailed information about requested resources\n", 119 | "- `public.essays` -- project titles and descriptions\n", 120 | "- `public.donations` -- separate record for each donation to a project\n", 121 | "\n", 122 | "There's also a `sketch` schema you can use to create tables in" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "### Simple select statement with sqlalchemy engine" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "sql = \"SELECT projectid, schoolid, resource_type FROM public.projects LIMIT 3\"\n", 139 | "\n", 140 | "result_set = engine.execute(sql)\n", 141 | "for rec in result_set:\n", 142 | " print(rec)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Pandas will give a little cleaner output" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "sql = \"SELECT projectid, schoolid, resource_type FROM public.projects LIMIT 3\"\n", 159 | "\n", 160 | "pd.read_sql(sql, engine)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Simple Table Manipulation with sqlalchemy (we'll do something more efficient below)\n", 168 | "\n", 169 | "Let's create a little table to track your stocks of halloween candy (fill in your andrew id below)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "andrew_id = # FILL IN YOUR andrew_id HERE!\n", 179 | "candy_table = '{}_candy'.format(andrew_id)\n", 180 | "table_schema = 'sketch'" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Execute an appropriate CREATE statement" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "create_sql = '''CREATE TABLE IF NOT EXISTS {}.{} (\n", 197 | " candy_type varchar NULL,\n", 198 | " amount int,\n", 199 | " units varchar\n", 200 | ");'''.format(table_schema, candy_table)\n", 201 | "\n", 202 | "engine.execute(create_sql)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "**IMPORTANT NOTE**: Statements that modify the state of the database will not be physically reflected until we tell the connection to commit these changes. If you went into DBeaver now, you still wouldn't see this new table!" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "engine.execute(\"COMMIT\")" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "Now let's insert a few records (again note that we have to **commit** for the records to show up):" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "insert_sql = '''INSERT INTO {}.{}\n", 235 | " (candy_type, amount, units)\n", 236 | " VALUES(%s, %s, %s);\n", 237 | "'''.format(table_schema, candy_table)\n", 238 | "\n", 239 | "records_to_insert = [('snickers', 10, 'bars'), ('candy corn', 5, 'bags'), ('peanut butter cups', 15, 'cups')]\n", 240 | "\n", 241 | "for record in records_to_insert:\n", 242 | " engine.execute(insert_sql, record)\n", 243 | "\n", 244 | "engine.execute(\"COMMIT\")" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "Let's look at the results:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "sql = \"SELECT * FROM {}.{}\".format(table_schema, candy_table)\n", 261 | "\n", 262 | "pd.read_sql(sql, engine)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Clean up: drop the table and commit:" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "drop_sql = \"DROP TABLE {}.{}\".format(table_schema, candy_table)\n", 279 | "\n", 280 | "engine.execute(drop_sql)\n", 281 | "engine.execute(\"COMMIT\")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "## TOPIC 2: Using Templated SQL\n", 289 | "\n", 290 | "Templating SQL statements and filling them in dynamically with python can be very helpful as you're transforming data for your projects, for instance, creating features, labels, and matrices for different temporal validation splits in your data.\n", 291 | "\n", 292 | "We've actually been doing a little bit of this already (e.g., filling in table names and insert values above), but let's look at a couple of examples in more detail with the donors choose data. Suppose we wanted to look at the sets of projects posted on a few given days:" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "sql_template = \"\"\"\n", 302 | "SELECT projectid, resource_type, poverty_level, date_posted\n", 303 | "FROM public.projects\n", 304 | "WHERE date_posted = '{}'::DATE\n", 305 | "\"\"\"\n", 306 | "\n", 307 | "results = []\n", 308 | "for dt in ['2014-05-01', '2014-04-15', '2014-04-01']:\n", 309 | " sql = sql_template.format(dt)\n", 310 | " results.append(pd.read_sql(sql, engine))\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "Do some quick checks:\n", 318 | "1. How many result sets did we get back?\n", 319 | "1. Look at the first few results of one of the sets, are they all on the right date?\n", 320 | "1. How many projects were posted on each of these days?" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# Number of result sets" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "# First few records of one set" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# Number of projects on each date" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "#### Some simple data visualization\n", 355 | "\n", 356 | "We won't go into detail here, but just to provide a quick example. See the matplot (or seaborn) documentation for more plot types and examples." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "ix = 0\n", 366 | "df = results[ix].groupby('resource_type')['projectid'].count().reset_index()\n", 367 | "dt = results[ix]['date_posted'].max()\n", 368 | "\n", 369 | "fig, ax = plt.subplots()\n", 370 | "ax.bar('resource_type', 'projectid', data=df)\n", 371 | "ax.set_title('Counts by resource type for %s' % dt)\n", 372 | "ax.set_ylabel('Number of Projects')\n", 373 | "plt.show()" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "### Templated SQL stored in a file\n", 381 | "\n", 382 | "If your queries get long or complex, you might want to move them out to separate files to keep your code a bit cleaner. We've provided an example to work with in `tech_session_template.sql` -- let's read that in here.\n", 383 | "\n", 384 | "Note that here we're just making use of basic python templating here, but if you want to use more complex logic in your templates, check out packages like [Jinja2](https://jinja.palletsprojects.com/en/2.11.x/)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "# Read the template file\n", 394 | "with open('tech_session_template.sql', 'r') as f:\n", 395 | " sql_template = f.read()\n", 396 | "\n", 397 | "# Look at the contents:\n", 398 | "print(sql_template)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "**Looks like we'll need a few parameters:**\n", 406 | "- table_schema\n", 407 | "- table_name\n", 408 | "- state_list\n", 409 | "- start_dt\n", 410 | "- end_dt\n", 411 | "\n", 412 | "Notice as well that we've explicitly encoded all of these columns by hand, but you might want to think about how you might construct the sets of columns for one-hot encoded categoricals programmatically from the data, as well as the other types of features we've discussed (like aggregations in different time windows)..." 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "table_schema = 'public'\n", 422 | "table_name = 'projects'\n", 423 | "state_list = ['CA', 'NY', 'PA']\n", 424 | "start_dt = '2014-03-14'\n", 425 | "end_dt = '2014-04-30'\n", 426 | "\n", 427 | "sql = sql_template.format(\n", 428 | " table_schema=table_schema,\n", 429 | " table_name=table_name,\n", 430 | " state_list=state_list,\n", 431 | " start_dt=start_dt,\n", 432 | " end_dt=end_dt\n", 433 | ")\n", 434 | "\n", 435 | "# Let's take a look...\n", 436 | "print(sql)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "**Looks like the square brackets in that state list will generate an error!**\n", 444 | "\n", 445 | "Let's try formatting it before doing the templating:" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "def list_to_string(l, dtype='string'):\n", 455 | " if dtype=='string':\n", 456 | " return ','.join([\"'%s'\" % elm for elm in l])\n", 457 | " else:\n", 458 | " return ','.join([\"%s\" % elm for elm in l])\n", 459 | "\n", 460 | "\n", 461 | "state_list = list_to_string(['CA', 'NY', 'PA'])\n", 462 | "\n", 463 | "print(state_list)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "sql = sql_template.format(\n", 473 | " table_schema=table_schema,\n", 474 | " table_name=table_name,\n", 475 | " state_list=state_list,\n", 476 | " start_dt=start_dt,\n", 477 | " end_dt=end_dt\n", 478 | ")\n", 479 | "\n", 480 | "# Let's take a look...\n", 481 | "print(sql)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "**Looks better!** Let's try running it now..." 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "df = pd.read_sql(sql, engine)\n", 498 | "\n", 499 | "df.head(10)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "## TOPIC 3: Getting data into and out of postgres efficiently\n", 507 | "\n", 508 | "At the command line, one very efficient way of getting data into postgres is to stream it to a `COPY` statement on `STDIN`, this might look something like:\n", 509 | "```\n", 510 | "cat my_file.csv | psql -h mlpolicylab.db.dssg.io {group_database} -c \"COPY {schema}.{table} FROM STDIN CSV HEADER\"\n", 511 | "```\n", 512 | "(more details in the [postgres documentation](https://www.postgresql.org/docs/11/sql-copy.html))\n", 513 | "\n", 514 | "Similarly, you can use the `\\copy` command from within `psql` itself -- you can find [documentation here](https://www.postgresql.org/docs/11/app-psql.html) (seach for \"\\copy\").\n", 515 | "\n", 516 | "For today, we'll focus on a package called `ohio` that provides efficient tools for moving data between postgres and python. `ohio` provides interfaces for both `pandas` dataframes and `numpy` arrays, but we'll focus on the `pandas` tools here, which are provided via `import ohio.ext.pandas` (see the [docs for the numpy examples](https://github.com/dssg/ohio#extensions-for-numpy))\n", 517 | "\n", 518 | "Note that `ohio` is dramatically more efficient than the built-in `df.to_sql()` (see the benchmarking graph below). The pandas function tries to be agnostic about SQL flavor by inserting data row-by-row, while `ohio` uses postgres-specific copy functionality to move the data much more quickly (and with lower memory overhead as well):\n", 519 | "\n", 520 | "![ohio benchmarking](https://raw.githubusercontent.com/dssg/ohio/0.5.0/doc/img/profile-copy-from-dataframe-to-databas-1555458507.svg?sanitize=true)\n", 521 | "\n", 522 | "Let's try it out by re-creating our halloween candy table." 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "andrew_id = # FILL IN YOUR andrew_id HERE!\n", 532 | "candy_table = '{}_candy'.format(andrew_id)\n", 533 | "table_schema = 'sketch'" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "create_sql = '''CREATE TABLE IF NOT EXISTS {}.{} (\n", 543 | " candy_type varchar NULL,\n", 544 | " amount int,\n", 545 | " units varchar\n", 546 | ");'''.format(table_schema, candy_table)\n", 547 | "\n", 548 | "engine.execute(create_sql)\n", 549 | "engine.execute(\"COMMIT\")" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "### Inserting data with df.pg_copy_to()" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "df = pd.DataFrame({\n", 566 | " 'candy_type': ['snickers', 'cookies', 'candy apples', 'peanut butter cups', 'candy corn'],\n", 567 | " 'amount': [1,1,2,3,5],\n", 568 | " 'units': ['bars', 'cookies', 'apples', 'cups', 'bags']\n", 569 | "})\n", 570 | "\n", 571 | "# The ohio package adds a `pg_copy_to` method to your dataframes...\n", 572 | "df.pg_copy_to(candy_table, engine, schema=table_schema, index=False, if_exists='append')" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "### Reading data with pd.DataFrame.pg_copy_from()\n", 580 | "\n", 581 | "We can read the data from the table we just created using `pg_copy_from`:" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "result_df = pd.DataFrame.pg_copy_from(candy_table, engine, schema=table_schema)\n", 591 | "\n", 592 | "result_df" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "Note that `pg_copy_from` can accept a query as well:" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "sql = \"\"\"\n", 609 | "SELECT\n", 610 | " CASE WHEN candy_type IN ('snickers', 'cookies', 'peanut butter cups') THEN 'has chocolate' ELSE 'non-chocolate' END AS chocolate_flag,\n", 611 | " SUM(amount) AS total_number\n", 612 | "FROM {}.{}\n", 613 | "GROUP BY 1\n", 614 | "\"\"\".format(table_schema, candy_table)\n", 615 | "\n", 616 | "result_df = pd.DataFrame.pg_copy_from(sql, engine)\n", 617 | "\n", 618 | "result_df" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "## TOPIC 4: Advanced SQL\n", 626 | "\n", 627 | "Finally for today, we want to talk about a few more advanced SQL functions that will likely be helpful as you're starting to prepare your features and training/test matrices. We **strongly encourage** you to do as much of that data manipulation as you can in the database, as postgres is well-optimized for this sort of work. The functions here should help make that work a bit easier as well.\n", 628 | "\n", 629 | "The idea here is to give you an overview of some of the things that are possible that you might want to explore further. You can find a more in-depth [tutorial here](https://dssg.github.io/hitchhikers-guide/curriculum/2_data_exploration_and_analysis/advanced_sql/), with links out to additional documentation as well." 630 | ] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": {}, 635 | "source": [ 636 | "### CTEs (WITH clauses)\n", 637 | "\n", 638 | "Common table expressions (CTEs), also known as WITH clauses, are a better alternative to subqueries both in terms of code readability as well as (in some cases) performance improvements. They can allow you to break up a complex query into consituent parts, making the logic of your code a little easier to follow.\n", 639 | "\n", 640 | "By way of example, suppose we wanted to calculate the fraction of different types of projects (based on their requested type of resource) that were fully funded in MD in January 2013. Here's how we might do that with CTEs:" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "sql = \"\"\"\n", 650 | "WITH md_projects AS (\n", 651 | " SELECT *\n", 652 | " FROM public.projects\n", 653 | " WHERE school_state='MD'\n", 654 | " AND date_posted BETWEEN '2013-01-01'::DATE AND '2013-01-31'::DATE\n", 655 | ")\n", 656 | ", total_donations AS (\n", 657 | " SELECT p.projectid, COALESCE(SUM(d.donation_total), 0) AS total_amount\n", 658 | " FROM md_projects p\n", 659 | " LEFT JOIN public.donations d USING(projectid)\n", 660 | " GROUP BY 1\n", 661 | ")\n", 662 | ", fully_funded AS (\n", 663 | " SELECT p.*, td.total_amount,\n", 664 | " CASE WHEN td.total_amount > p.total_price_excluding_optional_support THEN 1 ELSE 0 END AS funded_flag\n", 665 | " FROM md_projects p\n", 666 | " LEFT JOIN total_donations td USING(projectid)\n", 667 | ")\n", 668 | "SELECT resource_type, COUNT(*) AS num_projects, AVG(funded_flag) AS frac_funded\n", 669 | "FROM fully_funded\n", 670 | "GROUP BY 1\n", 671 | "ORDER BY 3 DESC\n", 672 | "\"\"\"\n", 673 | "\n", 674 | "pd.read_sql(sql, engine)" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "### HANDS-ON: For all the MD projects posted in January 2013 that received any donations\n", 684 | "### what is the average fraction of donations coming from teachers by resource type?\n", 685 | "### (note: the donations table has a boolean `is_teacher_acct` column that will be useful)\n", 686 | "\n", 687 | "sql = \"\"\"\n", 688 | "\n", 689 | "\"\"\"\n", 690 | "\n", 691 | "pd.read_sql(sql, engine)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "### Analytic (Window) Functions\n", 699 | "\n", 700 | "Postgres provides powerful functionality for calculating complex metrics such as within-group aggregates, running averages, etc., called \"window functions\" (because they operate over a defined window of the data relative to a given row):\n", 701 | "- They are similar to aggregate functions, but instead of operating on groups of rows to produce a single row, they act on rows related to the current row to produce the same amount of rows.\n", 702 | "- There are several window functions like `row_number`, `rank`, `ntile`, `lag`, `lead`, `first_value`, `last_value`, `nth_value`.\n", 703 | "- And you can use any aggregation functions: `sum`, `count`, `avg`, `json_agg`, `array_agg`, etc\n", 704 | "\n", 705 | "Supposed we want to answer a couple questions:\n", 706 | "- What fraction of all projects in MD are posted by each schoolid?\n", 707 | "- What is the most recently posted project for each school in MD?\n", 708 | "- Calculate a running average of the total ask amount of the 4 most recent projects at a given school (say, `schoolid='ff2695b8b7f3ade678358f6e5c621c1e'`)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "## HANDS-ON: Try answering those questions with SELECT, GROUP BY, HAVING, AND WHERE alone" 718 | ] 719 | }, 720 | { 721 | "cell_type": "markdown", 722 | "metadata": {}, 723 | "source": [ 724 | "Now let's look at how we'd answer these questions with window functions...\n", 725 | "\n", 726 | "**Fraction of projects by school**\n", 727 | "\n", 728 | "Here, we'll group by schools but calculate the number of projects across all schools in MD using:\n", 729 | "\n", 730 | "`SUM(COUNT(*)) OVER ()`\n", 731 | "\n", 732 | "In that statement, `COUNT(*)` is the number of projects at the given school, then we're summing that count across all the aggregated rows with `SUM(.) OVER ()`. There, the `OVER ()` indicates the window across which to take the sum -- in this case, an empty window (that is, `()`) indicates using all records in the table." 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "result_df = pd.read_sql(\"\"\"\n", 742 | "SELECT schoolid, \n", 743 | " COUNT(*) AS num_projects, \n", 744 | " 1.000*COUNT(*)/SUM(COUNT(*)) OVER () AS frac_at_school\n", 745 | "FROM public.projects\n", 746 | "WHERE school_state = 'MD'\n", 747 | "GROUP BY 1\n", 748 | "ORDER BY 3 DESC\n", 749 | "\"\"\", engine)\n", 750 | "\n", 751 | "result_df.head()" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "**Most recent project by school**\n", 759 | "\n", 760 | "Here, we'll use `row_number` to rank the projects (without ties) within school and by posting date. Note that the window here, `(PARTITION BY schoolid ORDER BY date_posted DESC)` means: within each school id, calculate a row number ordered by the posting date in descending order (so the most recent project by a given school will have `rn=1`, the second most recent will have `rn=2`, and so on).\n", 761 | "\n", 762 | "We do this row number calculation in a CTE, allowing us to pick out the most recent project for each school simply by looking for those with `rn=1` in a subsequent step:" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "result_df = pd.read_sql(\"\"\"\n", 772 | "WITH school_rns AS (\n", 773 | " SELECT *, row_number() OVER (PARTITION BY schoolid ORDER BY date_posted DESC) AS rn\n", 774 | " FROM public.projects\n", 775 | " WHERE school_state = 'MD'\n", 776 | ")\n", 777 | "SELECT *\n", 778 | "FROM school_rns\n", 779 | "WHERE rn=1\n", 780 | ";\n", 781 | "\"\"\", engine)\n", 782 | "\n", 783 | "result_df.head()" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "**Running average of ask from last four projects**\n", 791 | "\n", 792 | "Here, we use postgres's functionality to restrict a window to certain rows relative to the given row. Our window is:\n", 793 | "```\n", 794 | "(PARTITION BY schoolid ORDER BY date_posted ASC ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)\n", 795 | "```\n", 796 | "That is,\n", 797 | "- `PARTITION BY schoolid`: Do the calculation among records at the same school\n", 798 | "- `ORDER BY date_posted ASC`: Order the records by posting date (earliest first)\n", 799 | "- `ROWS BETWEEN 3 PRECEDING AND CURRENT ROW`: Given this ordering, calculate the average across the four most recent rows (including the current row)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "result_df = pd.read_sql(\"\"\"\n", 809 | "SELECT date_posted, projectid, schoolid, total_price_excluding_optional_support AS current_ask,\n", 810 | " AVG(total_price_excluding_optional_support) OVER (\n", 811 | " PARTITION BY schoolid ORDER BY date_posted ASC\n", 812 | " ROWS BETWEEN 3 PRECEDING AND CURRENT ROW\n", 813 | " ) AS running_avg_ask\n", 814 | "FROM public.projects\n", 815 | "WHERE schoolid = 'ff2695b8b7f3ade678358f6e5c621c1e'\n", 816 | "ORDER BY date_posted DESC\n", 817 | ";\n", 818 | "\"\"\", engine)\n", 819 | "\n", 820 | "result_df.head(10)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "metadata": {}, 826 | "source": [ 827 | "**Days since last project was posted**\n", 828 | "\n", 829 | "We can use the `lag()` window function to get the date of the most recent previously-posted project (see also `last_value` for more flexibility):" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "result_df = pd.read_sql(\"\"\"\n", 839 | "SELECT date_posted, projectid, schoolid, total_price_excluding_optional_support AS current_ask,\n", 840 | " date_posted::DATE - (lag(date_posted) OVER (PARTITION BY schoolid ORDER BY date_posted ASC))::DATE AS days_since_last_proj\n", 841 | "FROM public.projects\n", 842 | "WHERE schoolid = 'ff2695b8b7f3ade678358f6e5c621c1e'\n", 843 | "ORDER BY date_posted DESC\n", 844 | ";\n", 845 | "\"\"\", engine)\n", 846 | "\n", 847 | "result_df.head(5)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [ 856 | "# What happens when we hit the end of the series?\n", 857 | "result_df.tail(5)" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "Notice the `NaN` (will be `NULL` in postgres) for the first record that doesn't have any previously-posted project, so you'd have to think about how you wanted to handle these edge cases in your feature development." 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": {}, 870 | "source": [ 871 | "### Indices / Checking the Query Plan\n", 872 | "\n", 873 | "Indices are particularly critical to the performance of postgres queries, especially as the data gets larger. You should think about adding indices to tables based on columns that will frequently be used for joins or filtering rows with `WHERE` clauses.\n", 874 | "\n", 875 | "A useful tool for understanding how the database will treat a given query is checking the query plan by using the `EXPLAIN` keyword before a `SELECT` statement:" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": null, 881 | "metadata": {}, 882 | "outputs": [], 883 | "source": [ 884 | "# Eliminate column width truncating\n", 885 | "pd.set_option('display.max_colwidth', None)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "pd.read_sql(\"\"\"\n", 895 | "EXPLAIN SELECT * FROM public.projects WHERE projectid = '32943bb1063267de6ed19fc0ceb4b9a7'\n", 896 | "\"\"\", engine)" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": {}, 902 | "source": [ 903 | "Notice that picking out a specific project is making use of the index via `Index Scan`.\n", 904 | "\n", 905 | "By contrast, if we select projects for a given school:" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "pd.read_sql(\"\"\"\n", 915 | "EXPLAIN SELECT * FROM public.projects WHERE schoolid = 'ff2695b8b7f3ade678358f6e5c621c1e'\n", 916 | "\"\"\", engine)" 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "metadata": {}, 922 | "source": [ 923 | "Here, `Seq Scan` tells us that postgres has to scan the entire table to find the right projects, which can be very expensive (especially with joins!). Also note how much higher the overall estimated cost is for this query in the first row here than for the query above.\n", 924 | "\n", 925 | "Likewise for joins, compare the two query plans below:" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "metadata": {}, 932 | "outputs": [], 933 | "source": [ 934 | "pd.read_sql(\"\"\"\n", 935 | "EXPLAIN SELECT * FROM public.projects JOIN public.donations USING(projectid)\n", 936 | "\"\"\", engine)" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [ 945 | "## NOTE: Please don't actually run this query without the select!!!\n", 946 | "\n", 947 | "pd.read_sql(\"\"\"\n", 948 | "EXPLAIN SELECT * FROM public.projects p JOIN public.donations d ON d.donation_timestamp > p.date_posted\n", 949 | "\"\"\", engine)" 950 | ] 951 | }, 952 | { 953 | "cell_type": "markdown", 954 | "metadata": {}, 955 | "source": [ 956 | "**CREATING INDICES**\n", 957 | "\n", 958 | "When you need to create indices as you build tables for your project, you can use this syntax:\n", 959 | "\n", 960 | "```\n", 961 | "CREATE INDEX ON {schema}.{table}({column});\n", 962 | "```\n", 963 | "\n", 964 | "Note that you can also specify a list of columns. If the given column (or set of columns) is a unique key for the table, you can get additional gains by declaring it as a primary key instead of simply creating an index:\n", 965 | "\n", 966 | "```\n", 967 | "ALTER TABLE {schema}.{table} ADD PRIMARY KEY ({column});\n", 968 | "```\n", 969 | "\n", 970 | "You can also find a little more documentation of postgres indices [here](https://www.postgresqltutorial.com/postgresql-indexes/postgresql-create-index/)" 971 | ] 972 | }, 973 | { 974 | "cell_type": "markdown", 975 | "metadata": {}, 976 | "source": [ 977 | "### Temporary Tables\n", 978 | "\n", 979 | "Breaking up complex queries with CTEs can make your code much more readable and may provide some performance gains, but further gains can often be realized by creating and indexing temporary tables. \n", 980 | "\n", 981 | "Let's rework one of the CTE examples from above using temporary tables: For all the MD projects posted in January 2013 that received any donations what is the average fraction of donations coming from teachers by resource type?" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": null, 987 | "metadata": {}, 988 | "outputs": [], 989 | "source": [ 990 | "andrew_id = # FILL IN YOUR andrew_id HERE!\n", 991 | "\n", 992 | "# Temporary table and index for projects posted by MD schools in Jan 2013\n", 993 | "engine.execute(\"\"\"\n", 994 | "CREATE LOCAL TEMPORARY TABLE tmp_{}_md_projects\n", 995 | " ON COMMIT PRESERVE ROWS\n", 996 | " AS\n", 997 | " SELECT *\n", 998 | " FROM public.projects\n", 999 | " WHERE school_state='MD'\n", 1000 | " AND date_posted BETWEEN '2013-01-01'::DATE AND '2013-01-31'::DATE\n", 1001 | ";\n", 1002 | "\"\"\".format(andrew_id))\n", 1003 | "engine.execute(\"\"\"CREATE INDEX ON tmp_{}_md_projects(projectid);\"\"\".format(andrew_id))\n", 1004 | "engine.execute(\"COMMIT;\")\n", 1005 | "\n", 1006 | "# Temporary table and index for donations by teachers\n", 1007 | "engine.execute(\"\"\"\n", 1008 | "CREATE LOCAL TEMPORARY TABLE tmp_{}_teacher_donations\n", 1009 | " ON COMMIT PRESERVE ROWS\n", 1010 | " AS\n", 1011 | " SELECT d.projectid, SUM(CASE WHEN is_teacher_acct THEN d.donation_total ELSE 0 END)/SUM(d.donation_total) AS teacher_frac\n", 1012 | " FROM tmp_{}_md_projects p\n", 1013 | " JOIN public.donations d USING(projectid)\n", 1014 | " GROUP BY 1\n", 1015 | ";\n", 1016 | "\"\"\".format(andrew_id, andrew_id))\n", 1017 | "engine.execute(\"\"\"CREATE INDEX ON tmp_{}_teacher_donations(projectid);\"\"\".format(andrew_id))\n", 1018 | "engine.execute(\"COMMIT;\")\n", 1019 | "\n", 1020 | "# Join these two temporary tables to get our result\n", 1021 | "pd.read_sql(\"\"\"\n", 1022 | "SELECT p.resource_type, AVG(td.teacher_frac) AS avg_teacher_frac\n", 1023 | "FROM tmp_{}_md_projects p\n", 1024 | "JOIN tmp_{}_teacher_donations td USING(projectid)\n", 1025 | "GROUP BY 1\n", 1026 | "ORDER BY 2 DESC\n", 1027 | "\"\"\".format(andrew_id, andrew_id), engine)\n", 1028 | "\n" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": {}, 1034 | "source": [ 1035 | "## Clean Up\n", 1036 | "\n", 1037 | "drop the candy table and commit; dispose of the sqlalchemy engine" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": {}, 1044 | "outputs": [], 1045 | "source": [ 1046 | "drop_sql = \"DROP TABLE {}.{}\".format(table_schema, candy_table)\n", 1047 | "\n", 1048 | "engine.execute(drop_sql)\n", 1049 | "engine.execute(\"COMMIT\")\n", 1050 | "\n", 1051 | "engine.execute(\"DROP TABLE IF EXISTS tmp_{}_md_projects\".format(andrew_id))\n", 1052 | "engine.execute(\"COMMIT\")\n", 1053 | "\n", 1054 | "engine.execute(\"DROP TABLE IF EXISTS tmp_{}_teacher_donations\".format(andrew_id))\n", 1055 | "engine.execute(\"COMMIT\")\n", 1056 | "\n", 1057 | "engine.dispose()" 1058 | ] 1059 | } 1060 | ], 1061 | "metadata": { 1062 | "kernelspec": { 1063 | "display_name": "kit_kernel", 1064 | "language": "python", 1065 | "name": "kit_kernel" 1066 | }, 1067 | "language_info": { 1068 | "codemirror_mode": { 1069 | "name": "ipython", 1070 | "version": 3 1071 | }, 1072 | "file_extension": ".py", 1073 | "mimetype": "text/x-python", 1074 | "name": "python", 1075 | "nbconvert_exporter": "python", 1076 | "pygments_lexer": "ipython3", 1077 | "version": "3.6.9" 1078 | } 1079 | }, 1080 | "nbformat": 4, 1081 | "nbformat_minor": 4 1082 | } 1083 | -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/10718-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/10718-workflow.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/bash-absolute-path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/bash-absolute-path.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/bash-anatomy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/bash-anatomy.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/bash-nano-save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/bash-nano-save.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/bash-nano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/bash-nano.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/bash-pwd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/bash-pwd.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/jupyter-notebook-kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/jupyter-notebook-kernel.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/jupyter-port-selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/jupyter-port-selection.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/jupyter-token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/jupyter-token.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/jupyter_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/jupyter_kernel.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-changed-interpreter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-changed-interpreter.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-click-find.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-click-find.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-connect-to-host.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-connect-to-host.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-enter-login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-enter-login.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-enter-venv-path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-enter-venv-path.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-file-menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-file-menu.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-open-connect-to-host.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-open-connect-to-host.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-open-folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-open-folder.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-remote-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-remote-diagram.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-remote-ssh-install.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-remote-ssh-install.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-run-python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-run-python.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-select-folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-select-folder.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-select-host.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-select-host.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-select-interpreter-path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-select-interpreter-path.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-select-interpreter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-select-interpreter.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-select-python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-select-python.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-ssh-connected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-ssh-connected.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/img/vscode-update-config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/remote-workflow/img/vscode-update-config.png -------------------------------------------------------------------------------- /techhelp/remote-workflow/remote-workflow.md: -------------------------------------------------------------------------------- 1 | ## Intro to: Remote Workflow 2 | 3 | This document will provide you with tools for comfortably using our remote environment (the course server) to develop and test your team's pipeline. 4 | 5 | ### Basic tools for common tasks 6 | 7 | We're providing setup instructions and support for "good enough" tools for each of the common tasks in the workflow for this class but if you're comfortable with other tools, feel free to use them. 8 | 9 | 1. Writing code: 10 | - Python: This tutorial introduces ``VSCode``, an editor with good Python support, and some tools that make reomte development easy. 11 | - However, feel free to use any editor you want (vim, emacs, sublime, pycharm). 12 | - SQL: In other tutorials, we've introduced psql (for writing sql on the server) and DBeaver (on your laptop). 13 | 2. Jupyter notebooks: 14 | - In this tutorial, we will show how to set up ``Jupyter`` through a browser on your local machine. 15 | - Many Python IDEs (such as VSCode, Pycharm) have good Jupyter support - feel free to use one of these! 16 | 3. Share code with your team: 17 | - Use the git command line interface to push to your team github repository. 18 | - Many IDEs (including VSCode) have git integration. 19 | 4. Run code: 20 | - Run Python code manually in an SSH terminal, either by pasting code into a Python REPL, or running a Python script. 21 | - Some IDEs (such as VSCode) support remote interpreters, allowing you to run scripts in a python instance on a remote machine (here, the course server). 22 | 23 | 24 | **Agenda:** 25 | 1. Using VSCode for remote development 26 | 2. Using Jupyter remotely, with SSH tunneling 27 | 3. Navigating the course server using the command line 28 | 4. Remote development concepts - how exactly does all of this work? 29 | 30 | ## Remote development with VSCode 31 | 32 | ### Why VSCode over SSH? 33 | 34 | For the first assignment, many people had trouble running their code on the course server. We heard a lot of questions, like "how do I run code saved on my laptop on the course server?" 35 | 36 | This section will introduce one convenient workflow for developing code on the remote server. 37 | 38 | VSCode is an IDE that provides a lot of useful tools for developing Python, including autocomplete, syntax highlighting, support for virtual environments, and shortcuts to run python files. 39 | 40 | With the VSCode SSH extension, VSCode can access code and other files stored on a remote computer. Furthermore, it can run any code stored on the remote machine. 41 | 42 | ![](img/vscode-remote-diagram.png) 43 | 44 | This has several advantages: 45 | - You don't have to keep any code stored on your local computer - you only need one copy, stored on the course server 46 | - You don't have to copy code between your computer and the course server. Instead, VSCode lets you edit files where they're stored on the course server. 47 | - VSCode makes it convenient to run code stored on the course server. When you're developing this way, you'll always have access to the database and your group's virtual environment. 48 | 49 | **Note**: This workflow isn't required - it's just one "good enough" approach that we think many of you will find convenient. Please feel free to use other workflows if you're already set up and comfortable with them. 50 | 51 | ### Configuring VSCode SSH 52 | 1. [Download and install](https://code.visualstudio.com/Download) VSCode 53 | 2. Install the `Remote - SSH` extension: 54 | 1. Press `ctrl+shift+x` (Linux/Windows) or `⌘+shift+x` (MacOS) to open the extensions menu 55 | 2. Search for and install `Remote - SSH` 56 | 57 | ![](img/vscode-remote-ssh-install.png) 58 | 59 | 1. At this time, also search for and install the microsoft `Python` extension. 60 | 3. Configure our course server as an SSH host: 61 | 62 | With the SSH plugin installed, we can tell VSCode how to log into the server. In this step we'll be entering our connection string and saving it in a file, making it easy to connect in the future. 63 | 64 | 1. Press `ctrl+shift+p` (Linux/Windows) or `⌘+shift+p` (MacOS) to open the command pallette, and select `Remote-SSH: Connect to Host` 65 | 66 | ![](img/vscode-open-connect-to-host.png) 67 | 68 | 2. Select `Add New SSH Host...` 69 | 70 | ![](img/vscode-connect-to-host.png) 71 | 72 | 3. Enter `ssh -i {path to your private key} {andrewid}@mlpolicylab.dssg.io` 73 | 74 | ![](img/vscode-enter-login.png) 75 | 76 | 4. Select the first option to store your login config: 77 | 78 | ![](img/vscode-update-config.png) 79 | 80 | 4. Connect VSCode to the course server: 81 | 1. Connect to the CMU Full VPN 82 | 2. Press `ctrl+shift+p` (Linux/Windows) or `⌘+shift+p` (MacOS) to open the command pallette, and select `Remote-SSH: Connect to Host` 83 | 84 | ![](img/vscode-open-connect-to-host.png) 85 | 86 | 3. Select the ssh config we just created: `mlpolicylab.dssg.io` 87 | 88 | ![](img/vscode-select-host.png) 89 | 90 | 4. Enter your private key passcode if VSCode prompts you to (it will open a box at the top of the screen). 91 | 92 | 5. You should be connected to the course server. This should be indicated in the bottom of your VSCode window: 93 | ![](img/vscode-ssh-connected.png) 94 | 95 | 5. Open a workspace folder: 96 | 97 | Now that VSCode is connected via SSH, you can browse all of the files and folders on the course server. In this step, we select a folder containing some code to edit and test. 98 | 99 | 1. Select the folder menu button 100 | 101 | ![](img/vscode-file-menu.png) 102 | 103 | 2. Select `Open Folder` 104 | 105 | ![](img/vscode-open-folder.png) 106 | 107 | 3. Select a folder to work in 108 | 109 | ![](img/vscode-select-folder.png) 110 | 111 | 6. Select your python virtual environment: 112 | 113 | VSCode can be configured to automatically run python code in a virtual environment. Here, we'll select and activate our group virtual environments. 114 | 115 | 1. Press `ctrl+shift+p` (Linux/Windows) or `⌘+shift+p` (MacOS) to open the command pallette, and select `Python: Select Interpreter` 116 | 117 | ![](img/vscode-select-interpreter.png) 118 | 119 | 2. Select `Enter interpreter path` 120 | 121 | ![](img/vscode-select-interpreter-path.png) 122 | 123 | 3. Select `Find...` 124 | 125 | ![](img/vscode-click-find.png) 126 | 127 | 4. Enter the path to the python executable in your virtual environment: `/path/to/your/environment/bin/python`. 128 | 129 | If you're using your groups virtual environment, the path will be `/data/groups/{group_name}/dssg_env/bin/python` 130 | 131 | ![](img/vscode-enter-venv-path.png) 132 | 133 | 5. After a moment, your selected python interpreter should be activated. This should be indicated in the bottom of your VSCode window: 134 | 135 | ![](img/vscode-changed-interpreter.png) 136 | 137 | 7. Run python! 138 | 1. Open the folder menu and select a python file (or press `ctrl+n` (Linux/Windows) or `⌘+n` (MacOS) to create a new one) 139 | 140 | ![](img/vscode-select-python.png) 141 | 142 | 2. Click the green "play" button at the top of your window. This starts a new terminal session, activates your virtual environment, and runs your python code. 143 | 144 | ![](img/vscode-run-python.png) 145 | 146 | ## Remote development with Jupyter 147 | 148 | ### How's it work? 149 | 150 | Conceptually, this similar to how VSCode works over SSH: 151 | - The remote machine (our course server) hosts a jupyter notebook server that does things like loads files, runs python, activates virtual environments 152 | - Your web browser connects to that server and presents a frontend interface for opening, editing, and running notebooks 153 | - These connect using SSH (inside the CMU VPN) 154 | 155 | ### Setting it up 156 | 1. Connect to the CMU VPN 157 | 2. Connect to the course server using SSH 158 | 3. Find an open port on the course server to send your Jupyter traffic through: 159 | 1. In the terminal (on the course server) type `ss -lntu`. This will list all ports 160 | 2. Pick a port number between 1024 and 65535 that is NOT on that list. 161 | 162 | ![](img/jupyter-port-selection.png) 163 | (numbers in this box are ports currently in use) 164 | 165 | 4. On the course server, start your notebook server: 166 | 1. In the server terminal (inside SSH), run `jupyter notebook --no-browser --port {your port from step 3}` 167 | 2. When the server starts, take note of the token printed in the server terminal output: 168 | 169 | ![](img/jupyter-token.png) 170 | (the token is printed multiple times) 171 | 5. On your local machine, set up an SSH tunnel. This will allow your web browser (on your local computer) to reach your Jupyter notebook server (on the course server): 172 | 1. In a **local** terminal (not via ssh): type `ssh i- {path to your private key} -N -L localhost:8888:localhost:{your port from step 3} {andrew_id}@mlpolicylab.dssg.io` 173 | 2. If you use putty, you'll need to follow a different set of steps. [Here's a tutorial for that](https://docs.bitnami.com/bch/faq/get-started/access-ssh-tunnel/). Enter `8888` in the `Source port` field. In `Destination`, enter `localhost:{your port from step 3}` 174 | 6. Open the notebook on your local machine: 175 | 1. Open a web browser and navigate to http://localhost:8888. If that doesn't work, try: 176 | - http://0.0.0.0:8888/ 177 | - http://127.0.0.1:8888/ 178 | 2. If this is your first time opening Jupyter, this should take you to a login page asking you to enter the token generated in step 4.2. Enter that token to proceed. 179 | 3. In the next screen (which should be a view of the folders and files in your working directory): 180 | - To create a new notebook, click the `New` dropdown, and select your group's name. This will create a new notebook using your group's virtual environment. 181 | 182 | ![](img/jupyter_kernel.png) 183 | - Double click an existing notebook to open it. Inside, navigate to `Kernel` -> `Change kernel` -> select your group's name. This will ensure the open notebook runs with your group's virtual environment 184 | 185 | ![](img/jupyter-notebook-kernel.png) 186 | 187 | ## Living in the command line 188 | 189 | ### Some key Linux concepts 190 | 191 | #### Linux Paths 192 | 193 | **Absolute paths**: 194 | 195 | An absolute path is a path that starts at a system's root directory. 196 | 197 | 198 | For example, the command `pwd` will print the absolute path to your current directory: 199 | 200 | ![](img/bash-pwd.png) 201 | 202 | To refer to a location using an absolute path, specify your path starting with a `/` 203 | 204 | Absolute paths are as unambiguous as possible. However, they're not as convenient as... 205 | 206 | **Relative paths** 207 | 208 | A relative path specifies the path to some folder or file, *relative to* the current location. 209 | 210 | To use a relative path, specify a path *not* starting with a `/` 211 | 212 | An example: 213 | - I start in `/home/adunmore/mlforpublicpolicylab` 214 | - I use `cd project` (note: doesn't start with `/`) 215 | - I've changed directories to `/home/adunmore/mlforpublicpolicylab/project` 216 | 217 | ![](img/bash-absolute-path.png) 218 | 219 | **The home directory** 220 | 221 | In Linux, each user has a "home directory". This is the default directory a user enters upon login. 222 | 223 | You can access your home directory with the command `cd ~`. 224 | 225 | You can also specify absolute paths in a similar way: 226 | - My home directory is `/home/adunmore` 227 | - I can access the folder `mlforpublicpolicylab` stored in my home directory with `cd ~/mlforpublicpolicylab` 228 | 229 | #### Anatomy of a Linux command 230 | 231 | Linux commands share a basic syntax. Let's take a look at one to see how it works: 232 | 233 | ``` 234 | ls -l --human-readable ./mlforpublicpolicy 235 | ``` 236 | 237 | This command contains four parts: 238 | 239 | `ls`: This is the name of the command we're running. `ls` is a utility that lists the files and folders present in a directory. The command name is always the part that comes first. 240 | 241 | `-l` & `--human-readable`: Both of these are options. Options are used to change the behavior of a command. Options usually start with one or two dashes (one dash for single-character options, two for longer options). 242 | 243 | `-l` tells ls to give detailed descriptions of all the files it lists (including size and permissions). `--human-readable` is self-explanatory: it tells `ls` to make its output easy to read. 244 | 245 | `./mlforpublicpolicylab`: This is the argument. Here, it's a relative path to the folder that we're telling `ls` to list the contents of. Most Linux commands take an argument - often text, or a file or folder to operate on. 246 | 247 | ![](img/bash-anatomy.png) 248 | 249 | #### Getting help 250 | 251 | Linux makes it easy to get help with a command: 252 | 253 | ``` 254 | man {command} 255 | ``` 256 | 257 | Opens the manual page for the command in question. Many commands also offer a help menu accessible with `{comand} --help` 258 | 259 | ### Some key command line tools 260 | 261 | At first, it can be tough to do basic things like browsing folders or editing text in the command line. But Linux includes a lot of helpful tools for these kinds of tasks. In this section, we'll show how to use some of these tools to get around the terminal. 262 | 263 | Follow along by executing the commands on the numbered lines. 264 | 265 | 1. Connect to the course server with SSH (if you aren't already) 266 | 267 | 268 | **Getting oriented:** 269 | 270 | Let's start by getting our bearings inside of the filesystem. 271 | 272 | First, let's figure out where we are, with `pwd`: 273 | 274 | `pwd` prints the **absolute path** of the current working directory. 275 | 276 | 2. Print your current working directory: `pwd` 277 | 278 | Next, let's find out what's in our current directory, with `ls`: 279 | 280 | ```bash 281 | ls {some_folder (by default, the working directory)} 282 | ``` 283 | lists the files in a directory. 284 | 285 | 3. List the files in your home directory: `ls` 286 | 287 | **Making files** 288 | 289 | Let's start doing some work. Start by using `mkdir` to make a new directory: 290 | 291 | ```bash 292 | mkdir {folder_name} 293 | ``` 294 | Creates a new folder 295 | 296 | 4. Make a new directory: `mkdir my_test_dir` 297 | 298 | Now, let's change into our new directory to do some work, with `cd`: 299 | 300 | ```bash 301 | cd {some path} 302 | ``` 303 | Changes the working directory 304 | 305 | 5. Move to your new directory: `cd my_test_dir` 306 | 307 | Make a new empty file with `touch`: 308 | 309 | ```bash 310 | touch {file_name} 311 | ``` 312 | Create a new file 313 | 314 | 6. Make a new (empty) file: `touch a_test_file` 315 | 316 | **Editing text in the command line** 317 | 318 | Nano is a barebones text editor available on most Linux computers. While it's not as nice to use as something like VSCode, it's still quite convenient for making quick edits from the command line. 319 | 320 | Start Nano like any other command line tool: 321 | 322 | ```bash 323 | nano filename 324 | ``` 325 | 326 | ![](img/bash-nano.png) 327 | 328 | You should see something like this. The options along the bottom are keyboard shortcuts for controlling Nano. Here, `^` means `ctrl`. For example `ctrl+x` exits Nano, and `ctrl+w` searches the open file. 329 | 330 | The top part of the screen is the editor. You can move your flashing cursor with your arrow keys. 331 | 332 | If you make changes and exit, Nano will display the following message, asking if you'd like to save. Press `y` to save, `n` to exit without saving, or `ctrl+c` to cancel and continue editing. 333 | 334 | ![](img/bash-nano-save.png) 335 | 336 | 337 | **Let's try it out:** 338 | 339 | 7. Open the file you created in step 6 with `nano`, and put some text in it: 340 | 1. `nano a_test_file` 341 | 2. Type something you learned in this tech session 342 | 3. press `ctrl+c`, then `y` to save and exit 343 | 344 | Let's use `cat` to make sure our changes worked: 345 | 346 | ```bash 347 | cat {filename} 348 | ``` 349 | 350 | Prints the contents of a file (works best with text-based files) 351 | 352 | 8. Print the contents: `cat a_test_file` 353 | 354 | **Moving files** 355 | 356 | Let's learn some tools for manipulating existing files. 357 | 358 | Let's start by copying our text file, with `cp`: 359 | 360 | ```bash 361 | cp {source} {destination} 362 | ``` 363 | Copies the file at source to destination. 364 | 365 | 9. Make a copy of your file, named "another_one": `cp a_test_file another_one` 366 | 367 | Now, let's move that new file, with `mv`: 368 | 369 | ```bash 370 | mv {source} {destination} 371 | ``` 372 | Moves the file or folder at source to destination. 373 | 374 | 10. Move the copy to your home directory: `mv another_one ~/` 375 | 376 | Finally, let's delete that file with `rm` (turns out we didn't need it after all) 377 | 378 | ```bash 379 | rm {file} 380 | ``` 381 | Remove (delete!) a file 382 | 383 | 11. Remove the copy file: `rm ~/another_one` 384 | 385 | ### Background tasks with screen 386 | 387 | In this class, you'll often want to run long-running jobs in the terminal. However, by default, any tasks left running when you log out of ssh will be closed. 388 | 389 | We can get around this with a Linux utility called `screen`. Screen is a "terminal multiplexer". That is, it allows you to keep run multiple terminal sessions, and keep them active even after you've logged off. 390 | 391 | Screen allows us to start a process (like a long-running python script), put it in the background, and log off without cancelling the script 392 | 393 | **Running `screen`** 394 | 395 | 1. Log into the course server with ssh 396 | 2. Open a new screen session: 397 | 398 | ``` 399 | $ screen 400 | ``` 401 | 402 | You should see a screen with information about `screen` (licensing, a plea for free beer, etc). Press enter to bypass this. This will open a fresh terminal session, with your terminal history should be cleared out. 403 | 404 | 3. Verify that you're in a screen session by listing the open sessions owned by your account: 405 | 406 | ``` 407 | $ screen -ls 408 | >There is a screen on: 409 | > 18855.pts-44.ip-10-0-1-213 (09/30/20 18:32:05) (Attached) 410 | >1 Socket in /run/screen/S-adunmore. 411 | ``` 412 | 413 | One session is listed. It's labeled as `(Attached)`, which means you're logged into it. 414 | 415 | 4. Let's give our system some work to do. Run the following command, which will start a useless but friendly infinite loop: 416 | 417 | ``` 418 | $ while :; do echo "howdy do!"; sleep 1; done 419 | ``` 420 | 421 | Note that at this point, you could safely log off of `ssh`. Your loop would still be here when you logged back on. 422 | 423 | 5. Now that your screen session is busy, let's go back to our default session to get some work done. 424 | 425 | pres `ctrl+a`, release those keys, and press `d`. 426 | 427 | You should return to your original terminal prompt. 428 | 429 | 6. Check that your screen session is still there: run `screen -ls` to list open sessions again. This time, the single open session should be labeled as `(Detached)`, which means that you're not viewing it. 430 | 431 | Note the 5-digit number printed at the beginning of the line referring to your screen session. We'll use that number to log back into that session. 432 | 433 | 7. Let's return to our session and kill that loop - we don't need it anymore. 434 | 435 | We'll use `screen -r`. This reattaches the named screen. Use the 5-digit number from step 6 to refer to that session: 436 | 437 | ``` 438 | screen -r {screen session number} 439 | ``` 440 | 441 | You should now be back in your old terminal session, where that loop has been "howdy"-ing away. 442 | 443 | Press `ctrl-c` to close that loop. 444 | 445 | 8. Now we can close this screen session. Simply type `exit` in the command line. 446 | 447 | This should kill our session and return us to the command prompt. If you'd like, confirm that your session is closed with `screen -ls`. 448 | 449 | **Some notes:** 450 | 451 | - You can name your session, with the `-S` flag: 452 | 453 | ``` 454 | $ screen -S some_name 455 | ``` 456 | 457 | Once you've assigned a name, you can use it to reattach your screen sessions, which is easier than remembering/looking up a number. 458 | 459 | - You can use `screen` (and any of the utilities introduced here) in your VSCode terminal. Just press `ctrl+c` to exit your python session (if you're in one), and you'll be able to enter these commands just like a regular terminal session. 460 | 461 | ## Understanding the 10718 remote workflow 462 | 463 | ### Your machine is a client 464 | 465 | You can think of your machine "client" in our system. This is because it doesn't do much of the computational heavy lifting. Rather, it views data stored on the database, uses utilities running on the server, and edits and runs code in the server's environment. 466 | 467 | ### SSH tunnelling 468 | 469 | Since our projects involve sensitive, personal data, we keep the course server and database inside of a secure network hosted by Amazon Web Services. The course database and server are the only computers on the network. They cannot talk to computers outside of the network, with two exceptions: 470 | - The course server can access the web (ie to download files from a website or query the census.gov api) 471 | - The course server accepts SSH connections through the CMU VPN 472 | 473 | External computers cannot connect directly to the course server. 474 | 475 | We can use SSH to get inside this network. We use SSH in two main ways: 476 | - We use SSH to access the course server terminal. We can use this to access files stored on the server, and run programs like `python`, `psql`, `nano`, etc. 477 | - We use SSH to open tunnels through the course server, to the course database. An SSH tunnel allows a client computer (ex: your laptop) to connect securely to any application accessible from a remote server (ex: our course server). For example: 478 | - We run Jupyter notebook servers on the course server. We can use an SSH tunnel to open hosted notebooks on our local computers 479 | - The course server can connect to the course database. We can use an SSH tunnel to allow local applications like DBeaver to connect to the course server, via the course server. 480 | 481 | Interested in a deeper dive? Here's an article on [SSH tunneling](https://www.ssh.com/ssh/tunneling/). 482 | 483 | 484 | 485 | ![](img/10718-workflow.png) 486 | 487 | **A diagram illustrating the class architecture.** 488 | -------------------------------------------------------------------------------- /techhelp/sklearn.md: -------------------------------------------------------------------------------- 1 | # scikitlearn 2 | 3 | ### [Video: Quick intro to sklearn](https://youtu.be/QQvoSyqy3G4) 4 | ### [Video: Models and hyperparameters in sklearn](https://youtu.be/t-_0yjjDre4) 5 | -------------------------------------------------------------------------------- /techhelp/tech_session_1_initial_setup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/tech_session_1_initial_setup.pdf -------------------------------------------------------------------------------- /techhelp/tech_session_2_git_sql.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/MLinPractice/ac54d312971ae78147a2dcc76d549f666d3fc60b/techhelp/tech_session_2_git_sql.pdf -------------------------------------------------------------------------------- /techhelp/tech_session_template.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | projectid, 3 | 4 | total_price_excluding_optional_support, 5 | students_reached, 6 | 7 | school_charter::INT AS charter, 8 | school_magnet::INT AS magnet, 9 | school_year_round::INT AS year_round, 10 | school_nlns::INT AS nlns, 11 | school_kipp::INT AS kipp, 12 | school_charter_ready_promise::INT AS charter_ready, 13 | 14 | CASE WHEN resource_type = 'Books' THEN 1 ELSE 0 END AS resource_books, 15 | CASE WHEN resource_type = 'Technology' THEN 1 ELSE 0 END AS resource_tech, 16 | CASE WHEN resource_type = 'Supplies' THEN 1 ELSE 0 END AS resource_supplies, 17 | 18 | CASE WHEN poverty_level = 'highest poverty' THEN 1 ELSE 0 END AS poverty_highest, 19 | CASE WHEN poverty_level = 'high poverty' THEN 1 ELSE 0 END AS poverty_high, 20 | CASE WHEN poverty_level IN ('moderate poverty', 'low poverty') THEN 1 ELSE 0 END AS poverty_lower 21 | 22 | FROM {table_schema}.{table_name} 23 | WHERE 24 | school_state IN ({state_list}) 25 | AND 26 | date_posted BETWEEN '{start_dt}'::DATE AND '{end_dt}'::DATE 27 | ; 28 | --------------------------------------------------------------------------------