├── .all-contributorsrc ├── .github └── workflows │ └── book.yml ├── .gitignore ├── .vscode ├── ltex.hiddenFalsePositives.en-GB.txt └── settings.json ├── README.md ├── coursebook ├── README.md ├── __init__.py ├── _config.yml ├── _toc.yml ├── figures │ ├── 4.1_1 │ ├── 4.1_1.svg │ ├── 4.1_2 │ ├── 4.1_2.svg │ ├── logo.png │ ├── m1 │ │ ├── actions.jpg │ │ ├── agile.jpg │ │ ├── ai.png │ │ ├── atlas_ai.png │ │ ├── big_data.png │ │ ├── branch_names.png │ │ ├── chicken_egg.jpeg │ │ ├── cleaning.jpeg │ │ ├── cloud.jpeg │ │ ├── collection.png │ │ ├── cs.png │ │ ├── data_oil.jpeg │ │ ├── ddl.jpeg │ │ ├── digitisation.jpeg │ │ ├── ds_lifecycle.png │ │ ├── ds_selling.png │ │ ├── dsg.jpeg │ │ ├── engage.jpeg │ │ ├── fourth_para.jpg │ │ ├── free_lunch.jpg │ │ ├── funnel.png │ │ ├── gantt.jpeg │ │ ├── gitflow.svg │ │ ├── grenfell.jpeg │ │ ├── imagenet.jpeg │ │ ├── impact.png │ │ ├── kanban_example.png │ │ ├── lock.jpeg │ │ ├── market_cap.jpeg │ │ ├── matrix.png │ │ ├── missing_data.jpeg │ │ ├── monitor.png │ │ ├── mvp.jpeg │ │ ├── newspapers.jpeg │ │ ├── odi.png │ │ ├── power.jpeg │ │ ├── pr_add_comments.png │ │ ├── pr_commit_id.png │ │ ├── pr_example.png │ │ ├── problem.png │ │ ├── propublica.png │ │ ├── pyramid_of_needs.png │ │ ├── question.jpeg │ │ ├── reg.png │ │ ├── review.png │ │ ├── roc.png │ │ ├── scope_creep_meaning.png │ │ ├── scrum.png │ │ ├── task.png │ │ ├── theory.jpg │ │ └── twitter.jpeg │ ├── m2 │ │ ├── black_and_white_smiley.png │ │ ├── graph_data.png │ │ └── grayscale_smiley.png │ └── m4 │ │ ├── fig_data_process.png │ │ └── fig_generative_model.png ├── index.md └── modules │ ├── m1 │ ├── 1.1-WhatIsDataScience.ipynb │ ├── 1.2-DataScienceProjectLifecycle.ipynb │ ├── 1.3-EDIForDataScience.ipynb │ ├── 1.4-CollaborationAndReproducibility.ipynb │ ├── hands-on.ipynb │ └── overview.ipynb │ ├── m2 │ ├── 2.1-GettingLoading.ipynb │ ├── 2.1.1-WhereToFindData.ipynb │ ├── 2.1.2-LegalityAndEthics.ipynb │ ├── 2.1.3-PandasIntro.ipynb │ ├── 2.1.4-DataSourcesAndFormats.ipynb │ ├── 2.1.5-ControllingAccess.ipynb │ ├── 2.2-DataWrangling.ipynb │ ├── 2.2.1-DataConsistency.ipynb │ ├── 2.2.2-ModifyingColumnsAndIndices.ipynb │ ├── 2.2.3-FeatureEngineering.ipynb │ ├── 2.2.4-DataManipulation.ipynb │ ├── 2.2.4.1-TimeAndDateData.ipynb │ ├── 2.2.4.2-TextData.ipynb │ ├── 2.2.4.3-CategoricalData.ipynb │ ├── 2.2.4.4-ImageData.ipynb │ ├── 2.2.5-PrivacyAndAnonymisation.ipynb │ ├── 2.2.6-LinkingDatasets.ipynb │ ├── 2.2.7-MissingData.ipynb │ ├── data │ │ ├── ANSUR_II_FEMALE_Public.csv │ │ ├── README.md │ │ ├── flower.png │ │ ├── lter_penguins.png │ │ ├── pcs_2017.csv │ │ ├── pcs_download.py │ │ ├── penguins.csv │ │ ├── penguins_make_messy.py │ │ ├── penguins_original.csv │ │ ├── smiley_16.png │ │ ├── urban_population.csv │ │ └── urban_population_header.csv │ ├── hands-on-complete.ipynb │ ├── hands-on.ipynb │ ├── handson_utils.py │ └── overview.ipynb │ ├── m3 │ ├── 3.1-FiguresGoneWrong.ipynb │ ├── 3.2-RulesOfTheGame.ipynb │ ├── 3.3-Atlas0fVisualisations.ipynb │ ├── 3.4-StoryTelling.ipynb │ ├── 3.5-DataVisForExploration.ipynb │ ├── data │ │ ├── 2020-president.csv │ │ ├── US_state_regions_code.csv │ │ ├── data_2021-Aug-01.csv │ │ └── us-covid-share-fully-vaccinated.csv │ ├── figures │ │ └── proportions_table.png │ ├── hands-on.ipynb │ └── overview.ipynb │ └── m4 │ ├── 4.1-WhatAndWhy.ipynb │ ├── 4.2-ModelFitting.ipynb │ ├── 4.3-ModelBuilding.ipynb │ ├── 4.4-ModelEvaluation.ipynb │ ├── data │ └── UKDA-7724-csv │ │ ├── csv │ │ ├── eqls_2007.csv │ │ ├── eqls_2007and2011.csv │ │ └── eqls_2011.csv │ │ ├── mrdoc │ │ ├── UKDA │ │ │ └── UKDA_Study_7724_Information.htm │ │ ├── allissue │ │ │ ├── eqls_2007_ukda_data_dictionary.rtf │ │ │ ├── eqls_2007and2011_ukda_data_dictionary.rtf │ │ │ └── eqls_2011_ukda_data_dictionary.rtf │ │ ├── excel │ │ │ ├── 7724_file_information_csv.csv │ │ │ ├── eqls_api_map.csv │ │ │ └── eqls_concordance_grid.xlsx │ │ └── pdf │ │ │ └── 7724_eqls_2007-2011_user_guide_v2.pdf │ │ └── read7724.htm │ ├── hands-on.ipynb │ └── overview.ipynb ├── documentation ├── delivery_tips.md └── developer_instructions.md ├── poetry.lock ├── pyproject.toml └── requirements.txt /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "projectName": "rds-course", 3 | "projectOwner": "alan-turing-institute", 4 | "repoType": "github", 5 | "repoHost": "https://github.com", 6 | "files": [ 7 | "README.md" 8 | ], 9 | "imageSize": 100, 10 | "commit": false, 11 | "contributors": [ 12 | { 13 | "login": "jack89roberts", 14 | "name": "Jack Roberts", 15 | "avatar_url": "https://avatars.githubusercontent.com/u/16308271?v=4", 16 | "profile": "https://github.com/jack89roberts", 17 | "contributions": [ 18 | "code", 19 | "content", 20 | "ideas", 21 | "question", 22 | "talk", 23 | "bug", 24 | "review", 25 | "infra", 26 | "projectManagement", 27 | "tutorial", 28 | "eventOrganizing", 29 | "research", 30 | "maintenance" 31 | ] 32 | }, 33 | { 34 | "login": "AoifeHughes", 35 | "name": "Aoife Hughes", 36 | "avatar_url": "https://avatars.githubusercontent.com/u/10923695?v=4", 37 | "profile": "https://aoifehughes.github.io", 38 | "contributions": [ 39 | "code", 40 | "content", 41 | "ideas", 42 | "question", 43 | "bug", 44 | "review", 45 | "tutorial", 46 | "maintenance" 47 | ] 48 | }, 49 | { 50 | "login": "callummole", 51 | "name": "Callum Mole", 52 | "avatar_url": "https://avatars.githubusercontent.com/u/22677759?v=4", 53 | "profile": "https://github.com/callummole", 54 | "contributions": [ 55 | "code", 56 | "content", 57 | "ideas", 58 | "question", 59 | "talk", 60 | "bug", 61 | "review", 62 | "infra", 63 | "projectManagement", 64 | "tutorial", 65 | "eventOrganizing", 66 | "research", 67 | "doc", 68 | "maintenance" 69 | ] 70 | }, 71 | { 72 | "login": "crangelsmith", 73 | "name": "Camila Rangel Smith", 74 | "avatar_url": "https://avatars.githubusercontent.com/u/11162074?v=4", 75 | "profile": "https://github.com/crangelsmith", 76 | "contributions": [ 77 | "code", 78 | "content", 79 | "ideas", 80 | "question", 81 | "talk", 82 | "bug", 83 | "review", 84 | "projectManagement", 85 | "tutorial", 86 | "eventOrganizing", 87 | "fundingFinding", 88 | "research" 89 | ] 90 | }, 91 | { 92 | "login": "lannelin", 93 | "name": "James Bishop", 94 | "avatar_url": "https://avatars.githubusercontent.com/u/26149456?v=4", 95 | "profile": "https://github.com/lannelin", 96 | "contributions": [ 97 | "code", 98 | "content", 99 | "ideas", 100 | "talk", 101 | "review", 102 | "tutorial", 103 | "fundingFinding", 104 | "question", 105 | "infra", 106 | "research" 107 | ] 108 | }, 109 | { 110 | "login": "fedenanni", 111 | "name": "Federico Nanni", 112 | "avatar_url": "https://avatars.githubusercontent.com/u/8415204?v=4", 113 | "profile": "https://github.com/fedenanni", 114 | "contributions": [ 115 | "code", 116 | "content", 117 | "ideas", 118 | "question", 119 | "talk", 120 | "bug", 121 | "review", 122 | "projectManagement", 123 | "tutorial", 124 | "eventOrganizing", 125 | "fundingFinding", 126 | "research" 127 | ] 128 | }, 129 | { 130 | "login": "gmingas", 131 | "name": "Greg Mingas", 132 | "avatar_url": "https://avatars.githubusercontent.com/u/1958852?v=4", 133 | "profile": "https://github.com/gmingas", 134 | "contributions": [ 135 | "code", 136 | "content", 137 | "ideas", 138 | "question", 139 | "talk", 140 | "bug", 141 | "review", 142 | "projectManagement", 143 | "tutorial", 144 | "eventOrganizing", 145 | "fundingFinding", 146 | "research" 147 | ] 148 | }, 149 | { 150 | "login": "helendduncan", 151 | "name": "Helen Duncan", 152 | "avatar_url": "https://avatars.githubusercontent.com/u/46891265?v=4", 153 | "profile": "https://github.com/helendduncan", 154 | "contributions": [ 155 | "code", 156 | "content", 157 | "bug", 158 | "review", 159 | "maintenance" 160 | ] 161 | }, 162 | { 163 | "login": "edwardchalstrey1", 164 | "name": "Ed Chalstrey", 165 | "avatar_url": "https://avatars.githubusercontent.com/u/5486164?v=4", 166 | "profile": "http://edchalstrey.com/", 167 | "contributions": [ 168 | "code", 169 | "question", 170 | "bug", 171 | "talk", 172 | "tutorial" 173 | ] 174 | }, 175 | { 176 | "login": "ChristinaLast", 177 | "name": "Christina Last", 178 | "avatar_url": "https://avatars.githubusercontent.com/u/36204574?v=4", 179 | "profile": "https://christinalast.com/", 180 | "contributions": [ 181 | "code", 182 | "tutorial", 183 | "infra", 184 | "content", 185 | "ideas" 186 | ] 187 | }, 188 | { 189 | "login": "radka-j", 190 | "name": "Radka Jersakova", 191 | "avatar_url": "https://avatars.githubusercontent.com/u/29207091?v=4", 192 | "profile": "https://github.com/radka-j", 193 | "contributions": [ 194 | "review", 195 | "bug", 196 | "code" 197 | ] 198 | }, 199 | { 200 | "login": "ots22", 201 | "name": "ots22", 202 | "avatar_url": "https://avatars.githubusercontent.com/u/5434836?v=4", 203 | "profile": "https://github.com/ots22", 204 | "contributions": [ 205 | "review", 206 | "bug", 207 | "code", 208 | "tutorial", 209 | "question" 210 | ] 211 | }, 212 | { 213 | "login": "triangle-man", 214 | "name": "James Geddes", 215 | "avatar_url": "https://avatars.githubusercontent.com/u/1172905?v=4", 216 | "profile": "https://github.com/triangle-man", 217 | "contributions": [ 218 | "tutorial", 219 | "question" 220 | ] 221 | }, 222 | { 223 | "login": "pwochner", 224 | "name": "pwochner", 225 | "avatar_url": "https://avatars.githubusercontent.com/u/78024695?v=4", 226 | "profile": "https://github.com/pwochner", 227 | "contributions": [ 228 | "tutorial", 229 | "question" 230 | ] 231 | }, 232 | { 233 | "login": "pafoster", 234 | "name": "Peter Foster", 235 | "avatar_url": "https://avatars.githubusercontent.com/u/9533784?v=4", 236 | "profile": "https://github.com/pafoster", 237 | "contributions": [ 238 | "review" 239 | ] 240 | }, 241 | { 242 | "login": "nbarlowATI", 243 | "name": "nbarlowATI", 244 | "avatar_url": "https://avatars.githubusercontent.com/u/33832774?v=4", 245 | "profile": "https://github.com/nbarlowATI", 246 | "contributions": [ 247 | "review", 248 | "question", 249 | "bug", 250 | "talk", 251 | "tutorial" 252 | ] 253 | }, 254 | { 255 | "login": "kasra-hosseini", 256 | "name": "Kasra Hosseini", 257 | "avatar_url": "https://avatars.githubusercontent.com/u/1899856?v=4", 258 | "profile": "https://github.com/kasra-hosseini", 259 | "contributions": [ 260 | "ideas", 261 | "question", 262 | "review", 263 | "projectManagement", 264 | "tutorial", 265 | "eventOrganizing", 266 | "fundingFinding", 267 | "research" 268 | ] 269 | }, 270 | { 271 | "login": "jemrobinson", 272 | "name": "James Robinson", 273 | "avatar_url": "https://avatars.githubusercontent.com/u/3502751?v=4", 274 | "profile": "https://github.com/jemrobinson", 275 | "contributions": [ 276 | "question", 277 | "talk", 278 | "tutorial", 279 | "projectManagement", 280 | "eventOrganizing" 281 | ] 282 | }, 283 | { 284 | "login": "LouiseABowler", 285 | "name": "Louise Bowler", 286 | "avatar_url": "https://avatars.githubusercontent.com/u/25640708?v=4", 287 | "profile": "https://github.com/LouiseABowler", 288 | "contributions": [ 289 | "ideas", 290 | "projectManagement", 291 | "fundingFinding", 292 | "research" 293 | ] 294 | }, 295 | { 296 | "login": "LydiaFrance", 297 | "name": "Lydia France", 298 | "avatar_url": "https://avatars.githubusercontent.com/u/85945427?v=4", 299 | "profile": "https://github.com/LydiaFrance", 300 | "contributions": [ 301 | "review", 302 | "question", 303 | "bug", 304 | "talk", 305 | "tutorial", 306 | "userTesting" 307 | ] 308 | }, 309 | { 310 | "login": "dsj976", 311 | "name": "David Salvador Jasin", 312 | "avatar_url": "https://avatars.githubusercontent.com/u/57944311?v=4", 313 | "profile": "https://github.com/dsj976", 314 | "contributions": [ 315 | "question", 316 | "tutorial" 317 | ] 318 | }, 319 | { 320 | "login": "KatrionaGoldmann", 321 | "name": "Katriona Goldmann", 322 | "avatar_url": "https://avatars.githubusercontent.com/u/25952322?v=4", 323 | "profile": "https://katrionagoldmann.github.io/", 324 | "contributions": [ 325 | "question", 326 | "tutorial" 327 | ] 328 | }, 329 | { 330 | "login": "craddm", 331 | "name": "Matt Craddock", 332 | "avatar_url": "https://avatars.githubusercontent.com/u/5796417?v=4", 333 | "profile": "https://github.com/craddm", 334 | "contributions": [ 335 | "question", 336 | "bug", 337 | "review", 338 | "projectManagement", 339 | "tutorial", 340 | "eventOrganizing" 341 | ] 342 | }, 343 | { 344 | "login": "mhauru", 345 | "name": "Markus Hauru", 346 | "avatar_url": "https://avatars.githubusercontent.com/u/5229876?v=4", 347 | "profile": "https://github.com/mhauru", 348 | "contributions": [ 349 | "question", 350 | "tutorial" 351 | ] 352 | }, 353 | { 354 | "login": "rchan26", 355 | "name": "Ryan Chan", 356 | "avatar_url": "https://avatars.githubusercontent.com/u/44200705?v=4", 357 | "profile": "https://rchan26.github.io/", 358 | "contributions": [ 359 | "question", 360 | "tutorial", 361 | "userTesting" 362 | ] 363 | }, 364 | { 365 | "login": "rwood-97", 366 | "name": "Rosie Wood", 367 | "avatar_url": "https://avatars.githubusercontent.com/u/72076688?v=4", 368 | "profile": "https://github.com/rwood-97", 369 | "contributions": [ 370 | "question", 371 | "tutorial" 372 | ] 373 | }, 374 | { 375 | "login": "mastoffel", 376 | "name": "martin", 377 | "avatar_url": "https://avatars.githubusercontent.com/u/7348440?v=4", 378 | "profile": "https://scholar.google.de/citations?user=58uLFHoAAAAJ&hl=en&oi=ao", 379 | "contributions": [ 380 | "question", 381 | "tutorial" 382 | ] 383 | }, 384 | { 385 | "login": "lbokeria", 386 | "name": "Levan Bokeria", 387 | "avatar_url": "https://avatars.githubusercontent.com/u/7816766?v=4", 388 | "profile": "https://github.com/lbokeria", 389 | "contributions": [ 390 | "question", 391 | "talk" 392 | ] 393 | }, 394 | { 395 | "login": "dlpbc", 396 | "name": "Ese", 397 | "avatar_url": "https://avatars.githubusercontent.com/u/17943860?v=4", 398 | "profile": "https://github.com/dlpbc", 399 | "contributions": [ 400 | "question", 401 | "tutorial" 402 | ] 403 | } 404 | ], 405 | "contributorsPerLine": 7, 406 | "skipCi": true, 407 | "commitConvention": "angular" 408 | } 409 | -------------------------------------------------------------------------------- /.github/workflows/book.yml: -------------------------------------------------------------------------------- 1 | name: deploy-book 2 | 3 | # Only run this when the master branch changes 4 | on: 5 | push: 6 | branches: 7 | - develop 8 | 9 | # This job installs dependencies, build the book, and pushes it to `gh-pages` 10 | jobs: 11 | deploy-book: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | # Install dependencies 17 | - name: Set up Python 3.9 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: 3.9 21 | 22 | - name: Install dependencies 23 | run: | 24 | sudo apt-get install graphviz 25 | pip install . 26 | # Build the book 27 | - name: Build the book 28 | run: | 29 | jupyter-book build coursebook 30 | # Push the book's HTML to github-pages 31 | - name: GitHub Pages action 32 | uses: peaceiris/actions-gh-pages@v3.5.9 33 | with: 34 | github_token: ${{ secrets.GITHUB_TOKEN }} 35 | publish_dir: coursebook/_build/html 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | coursebook/_build/* 2 | *checkpoints 3 | data/* 4 | .DS_Store 5 | node_modules/ 6 | package-lock.json 7 | package.json 8 | *.sh 9 | -------------------------------------------------------------------------------- /.vscode/ltex.hiddenFalsePositives.en-GB.txt: -------------------------------------------------------------------------------- 1 | {"rule":"PASSIVE_VOICE","sentence":"^\\QThere can be an increased risk of identification of a person/entity when two datasets are linked.\\E$"} 2 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.fontSize": 20, 3 | "editor.rulers": {}, 4 | "editor.renderLineHighlight": "line", 5 | "workbench.colorCustomizations": { 6 | "editor.lineHighlightBorder": "#fff" 7 | } 8 | } -------------------------------------------------------------------------------- /coursebook/README.md: -------------------------------------------------------------------------------- 1 | # Coursebook 2 | 3 | We are using [jupyter-book](https://jupyterbook.org/intro.html) to develop our course. We will be using mix of markdown and jupyter notebook files, with binder providing a reproducible interactive computing environment. 4 | 5 | Course file structure: 6 | 7 | ``` 8 | coursebook/ 9 | - figures/ 10 | - modules/ 11 | - m1/ 12 | - files... 13 | - m2/ 14 | - files... 15 | - m3/ 16 | - files... 17 | - m4/ 18 | - files... 19 | - _config.yml 20 | - _toc.yml 21 | requirement.txt 22 | welcome.md 23 | ``` 24 | 25 | ## Adding to the book: 26 | 27 | - Add a markdown or jupyter notebook file to the corresponding module. You must have one top-level header (`#`), this will be the page link text in the book. 28 | ``` 29 | coursebook/ 30 | - figures/ 31 | - modules/ 32 | - m1/ 33 | - ethics_and_datascience.md 34 | ... 35 | ``` 36 | 37 | - Add the file to the table of contents `_toc.yml`. Do not include the file extension. 38 | ``` 39 | format: jb-book 40 | root: welcome 41 | parts: 42 | - caption: "Module 1: Introduction" 43 | chapters: 44 | - file: modules/m1/ethics_and_datascience 45 | - caption: "Module 2: Handling data and deployment" 46 | ... 47 | 48 | ``` 49 | 50 | - Test the build locally. 51 | - install `jupyter-book`. Recommended to use a fresh environment to avoid errors related to [package versions](https://github.com/executablebooks/jupyter-book/issues/1394). 52 | - `pip install jupyter-book`; `conda install -c conda-forge jupyter-book`. 53 | - Build the book with `jupyter-book build coursebook`, this will generate html files in `coursebook/_build/html/`. 54 | - Inspect the book by opening `coursebook/_build/html/index.html` in your browser. 55 | - If you want to rebuild after changes, use `jupyter-book clean coursebook` to empty the `_build` directory. 56 | 57 | - You don't need to commit the html files to github. When you push to `develop` the book will automatically be rebuilt, with the html pages deployed to `gh-pages`. 58 | -------------------------------------------------------------------------------- /coursebook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/__init__.py -------------------------------------------------------------------------------- /coursebook/_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | # Learn more at https://jupyterbook.org/customize/config.html 3 | 4 | title: Research Data Science 5 | author: Research Engineering Group, The Alan Turing Institute 6 | logo: "./figures/logo.png" 7 | 8 | # Force re-execution of notebooks on each build. 9 | # See https://jupyterbook.org/content/execute.html 10 | execute: 11 | execute_notebooks: force 12 | exclude_patterns: 13 | - '3.5-*' 14 | - '4.3-*' 15 | - '4.4-*' 16 | - '*hands-on*' 17 | 18 | exclude_patterns: [ 19 | '*README.md' 20 | ] 21 | 22 | only_build_toc_files: true 23 | 24 | 25 | # Define the name of the latex output file for PDF builds 26 | latex: 27 | latex_documents: 28 | targetname: rds-course.tex 29 | 30 | # added for plotly: https://jupyterbook.org/interactive/interactive.html?highlight=plotly 31 | sphinx: 32 | config: 33 | html_js_files: 34 | - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js 35 | 36 | # Information about where the book exists on the web 37 | repository: 38 | url: https://github.com/alan-turing-institute/rds-course # Online location of your book 39 | path_to_book: "coursebook" # Optional path to your book, relative to the repository root 40 | branch: develop # Which branch of the repository should be used when creating links (optional) 41 | 42 | # Add GitHub buttons to your book 43 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository 44 | html: 45 | use_issues_button: true 46 | use_repository_button: true 47 | 48 | ############################################################### 49 | # Launch button settings 50 | launch_buttons: 51 | notebook_interface : "classic" # The interface interactive links will activate ["classic", "jupyterlab"] 52 | binderhub_url : "https://mybinder.org" # The URL of the BinderHub (for example, https://mybinder.org) 53 | jupyterhub_url : "" # The URL of the JupyterHub (for example, https://datahub.berkeley.edu) 54 | thebe : true # Add a thebelab button to pages (requires the repository to run on Binder) -------------------------------------------------------------------------------- /coursebook/_toc.yml: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | # Learn more at https://jupyterbook.org/customize/toc.html 3 | 4 | 5 | # Each entry has the following schema: 6 | # 7 | # - title: mytitle # Title of chapter or subchapter 8 | # file: myfile # filename relative to the folder. 9 | # sections: # Contains a list of more entries that make up the chapter's subchapters 10 | # 11 | # Below are some special values that trigger specific behavior: 12 | # - header: My Header # Will insert a header with no link in the sidebar 13 | 14 | 15 | format: jb-book 16 | root: index 17 | parts: 18 | - caption: "Module 1: Introduction to Data Science" 19 | chapters: 20 | - file: modules/m1/overview 21 | - file: modules/m1/1.1-WhatIsDataScience 22 | - file: modules/m1/1.2-DataScienceProjectLifecycle 23 | - file: modules/m1/1.3-EDIForDataScience 24 | - file: modules/m1/1.4-CollaborationAndReproducibility 25 | - file: modules/m1/hands-on 26 | - caption: "Module 2: Handling data" 27 | chapters: 28 | - file: modules/m2/overview 29 | # getting data 30 | - file: modules/m2/2.1-GettingLoading 31 | sections: 32 | - file: modules/m2/2.1.1-WhereToFindData 33 | - file: modules/m2/2.1.2-LegalityAndEthics 34 | - file: modules/m2/2.1.3-PandasIntro 35 | - file: modules/m2/2.1.4-DataSourcesAndFormats 36 | - file: modules/m2/2.1.5-ControllingAccess 37 | # cleaning and wrangling 38 | - file: modules/m2/2.2-DataWrangling 39 | sections: 40 | - file: modules/m2/2.2.1-DataConsistency 41 | - file: modules/m2/2.2.2-ModifyingColumnsAndIndices 42 | - file: modules/m2/2.2.3-FeatureEngineering 43 | - file: modules/m2/2.2.4-DataManipulation 44 | sections: 45 | - file: modules/m2/2.2.4.1-TimeAndDateData 46 | - file: modules/m2/2.2.4.2-TextData 47 | - file: modules/m2/2.2.4.3-CategoricalData 48 | - file: modules/m2/2.2.4.4-ImageData 49 | - file: modules/m2/2.2.5-PrivacyAndAnonymisation 50 | - file: modules/m2/2.2.6-LinkingDatasets 51 | - file: modules/m2/2.2.7-MissingData 52 | # hands on 53 | - file: modules/m2/hands-on 54 | - file: modules/m2/hands-on-complete 55 | - caption: "Module 3: Data Visualisation & Exploration" 56 | chapters: 57 | - file: modules/m3/overview 58 | - file: modules/m3/3.1-FiguresGoneWrong 59 | - file: modules/m3/3.2-RulesOfTheGame 60 | - file: modules/m3/3.3-Atlas0fVisualisations 61 | - file: modules/m3/3.4-StoryTelling 62 | - file: modules/m3/3.5-DataVisForExploration 63 | - file: modules/m3/hands-on 64 | - caption: "Module 4: Introduction to Modelling" 65 | chapters: 66 | - file: modules/m4/overview 67 | - file: modules/m4/4.1-WhatAndWhy 68 | - file: modules/m4/4.2-ModelFitting 69 | - file: modules/m4/4.3-ModelBuilding 70 | - file: modules/m4/4.4-ModelEvaluation 71 | - file: modules/m4/hands-on 72 | - caption: "Appendix" 73 | chapters: 74 | - file: modules/appendix/A.1-Glossary 75 | - file: modules/appendix/A.2-SettingUp 76 | - file: modules/appendix/A.3-ContactUs 77 | 78 | -------------------------------------------------------------------------------- /coursebook/figures/4.1_1: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [size="6,5"] 3 | rankdir=LR 4 | node [shape=doublecircle] 5 | Phenomenon 6 | node [shape=oval] 7 | Measurement 8 | "Sampling Process" 9 | "Data Generating Process" 10 | Phenomenon -> "Data Generating Process" 11 | Measurement -> "Data Generating Process" 12 | "Data Generating Process" -> "Sampling Process" 13 | "Sampling Process" -> Data 14 | Data [shape=star] 15 | } 16 | -------------------------------------------------------------------------------- /coursebook/figures/4.1_1.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | Phenomenon 14 | 15 | 16 | Phenomenon 17 | 18 | 19 | 20 | Data Generating Process 21 | 22 | Data Generating Process 23 | 24 | 25 | 26 | Phenomenon->Data Generating Process 27 | 28 | 29 | 30 | 31 | 32 | Measurement 33 | 34 | Measurement 35 | 36 | 37 | 38 | Measurement->Data Generating Process 39 | 40 | 41 | 42 | 43 | 44 | Sampling Process 45 | 46 | Sampling Process 47 | 48 | 49 | 50 | Data 51 | 52 | Data 53 | 54 | 55 | 56 | Sampling Process->Data 57 | 58 | 59 | 60 | 61 | 62 | Data Generating Process->Sampling Process 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /coursebook/figures/4.1_2: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [size="6,5"] 3 | rankdir=LR 4 | Data [shape=star] 5 | "Generative Model" [shape=oval] 6 | Inference [shape=oval] 7 | Prediction [shape=oval] 8 | Data -> "Generative Model" 9 | "Generative Model" -> Inference 10 | "Generative Model" -> Prediction 11 | } 12 | -------------------------------------------------------------------------------- /coursebook/figures/4.1_2.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | Data 14 | 15 | Data 16 | 17 | 18 | 19 | Generative Model 20 | 21 | Generative Model 22 | 23 | 24 | 25 | Data->Generative Model 26 | 27 | 28 | 29 | 30 | 31 | Inference 32 | 33 | Inference 34 | 35 | 36 | 37 | Generative Model->Inference 38 | 39 | 40 | 41 | 42 | 43 | Prediction 44 | 45 | Prediction 46 | 47 | 48 | 49 | Generative Model->Prediction 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /coursebook/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/logo.png -------------------------------------------------------------------------------- /coursebook/figures/m1/actions.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/actions.jpg -------------------------------------------------------------------------------- /coursebook/figures/m1/agile.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/agile.jpg -------------------------------------------------------------------------------- /coursebook/figures/m1/ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/ai.png -------------------------------------------------------------------------------- /coursebook/figures/m1/atlas_ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/atlas_ai.png -------------------------------------------------------------------------------- /coursebook/figures/m1/big_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/big_data.png -------------------------------------------------------------------------------- /coursebook/figures/m1/branch_names.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/branch_names.png -------------------------------------------------------------------------------- /coursebook/figures/m1/chicken_egg.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/chicken_egg.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/cleaning.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/cleaning.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/cloud.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/cloud.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/collection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/collection.png -------------------------------------------------------------------------------- /coursebook/figures/m1/cs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/cs.png -------------------------------------------------------------------------------- /coursebook/figures/m1/data_oil.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/data_oil.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/ddl.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/ddl.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/digitisation.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/digitisation.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/ds_lifecycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/ds_lifecycle.png -------------------------------------------------------------------------------- /coursebook/figures/m1/ds_selling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/ds_selling.png -------------------------------------------------------------------------------- /coursebook/figures/m1/dsg.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/dsg.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/engage.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/engage.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/fourth_para.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/fourth_para.jpg -------------------------------------------------------------------------------- /coursebook/figures/m1/free_lunch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/free_lunch.jpg -------------------------------------------------------------------------------- /coursebook/figures/m1/funnel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/funnel.png -------------------------------------------------------------------------------- /coursebook/figures/m1/gantt.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/gantt.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/grenfell.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/grenfell.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/imagenet.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/imagenet.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/impact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/impact.png -------------------------------------------------------------------------------- /coursebook/figures/m1/kanban_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/kanban_example.png -------------------------------------------------------------------------------- /coursebook/figures/m1/lock.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/lock.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/market_cap.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/market_cap.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/matrix.png -------------------------------------------------------------------------------- /coursebook/figures/m1/missing_data.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/missing_data.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/monitor.png -------------------------------------------------------------------------------- /coursebook/figures/m1/mvp.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/mvp.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/newspapers.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/newspapers.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/odi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/odi.png -------------------------------------------------------------------------------- /coursebook/figures/m1/power.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/power.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/pr_add_comments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/pr_add_comments.png -------------------------------------------------------------------------------- /coursebook/figures/m1/pr_commit_id.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/pr_commit_id.png -------------------------------------------------------------------------------- /coursebook/figures/m1/pr_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/pr_example.png -------------------------------------------------------------------------------- /coursebook/figures/m1/problem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/problem.png -------------------------------------------------------------------------------- /coursebook/figures/m1/propublica.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/propublica.png -------------------------------------------------------------------------------- /coursebook/figures/m1/pyramid_of_needs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/pyramid_of_needs.png -------------------------------------------------------------------------------- /coursebook/figures/m1/question.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/question.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m1/reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/reg.png -------------------------------------------------------------------------------- /coursebook/figures/m1/review.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/review.png -------------------------------------------------------------------------------- /coursebook/figures/m1/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/roc.png -------------------------------------------------------------------------------- /coursebook/figures/m1/scope_creep_meaning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/scope_creep_meaning.png -------------------------------------------------------------------------------- /coursebook/figures/m1/scrum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/scrum.png -------------------------------------------------------------------------------- /coursebook/figures/m1/task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/task.png -------------------------------------------------------------------------------- /coursebook/figures/m1/theory.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/theory.jpg -------------------------------------------------------------------------------- /coursebook/figures/m1/twitter.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m1/twitter.jpeg -------------------------------------------------------------------------------- /coursebook/figures/m2/black_and_white_smiley.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m2/black_and_white_smiley.png -------------------------------------------------------------------------------- /coursebook/figures/m2/graph_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m2/graph_data.png -------------------------------------------------------------------------------- /coursebook/figures/m2/grayscale_smiley.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m2/grayscale_smiley.png -------------------------------------------------------------------------------- /coursebook/figures/m4/fig_data_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m4/fig_data_process.png -------------------------------------------------------------------------------- /coursebook/figures/m4/fig_generative_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/figures/m4/fig_generative_model.png -------------------------------------------------------------------------------- /coursebook/index.md: -------------------------------------------------------------------------------- 1 | # Welcome! 2 | 3 | Welcome to an **Introduction to Research Data Science**, developed by [The Alan Turing Institute's](https://www.turing.ac.uk/) 4 | [Research Engineering Group](https://www.turing.ac.uk/research-engineering). 5 | 6 | 7 | ## Introduction 8 | 9 | Data science methods and tools have become commonplace in research projects across academia, government and industry. Researchers increasingly need to collaborate with multi-disciplinary teams of data scientists, software engineers and other stakeholders. 10 | 11 | The goal of this course is to introduce how you can use data science principles to tackle real, complex, and sometimes vaguely defined research data science projects. The course is not a handbook of data science methods. Rather, the focus is how to begin using these methods on collaborative research projects, with an emphasis an awareness of ethics and diversity issues. 12 | 13 | 14 | ## Who? 15 | 16 | **We are** a group of data scientists and software engineers that work on a wide range of research problems. 17 | 18 | **You are** someone interested in learning about, or using, data science methods in research. To completely follow along with the course some basic programming is needed, see [Prerequisites](#prerequisites) for more information. 19 | 20 | 21 | 22 | ## Course materials 23 | 24 | This free and open course is primarily the jupyter book you're reading. You can work through the material by yourself. See the [Syllabus](#syllabus). 25 | 26 | 27 | Some tips on **how to use this course**: 28 | 29 | - You will get a lot out of simply reading the online course book. However, the 30 | course is built by executable jupyter notebooks that you can run yourself, and 31 | we encourage learners to try the hands-on sections where we tackle a real 32 | research data science problem. Visit the [the readme](https://github.com/alan-turing-institute/rds-course/tree/develop/coursebook) page to setup your computer to 33 | follow along. 34 | 35 | - There are some benefits to reading the course chronologically. The same dataset is used throughout the modules, especially on the hands-on sessions. However, much of the material is self-contained and can be consumed independently. 36 | 37 | 38 | - If you are a self-learner and have questions, comments, ideas or issues please 39 | use: [RDS-Course Issues](https://github.com/alan-turing-institute/rds-course/issues) 40 | 41 | 42 | - There is also a synchronous, taught, version of the course, where modules are spread over a half-day taught session and a half-day hands-on session. 43 | 44 | 45 | ## Syllabus 46 | 47 | **Module 1: Intro to Data Science** 48 | 49 | Taught session: 50 | - What data science and research data science are, overview of the variety of cultures within them. 51 | - Stages in a data science project and common issues when scoping a project. 52 | - Intro to EDI for data science. 53 | - How to work collaboratively in data science projects. 54 | 55 | Hands-on session: 56 | - Scope a research data science project using a real-world survey individual-level dataset, including discussion of research question and EDI issues and setting up a collaborative GitHub repo. 57 | 58 | **Module 2: Handling data** 59 | 60 | Taught session: 61 | - Data wrangling, cleaning and provenance. 62 | - Handling missing data. 63 | - Data access: SQL, APIs. 64 | - Data privacy and security. 65 | 66 | Hands-on session: 67 | - Explore, pre-process and clean the dataset from Module 1. Discuss and decode various complexities (e.g. missing/ambiguous values, bias in data collection, data privacy and sensitivity). 68 | 69 | **Module 3: Data visualisation** 70 | 71 | Taught session: 72 | - Figures gone wrong. 73 | - Rules of the data visualisation game. 74 | - Atlas of visualisations. 75 | - Storytelling with data visualisation. 76 | - Data visualisation for data exploration. 77 | 78 | Hands-on session: 79 | - Build visualisations to understand the dataset from Module 1 and 2 using material from the taught sessions, explore the relationships and importance of variables. 80 | 81 | **Module 4: Modeling** 82 | 83 | Taught session: 84 | - The what and why of Statistical Modeling 85 | - Inside a Model. 86 | - Building a Model. 87 | - Evaluating and Validating Models. 88 | 89 | Hands-on session: 90 | - Build your own model based on the knowledge acquired so far about the dataset and the techniques taught in this module. Improve upon baseline, interpret results to answer research questions and discuss limitations and alternative approaches. 91 | 92 | 93 | ## Prerequisites 94 | 95 | There is no code in Module 1. Students will get more out of Modules 2-4 if they: 96 | 97 | - Are comfortable with basic Python, as presented in: 98 | - The [Introduction to Python](https://alan-turing-institute.github.io/rse-course/html/module01_introduction_to_python/index.html) module from the Turing's Research Software Engineering. 99 | - Software Carpentry's [Programming with Python](https://swcarpentry.github.io/python-novice-inflammation/). 100 | - Have some basic knowledge of using Git for version control, for example the Software Carpentry’s [Version Control with Git](https://swcarpentry.github.io/git-novice/) (Sections 1 to 4 and 7 to 9). 101 | - Have basic knowledge of using Github for collaboration. See the first two sections of the Turing Way’s Guide for Collaboration ([Getting Started in GitHub](https://the-turing-way.netlify.app/collaboration/github-novice.html) and [Maintainers and Reviewers in GitHub](https://the-turing-way.netlify.app/collaboration/maintain-review.html)). 102 | 103 | 104 | This course complements the [Turing’s Research Software Engineering with Python](https://alan-turing-institute.github.io/rse-course/) course. 105 | 106 | 107 | ## Disclaimer 108 | The work and materials here are developed by a group of \[research\] data 109 | scientists and software engineers from a diverse background. Many of the topics, 110 | examples and discussed work here is biased against our own experiences. As such, 111 | our definitions and understandings of certain words, phrases, or methodologies used 112 | may differ from others'. We do not claim to be a definitive authority, and 113 | welcome open discussion and feedback. 114 | 115 | 116 | ## Acknowledgement 117 | This work was supported by Wave 1 of The UKRI Strategic Priorities Fund under the EPSRC Grant EP/W006022/1 & The Alan Turing Institute. 118 | 119 | -------------------------------------------------------------------------------- /coursebook/modules/m1/1.1-WhatIsDataScience.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"f36c6e2a","metadata":{},"source":["# 1.1 What is \\[research\\] data science?\n","\n","We use the term **Data Science** to refer to a varied ensemble of practices, methodologies, and tools that may be used to learn from or about data. \n","\n","**Research Data Science** is a broader term, which we sometimes use to describe\n","the work done in the [research engineering group at Turing](https://www.turing.ac.uk/research/research-engineering/), it includes the additional challenges encountered when applying data\n","science in a collaborative research environment.\n","\n","**Data** is critical to data science. In this module we discuss:\n","- key considerations when faced with a dataset\n","- the main components of data science and the role that data science plays in research. \n","\n","\n","## Data\n","\n","The availability of large-scale datasets, for example for training face recognition algorithms or language models, is a fairly new phenomenon. Creating, refining and, making data consumable takes a substantial effort, and acquiring data remains complex (e.g., [digitisation](https://www.bl.uk/help/initiate-a-new-digitisation-project-with-the-british-library)). \n","\n","These processes should not be taken for granted. Two important considerations to keep in mind when faced with a dataset are how the data was **created**, and how **representative** the data is.\n","\n","### Data Creation\n","\n","Data is not a natural resource, it is always the product of human decisions and actions. Data is often the outcome of an enormous amount of labour, resources, infrastructure, and time. \n","\n","Example data creation tasks are:\n","- collecting information or the tracking of historical information\n","- organising information in specific categories\n","- measuring and storing information as data on digital infrastructure\n","\n","Whenever a dataset is enriched with metadata information or specific labels, keep in mind someone has provided those. Even if the labels you see are automatically assigned, the tool would have been trained on manual labels.\n","\n","For example, the famous ImageNet dataset, central component for the development of many well known image recognition pipelines relies on two pillars:\n","- a taxonomy developed since 1985 as part of the lexical database WordNet, which provides a top-down hierarchical structure of concepts (\"chair\" is under artifact->furnishing->furniture->seat->chair)\n","- an enormous amount of cheap workforce provided by Amazon Mechanical Turk.\n","\n","![imagenet](../../figures/m1/imagenet.jpeg)\n","
The above ImageNet image, taken from Crawford and Paglen's essay Excavating AI, is a result a gigantic effort and the specific representation of the World of: 1) the people who have designed WordNet, 2) the researchers who have decided which WordNet categories are included and which are not in ImageNet, and 3) the many, many annotators who have selected which images associate to concepts like \"brother\", \"boss\", \"debtor\", \"drug-addict\" and \"call girl\", all included both in WordNet and ImageNet (at least until 2019).
\n","\n","\n","\n","### Data Representativeness\n","\n","The data we have available is just a sample and not the complete story. The question we should ask ourselves is:\n"," - \"What can these data points tell us about the wider phenomenon that we're\n"," really interested in?\"\n","\n","In an article by [Anna Rogers](https://aclanthology.org/2021.acl-long.170.pdf),the author considers the following argument: \n","> “the size of the data is so large that, in fact, our training sets are not a sample at all, they are the entire data universe”. \n","\n","Rogers replies to it by saying that this argument would stand if the “data universe\" that we use for training for instance a speech recognition system was the same as “the totality of human speech/writing\". It is not, and will hopefully never be, because collecting all speech is problematic for ethical, legal, and practical reasons (e.g., [not all tweets are archived](https://www.businessinsider.com/library-of-congress-twitter-wont-archive-every-public-tweet-anymore-2017-12?r=US&IR=T)). \n","\n","Anything less than the whole data universe is a sample. Given the existing social structures, no matter how big that sample is, it is not representative due to (amongst other things) unequal access to technology, unequal possibility to defend one’s privacy and copyright, and limited access to the huge volumes of speech produced in the “walled garden\" platforms like Facebook. \n","\n"," \n","## Data Science\n","\n","Data science allows us to learn about data. However, terms commonly associated with data science - e.g., AI, deep learning - are only the tip of the iceberg. They rely on many steps that are briefly introduced here, and built upon throughout the course.\n","\n","![The data science hierarchy of needs](../../figures/m1/pyramid_of_needs.png)\n","
Monica Rogati's representation of the \"Data Science hierarchy of needs\".
\n"," \n","\n","### Collection and Storage\n","\n","Part of the work of a data scientist is knowing the challenges and hurdles involved in data collection and storage (e.g., our [Living with Machines](https://blogs.bl.uk/thenewsroom/2019/07/moving-from-a-newspaper-collection-to-a-news-collection.html) project).\n","\n","\n","It is essential that we know who owns the data, what restrictions apply, how a resource should be stored for long-term preservation and made available to collaborators (e.g., Turing's [Data Safe Haven](https://www.turing.ac.uk/research/research-projects/data-safe-havens-cloud)).\n","\n","In small team contexts' data scientists often take care directly of such responsibilities. These topics are explored more in [Module 2: Getting and Loading Data](../m2/2.1-GettingLoading.ipynb).\n"," \n"," \n","\n","### Wrangling \n","\n","Commonly, we need to alter a freshly obtained dataset to a more analysis-ready state. We might need to remove inconsistencies and inaccuracies (a process called [data cleaning](https://en.wikipedia.org/wiki/Data_cleansing)), combine data sources, or otherwise change the data into our desired format. \n","\n","These tasks can be collectively called [data wrangling](https://online.hbs.edu/blog/post/data-wrangling). Although often this task is the largest part of a data scientist's work it is often undervalued. We will explore this topic more in [Module 2: Exploring and Wrangling](../m2/2.2-DataWrangling.ipynb).\n","\n","### Exploring and Visualisation\n","\n","For many disciplines the availability of large datasets is unprecedented. This creates opportunity, but defining new research questions or business goals is complex. \n","\n","Understanding a dataset well can unlock opportunities for new insights and help specific research questions.\n","- For example, a dataset combining aggregate GPS activity and traffic monitoring data has been successfully and effectively used by two Turing projects: the [London Air Quality](https://www.turing.ac.uk/research/research-projects/london-air-quality) project, and [Project Odysseus](https://www.turing.ac.uk/research/research-projects/project-odysseus-understanding-london-busyness-and-exiting-lockdown), which monitored the level of activity during the pandemic.\n","\n","A powerful tool to develop understanding is data visualisation. This will be the focus of [Module 3](../m3/overview.ipynb). \n","\n","\n","### Modelling\n","\n","Modelling is the core activity of data scientists. \n","\n","Though, as we have seen, we might spend more time on other activities, the goals of modelling are present throughout. We build models with a specific goal in mind (more in [Module 4](../m4/overview.ipynb)). This begins with the project scoping process (see [1.2 Research Data Science Project Lifecycle](../m1/1.2-DataScienceProjectLifecycle.ipynb)), specifying a research question, task, and measures of success.\n","\n","When modelling we are often comparing what works \"best\" (which can also mean most reliably) in a given setting, rather than improving over a given start-of-the-art (which may be the job of an expert researcher).\n"]},{"cell_type":"markdown","id":"e16ede84","metadata":{},"source":["\n","## Data-Driven Science \n","\n","The availability of data and expanding toolbox of data science techniques is changing science. \n","\n","Even as far back as 2009 it was suggested a **fourth scientific paradigm** was emerging. \n","\n","\n","![fourth_para](../../figures/m1/fourth_para.jpg) \n","
As Bell, Hey and Szalay (2009) said in a famous article in Science, for a long\n","time
\"scientists have recognized experimental and theoretical science as the\n","basic research paradigms for understanding nature. In recent decades, computer\n","simulations have become an essential third paradigm...\" They continue to say that \"a fourth\n","paradigm is emerging, consisting of the techniques and technologies needed to\n","perform data-intensive science\".


\n","\n","\n","And one author even claimed the _end of theory_!\n","\n","![theory](../../figures/m1/theory.jpg) \n","
Chris Anderson's 2008 Wired article contained provocative statements like \"Petabytes allow us to say: ‘Correlation is enough. We can analyse the data without hypotheses about what it might show. We can throw the numbers into the biggest computing clusters the world has ever seen and let statistical algorithms find patterns where science cannot. Correlation supersedes causation, and science can advance even without coherent models, unified theories, or really any mechanistic explanation at all.\"

\n","\n","\n","Here, and in similar articles (e.g., [Kitchen, 2014](https://journals.sagepub.com/doi/10.1177/2053951714528481)), we see the following ideas:\n","- Big Data can capture a whole domain and provide full resolution.\n","- There is little need for a priori theory, models, or hypotheses.\n","- Through the application of agnostic data analytics the data can speak for themselves free of human bias or framing, and any patterns and relationships within Big Data are inherently meaningful and truthful;\n","- meaning transcends context or domain-specific knowledge, thus can be interpreted by anyone who can decode a statistic or data visualization.\n","\n"," \n","\n","### Developing a Critical Mindset\n","\n","In the intervening decade there has been a lot of discussion on the shortcomings on Big Data. Our perception of data science in society and research has changed. Our understanding of biases embedded within data has increased -- biases that are passed on to trained models. \n","\n","In this course we learn how to approach data, methods, and research questions in with a critical mindset. We aim to produce findings that are reliable, reproducible, and context-aware.\n","\n","\n","![fourth_para](../../figures/m1/atlas_ai.png) \n","
This Turing 'fireside chat' talks to Professor Kate Crawford about her book, Atlas of AI, which aims to expose how artificial intelligence is shaping society.
"]},{"cell_type":"markdown","id":"029513e1","metadata":{},"source":["## Research data scientist\n","\n","We have seen that there are many components to data science. As a data scientist, we will often:\n","- be the central element in the projects we are involved with, connecting data providers, domain experts and final users.\n","- ask \"why\" people want to use data science approaches. We need to be aware of limitations and the societal context of the research. \n","- contribute to shaping research directions and guaranteeing reproducibility.\n","\n","\n","![reg](../../figures/m1/reg.png) \n","\n","For a brief deep-dive on the role of a data scientist, consider our large-scale [Living with Machines](https://www.turing.ac.uk/research/research-projects/living-machines) project. A five-year study on the Industrial Revolution using data-driven approach with over twenty members, and currently five Turing Research Engineering Group members. Our responsibilities have involved:\n","\n","- Acquiring data, organising data for easy access, and classifying the level of sensitivity. \n","- Storage in a secure environment (Turing's [Data Safe Haven](https://www.turing.ac.uk/research/research-projects/data-safe-havens-cloud)) for work on copyright-protected collections\n","- Design and develop software to help deal with different data sources, for example:\n"," - [historical language models](https://github.com/Living-with-machines/histLM)\n"," - [fuzzy string matching](https://github.com/Living-with-machines/DeezyMatch)\n"," - [sampling resources](https://github.com/Living-with-machines/PressPicker_public)\n","- Contribute to research papers based on data science methods. Here we contribute both to shaping the research direction and to evaluating method performance:\n"," - [Living Machines: A study of atypical animacy](https://arxiv.org/pdf/2005.11140.pdf)\n"," - [Maps of a Nation?](https://academic.oup.com/jvc/article/26/2/284/6232245)\n","- Management, for example: planning tasks, leading subprojects, mentoring researchers, handling stake-holder expectations.\n"]},{"cell_type":"markdown","id":"f713458c","metadata":{},"source":["## References\n","\n","Rogers, A. (2021). Changing the World by Changing the Data. arXiv preprint arXiv:2105.13947.\n","\n","Anderson, C. (2008). The end of theory: The data deluge makes the scientific method obsolete. Wired magazine, 16(7), 16-07.\n","\n","Bell, G., Hey, T., & Szalay, A. (2009). Beyond the data deluge. Science, 323(5919), 1297-1298.\n","\n","Crawford, K. (2021). The Atlas of AI. Yale University Press.\n","\n","D'ignazio, C., & Klein, L. F. (2020). Data feminism. MIT press.\n","\n","Kitchin, R. (2014). Big Data, new epistemologies and paradigm shifts. Big data & society, 1(1).\n","\n","Wickham, H. (2014). Tidy data. Journal of statistical software, 59(1), 1-23.\n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"vscode":{"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /coursebook/modules/m1/1.4-CollaborationAndReproducibility.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"attachments":{},"cell_type":"markdown","id":"915e04fd","metadata":{},"source":["# 1.4 Collaboration and reproducibility\n","As a final part of this first module, we will discuss a series of best practices for collaborative coding in a research project. We will not focus on the different aspects of version control as these topics are extensively covered in other resources offered at the Turing, for instance as part of the [RSE Course](https://alan-turing-institute.github.io/rse-course/html/module04_version_control_with_git/index.html) and [The Turing Way](https://the-turing-way.netlify.app/reproducible-research/vcs/vcs-github.html). Instead we will go through a few topics from the software development community, which are important when doing data science collaboratively.\n","## Git Flow and Pull Requests\n","\n","When setting up a new project, and especially when you are working collaboratively, you might want to consider following a branching model such as Git Flow, where different branches have specific roles and it is clearly defined when and how they should interact. This will facilitate collaboration, testing and code reviews. Note that this is not the only type of workflow and [GitHub workflow](https://guides.github.com/introduction/flow/) is another established option.\n","\n","\n","![GitFlow](../../figures/m1/gitflow.svg)\n","\n"," \n","\n","### Main Branches\n","\n","\n","In Git Flow two branches will record the entire history of the project and have an infinite lifetime:\n","- **Main**: which contains the release history\n","- **Develop**: which is the integration branch for features.\n","\n","When creating a new project we suggest to create the Develop branch (Main will be there from the beginning) and to change the default branch to Develop.\n","\n"," \n","\n","### Feature Branch\n","\n","Every time a new feature is planned or a bug / problem needs to be addressed we suggest to first document it in a specific issue, which specifies the problem and the type of expected outcome. Remember that it is very important that issues are focused on a specific and contained task and clearly described. \n","\n","![GitFlow](../../figures/m1/branch_names.png)\n","\n"," \n","\n","If a collaborator would like to address this, they should open a dedicated feature branch (from Develop). On GitHub each issue is identified with a number and it is important to be consistent with naming conventions of branches to keep a consistent overview. For instance see the following example.\n","\n","A few important things for a person addressing an issue:\n","\n","1. Take your time to study the code and design an idea for addressing the problem, before starting the implementation\n","2. Some issues may request several steps. You should address each step in a different commit in your branch (with a clear descriptive comment). This is useful so that you can later point to a specific commit during the code review process\n","3. When you feel you are in a good stage and would like to receive feedback, you can open a pull request (even in draft mode, to signal that is still a work in progress)\n"," \n","\n","\n","### Pull Requests\n","\n","A Pull Request (PR) allows the possibility of merging the code back to the Develop branch. You can open it from the GitHub web interface as in the example below. Once a pull request is opened by a collaborator, you can discuss and review the potential changes with them and they can add follow-up commits if necessary before the changes are merged into Develop.\n","\n","![Pull Request example](../../figures/m1/pr_example.png)\n","\n"," \n","\n","Some important points for the person addressing an issue and opening the PR:\n","\n","1. Remember that your work doesn't need to be completely finished (especially for complex PRs), but you might have an initial workflow for a solution and now it's the time that someone else could look at your code.\n","2. Introduce your PR with a clear description, pointing to the issue this PR would close, the main component of your solution, the tests you have done, etc.\n","3. Try to put yourself in the reviewer's shoes: the solution you would like to review should be simple (not clever!), not too long and especially comprehensible. For instance, if you have added new code, spend time renaming variables so that they are consistent with the overall style.\n","\n","Some important points for the person reviewing the PR:\n","\n","1. First of all, remember to be nice and polite with your collaborator, it is stressful for anyone to have their code reviewed.\n","2. Read the issue the PR is planning to address\n","3. Go through the proposed solution, you will be able to see an overview of all modifications from the `Files changed` tab\n","4. When you have understood the proposed solution, checkout the branch of the PR and test the code\n","5. Ask questions if something is not clear. You can do this by adding the comment exactly at the line in the code you are referring to, from the `Files changed` tab. You could either add this as a single comment or `Start a review`. Start a review will hold all your comments, until you have finished your review and then show these to the Collaborator together with the overall feedback.\n","![Add specific comments](../../figures/m1/pr_add_comments.png)\n","6. Both the reviewer and the contributor will be able to point to specific commits by writing its SHA-hash, and it'll automatically get turned into a link (see the example below). \n","![Reference to a specific commit](../../figures/m1/pr_commit_id.png)\n","7. When you are satisfied with a change or a reply of a comment you can resolve it, so you will keep open only comments that still need to be resolved.\n","8. When you have finished you can add an overall feedback and your decision. Unless there are clear bugs that need a `Request changes`, most of the time a general `Comment` will give your collaborator the chance to discuss and address the feedback in conversation with you.\n","9. When all things have been addressed, if you are satisfied with the PR you can proceed by merging it and closing the dedicated branch.\n","\n","### Other types of branches\n","\n","Apart from dedicated feature branches and the long-running Develop and Main branch, there are other important branches in the Git Flow model:\n","\n","- **Release**: Once you are approaching a point when you would like to do a first public release of your code (for instance, when you are submitting a paper and you want to accompany it with the code to reproduce the experiments), you can create a dedicated release branch. Creating this branch starts a \"release cycle\", so no new features can be added after this point, only bug fixes, documentation generation, and other release-oriented tasks should go in this branch. Once it's ready, the release branch gets merged into main and tagged with a version number. In addition, it should be merged back into develop, which may have progressed since the release was initiated. Using a specific, dedicated branch to prepare releases makes it possible to polish the current release while others continue working on features for a future release. It also creates well-defined phases of development.\n","\n","- **Hotfix**: Maintenance or “hotfix” branches are used to quickly fix issues directly in the main branch. They are very similar to release branches and feature branches except they're based on main instead of develop. As soon as the fix is complete, it should be merged into both main and develop, and main should be tagged with an updated version number. \n","\n"," \n","## Agile Working\n","\n","- In software development \"Agile\" is a set of practices designed for improving the effectiveness of collaborative working, pivoting around flexibility. \n","- [Its principles](http://agilemanifesto.org/) include prioritizing individuals and interactions over processes and tools, and responding to change instead of blindly following preset plans. \n","- While we are not covering Agile principles as part of our course, we will discuss a few strategies that are important as part of a collaborative research data science work. \n","\n","![agile](../../figures/m1/agile.jpg)\n","[Image link](https://www.steelcase.com/research/articles/topics/agile/agile-working-new-ways/)\n","\n","### Scrum\n","In software development, Scrum is a lightweight framework built following Agile principles. It requires a \"Scrum Master\" to foster an environment where:\n","\n","- A person (the \"product owner\") orders a specific product\n","\n","- A team focuses on a specific aspect of the work during a Sprint.\n","\n","- The team and its stakeholders inspect the results and adjust for the next Sprint.\n","\n","- Repeat\n","\n","![scrum](../../figures/m1/scrum.png)\n","[Image link](https://www.wrike.com/scrum-guide/scrum-sprints/)\n","\n","\n"," \n","\n","The cadence of each iteration is very short, with Sprints of two to max four weeks, with clear start and finish dates. Sprints are punctuated by the sprint planning, sprint review, and retrospective meetings and peppered with daily scrum(stand-up) meetings. Such steps are lightweight and run on a continuous basis. \n","\n","When employing such concepts in a Research Data Science project, the focus will be on:\n","\n","1. Splitting the complex work that you want to achieve into small-scale specific tasks with a clear outcome. \n","2. Sprint planning with the researchers and domain experts (the equivalent of the \"product owner\") in short focused meetings (max 30 mins), to defining the goal of each sprint (which specific task you want to address) and the duration. Everyone needs to agree, so that expectations are aligned. \n","3. During the sprint it is important to have regular, short and effective checks with collaborators (as stand-ups meetings) and regular touch-base with the researchers to highlight: what has been done, what we are planing to do, which blockers we have found.\n","4. At the end of each sprint it is essential to discuss results, difficulties and to improve the planning of the next sprint.\n"," \n","\n","### Kanban board\n","- Kanban helps visualise your work, limit the amount of work-in-progress (WIP) and rapidly moving tasks from \"Doing\" to \"Done\". \n","- The core is a Kanban board, which is for instance available in GitHub under the tab \"Projects\" and you could use it to organise the issues and workflow related to a project and highlight the stage of a specific Sprint. \n","\n","![kanban](../../figures/m1/kanban_example.png)\n","\n"," \n","Kanban boards have three main components: Visual signals (the issues), columns (the basic structure is \"To Do\", \"In Progress\" and \"Done\") and work-in-progress limits. Depending on the type of project, input and output of the board will be defined (who creates the issues for instance and who tests that the contribution is what was expected). The most important aspect of a Kanban board is that is shows clearly how your team works and the stage of the project.\n"," \n","\n","## Continuous Integration\n","\n","- Having multiple people contributing to the same project makes detecting errors and debugging more complex\n","- You can automatize various types of check with Continuous Integration (CI)\n","- Since the end of 2018, GitHub offers \"Github actions\" for this\n","\n","![actions](../../figures/m1/actions.jpg)\n","\n"," \n","\n","Continuous integration (CI) is a software practice that requires frequently committing code, which you can continuously build and test to make sure that the commit doesn't introduce errors. Your tests can include code linters (which check style formatting), security checks, code coverage, functional tests, and other custom checks.\n","\n","However, building and testing your code requires a server. You can of course build and test updates locally before pushing code to a repository, but to be sure that your implementation works on different systems you can use a CI server that checks for new code commits in a repository. Since the end of 2018, GitHub has been offering what they call \"GitHub actions\", which are specific workflows that will be ran automatically on certain events (for instance at each commit or before merging a PR into Develop). It is important to understand that GitHub with these workflows (which are composed by multiple \"jobs\", such as building the code and then running a series of tests) is not providing you with the equivalent of a Virtual Machine where you can run your experiments. Instead, these actions are specific checks that your code works as expected, based on a series of predefined tests. To know more, check [the dedicated chapter](https://the-turing-way.netlify.app/reproducible-research/ci/ci-github-actions.html) in The Turing Way.\n","\n","\n"," \n","\n","\n","\n","## Bringing it all together\n","\n","In this final sub-module we have discussed three main components of collaborative coding: \n","- git-flow (and in particular the role of pull requests)\n","- agile working (focusing on Scrum and Kanban boards)\n","- continuous integration\n","\n","Such concepts are backbones of many Research Data Science and Research Software Engineering projects. When we set up a repository for a specific project (which has a clear outcome and defined milestones) we create a corresponding Kanban project board to monitor the overall development and the focus of each Sprint. This will help keep each collaborator (researchers, domain experts) in the loop during the Sprint. Every aspect of the project that we plan to address is clearly described in a specific issue and only a small number of issues are addressed, reviewed and integrated in parallel during each iteration. \n","\n","\n","## References\n","\n","Pirro, L. (2019). How agile project management can work for your research. NATURE.\n","\n","http://agilemanifesto.org/\n","https://docs.github.com/en/actions/guides/about-continuous-integration\n","https://www.atlassian.com/agile/kanban/kanban-vs-scrum\n","https://the-turing-way.netlify.app/reproducible-research/ci/ci-options.html\n","\n"]}],"metadata":{"interpreter":{"hash":"31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"},"kernelspec":{"display_name":"Python 3.6.9 64-bit","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.6"}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /coursebook/modules/m1/hands-on.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Module 1: Hands-on session\n", "## Introduction\n", "In this hands-on session we will simulate a real-world scoping scenario:\n", "- Participants are divided in groups.\n", "- Each group will be a small data science team for the rest of the session. \n", "- Each team is paired with a helper who will act as a Principal Investigator (PI), i.e., a lead researcher who is seeking funding for a new project and wants the data science team to participate in it. \n", "- Each group will receive a short research proposal from the PI. \n", "- Teams need to go through the provided materials to conduct the scoping, including technical and ethical aspects. \n", "\n", "The activity consists of three phases:\n", "\n", "## Phase 1: Setup, initial contact and discussion \n", "\n", "### **Schedule**\n", "- 20 mins setup (in groups)\n", "- 35 mins collaborative activity (exploration of materials and discussion, in groups)\n", "\n", "****\n", "\n", "### **Teams should follow these steps during Phase 1:**\n", "1. Setup a GitHub repo for each group following the guidance in Lesson 1.4.\n", "\n", "2. Ensure that all participants have access to it.\n", "\n", "3. Prepare a scoping project board (decide the flow that better captures the scoping process you aim to conduct).\n", "\n", "4. Go through all the received materials (you can follow up with the PI if something is not clear from the beginning).\n", "\n", "5. To get a better understanding of the project, it might be necessary to:\n", "\n", " a. explore the dataset. \n", "\n", " b. examine the dataset's documentation. \n", "\n", " c. do an initial general literature search. To simplify this step, you can just look at [the paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3678208/) we used as an inspiration for the hands-on activities.\n", "\n", " We suggest to split your team into sub-groups to address these three points in parallel.\n", " \n", "6. Create issues in the repo covering the scoping questions you want to discuss with the PI (see Module 1.2). You should ideally have at least one issue dedicated to each of the questions in your project scoping board. Start filling them in based on your initial exploration and the information you received from the PI. You will have a chance to speak with the PI in the second phase.\n", "\n", "7. Additionally, if you notice anything interesting in the dataset that should be discussed during scoping you should make others in the group aware of it. You can:\n", "\n", " a. Create a new issue in your repo describing your observation or idea.\n", " \n", " b. or (if you wrote some code as part of your exploration) make a pull request to the main branch of the repo higlighting what you have found. \n", "\n", "****\n", "\n", "### **The initial PI contact**\n", "\n", ">15th of November 2021\n", ">\n", ">Subject: Request for collaboration\n", ">\n", ">\n", "> Dear Research Engineering Group,\n", "I am reaching out for scoping a potential collaboration. Social inequalities in health have been described across a range of European countries. While it is well-known in the literature that the higher the social class, the lower the prevalence/incidence of health problems, no study has attempted to explain social inequalities in health for Europe as a whole. To address this, I am setting up a project proposal for a large-scale study using promptly available data (European Quality of Life Time Series, freely available [online](https://ecommons.cornell.edu/handle/1813/87445)) and deep learning techniques. I envision a 2-year project answering the call \u201cPersonal Stories, Healing Nations\u201d employing 1 full-time Post-Doctoral researcher covering the social science parts of the study and (potentially) in collaboration with your team for the technological parts. We are hoping to submit by Dec 1st, so we would be keen to establish the costs for this digital component by Nov 28. \n", ">\n", "> While I am fairly new to big-data (and, I have to admit, I have my reservations), I believe a well-designed project with these sources might be able to rewrite our understanding of social inequalities in health in Europe (and even beyond) during the last two decaces, a period involving a series of major socio-economical and political events, \u00e7a va sans dire. Its impact will be relevant for the general public and could potentially even suggest actions to governments.\n", ">\n", ">\n", ">Your Sincerely,\n", ">\n", ">Professor J. Doe\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Phase 2: Technical Questions (40 mins in group + 20 mins together)\n", "\n", "## **Schedule**\n", "- 40 mins collaborative activity (scoping, in groups)\n", "- 20 mins presentation (all together)\n", "\n", "****\n", "\n", "## **Teams should follow these steps during Phase 2:**\n", "1. After having explored the materials and prepared some initial questions in Phase 1, you can start an iterative conversation with the PI to reach an answer to the scoping questions questions listed below. This should consist of a series of discussions and further exploration of the data, any documentation you found and other literature. The PI will be available for you whenever you want during Phase 2.\n", "2. You should document your discussions in the GitHub project board, trying to answer the questions as clearly as possible. You can do this all together or by splitting your team into sub-groups. Use the issues you created in Phase 1 and create new ones when new ideas come up.\n", "3. After documenting the converstations and answers, one representative of the team will be asked to present your main conclusions in the common room.\n", "\n", "We list here some main scoping questions we presented in Lesson 1.2\n", "\n", "### The goal\n", "\n", "1. What is the broad challenge we are trying to solve?\n", "\n", "2. What is the specific research question? How does it translate to a data science problem?\n", "\n", "### The data\n", "\n", "3. Is data available? Can I legally use it? Is it appropriate for the research question?\n", "\n", "### The expectations\n", "\n", "4. What are the stakeholders' expectations?\n", "\n", "5. What is in-scope and out-of-scope?\n", "\n", "6. How does the output of the project look like and how is it going to be used?\n", "\n", "### Success\n", "\n", "7. What metrics do we use to measure the success of the project?\n", "\n", "8. What is the expected impact? Is it realistic?\n", "\n", "### Other questions\n", "\n", "9. What about computational resources and timelines?\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Phase 3: Ethical Discussion (30 mins in groups, then 30 mins together)\n", "\n", "### **Schedule**\n", "- 40 mins collaborative activity (EDI discussion, in groups)\n", "- 20 mins presentation (all together)\n", "\n", "****\n", "\n", "### **Teams should follow these steps during Phase 3:**\n", "As a final step in the scoping activity, we want to examine questions relating to equality diversity and inclusion with the PI. Follow these steps:\n", "1. Discuss with the PI any EDI questions you might have about the project goals, the data, the impact or other topics you consider important or controversial. The PI will be available throughout Phase 3.\n", "2. You should document your discussions in the GitHub project board, similarly to Phase 2. You can do this all together or by splitting your team into sub-groups. \n", "3. After documenting the converstations and conclusions, one representative of the team will be asked to present your them in the common room.\n", "\n", "Think about the following questions as a starting point but please approach this discussion with an open and investigative spirit. We would love to hear your insights!\n", "- In this project we focus on the goal of studying health outcomes. Do the data contain the right type of information to do that (explore the dataset, the paper and the documentation)? \n", "- To what extent can we understand the relationship between health and social factors with this dataset? What claims can we make?\n", "- Given the importance of understanding and explaining health outcomes, what does the PI plan to do to share the information publicly and achive positive societal impact?\n", "- Is it important to make the work reproducible?"]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4"}, "vscode": {"interpreter": {"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}, "widgets": {"application/vnd.jupyter.widget-state+json": {"state": {}, "version_major": 2, "version_minor": 0}}}, "nbformat": 4, "nbformat_minor": 4} -------------------------------------------------------------------------------- /coursebook/modules/m1/overview.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "0f06592c-5c2b-4579-89da-1cd8304da6f4", "metadata": {}, "source": ["# Overview\n", "\n", "Research Data Science (RDS) is an emerging term. It refers to the unique challenge of applying data science tools and techniques within a research setting. In this module you will learn what an RDS project is, and crucial considerations for inclusivity and collaboration.\n", "\n", "Module structure:\n", "\n", "- **What is Research Data Science**: A context-setting introduction.\n", "- **Research Data Science project lifecycle**: What makes a RDS project? \n", "- **Equality, Diversity and Inclusion in data science**: Data science is powerful, here we discuss how to be mindful of our applications.\n", "- **Collaboration and Reproducibility**: Tips for collaborating on code. \n", "- **Hands-on session**: Scope a research data science project using a real dataset.\n", "\n", "## References \n", "\n", "Parts of this chapter\n", "have been strongly influenced by the book [Data\n", "Feminism](https://data-feminism.mitpress.mit.edu/).\n"]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.4"}, "vscode": {"interpreter": {"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /coursebook/modules/m2/2.1-GettingLoading.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a68fb716-ff6b-4aed-b914-439fa5061dc5", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.1 Getting and Loading Data\n", 9 | "\n", 10 | "- Open data and data sources\n", 11 | "- Licensing, ethics, security.\n", 12 | "- Pandas intro/primer\n", 13 | "- Data formats (CSV, database, API, image, …)\n", 14 | " - how to load them into Python with Pandas (mostly)." 15 | ] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "", 21 | "name": "" 22 | }, 23 | "language_info": { 24 | "name": "" 25 | } 26 | }, 27 | "nbformat": 4, 28 | "nbformat_minor": 5 29 | } 30 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.1.1-WhereToFindData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9fdbc4b2-c1e7-4bab-a8e9-063921026aac", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.1.1 Where to find data?\n", 9 | "\n", 10 | "## What is Open Data?\n", 11 | "\n", 12 | "We can use the definition from [The Turing\n", 13 | "Way](https://the-turing-way.netlify.app/reproducible-research/open/open-data.html):\n", 14 | "\n", 15 | "> Open data is freely available on the internet. Any user is permitted\n", 16 | "to download, copy, analyse, re-process, and re-use it for any other\n", 17 | "purpose with minimal financial, legal, and technical barriers.\n", 18 | "\n", 19 | "The benefits of using an open dataset include (but are not limited to):\n", 20 | "\n", 21 | "- Access to the data is free and usually fast (without requiring a\n", 22 | " registration/approval process).\n", 23 | "- Other researchers may have used the same dataset and published\n", 24 | " guidelines, code and other details about the dataset that are also\n", 25 | " in the public domain. So we can benefit and draw inspiration from\n", 26 | " their prior work.\n", 27 | "- Transparency & reproducibility: If the dataset we’re using is open\n", 28 | " we can make all our research (data, code & publications) open as\n", 29 | " well, so others can easily reproduce and contribute to our work.\n", 30 | "\n", 31 | "[The Open Data\n", 32 | "Handbook](https://opendatahandbook.org/guide/en/why-open-data/) has a\n", 33 | "longer discussion of the benefits of open data.\n", 34 | "\n", 35 | "Even if a dataset is public, we must still evaluate whether it is\n", 36 | "ethical to use it and its licensing/legal requirements - not all public\n", 37 | "datasets are completely “open”!\n", 38 | "\n", 39 | "## Sources of Open Data\n", 40 | "\n", 41 | "There are many sources of data online, below are a few ideas for places\n", 42 | "to look.\n", 43 | "\n", 44 | "### Tailored for Data Science\n", 45 | "\n", 46 | "Some datasets have been heavily used for data science applications, and\n", 47 | "are available in easy-to-use formats that may already be\n", 48 | "cleaned/pre-processed for you. Many popular machine learning libraries\n", 49 | "like PyTorch have datasets built-in, for example. These are excellent\n", 50 | "sources for finding datasets to learn, test and benchmark algorithms\n", 51 | "or other analysis techniques, but are less likely to be rich sources\n", 52 | "for novel research projects.\n", 53 | "\n", 54 | "- [Kaggle](https://www.kaggle.com/datasets)\n", 55 | "- [UCI Machine Learning\n", 56 | " Repository](https://archive.ics.uci.edu/ml/index.php)\n", 57 | "- [Google research](https://research.google/tools/datasets/)\n", 58 | "- [scikit-learn](https://scikit-learn.org/stable/datasets/toy_dataset.html),\n", 59 | " [Tensorflow](https://www.tensorflow.org/datasets),\n", 60 | " [PyTorch](https://pytorch.org/vision/stable/datasets.html)\n", 61 | "\n", 62 | "### Countries/Governments\n", 63 | "\n", 64 | "Many governments commit to publishing data in the open for public\n", 65 | "interest and transparency. The datasets might be less “data science\n", 66 | "ready”, but they cover a broad range of topics.\n", 67 | "\n", 68 | "- UK:\n", 69 | " - [data.gov.uk](https://data.gov.uk/)\n", 70 | " - [UK Data Service](https://www.ukdataservice.ac.uk/)\n", 71 | " - [Office for National Statistics (ONS)](https://www.ons.gov.uk/)\n", 72 | "- Other countries/regions:\n", 73 | " - [data.gov](https://www.data.gov/) (USA)\n", 74 | " - [data.europa.eu](https://data.europa.eu/en) (EU)\n", 75 | " - [africaopendata.org](https://africaopendata.org/) (Africa)\n", 76 | " - [dataportal.asia](https://dataportal.asia/home) (Asia)\n", 77 | "\n", 78 | "### Organisations\n", 79 | "\n", 80 | "Large humanitarian organisations often make data available, such as:\n", 81 | "\n", 82 | "- [World Bank](https://microdata.worldbank.org/index.php/home)\n", 83 | "- [World Health Organisation](https://www.who.int/data/collections)\n", 84 | "- [International Monetary Fund](https://www.imf.org/en/Data)\n", 85 | "\n", 86 | "### General\n", 87 | "\n", 88 | "General tools and repositories that contain data across many different\n", 89 | "domains:\n", 90 | "\n", 91 | "- [Google Dataset Search](https://datasetsearch.research.google.com/)\n", 92 | "- GitHub: Although large datasets can’t be stored on GitHub there are\n", 93 | " many smaller datasets to be found in GitHub repositories. There are\n", 94 | " also community-maintained lists of interesting datasets, e.g.,\n", 95 | " [awesome-public-datasets](https://github.com/awesomedata/awesome-public-datasets).\n", 96 | "- [Zenodo](https://zenodo.org/): Combined repository for open data,\n", 97 | " papers, and code.\n", 98 | "- [FAIRsharing](https://fairsharing.org/): A catalogue of databases\n", 99 | " across many different domains.\n", 100 | "\n", 101 | "## When Open Data Isn’t Available\n", 102 | "\n", 103 | "If you can start your project with an open dataset that’s always\n", 104 | "preferable. Even if the perfect dataset is not openly available it may\n", 105 | "be worth first prototyping with related data that is open. For example,\n", 106 | "it may be that an older version of the data you’re interested in has\n", 107 | "been made public. You can continue to explore options for getting the\n", 108 | "ideal data in parallel, but gaining data access is frequently an\n", 109 | "expensive or time-consuming process.\n", 110 | "\n", 111 | "Two common reasons open data may not be available or appropriate are:\n", 112 | "\n", 113 | "1. The data is commercially sensitive or valuable.\n", 114 | "2. The data presents a privacy or security risk.\n", 115 | "\n", 116 | "Access to detailed healthcare records, for example, is often heavily\n", 117 | "restricted even if personal identifiers have been removed due to the\n", 118 | "risk of re-identification. In August 2016 the Australian government\n", 119 | "openly published a de-identified medical billing dataset, but one month\n", 120 | "later [researchers at the University of\n", 121 | "Melbourne](https://arxiv.org/pdf/1712.05627.pdf) demonstrated it was\n", 122 | "possible to re-identify individuals and the data was taken offline.\n", 123 | "\n", 124 | "Options for finding a non-open dataset include:\n", 125 | "\n", 126 | "### Ask!\n", 127 | "\n", 128 | "Although open data may not be available for your project, a\n", 129 | "collaborator, someone else at your institute, company, or the wider\n", 130 | "community could have something relevant. However, even if they’re\n", 131 | "willing to share it, you must check what the conditions are for access\n", 132 | "and usage of the data and get advice where necessary. Always err on the\n", 133 | "side of caution, especially if any of the data relates to living\n", 134 | "individuals. Alternatively, you may find someone else that’s interested\n", 135 | "in the same data, and you could join forces (or budgets) to get it.\n", 136 | "\n", 137 | "### Paywalled/Restricted Access\n", 138 | "\n", 139 | "Getting access to a dataset that’s behind closed doors is likely to\n", 140 | "involve a registration or application process and may include a fee.\n", 141 | "Bear in mind that data can be *expensive*, and could easily cost\n", 142 | "thousands of pounds. If the application is approved, the resulting\n", 143 | "contract/ research agreement may specify precisely which data you can\n", 144 | "have access to (down to the level of individual fields), who will have\n", 145 | "access, the duration of access, and exactly what you’re allowed to do\n", 146 | "with it.\n", 147 | "\n", 148 | "As an example, [this\n", 149 | "website](https://www.ukbiobank.ac.uk/enable-your-research) describes the\n", 150 | "process for accessing the UK Biobank, a large biomedical database for\n", 151 | "health research.\n", 152 | "\n", 153 | "### Creating Your Own Dataset\n", 154 | "\n", 155 | "Ultimately the data you need might not be available anywhere, in which\n", 156 | "case the only option could be to collect it yourself. Designing datasets\n", 157 | "is not the focus of this course, but if you’re making your own remember\n", 158 | "you’ll be the one analysing it! Investing time in thinking about how\n", 159 | "your data will be structured and how you’ll deal with missing values and\n", 160 | "the many other issues common in datasets will save a lot of time later.\n", 161 | "You must also carefully consider whether it is ethical to collect the\n", 162 | "data and have approval from your organisation to do so.\n", 163 | "\n", 164 | "## Assessing Dataset Quality and Suitability\n", 165 | "\n", 166 | "In Module 1 ([Section 1.2](../m1/1.2-DataScienceProjectLifecycle),\n", 167 | "question 3 for scoping projects) we gave these overarching questions for\n", 168 | "evaluating a dataset:\n", 169 | "\n", 170 | "- Does the dataset contain what’s needed to solve the research question\n", 171 | " available?\n", 172 | "- Can I legally and ethically use the data?\n", 173 | "- Is the data easily accessible? - Is the dataset well-understood and\n", 174 | " tested?\n", 175 | "- Is data quality and quantity appropriate?\n", 176 | "\n", 177 | "Another useful concept here is [data readiness\n", 178 | "levels](https://arxiv.org/pdf/1705.02245.pdf):\n", 179 | "\n", 180 | "- **Band C: Accessibility**\n", 181 | " - C4: You believe the data may exist, but haven’t seen/verified that\n", 182 | " it does.\n", 183 | " - C1: The data is ready to be loaded into an analysis: You have access,\n", 184 | " it’s in an appropriate format, and you have (both ethical and legal)\n", 185 | " permission to use it.\n", 186 | "\n", 187 | "- **Band B: Faihfulness & Representation**\n", 188 | " - B1: The data has been used in an exploratory analysis, and you have\n", 189 | " verified the contents match what you expected. You understand any\n", 190 | " limitations (e.g. how missing values were treated).\n", 191 | "\n", 192 | "- **Band A: Data in Context**\n", 193 | " - A1: The data has been prepared and is suitable to answer a\n", 194 | " *specific* research question.\n", 195 | "\n", 196 | "In this module we cover many of the steps needed to take data from Band\n", 197 | "C to Band B: we start with data we know exists but don’t know how to\n", 198 | "analyse, and end with beginning an exploratory analysis.\n", 199 | "\n", 200 | "For evaluating all of these, **data documentation** is essential. See\n", 201 | "[The Turing\n", 202 | "Way](https://the-turing-way.netlify.app/reproducible-research/rdm/rdm-metadata.html)\n", 203 | "and this [Radboud University\n", 204 | "article](https://www.ru.nl/rdm/processing-data/documenting-data/) for\n", 205 | "details of what good documentation should contain.\n", 206 | "\n", 207 | "## Sharing Data\n", 208 | "\n", 209 | "If your project is working with, or has generated, a dataset consider\n", 210 | "whether you can publish it with an open license. The community can then\n", 211 | "benefit from all the advantages of open data we’ve talked about! It’s\n", 212 | "also becoming more common for funders and journals to *require* code and\n", 213 | "data to be published with papers.\n", 214 | "\n", 215 | "We won’t discuss this here but the [Sharing and Archiving\n", 216 | "Data](https://the-turing-way.netlify.app/reproducible-research/rdm/rdm-sharing.html)\n", 217 | "chapter in the Turing Way is a great place to start." 218 | ] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3 (ipykernel)", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.9.15" 238 | }, 239 | "vscode": { 240 | "interpreter": { 241 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 242 | } 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 5 247 | } 248 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.1.5-ControllingAccess.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "84183ea9-a6f4-4f9a-89a7-07fd499465fc", "metadata": {}, "source": ["# 2.1.5 Controlling access\n", "\n", "Once you\u2019ve created a dataset, or taken data from a third party, it\n", "becomes your responsibility to ensure that access to this data copy is\n", "controlled appropriately.\n", "\n", "Your responsibilities will vary according to licensing conditions and\n", "any commercial agreements in place.\n", "\n", "In cases of particular data-sensitivity you may wish to consider use of\n", "a secure research environment, for example, a [data safe\n", "haven](https://www.turing.ac.uk/research/research-projects/data-safe-havens-cloud)."]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.4"}, "vscode": {"interpreter": {"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2-DataWrangling.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "92183946-74d4-4588-ae66-b40ff2ff343d", "metadata": {}, "source": ["# 2.2 Data Wrangling\n", "\n", "- Loading a dataset for the first time (sanity checks, data parsing\n", " issues, \u2026)\n", "- Manipulating different types of data (text, dates, categorical,\n", " images)\n", "- Feature engineering\n", "- Missing data\n", "- Privacy and anonymisation"]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.6 ('.venv': poetry)", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.6"}, "vscode": {"interpreter": {"hash": "b9e8921bd18fbd36d3a09ae9691fc21c58beec206524d0083259030e87e84f05"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2.4-DataManipulation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "55cdd9f6-1890-43c8-940d-141623a07cc9", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.2.4 Data Manipulation\n", 9 | "\n", 10 | "We’ll briefly cover some manipulation of the following data types:\n", 11 | "\n", 12 | "- Time and Date\n", 13 | "- Text\n", 14 | "- Categorical\n", 15 | "- Image" 16 | ] 17 | } 18 | ], 19 | "metadata": { 20 | "kernelspec": { 21 | "display_name": "Python 3 (ipykernel)", 22 | "language": "python", 23 | "name": "python3" 24 | }, 25 | "language_info": { 26 | "codemirror_mode": { 27 | "name": "ipython", 28 | "version": 3 29 | }, 30 | "file_extension": ".py", 31 | "mimetype": "text/x-python", 32 | "name": "python", 33 | "nbconvert_exporter": "python", 34 | "pygments_lexer": "ipython3", 35 | "version": "3.9.15" 36 | } 37 | }, 38 | "nbformat": 4, 39 | "nbformat_minor": 5 40 | } 41 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2.4.1-TimeAndDateData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "dad696b9-6493-4f39-8e10-c3cc14bd1938", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.2.4.1 Time and Date Data\n", 9 | "\n", 10 | "Dealing with time and date data can be tricky. String-formatted data is\n", 11 | "hard to compare and represent for modelling.\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "id": "11d33204", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "True\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "print(\"02/03/1900\" > \"01/01/2020\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "5d69e1a1", 35 | "metadata": {}, 36 | "source": [ 37 | "\n", 38 | "We use the `datetime.datetime` object in examples below. However, you\n", 39 | "can also use `datetime.date` and `datetime.time` as appropriate.\n", 40 | "\n", 41 | "## datetime\n", 42 | "\n", 43 | "We need to represent date data in a format that will allow us to compare\n", 44 | "items and perform operations such as addition and subtraction.\n", 45 | "\n", 46 | "Python’s standard library includes the\n", 47 | "[`datetime`](https://docs.python.org/3/library/datetime.html) module.\n", 48 | "This allows us to represent dates and times as structured objects.\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "id": "05a53444", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "2022-09-26 11:56:14.348543\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "import datetime\n", 67 | "\n", 68 | "# create a datetime object with value set to now \n", 69 | "now = datetime.datetime.now()\n", 70 | "print(now)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "3ecd00ec", 76 | "metadata": {}, 77 | "source": [ 78 | "\n", 79 | "This object has structure. We can, for example, extract the year\n", 80 | "property from this object.\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "id": "bb091dad", 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "2022\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "print(now.year)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "dc0bcf79", 104 | "metadata": {}, 105 | "source": [ 106 | "\n", 107 | "We can also compare this datetime to others, as well as perform date\n", 108 | "arithmetic.\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "id": "8d734e26", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "d gt now: False\n", 122 | "now - d: 643 days, 11:56:14.348543. Type: \n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "past = datetime.datetime.fromisoformat(\"2020-12-22\") \n", 128 | "is_gt_now = past > now\n", 129 | "print(f\"d gt now: {is_gt_now}\")\n", 130 | "\n", 131 | "# subtract past from now\n", 132 | "difference = now - past \n", 133 | "print(f\"now - d: {difference}. Type: {type(difference)}\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "5216b4e4", 139 | "metadata": {}, 140 | "source": [ 141 | "\n", 142 | "We can get a numeric, [POSIX timestamp](https://en.wikipedia.org/wiki/Unix_time), representation of these dates with `datetime.datetime.timestamp()`.\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 5, 148 | "id": "412c2118", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "now timestamp: 1664189774.348543\n", 156 | "past timestamp: 1608595200.0\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "print(f\"now timestamp: {now.timestamp()}\")\n", 162 | "print(f\"past timestamp: {past.timestamp()}\")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "f129a763", 168 | "metadata": {}, 169 | "source": [ 170 | "\n", 171 | "Note some UTC vs local time complications detailed\n", 172 | "[here](https://docs.python.org/3/library/datetime.html#datetime.datetime.timestamp).\n", 173 | "\n", 174 | "## Converting From/To String\n", 175 | "\n", 176 | "For converting *from* a string, we can use the\n", 177 | "`datetime.datetime.strptime(date_string, format)` function. Format codes\n", 178 | "are detailed\n", 179 | "[here](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 6, 185 | "id": "29f8508d", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "1999-03-30 16:30:00. Type: \n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "dt = datetime.datetime.strptime(\"30/03/99 16:30\", \"%d/%m/%y %H:%M\")\n", 198 | "print(f\"{dt}. Type: {type(dt)}\")" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "8540da18", 204 | "metadata": {}, 205 | "source": [ 206 | "\n", 207 | "And to convert a date *to* string we can use\n", 208 | "`datetime.datetime.strftime(format)`.\n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 7, 214 | "id": "c1857e2d", 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "26/09/22 11:56. Type: \n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "s = now.strftime(\"%d/%m/%y %H:%M\") \n", 227 | "print(f\"{s}. Type: {type(s)}\")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "id": "8f16b84b", 233 | "metadata": {}, 234 | "source": [ 235 | "\n", 236 | "## Pandas Datetime Accessor (`Series.dt`)\n", 237 | "\n", 238 | "Pandas provides an accessor object for datetime-like properties of\n", 239 | "Series values. See\n", 240 | "[here](https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.html).\n", 241 | "\n", 242 | "E.g. (taken almost directly from Pandas docs, linked above)\n" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 1, 248 | "id": "509536c6", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "0 2000-01-01 00:00:00\n", 255 | "1 2000-01-01 00:00:01\n", 256 | "2 2000-01-01 00:00:02\n", 257 | "dtype: datetime64[ns]" 258 | ] 259 | }, 260 | "metadata": {}, 261 | "output_type": "display_data" 262 | }, 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "0 0\n", 267 | "1 1\n", 268 | "2 2\n", 269 | "dtype: int64" 270 | ] 271 | }, 272 | "metadata": {}, 273 | "output_type": "display_data" 274 | } 275 | ], 276 | "source": [ 277 | "import pandas as pd\n", 278 | "\n", 279 | "seconds_series = pd.Series(pd.date_range(\"2000-01-01\", periods=3, freq=\"s\"))\n", 280 | "display(seconds_series)\n", 281 | "\n", 282 | "# access seconds property of values in series\n", 283 | "display(seconds_series.dt.second)" 284 | ] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "Python 3 (ipykernel)", 290 | "language": "python", 291 | "name": "python3" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.9.15" 304 | }, 305 | "vscode": { 306 | "interpreter": { 307 | "hash": "bfb6be6a1e2928ff2d2ab7904db3c9037c4b5ca4f399bd5e7064365a54a3f091" 308 | } 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 5 313 | } 314 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2.4.2-TextData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "23243b6a-c84f-4eb3-95cf-04c095f77ccb", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.2.4.2 Text data\n", 9 | "\n", 10 | "We’ll often want to manipulate text data (strings) in Python. There are\n", 11 | "many handy libraries for helping us do this, some of which allow some\n", 12 | "pretty complicated operations. Here, we’ll show some basic processing.\n", 13 | "\n", 14 | "## Inconsistencies\n", 15 | "\n", 16 | "Strings have their own particular array of consistency issues, such as\n", 17 | "inconsistent capitalisation and extraneous whitespace.\n", 18 | "\n", 19 | "Fortunately, python gives us some handy [built-in\n", 20 | "functionality](https://docs.python.org/3/library/stdtypes.html#string-methods)\n", 21 | "for dealing with somese issues.\n", 22 | "\n", 23 | "We’ll make note of a few of these methods, below.\n", 24 | "\n", 25 | "### `str.upper()` and `str.lower()`\n", 26 | "\n", 27 | "The `str.upper()` and `str.lower()` methods will take a given string and\n", 28 | "return a copy as a solely uppercase or lowercase string. E.g.\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "id": "8380309e", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "upper: FOO\n", 42 | "lower: foo\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "print(f\"upper: {'Foo'.upper()}\")\n", 48 | "print(f\"lower: {'Foo'.lower()}\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "e6cdd323", 54 | "metadata": {}, 55 | "source": [ 56 | "\n", 57 | "These methods can be useful for ensuring consistency when casing is not\n", 58 | "important in your data.\n", 59 | "\n", 60 | "### `str.strip()`\n", 61 | "\n", 62 | "The `str.strip()` method (and its companions, `str.lstrip` and\n", 63 | "`str.rstrip()`) return a copy, stripping leading and trailing characters\n", 64 | "(default to whitespace) from a string. E.g.\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "id": "0be2a942", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "stripped: 'foo bar'\n", 78 | "left stripped: 'foo bar '\n", 79 | "right stripped:' foo bar'\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "stripped = \" foo bar \".strip()\n", 85 | "\n", 86 | "print(f\"stripped: '{stripped}'\") \n", 87 | "lstripped =\" foo bar \".lstrip()\n", 88 | "print(f\"left stripped: '{lstripped}'\")\n", 89 | "\n", 90 | "rstripped = \" foo bar \".rstrip() # strip right\n", 91 | "print(f\"right stripped:'{rstripped}'\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "0cd367f8", 97 | "metadata": {}, 98 | "source": [ 99 | "\n", 100 | "### Spelling is tricky\n", 101 | "\n", 102 | "The methods we've talked about so far don't address things like misspelling/typos (a common data input concern).\n", 103 | "\n", 104 | "In a relatively simple scenario, with categorical data encoded as strings, you might be able to spot these by checking for all unique values in your data. E.g.\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "id": "dd7edf44", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "{'aple', 'apple', 'orange', 'grapefruit', 'pear'}\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "my_favourite_fruit_data = [\"apple\", \"apple\", \"pear\", \"orange\", \"aple\", \"orange\", \"grapefruit\"]\n", 123 | "print(set(my_favourite_fruit_data))" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "c664f96c", 129 | "metadata": {}, 130 | "source": [ 131 | "\n", 132 | "## Splitting\n", 133 | "\n", 134 | "We’ll also commonly want to split a string based on a particular\n", 135 | "delimiter or separator. For example, we may wish to split a string of\n", 136 | "text into individual words, using any whitespace separator.\n", 137 | "\n", 138 | "We can use the `str.split(sep=None, maxsplit=-1)` method.\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 4, 144 | "id": "49147d9f", 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "['this', 'is', 'some', 'text']\n", 152 | "['this,', 'another', 'example,', 'is', 'some', 'more', 'text']\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "s = \"this is some text\".split() # whitespace is the default \n", 158 | "print(s)\n", 159 | "\n", 160 | "# however, be careful of punctuation\n", 161 | "s2 = \"this, another example, is some more text\".split()\n", 162 | "print(s2)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "a267c5da", 168 | "metadata": {}, 169 | "source": [ 170 | "\n", 171 | "We may also wish to split by separators other than whitespace.\n" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 5, 177 | "id": "59894052", 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "['apple', 'banana', 'pear', 'peach']\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "s = \"apple#banana#pear#peach\".split(\"#\")\n", 190 | "print(s)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "id": "9c3a51cb", 196 | "metadata": {}, 197 | "source": [ 198 | "\n", 199 | "With this in mind, we could also use `str.split` for dealing with csv\n", 200 | "data. However, we’d need to be careful about commas inside quotes. It’s\n", 201 | "generally more convenient to use libraries that already deal with this\n", 202 | "kind of thing, like pandas!\n", 203 | "\n", 204 | "## Joining\n", 205 | "\n", 206 | "A bit like `str.split(sep=None, maxsplit=-1)` in reverse,\n", 207 | "`str.join(iterable)` allows us to join a list of strings together with a\n", 208 | "given separator.\n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "id": "67b8ecf0", 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "a list of words\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "my_list = [\"a\", \"list\", \"of\", \"words\"] # join with a space separator \n", 227 | "s = \" \".join(my_list) \n", 228 | "print(s)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "8cb291c9", 234 | "metadata": {}, 235 | "source": [ 236 | "\n", 237 | "## Regular expressions\n", 238 | "\n", 239 | "[Regular expressions (regexps,\n", 240 | "regex)](https://en.wikipedia.org/wiki/Regular_expression) are character\n", 241 | "sequences that specify a search pattern, usually for a find and/or\n", 242 | "replace task on text data.\n", 243 | "\n", 244 | "Python’s [regular expression](https://docs.python.org/3/library/re.html)\n", 245 | "module provides functionality similar to that offered in Perl.\n", 246 | "\n", 247 | "Regex can give us powerful string matching, beyond that of a simple\n", 248 | "exact string match. E.g.\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 7, 254 | "id": "c51fa13e", 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "['Spain', 'the', 'So', 'they']\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "import re \n", 267 | "txt = \"The rain in Spain falls mainly on the plains. So they say, anyway.\" # find all words starting with upper case S or lower case t \n", 268 | "print(re.findall(r\"\\b[St]\\w+\", txt))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "5288fcf0", 274 | "metadata": {}, 275 | "source": [ 276 | "\n", 277 | "## NLP Preprocessing\n", 278 | "\n", 279 | "In Natural Language Processing (NLP) tasks we often see some slightly\n", 280 | "more complicated preprocessing such as:\n", 281 | "\n", 282 | "- [Stemming and\n", 283 | " Lemmatisation](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html) -\n", 284 | " reducing words to common base forms\n", 285 | "- Stop-word Removal - removing common words that carry little\n", 286 | " information\n", 287 | "- \"Vectorization\" - convert text to a meaningful numeric vector\n", 288 | " representation (e.g. [term frequency\n", 289 | " encoding](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer))\n", 290 | "\n", 291 | "There are some commonly used libraries for the above tasks, we recommend\n", 292 | "[NLTK](https://www.nltk.org/) and\n", 293 | "[scikit-learn](https://scikit-learn.org/stable/).\n", 294 | "\n", 295 | "## Pandas String Operations (`Series.str`)\n", 296 | "\n", 297 | "Pandas provides vectorized string functions for Series. Unless\n", 298 | "explicitly handled, NAs will stay as NA. See\n", 299 | "[here](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.html).\n", 300 | "E.g." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 8, 306 | "id": "0feddf9f", 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "0 AAA\n", 313 | "1 AAb\n", 314 | "2 AbA\n", 315 | "dtype: object" 316 | ] 317 | }, 318 | "execution_count": 8, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "import pandas as pd\n", 325 | "\n", 326 | "s = pd.Series([\"aaa\", \"aab\", \"aba\"]) # replace \"a\" with \"A\"\n", 327 | "s.str.replace(\"a\", \"A\")" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3 (ipykernel)", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.9.15" 348 | }, 349 | "vscode": { 350 | "interpreter": { 351 | "hash": "bfb6be6a1e2928ff2d2ab7904db3c9037c4b5ca4f399bd5e7064365a54a3f091" 352 | } 353 | } 354 | }, 355 | "nbformat": 4, 356 | "nbformat_minor": 5 357 | } 358 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2.5-PrivacyAndAnonymisation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2b3604b4-2d1a-4a22-a16e-cbd32494c19c", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.2.5 Privacy and Anonymisation\n", 9 | "\n", 10 | "This section touches, again, on UK GDPR. A comprehensive guide to UK\n", 11 | "GDPR can be found on the [ICO\n", 12 | "website](https://ico.org.uk/for-organisations/guide-to-data-protection/guide-to-the-general-data-protection-regulation-gdpr/).\n", 13 | "\n", 14 | "## Anonymisation or Pseudonymisation\n", 15 | "\n", 16 | "Pseudonymisation and anonymisation are common approaches to meet GDPR\n", 17 | "principles of [“data minimisation” and “storage\n", 18 | "limitation”](https://www.privacy-regulation.eu/en/article-5-principles-relating-to-processing-of-personal-data-GDPR.htm).\n", 19 | "\n", 20 | "Many tasks do not require the use of personal identifiers that can often\n", 21 | "be part of a dataset. In these cases, we should aim to remove this\n", 22 | "unnecessary, yet sensitive, data.\n", 23 | "\n", 24 | "### Anonymisation\n", 25 | "\n", 26 | "In a GDPR context, Recital 26 defines anonymous information as:\n", 27 | "\n", 28 | "> …information which does not relate to an identified or identifiable\n", 29 | "natural person or to personal data rendered anonymous in such a manner\n", 30 | "that the data subject is not or no longer identifiable.\n", 31 | "\n", 32 | "— *[Recital\n", 33 | "26](https://www.privacy-regulation.eu/en/recital-26-GDPR.htm)*\n", 34 | "\n", 35 | "Anonymisation is the process of removing direct and indirect personal\n", 36 | "identifiers. Crucially, post-anonymisation, subjects will not be able to\n", 37 | "be identified in the data, even given additional information.\n", 38 | "\n", 39 | "GDPR does not apply to anonymised information. However, note that when\n", 40 | "**you** anonymise personal data, **you** are still processing the data\n", 41 | "at that point.\n", 42 | "\n", 43 | "### Pseudonymisation\n", 44 | "\n", 45 | "> …the processing of personal data in such a manner that the personal\n", 46 | "> data can no longer be attributed to a specific data subject without\n", 47 | "> the use of additional information, provided that such additional\n", 48 | "> information is kept separately and is subject to technical and\n", 49 | "> organisational measures to ensure that the personal data are not\n", 50 | "> attributed to an identified or identifiable natural person.\n", 51 | "\n", 52 | "— *excerpt from [ICO Guide to Data\n", 53 | "Protection](https://ico.org.uk/for-organisations/guide-to-data-protection/guide-to-the-general-data-protection-regulation-gdpr/what-is-personal-data/what-is-personal-data/#pd4)*\n", 54 | "\n", 55 | "This is subtly different to anonymisation. Here, the subject can be\n", 56 | "re-identified, given additional information.\n", 57 | "\n", 58 | "GDPR does still apply to pseudonymised information. However,\n", 59 | "pseudonymisation helps meet the “data minimisation” and “storage\n", 60 | "limitation” principles.\n", 61 | "\n", 62 | "## Differential Privacy\n", 63 | "\n", 64 | "Separate to concerns over data use in developing algorithms, there is\n", 65 | "also a concern over what may be inferred about the data by learning the\n", 66 | "result of some randomised algorithm. Here, the concern shifts from\n", 67 | "intrusion by the data scientist developing the algorithm to the\n", 68 | "intrusion by the many consumers of this algorithm.\n", 69 | "\n", 70 | "The below image shows a **toy version** of a membership inference\n", 71 | "attack. Here, an adversary attempts to infer whether certain images were\n", 72 | "used in the training of a cat vs dog classifier.\n", 73 | "\n", 74 | "A data scientist:\n", 75 | "\n", 76 | "1. Trains a model on the first set of images\n", 77 | "2. Publishes the model to the public internet\n", 78 | "\n", 79 | "An adversary then:\n", 80 | "\n", 81 | "1. Makes predictions with the model through some public API\n", 82 | "2. Based on the scores in the model’s predictions, loosely\n", 83 | " interpretable as confidence, infers:\n", 84 | " - first image was part of the original training set\n", 85 | " - second image was not part of training training set\n", 86 | "\n", 87 | "![toy membership inference figure](https://i.imgur.com/ujb8iPk.jpg)\n", 88 | "\n", 89 | "Broadly, differential privacy provides a mechanism for learning nothing\n", 90 | "about an individual while learning useful information about the general\n", 91 | "population.\n", 92 | "\n", 93 | "> “Differential privacy” describes a promise, made by a data holder, or\n", 94 | "> curator, to a data subject: “You will not be affected, adversely or\n", 95 | "> otherwise, by allowing your data to be used in any study or analysis,\n", 96 | "> no matter what other studies, data sets, or information sources, are\n", 97 | "> available.”\n", 98 | "\n", 99 | "— *[The Algorithmic Foundations of Differential\n", 100 | "Privacy](https://www.tau.ac.il/~saharon/BigData2018/privacybook.pdf)* -\n", 101 | "Dwork & Roth (2014)\n", 102 | "\n", 103 | "Use of differential privacy in data science and machine learning is an\n", 104 | "ongoing area of research. We recommend the 2018 blog post, [Privacy and\n", 105 | "machine learning: two unexpected\n", 106 | "allies?](http://www.cleverhans.io/privacy/2018/04/29/privacy-and-machine-learning.html),\n", 107 | "as further reading.\n", 108 | "\n", 109 | "## References\n", 110 | "\n", 111 | "Dwork, C., & Roth, A. (2014). The algorithmic foundations of\n", 112 | "differential privacy. Found. Trends Theor. Comput. Sci., 9(3-4),\n", 113 | "211-407." 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3 (ipykernel)", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.9.15" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 5 138 | } 139 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2.6-LinkingDatasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e2ccdc55-63ab-429c-b6e4-7126f75df160", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.2.6 Linking Datasets\n", 9 | "\n", 10 | "Data linking is the process of joining datasets together. Datasets can\n", 11 | "be useful on their own but bringing them together can unlock new\n", 12 | "insights.\n", 13 | "\n", 14 | "Discussion\n", 15 | "\n", 16 | "- When might this be helpful? Can you think of any examples where joining datasets might help us unlock additional insights?\n", 17 | "\n", 18 | "## Impact on Privacy\n", 19 | "\n", 20 | "Before we link datasets together, we should consider the impact on\n", 21 | "privacy of doing so. There can be an increased risk of identification of\n", 22 | "a person/entity when two datasets are linked.\n", 23 | "\n", 24 | "## How to Link\n", 25 | "\n", 26 | "The most straightforward way to link datasets is by a deterministic,\n", 27 | "rules-based linkage, where records are linked if a specific **set of\n", 28 | "identifiers** match. When working with tabular datasets we will often\n", 29 | "see this accomplished with a *join* operation.\n", 30 | "\n", 31 | "### Joining in Pandas\n", 32 | "\n", 33 | "Pandas allows for database-style joins. If you have experience of SQL,\n", 34 | "you'll be familiar with the terminology of left/right/inner/outer etc.\n", 35 | "joins. Pandas makes these join types (or similar) available in the\n", 36 | "Pandas API via `pd.merge` and provides a comprehensive summary of joins\n", 37 | "[in the\n", 38 | "docs](https://pandas.pydata.org/docs/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging).\n", 39 | "Here, for demonstration, we will give an example using a simple left\n", 40 | "join, similar to a `LEFT OUTER` join in SQL.\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "id": "a0c4e4be", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "df1\n" 54 | ] 55 | }, 56 | { 57 | "data": { 58 | "text/html": [ 59 | "
\n", 60 | "\n", 73 | "\n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | "
some_ida
0foo0
1bar1
2zop2
\n", 99 | "
" 100 | ], 101 | "text/plain": [ 102 | " some_id a\n", 103 | "0 foo 0\n", 104 | "1 bar 1\n", 105 | "2 zop 2" 106 | ] 107 | }, 108 | "metadata": {}, 109 | "output_type": "display_data" 110 | }, 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "df2\n" 116 | ] 117 | }, 118 | { 119 | "data": { 120 | "text/html": [ 121 | "
\n", 122 | "\n", 135 | "\n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | "
some_idb
0foo3
1bar4
2baz5
\n", 161 | "
" 162 | ], 163 | "text/plain": [ 164 | " some_id b\n", 165 | "0 foo 3\n", 166 | "1 bar 4\n", 167 | "2 baz 5" 168 | ] 169 | }, 170 | "metadata": {}, 171 | "output_type": "display_data" 172 | } 173 | ], 174 | "source": [ 175 | "import pandas as pd\n", 176 | "\n", 177 | "df1 = pd.DataFrame({'some_id': ['foo', 'bar', 'zop'], 'a': [0,1,2]})\n", 178 | "df2 = pd.DataFrame({'some_id': ['foo', 'bar', 'baz'], 'b': [3, 4,5]})\n", 179 | "\n", 180 | "print(\"df1\")\n", 181 | "display(df1)\n", 182 | "\n", 183 | "print(\"df2\")\n", 184 | "display(df2)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "54962111", 190 | "metadata": {}, 191 | "source": [ 192 | "\n", 193 | "\n", 194 | "note that `df1` contains the key `'zop'` that is not contained in `df2` and that `df2` contains the key `'baz'` that is not contained in `df1`.\n", 195 | "The other two keys (`'foo', 'bar'`) are contained in both DataFrames.\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 2, 201 | "id": "da981ec1", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/html": [ 207 | "
\n", 208 | "\n", 221 | "\n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | "
some_idab
0foo03.0
1bar14.0
2zop2NaN
\n", 251 | "
" 252 | ], 253 | "text/plain": [ 254 | " some_id a b\n", 255 | "0 foo 0 3.0\n", 256 | "1 bar 1 4.0\n", 257 | "2 zop 2 NaN" 258 | ] 259 | }, 260 | "metadata": {}, 261 | "output_type": "display_data" 262 | } 263 | ], 264 | "source": [ 265 | "result = pd.merge(left=df1, right=df2, how=\"left\", on=\"some_id\")\n", 266 | "display(result)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "id": "54348e72", 272 | "metadata": {}, 273 | "source": [ 274 | "\n", 275 | "the data for `'baz'` has been dropped after this join as it only\n", 276 | "appeared on the 'right' side. The `how='left'` join only uses keys from\n", 277 | "the left frame. Meanwhile, the entry for column `'b'` is null for key\n", 278 | "`'zop'`.\n", 279 | "\n", 280 | "Different scenarios will require different join types, read the docs\n", 281 | "carefully!\n", 282 | "\n", 283 | "### Probabilistic Matching\n", 284 | "\n", 285 | "If we don't have a set of common identifiers, we may wish to use\n", 286 | "probabilistic matching.\n", 287 | "\n", 288 | "Probabilistic matching calculates a matching score between two records.\n", 289 | "Typically, this is done by comparing several field values and assigning\n", 290 | "a weight to each depending on how closely they match.\n", 291 | "\n", 292 | "Details of probabilistic matching approaches are, unfortunately, beyond\n", 293 | "the scope of this course." 294 | ] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 3 (ipykernel)", 300 | "language": "python", 301 | "name": "python3" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython3", 313 | "version": "3.9.15" 314 | }, 315 | "vscode": { 316 | "interpreter": { 317 | "hash": "bfb6be6a1e2928ff2d2ab7904db3c9037c4b5ca4f399bd5e7064365a54a3f091" 318 | } 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 5 323 | } 324 | -------------------------------------------------------------------------------- /coursebook/modules/m2/2.2.7-MissingData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fd14238c-16c8-4c16-92d5-5b0214210488", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2.2.7 Missing Data\n", 9 | "\n", 10 | "Missing data often refers to missing values in data, where no data value\n", 11 | "is stored for a variable or observation. However, we may also regard\n", 12 | "missing datasets or data with insufficient disaggregation as a form of\n", 13 | "missing data.\n", 14 | "\n", 15 | "## Missing Datasets and Insufficient Disaggregation\n", 16 | "\n", 17 | "> “Missing data sets” are my term for the blank spots that exist in\n", 18 | "> spaces that are otherwise data-saturated.\n", 19 | "\n", 20 | "— Mimi Onuoha (2021)\n", 21 | "\n", 22 | "Datasets may not exist, where we might expect them to, for a variety of\n", 23 | "reasons; these may be political or cultural and can be expressive of\n", 24 | "biases, systematic failures, and/or oppression.\n", 25 | "\n", 26 | "We may also find that, where datasets *do* exist, that the data may not\n", 27 | "be sufficiently disaggregated to allow for analysis that can make issues\n", 28 | "of inequality apparent. Data may not be broken down by categories such\n", 29 | "as gender, age, or race, preventing the exploration of the impact of\n", 30 | "these factors on other variables.\n", 31 | "\n", 32 | "Discussion\n", 33 | "\n", 34 | "- What considerations would we want to make around publishing disaggregated data for gender, age, and race? How does this disaggregation fit with [previous discussion](./2-02-LegalityAndEthics.html#should-a-variable-be-used) around use of variables such as race in data analysis?\n", 35 | "\n", 36 | "## Missing Data Values\n", 37 | "\n", 38 | "Real world data often has missing values. These may represent gaps in\n", 39 | "data collection, be expected responses to questions, or be the result of\n", 40 | "failures or data corruption.\n", 41 | "\n", 42 | "Consider the following question in a survey: “How many days has it been\n", 43 | "since your flu jab?”\n", 44 | "\n", 45 | "What answer would you expect from a participant who had not had a flu\n", 46 | "jab?\n", 47 | "\n", 48 | "### Representing Missing Values\n", 49 | "\n", 50 | "Pandas uses `NaN` (Not a Number) internally to denote missing data, for\n", 51 | "reasons of computational speed and convenience (beyond our scope!).\n", 52 | "However, Python’s `None` is also considered a “missing” value and we can\n", 53 | "check for this with `isna()` or `notna()` methods.\n", 54 | "\n", 55 | "- `.isnull()` vs `.isna()`: These do the same thing!\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 1, 61 | "id": "b8d32943", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "0 True\n", 68 | "1 False\n", 69 | "2 False\n", 70 | "dtype: bool" 71 | ] 72 | }, 73 | "execution_count": 1, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "import pandas as pd \n", 80 | "pd.Series([None,\"foo\",\"bar\"]).isnull()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "0cf20428", 86 | "metadata": {}, 87 | "source": [ 88 | "\n", 89 | "Pandas has some other ways of denoting missing values, but we won’t\n", 90 | "detail them here. A more complete guide to missing data in Pandas can be\n", 91 | "found in the\n", 92 | "[docs](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html).\n", 93 | "\n", 94 | "### Dealing with Missing Values\n", 95 | "\n", 96 | "How do we deal with missing values? There are many approaches, we\n", 97 | "outline two simple and popular varieties below:\n", 98 | "\n", 99 | "#### Listwise Deletion\n", 100 | "\n", 101 | "Simply drop any rows that contain missing values. This is the simplest\n", 102 | "method of dealing with missing data!\n", 103 | "\n", 104 | "However, this can introduce bias. We should consider *why* the data is\n", 105 | "missing. Listwise deletion will introduce bias when the remaining data\n", 106 | "can no longer be reflective of the original data.\n", 107 | "\n", 108 | "#### Single Imputation\n", 109 | "\n", 110 | "Apply a rule to replace a missing value. For example, replace with the\n", 111 | "mean or the value of the last observation (last observation carried\n", 112 | "forward - LOCF).\n", 113 | "\n", 114 | "Again, single imputation methods can easily introduce bias. Use with\n", 115 | "caution.\n", 116 | "\n", 117 | "### Missingness Terminology\n", 118 | "\n", 119 | "Here, we introduce some terminology to help us with the scenarios in\n", 120 | "which values are missing.\n", 121 | "\n", 122 | "[Sterne et al. (2009)](https://www.bmj.com/content/338/bmj.b2393/)\n", 123 | "provide similar definitions with biomedical examples and discussion of\n", 124 | "the use of multiple imputation.\n", 125 | "\n", 126 | "#### Missing Completely At Random (MCAR)\n", 127 | "\n", 128 | "The missingness is unrelated to both missing and observed values, e.g.,\n", 129 | "a study participant’s weekly questionnaire is lost in the post.\n", 130 | "\n", 131 | "If we were to drop rows with MCAR data, we would reduce our population\n", 132 | "size but would not introduce any bias.\n", 133 | "\n", 134 | "Unfortunately, missing values are rarely MCAR.\n", 135 | "\n", 136 | "Note that, whilst we can falsify the hypothesis that data are MCAR, we\n", 137 | "cannot confirm it.\n", 138 | "\n", 139 | "#### Missing At Random (MAR)\n", 140 | "\n", 141 | "Slightly confusingly named, in MAR data missingness is related to\n", 142 | "observed data but not the missing values themselves, e.g., a person does\n", 143 | "not attend an academic exam because they are too unwell to travel\n", 144 | "**and** we also have their health records.\n", 145 | "\n", 146 | "MAR data allows the prediction of missing values based on complete rows.\n", 147 | "However, as ever, imputation should be approached with caution!\n", 148 | "\n", 149 | "Classification of missing data as MAR should be done with care and will\n", 150 | "rely on domain knowledge.\n", 151 | "\n", 152 | "#### Missing Not At Random (MNAR)\n", 153 | "\n", 154 | "The missingness is related to what is missing, e.g., a person does not\n", 155 | "attend a health assessment because they are too unwell to travel.\n", 156 | "\n", 157 | "When data is MNAR no methods exist to handle this missingness\n", 158 | "appropriately (Sterne et al. 2009).\n", 159 | "\n", 160 | "**We cannot test whether data is MAR vs MNAR as the data required is\n", 161 | "missing**.\n", 162 | "\n", 163 | "## References\n", 164 | "\n", 165 | "Mimi Onuoha. (2021). On Missing Data Sets.\n", 166 | "https://github.com/MimiOnuoha/missing-datasets (Original work published\n", 167 | "2016)\n", 168 | "\n", 169 | "Sterne, J. A., White, I. R., Carlin, J. B., Spratt, M., Royston, P.,\n", 170 | "Kenward, M. G., … & Carpenter, J. R. (2009). Multiple imputation for\n", 171 | "missing data in epidemiological and clinical research: potential and\n", 172 | "pitfalls. Bmj, 338." 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3 (ipykernel)", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.9.15" 193 | }, 194 | "vscode": { 195 | "interpreter": { 196 | "hash": "bfb6be6a1e2928ff2d2ab7904db3c9037c4b5ca4f399bd5e7064365a54a3f091" 197 | } 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 5 202 | } 203 | -------------------------------------------------------------------------------- /coursebook/modules/m2/data/README.md: -------------------------------------------------------------------------------- 1 | # Module 2 Data Sources 2 | 3 | - `ANSUR_II_FEMALE_Public.csv` 4 | - https://www.openlab.psu.edu/ansur2/ 5 | - The Anthropometric Survey of US Army Personnel, female data. 6 | - Paquette, S., Parham, J.L. et al., 2012 7 | - Public Domain 8 | 9 | - `penguins_original.csv`, `penguins.csv` 10 | - Data are available by[CC-0](https://creativecommons.org/share-your-work/public-domain/cc0/) license in accordance with the [Palmer Station LTER Data Policy](http://pal.lternet.edu/data/policies) and the [LTER Data Access Policy for Type I data](https://lternet.edu/data-access-policy/). 11 | - https://allisonhorst.github.io/palmerpenguins/ 12 | - Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer Archipelago (Antarctica) penguin data. R package version 0.1.0.https://allisonhorst.github.io/palmerpenguins/. doi: 10.5281/zenodo.3960218. 13 | - `penguins.csv` file modified here to include a few messy rows using the `penguins_make_messy.py` script, so is only appropriate for teaching purposes. 14 | 15 | - `lter_penguins.png` 16 | - Artwork by [@allison_horst](https://twitter.com/allison_horst) 17 | - https://github.com/allisonhorst/palmerpenguins 18 | 19 | - `pcs_2017.csv` 20 | - https://data.ny.gov/Human-Services/Patient-Characteristics-Survey-PCS-2017/8itk-gcdy 21 | - Patient Characteristics Survey (PCS): 2017 22 | - The New York State Office of Mental Health, Office of Performance Measurement and Evaluation (OPME) 23 | - [OPEN-NY Terms Of Use](https://data.ny.gov/dataset/OPEN-NY-Terms-Of-Use/77gx-ii52) 24 | - Only a random subset of 5000 patients saved here. 25 | 26 | - `urban_population.csv`, `urban_population_header.csv`: 27 | - The World Bank 28 | - United Nations Population Division. World Urbanization Prospects: 2018 Revision. 29 | - License: [CC BY-4.0](https://datacatalog.worldbank.org/public-licenses#cc-by) 30 | - https://data.worldbank.org/indicator/SP.URB.TOTL.IN.ZS 31 | - Source files modified to only include a small number of years for teaching purposes, and to remove the header in the CSV. 32 | -------------------------------------------------------------------------------- /coursebook/modules/m2/data/flower.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m2/data/flower.png -------------------------------------------------------------------------------- /coursebook/modules/m2/data/lter_penguins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m2/data/lter_penguins.png -------------------------------------------------------------------------------- /coursebook/modules/m2/data/pcs_download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download New York State 2017 Patient Characteristics Survey (PCS) and save a subset of 3 | it. 4 | """ 5 | import pandas as pd 6 | 7 | df = pd.read_csv( 8 | "https://data.ny.gov/api/views/8itk-gcdy/rows.csv?accessType=DOWNLOAD&sorting=true" 9 | ) 10 | 11 | # only include these columns 12 | select_columns = [ 13 | "Program Category", 14 | "Region Served", 15 | "Age Group", 16 | "Sex", 17 | "Living Situation", 18 | "Household Composition", 19 | "Preferred Language", 20 | "Veteran Status", 21 | "Employment Status", 22 | "Number Of Hours Worked Each Week", 23 | "Education Status", 24 | "Special Education Services", 25 | "Mental Illness", 26 | "Intellectual Disability", 27 | "Autism Spectrum", 28 | "Alcohol Related Disorder", 29 | "Drug Substance Disorder", 30 | "Mobility Impairment Disorder", 31 | "Alzheimer or Dementia", 32 | "Neurological Condition", 33 | "Traumatic Brain Injury", 34 | "Cancer", 35 | "Smokes", 36 | "Received Smoking Medication", 37 | "Received Smoking Counseling", 38 | "Serious Mental Illness", 39 | "Principal Diagnosis Class", 40 | "SSI Cash Assistance", 41 | "SSDI Cash Assistance", 42 | "Public Assistance Cash Program", 43 | "Other Cash Benefits", 44 | "Three Digit Residence Zip Code", 45 | ] 46 | df = df[select_columns] 47 | 48 | # save a subset of 5000 patients only (full dataset is >80 MB) 49 | df = df.sample(n=5000, random_state=123) 50 | df.to_csv("pcs_2017.csv", index=False) 51 | -------------------------------------------------------------------------------- /coursebook/modules/m2/data/penguins_make_messy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loads the Palmer penguins data and adds some rows to represent issues that can 3 | arise with data consistency. 4 | """ 5 | import pandas as pd 6 | import numpy as np 7 | 8 | df = pd.read_csv("penguins_original.csv") 9 | print("\nOriginal data frame info:\n") 10 | df.info() 11 | 12 | new = df.copy() 13 | 14 | # create a dummy ID column of format P-000 15 | new["Id"] = "P-" + new.index.astype(str).str.zfill(3) 16 | 17 | # make ID the first column 18 | new = new[ 19 | [ 20 | "Id", 21 | "species", 22 | "island", 23 | "bill_length_mm", 24 | "bill_depth_mm", 25 | "flipper_length_mm", 26 | "body_mass_g", 27 | "sex", 28 | "year", 29 | ] 30 | ] 31 | 32 | # Correctly treated missing values (NaN bill_length_mm) 33 | new = new.append( 34 | { 35 | "Id": "P-344", 36 | "species": "Chinstrap", 37 | "island": "Dream", 38 | "bill_length_mm": np.nan, 39 | "bill_depth_mm": 19.2, 40 | "flipper_length_mm": 197, 41 | "body_mass_g": 4000, 42 | "sex": "male", 43 | "year": 2008, 44 | }, 45 | ignore_index=True, 46 | ) 47 | 48 | new = new.append( 49 | { 50 | "Id": "P-345", 51 | "species": "Adelie", 52 | "island": "Torgersen", 53 | "bill_length_mm": np.nan, 54 | "bill_depth_mm": 18.0, 55 | "flipper_length_mm": 193, 56 | "body_mass_g": 43400, 57 | "sex": "female", 58 | "year": 2009, 59 | }, 60 | ignore_index=True, 61 | ) 62 | 63 | # comma instead of decimal point (bill_depth_mm 14,2 instead of 14.2) 64 | new = new.append( 65 | { 66 | "Id": "P-346", 67 | "species": "Gentoo", 68 | "island": "Biscoe", 69 | "bill_length_mm": 45.2, 70 | "bill_depth_mm": "14,2", 71 | "flipper_length_mm": 224, 72 | "body_mass_g": 5600, 73 | "sex": "male", 74 | "year": 2007, 75 | }, 76 | ignore_index=True, 77 | ) 78 | 79 | # missing value encoded as negative number (-99 for flipper_length_mm) 80 | new = new.append( 81 | { 82 | "Id": "P-347", 83 | "species": "Adelie", 84 | "island": "Dream", 85 | "bill_length_mm": 36.0, 86 | "bill_depth_mm": 17.3, 87 | "flipper_length_mm": -99, 88 | "body_mass_g": 3475, 89 | "sex": "female", 90 | "year": 2007, 91 | }, 92 | ignore_index=True, 93 | ) 94 | 95 | # suspiciously low value, perhaps a typo (body_mass_g = 285 instead of 2850) 96 | new = new.append( 97 | { 98 | "Id": "P-348", 99 | "species": "Adelie", 100 | "island": "Biscoe", 101 | "bill_length_mm": 36.4, 102 | "bill_depth_mm": 18.1, 103 | "flipper_length_mm": 193, 104 | "body_mass_g": 285, 105 | "sex": "female", 106 | "year": 2007, 107 | }, 108 | ignore_index=True, 109 | ) 110 | 111 | # UNKNOWN Speciees 112 | new = new.append( 113 | { 114 | "Id": "P-349", 115 | "species": "UNKNOWN", 116 | "island": "Biscoe", 117 | "bill_length_mm": 55.9, 118 | "bill_depth_mm": 15.9, 119 | "flipper_length_mm": 218, 120 | "body_mass_g": 5300, 121 | "sex": "male", 122 | "year": 2009, 123 | }, 124 | ignore_index=True, 125 | ) 126 | 127 | # duplicate row 128 | new = new.append( 129 | { 130 | "Id": "P-276", 131 | "species": "Chinstrap", 132 | "island": "Dream", 133 | "bill_length_mm": 46.5, 134 | "bill_depth_mm": 17.9, 135 | "flipper_length_mm": 192, 136 | "body_mass_g": 3500, 137 | "sex": "female", 138 | "year": 2007, 139 | }, 140 | ignore_index=True, 141 | ) 142 | 143 | # shuffle data frame order so added strange values not so obvious 144 | new = new.sample(frac=1, random_state=123) 145 | 146 | print("\nUpdated data frame info:\n") 147 | new.info() 148 | 149 | new.to_csv("penguins.csv", index=False) 150 | -------------------------------------------------------------------------------- /coursebook/modules/m2/data/smiley_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m2/data/smiley_16.png -------------------------------------------------------------------------------- /coursebook/modules/m2/overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5ff3b789-c0b7-443b-92f3-53a0cd645f93", 6 | "metadata": {}, 7 | "source": [ 8 | "# Overview\n", 9 | "\n", 10 | "![](https://pandas.pydata.org/static/img/pandas.svg)\n", 11 | "\n", 12 | "This module covers the basics of working with data. We cover the challenges and ethical issues of working with open data (as well as the benefits!), the different types of data available, and the practical experience of loading and manipulating data (with [`pandas`](https://pypi.org/project/pandas/)) in preparation for analysis.\n", 13 | "\n", 14 | "The module is structured into two parts:\n", 15 | "\n", 16 | "- Part 1: Getting and loading data\n", 17 | " - Open data and data sources\n", 18 | " - Licensing, ethics, security.\n", 19 | " - Pandas intro/primer\n", 20 | " - Data formats (CSV, database, API, image, …)\n", 21 | " - How to load them into Python with Pandas (mostly).\n", 22 | "- Part 2: Exploring and wrangling data\n", 23 | " - Loading a dataset for the first time (sanity checks, data\n", 24 | " parsing issues, …)\n", 25 | " - Manipulating different types of data (text, dates, categorical,\n", 26 | " images)\n", 27 | " - Feature engineering\n", 28 | " - Missing data\n", 29 | " - Privacy and anonymisation\n", 30 | "\n", 31 | "References are given at the end of each subsection.\n" 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3 (ipykernel)", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.9.15" 52 | }, 53 | "vscode": { 54 | "interpreter": { 55 | "hash": "b9e8921bd18fbd36d3a09ae9691fc21c58beec206524d0083259030e87e84f05" 56 | } 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 5 61 | } 62 | -------------------------------------------------------------------------------- /coursebook/modules/m3/3.1-FiguresGoneWrong.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"7ade671a-c2da-4d4b-8ac2-1c11bf57adf3","metadata":{},"source":["# 3.1 Figures gone wrong\n","\n","The most common way for data scientists to convey and present their findings and\n","ideas is through graphs and figures. Modern software libraries have made figure\n","generation as easy as it has ever been. However, this has made it all together\n","too easy for data scientists to:\n","\n","1. Make misleading figures\n","2. Choose a wrong presentation format \n","3. Use overly complex, but attractive, designs which muddle the message\n","\n","and so, creating **good** plots is hard.\n","\n","Here we’ve collated examples of figures that do a poor job of\n","communicating the data. As we go through these examples we will have a\n","brief discussion about each figure (you can find many more examples at\n","[@GraphCrimes](https://twitter.com/GraphCrimes?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) on Twitter).\n","\n","## Example 1\n","\n","![](https://i.imgur.com/u16vTqy.png)\n","[Source](https://www.callingbullshit.org/tools/tools_misleading_axes.html)\n","\n"," \n","\n","Example 1 notes\n","\n","This figure is a classic example of misleading information. The axes\n","does not start at zero so the effect is visually exaggerated, and we are\n","not given the uncertainty around these averages. The reader is therefore\n","unable to interpret the practical significance of any differences\n","between groups.\n"," \n","\n","## Example 2\n","\n","![](https://i.imgur.com/K0MWcgq.jpg)\n","[Source](https://www.visualisingdata.com/2015/10/if-your-visuals-deceive-your-message-deceives/)\n","\n"," \n","\n","Example 2 notes\n","\n","\n","Here data has been selectively plotted to exaggerate an idea. There are\n","a number of issues with this plot:\n","\n","- We only have two data points for each trend, and it is practically\n"," impossible that the trend would be a perfect straight line. Rather,\n"," any trend would fluctuate year-on-year, this proposed difference\n"," between 2008 and 2013 may be within the bounds of a noisy signal.\n","- If you look at the numbers there are clearly two axes at play, or at\n"," least no attempt has been made to represent the magnitude of the\n"," change realistically.\n","- Plotting these two data sources together here suggests there is a\n"," relationship between them (i.e., that the company *Planned\n"," Parenthood* has redistributed resources from cancer screening to\n"," abortions). It is an ethical responsibility for any figure creator\n"," to ensure that your figure is not implying a conclusion that isn’t\n"," supported by your data.\n","\n","To illustrate how misleading this original figure is, here is an example\n","of the same data visualised in a more responsible manner.\n","\n","![](https://i.imgur.com/IPWnF0k.jpg)\n","[Source](https://twitter.com/emschuch/status/649690759453646848)\n"," \n","\n","## Example 3\n","\n","![](https://i.imgur.com/eHM3JzA.png)\n"," \n","\n","Example 3 notes\n","\n","Example 3 is very confusing because the scale of the x-axis is altered\n","mid-plot. Even after reading the caption it is very difficult to get a\n","feel for the data because the grid-lines give a powerful signal of\n","uniformity.\n"," \n","\n","## Example 4\n","\n","![](https://i.imgur.com/9lu7jVP.png)\n"," \n","\n","Example 4 notes\n","\n","The phenomenon that this xkcd comic is getting at is Normalisation.\n","Normalisation is where you alter a scale to be between zero and one\n","(usually). Failure to normalise is when different measurements that\n","themselves have different scales are plotted on the same scale. The\n","comic shows the classic failure to normalise measurements by population.\n","A variable that was consistently 10% of the population would appear to\n","vary across the map.\n"," \n","\n","## Example 5\n","\n","![](https://i.imgur.com/Scf4Qlw.png)\n"," \n","\n","Example 5 notes\n","\n","\n","Both these plots are examples from Factfulness by Hans Rosling, they show that using only averages means that\n","the figure does not communicate all the information, such as the\n","spread of the distribution. The kernel density estimates (more on those\n","in Section 3.3) below show a slice of one particular year, offering\n","complementary information that allows us to interpret the averages in\n","context.\n","\n","![](https://i.imgur.com/LYySCdh.png)\n"," \n","\n","## Example 6\n","\n","![](https://i.imgur.com/adT9Snr.jpg)\n","[Source](https://badvisualisations.tumblr.com/post/184827953341/this-is-not-all-you-need-to-click-through-to-this)\n"," \n","\n","Example 6 notes\n","\n","This figure does not appear to have been created to clearly communicate\n","data. There is far too much information on this figure to be able to\n","intuitively grasp the message. A reader should be able to understand a\n","figure quickly.\n"," \n","\n","## Example 7\n","\n","![](https://i.imgur.com/n72HXxI.jpg)\n","[Source](https://twitter.com/10DowningStreet/status/1322614557181960195)\n"," \n","\n","Example 7 notes\n","\n","This figure was presented quickly in a UK Government Covid briefing. In\n","such a situation rapid and clear comprehension of figures is paramount.\n","Here this figure is guilty of over-plotting, which is attempting to\n","squeeze too much information into one figure, ultimately rendering it\n","difficult to comprehend.\n"," \n","\n","## Example 8\n","\n","![](https://i.imgur.com/FsnOPNO.png)\n","[Source](https://twitter.com/Rodpac/status/1250764503861600256?s=20)\n"," \n","\n","Example 8 notes\n","\n","Although you can see the relative pattern of the two signals, there is\n","no y-axis! One can therefore not assess the magnitude of the difference.\n"," \n","\n","## Example 9\n","\n","![](https://i.imgur.com/q9r9GpE.png)\n","[Source](https://www.visualisingdata.com/)\n"," \n","\n","Example 9 notes\n","\n","Though this pie chart represents the data well there are confusing\n","aesthetic choices. We have redundant information: there is really no\n","need for the legend since all the slices of the pie chart are labelled\n","anyway.\n"," \n","Additionally, pie charts are almost always the\n","[worst](https://scc.ms.unimelb.edu.au/resources/data-visualisation-and-exploration/no_pie-charts#:~:text=Pies%20and%20doughnuts%20fail%20because,important)%20are%20tricky%20to%20show.)\n","form of presenting data and should be avoided at all costs. \n","\n","## Example 10\n","\n","![](https://i.imgur.com/4OUGwbR.png)\n","[Source](https://badvisualisations.tumblr.com/post/184827953341/this-is-not-all-you-need-to-click-through-to-this)\n"," \n","\n","Example 10 notes\n","\n","Similarly to Example 10, the aesthetics of this graph detract from\n","communicating the data. The y axis could be represented on a scale of\n","millions (i.e., from 0 to \\$500), and there is no need to note all the\n","data points. If one wants to write out the data, use a table!\n"," \n"]}],"metadata":{"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.4"},"vscode":{"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /coursebook/modules/m3/3.4-StoryTelling.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "2efa3c85-8b83-4760-96ba-59d6a6da7069", "metadata": {}, "source": ["# 3.4 Storytelling with data visualisation\n", "\n", "## **What is a story**\n", "\n", "A story is a connected series of events presented in a specific order\n", "such that they create an emotional reaction for a given audience.\n", "\n", "In our case, a story is told through visuals of statistical information.\n", "Ideally, these create a compelling narrative that convinces the audience\n", "of the importance of your data insight.\n", "\n", "### Audience\n", "\n", "Telling a story starts with the audience. The starting point of your\n", "visualisation\u2019s story is the audience\u2019s domain-relevant knowledge. The\n", "audience\u2019s context should frame your story.\n", "\n", "However, even experts in a field take time to process complex\n", "visualisations. Assume that the reader needs as much help as possible\n", "from you to understand the key trends and relationships shown. **Keep\n", "your figures simple, avoid irrelevant or tangential information**, and\n", "augment your visuals with narrative text that will help the readers\n", "infer the intended conclusions about your data.\n", "\n", "### Tell one story\n", "\n", "Although it is possible (and far too easy with modern visualisation\n", "packages!) to tell many stories using a single chart, **it is easy to\n", "overload your audience with information**. Every addition to a figure\n", "increases the required mental effort of your audience. Too much\n", "information is confusing - the reader does not know what to focus\n", "on, and your figure is diminished. Aim for *ink parsimony*, be careful to\n", "**include only the visual elements that increase understanding**, and\n", "embrace white space. Remember you are competing for the viewer\u2019s time\n", "and attention!\n", "\n", "Having said all this, sometimes a single static visualisation is not\n", "enough to tell an entire story. You may want to design a sequence of\n", "simple figures that jointly create a convincing story arc. When\n", "connecting a story over multiple figures consider using different types\n", "of visualisation for each distinct analysis to help the reader\u2019s focus.\n", "Furthermore, keep semantic indicators (such as colour scheme) consistent\n", "across figures. If you have many similar types of analysis an\n", "alternative is to use interactive figures and animations that enable the\n", "audience to navigate through the story.\n", "\n", "A great example of data journalism following the above principles comes\n", "from the Financial Times [Covid-19: The global crisis \u2014 in\n", "data](https://ig.ft.com/coronavirus-global-data/).\n", "\n", "## Titles and annotations\n", "\n", "The first thing that your reader will see is the title of your\n", "visualisation, so it is important at this point that the title not only\n", "describes what is being measured, but also why the reader should care\n", "and how to read the chart.\n", "\n", "Depending on the audience and the story you want to tell, the title can\n", "adopt different styles (from\n", "[here](https://www.eea.europa.eu/data-and-maps/daviz/learn-more/chart-dos-and-donts/#message)): -\n", "**Descriptive title**: This is more appropriate for technical audiences\n", "with a background knowledge of the topic. It gives a neutral overview of\n", "the analysis being presented. - **Definitive statement**: Suitable for\n", "general audiences, it communicates clearly your message and the\n", "insights. - **Question**: Posing a clear question in your title and\n", "providing an answer to it with the data will transmit a clear message to\n", "your audience (suitable for non-technical audiences).\n", "\n", "You can also use subtitles to strengthen your message and provide more\n", "depth on the insight obtained from the data. If your audience has no\n", "prior knowledge on the data, annotations can improve the figure\u2019s\n", "readability and provide additional detail.\n", "\n", "## Hierarchy of the information\n", "\n", "Consider how your audience will scan and read the visualisation.\n", "\n", "Without any visual cues, when we\u2019re confronted with a block of text our\n", "only option is to read it.\n", "\n", "We can employ preattentive (subconscious processing) attributes to create a visual hierarchy in\n", "our communications. These attributes can be colour, contrast, font,\n", "size, etc.\n", "\n", "For example, look at the figure from the [European Environment\n", "Agency](https://www.eea.europa.eu/) shown below. If many data are\n", "displayed, we can make use of white space to separate and group the\n", "information. Text formatting can allow us to give the desired hierarchy\n", "to the content and colours help us to highlight the focus of the data\n", "visualisation.\n", "\n", "![](https://i.imgur.com/aCd1T2W.png)\n", "\n", "## Data-ink ratio\n", "\n", "The `data-ink` ratio is a concept introduced by Edward Tufte, an expert\n", "on designing effective data visualisations.\n", "\n", "Best practices used in data visualisation, story-telling and information design are highly influenced by the renowned statistical graphics expert Edward Tufte. An excellent visualization, according to Tufte, expresses \u201ccomplex ideas communicated with clarity, precision and efficiency.\u201d In 1980s, Tufte proposed a metric for measuring the amount of superfluous information included in a chart. He called it the data-ink ratio, saying \u201cthe larger the share of a graphic\u2019s ink devoted to data, the better.\u201d \n", "\n", "The term, \u2018data-ink\u2019, is defined as the total ink on a graph that\n", "represents data. And the [data-ink\n", "ratio](https://infovis-wiki.net/w/images/5/55/DIR.jpg) is the proportion\n", "of a graphic\u2019s ink devoted to the non-redundant display of\n", "data-information.\n", "\n", "Tufte claims that good graphical representations are designed with ta\n", "data-ink ratio as close as possible to 1.0 as possible, without\n", "eliminating something that is necessary for effective communication.\n", "\n", "Tufte puts forward the following 5 principles related to data ink when\n", "designing a data visualisation:\n", "\n", "- Above all else show data.\n", "- Maximize the data-ink ratio.\n", "- Erase non-data-ink.\n", "- Erase redundant data-ink.\n", "- Revise and edit.\n", "\n", "## Emotion and context in story telling\n", "\n", "### Emotion\n", "\n", "Discussing data-ink ratios (see section above) and information can feel\n", "cold and calculating, contradictory to the idea that a good story takes\n", "the reader through an emotional journey. The emotional component of the\n", "story is what grabs your attention and makes the story memorable.\n", "\n", "So how can you invoke emotion and minimize the data-ink ratio? In this\n", "course we do not aim to provide a clear answer for this, but we want to\n", "use an example to get you to think about it.\n", "\n", "This comparison comes from the [Data Feminism\n", "book](https://data-feminism.mitpress.mit.edu/pub/5evfe9yd/release/3).\n", "They compare two different ways of visualising the same problem: gun\n", "violence in the US. The first figure is a typical bar chart from the\n", "Washington Post showing a bar chart with the number of active shooter\n", "incidents annually. The second figure comes from Periscope, a design\n", "firm with the tag line \u201cDo good with data\u201d, that took a different\n", "approach. Quoting the Data Feminism book (D\u2019Ignazio and Klein, 2020):\n", "\n", "> \u201c\u2026 what makes Periscopic\u2019s visualization so very different from a more\n", "> conventional bar chart of similar information, such as \u201cThe Era of\n", "> \u2018Active Shooters\u2019\u201d from the Washington Post? The projects share the\n", "> proposition that gun deaths present a serious threat. But unlike the\n", "> Washington Post bar chart, Periscopic\u2019s work is framed around an\n", "> emotion: loss. People are dying; their remaining time on earth has\n", "> been stolen from them. These people have names and ages. They have\n", "> parents and partners and children who suffer from that loss as well.\u201d\n", "\n", "![](https://i.imgur.com/XcdZGKf.png) A bar chart with the number of\n", "\u201cactive shooters incidents\u201d from the United States between 2000 and\n", "2015.\n", "\n", "![](https://i.imgur.com/KbnOEry.png) Visualisation of the \u201cstolen years\u201d\n", "of people killed by guns in the United States in 2013. Figure by\n", "[Periscope](https://guns.periscopic.com/).\n", "\n", "### Context\n", "\n", "In the previous sections we have emphasised how to use data storytelling\n", "to convey a message through emotion. It is clear that tailoring a story\n", "in this way can never be neutral or objective. However, this is not only\n", "limited to a story or message. **Data is never neutral or objective and\n", "there is no such thing as \u201craw data\u201d** (discussed in *Section 1.1*)\n", "\n", "Instead of taking data at face value and looking toward future insights,\n", "data scientists must work with domain experts to first **interrogate the\n", "context, limitations, and validity of the data being used**.\n", "Furthermore, this should not limit itself to the stages of data\n", "acquisition or data analysis, context also comes into play in the\n", "framing and communication of results.\n", "\n", "The following example from the Data Feminism book shows data resulting\n", "from a research study called \u201cDisparities in Mental Health Referral and\n", "Diagnosis in the New York City Jail Mental Health Service.\u201d This study\n", "looked at the medical records of tens of thousands of first-time\n", "incarcerated people and finds that some groups (white people) are more\n", "likely to receive treatment, while others (Black and Latinos) are more\n", "likely to receive punishment such as solitary confinement.\n", "\n", "The figures below shows two examples of how varying the titles can frame\n", "the figure in a different context. The figure in the left seems neutral\n", "and free of bias however does not succeed in communicating the research\n", "findings. The figure in the right attempts to fix that by changing the\n", "annotations of the figure to add necessary context.\n", "\n", "![](https://i.imgur.com/ThduUju.png) \n", "Two portrayals of the same data analysis. Data from Fatos Kaba et\n", "al.\u00a0\u201cDisparities in Mental Health Referral and Diagnosis in the New York\n", "City Jail Mental Health Service\u201d. Graphics by Catherine D\u2019Ignazio,\n", "extracted from Data Feminism (D\u2019Ignazio and Klein, 2020).\n", "\n", "## Extra resources\n", "\n", "Check out the page of [Turing Data\n", "Stories](https://alan-turing-institute.github.io/TuringDataStories-fastpages/).\n", "A Turing project that experiments with different ways of doing\n", "story-telling in a pedagogical, ethical and reproducible fashion.\n", "\n", "\n", "**Group Discussion** We have said that data is never completely\n", "objective. Is this true? How would be visualise data that is potentially\n", "emotive but you want to keep as objective as possible?\n", "\n", "\n", "# References\n", "\n", "[Fundamentals of data visualisation by Claus O\u2019Wilke](https://clauswilke.com/dataviz/)\n", "\n", "[Storytelling with data](http://www.bdbanalytics.ir/media/1123/storytelling-with-data-cole-nussbaumer-knaflic.pdf)\n", "\n", "[chart-dos-and-donts](https://www.eea.europa.eu/data-and-maps/daviz/learn-more/chart-dos-and-donts/#message)\n", "\n", "[Chapter about data-ink\n", "ratio](https://jtr13.github.io/cc19/tuftes-principles-of-data-ink.html)\n", "\n", "[Data Feminism](https://data-feminism.mitpress.mit.edu/)\n", "\n", ""]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.4"}, "vscode": {"interpreter": {"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /coursebook/modules/m3/data/US_state_regions_code.csv: -------------------------------------------------------------------------------- 1 | State,State Code,Region,Division 2 | Alaska,AK,West,Pacific 3 | Alabama,AL,South,East South Central 4 | Arkansas,AR,South,West South Central 5 | Arizona,AZ,West,Mountain 6 | California,CA,West,Pacific 7 | Colorado,CO,West,Mountain 8 | Connecticut,CT,Northeast,New England 9 | District of Columbia,DC,South,South Atlantic 10 | Delaware,DE,South,South Atlantic 11 | Florida,FL,South,South Atlantic 12 | Georgia,GA,South,South Atlantic 13 | Hawaii,HI,West,Pacific 14 | Iowa,IA,Midwest,West North Central 15 | Idaho,ID,West,Mountain 16 | Illinois,IL,Midwest,East North Central 17 | Indiana,IN,Midwest,East North Central 18 | Kansas,KS,Midwest,West North Central 19 | Kentucky,KY,South,East South Central 20 | Louisiana,LA,South,West South Central 21 | Massachusetts,MA,Northeast,New England 22 | Maryland,MD,South,South Atlantic 23 | Maine,ME,Northeast,New England 24 | Michigan,MI,Midwest,East North Central 25 | Minnesota,MN,Midwest,West North Central 26 | Missouri,MO,Midwest,West North Central 27 | Mississippi,MS,South,East South Central 28 | Montana,MT,West,Mountain 29 | North Carolina,NC,South,South Atlantic 30 | North Dakota,ND,Midwest,West North Central 31 | Nebraska,NE,Midwest,West North Central 32 | New Hampshire,NH,Northeast,New England 33 | New Jersey,NJ,Northeast,Middle Atlantic 34 | New Mexico,NM,West,Mountain 35 | Nevada,NV,West,Mountain 36 | New York,NY,Northeast,Middle Atlantic 37 | Ohio,OH,Midwest,East North Central 38 | Oklahoma,OK,South,West South Central 39 | Oregon,OR,West,Pacific 40 | Pennsylvania,PA,Northeast,Middle Atlantic 41 | Rhode Island,RI,Northeast,New England 42 | South Carolina,SC,South,South Atlantic 43 | South Dakota,SD,Midwest,West North Central 44 | Tennessee,TN,South,East South Central 45 | Texas,TX,South,West South Central 46 | Utah,UT,West,Mountain 47 | Virginia,VA,South,South Atlantic 48 | Vermont,VT,Northeast,New England 49 | Washington,WA,West,Pacific 50 | Wisconsin,WI,Midwest,East North Central 51 | West Virginia,WV,South,South Atlantic 52 | Wyoming,WY,West,Mountain -------------------------------------------------------------------------------- /coursebook/modules/m3/figures/proportions_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m3/figures/proportions_table.png -------------------------------------------------------------------------------- /coursebook/modules/m3/hands-on.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "042b31b0-0186-4f23-8278-38eddc8ad748", "metadata": {}, "source": ["# Module 3 hands-on session\n", "\n", "## Description\n", "\n", "In this hands-on session participants are divided in groups of 4 to 5\n", "participants and paired with a helper. Each group, should choose one of\n", "the proposed tasks (or propose a new one if keen!) and work together on\n", "it. At the end of the session the students will present their work to\n", "the class. The timeline for this session is the following: - Phase 1:\n", "Presentation of tasks and goals of the hands-on session (15 minutes), -\n", "Phase 2: Groups are formed and students choose task to develop (15\n", "minutes) \n", "- Phase 3: Work on the tasks (2.5 hours + with two 15 m breaks) - Phase\n", "4: Students present their findings and discussion (1 hour)\n", "\n", "Students are encouraged to use code developed in the hands-on session\n", "for module 2 and/or the data exploration notebook in [*Section\n", "3.5*](section3.5). They can use data from the UK or choose another\n", "country. Presentations should last between 5 and 10 minutes.\n", "\n", "## Proposed tasks\n", "\n", "1. **Use data visualisation to further explore relationship between\n", " variables**: In [*Section 3.5*](section3.5) began exploring the\n", " relationships between some variables, focusing particularly\n", " on deprivation index and accommodation and self-reported health. Our\n", " research questions requires examining variables beyond these two,\n", " and in Module 4 we will be using more variables. It is important to\n", " further understand the relationships between our chosen variables.\n", " with the self-reported health and each other. For example:\n", " - Other material variables (e.g.\u00a0`Household Size`, or `Children`)\n", " - Education variables (`ISCED` or `Y11_Education`)\n", " - Mental well-being variables (`MentalWellbeingIndex`)\n", " - Any other psychosocial you find interesting.\n", "\n", " Think of ways of visualising these relationships that would convince\n", " a PI (and yourself!) and a given variable should (or should not) be\n", " part of the model.\n", "\n", "In Module 4's hands-on you will be building your own models with variables of your choosing. We have selected a subset of variables to make the task manageable, but feel free to explore further and assess any variables in the EQLS dataset that you think are interesting. You will be able to use these variables in Module 4's hands-on.\n", "\n", "1. **Use data visualisation to explore data missingness**: In [*Section\n", " 3.5*](section3.5) we looked at the missingness of some variables\n", " rather superficially. For simplicity, we decided to drop rows with\n", " missing values. This is a dangerous approach, particularly if we\n", " have data that is not missing completely at random. For example, if\n", " for some unknown factor people living in the most deprived areas are\n", " less likely to answer to some survey question that group would end\n", " up being unrepresented in our dataset. In this task you are\n", " invited to:\n", " - Further explore the missingness in our variables of interest\n", " related to Module 4 (`AgeCategory`,`DeprIndex`,`ISCED`,\n", " `Children`,`MentalWellbeingIndex`,`AccomProblems`,`SRH`) and\n", " advise if it was a sensible approach to drop the rows as done in\n", " [*Section 3.5*](section3.5).\n", " - Explore other methods of dealing with missing data, such as\n", " imputation or prediction of the missing values (somese\n", " are mentioned in Module 2).\n", "2. **Be a data journalist for a day**: After doing some exploration on\n", " the dataset use data visualisation to tell a story from the data.\n", " You are free to decide which story you want to tell!"]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.4"}, "vscode": {"interpreter": {"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /coursebook/modules/m3/overview.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "dc5a316d-d059-48cf-a80e-63e4039accc7", "metadata": {}, "source": ["# Overview\n", "\n", "Great figures bring data to life, making communication\n", "clearer and more compelling. Bad figures can mislead and confuse, at worst they tell stories that do not exist.\n", "\n", "Here we give tips to tell good\n", "figures from bad ones, helping improve the way\n", "you use visualisation to effectively and accurately communicate insights\n", "from data.\n", "\n", "This module is structured as follows: \n", "- **Figures Gone Wrong**. Real-world for examples of bad figures, showing how easy misakes are (knowingly or unknowingly) made. \n", "- **Rules of the data visualisation game**. A whip-round of the basics. \n", "- **Atlas of Visualisations**. How visualisations are commonly adapted for different scenarios and data types. \n", "- **Storytelling with data visualisation**. How to ensure your figure is delivering the intended message. \n", "- **Visualisation for Data Exploration**. Exploring the dataset introduced in Modules 1 & 2 through visualisations.\n", "\n", "**References**:\n", "\n", "We are professional research data scientists who have spent some time\n", "plotting, but we are not experts in data visualisation. Fortunately,\n", "there are plenty of experts out there who offer excellent pedagogical\n", "material. This course is mostly based on these resources, with a\n", "sprinkling of hard-won lessons of our own.\n", "\n", "The resources that have influenced this chapter the most is the\n", "[Fundamentals of Data Visualization](https://clauswilke.com/dataviz/)\n", "book by Claus O. Wilke. We also draw frequently from Andy Kirk\u2019s\n", "[visualising data site](https://www.visualisingdata.com/), as well as\n", "numerous others that we reference as we go."]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.6 ('.venv': poetry)", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.6"}, "vscode": {"interpreter": {"hash": "b9e8921bd18fbd36d3a09ae9691fc21c58beec206524d0083259030e87e84f05"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/UKDA/UKDA_Study_7724_Information.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/UKDA/UKDA_Study_7724_Information.htm -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/allissue/eqls_2007_ukda_data_dictionary.rtf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/allissue/eqls_2007_ukda_data_dictionary.rtf -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/allissue/eqls_2007and2011_ukda_data_dictionary.rtf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/allissue/eqls_2007and2011_ukda_data_dictionary.rtf -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/allissue/eqls_2011_ukda_data_dictionary.rtf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/allissue/eqls_2011_ukda_data_dictionary.rtf -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/excel/7724_file_information_csv.csv: -------------------------------------------------------------------------------- 1 | FILEPATH,FILENAME,SIZE UNZIPPED (BYTES),FILE DESCRIPTION 2 | UKDA-7724-csv\7724_file_information_csv.csv,7724_file_information_csv.csv,1764,File manifest for all files in zip bundle 7724csv_.zip 3 | UKDA-7724-csv\read7724.htm,read7724.htm,9923,READ File (legal notes and data processing notes) 4 | UKDA-7724-csv\mrdoc\allissue\eqls_2007_ukda_data_dictionary.rtf,eqls_2007_ukda_data_dictionary.rtf,701888,EQLS 2007 Wave Data Dictionary and variable description for collection in RTF format 5 | UKDA-7724-csv\mrdoc\allissue\eqls_2011_ukda_data_dictionary.rtf,eqls_2011_ukda_data_dictionary.rtf,701888,EQLS 2011 Wave Data Dictionary and variable description for collection in RTF format 6 | UKDA-7724-csv\mrdoc\allissue\eqls_2007and2011_ukda_data_dictionary.rtf,eqls_2007and2011_ukda_data_dictionary.rtf,701888,EQLS 2007 and 2011 Waves Combined Data Dictionary and variable description for collection in RTF format 7 | UKDA-7724-csv\mrdoc\csv\eqls_api_map.csv,eqls_api_map.csv,42623,EQLS 2007 and 2011 API to Variable Mappings 8 | UKDA-7724-csv\mrdoc\excel\eqls_concordance_grid.xlsx,eqls_concordance_grid.xlsx,44440,EQLS 2007 and 2011 Variable and Value mappings 9 | UKDA-7724-csv\mrdoc\pdf\7724_eqls_userguide.pdf,7724_eqls_userguide.pdf,932557,User guide for study 7724 in PDF format 10 | UKDA-7724-csv\mrdoc\UKDA\UKDA_Study_7724_Information.htm,UKDA_Study_7724_Information.htm,6020,Study metadata (based on DDI codebook) in HTML format 11 | UKDA-7724-csv\csv\eqls_2007.csv,eqls_2007.csv,16089615,EQLS 2007 Wave Data File in CSV format 12 | UKDA-7724-csv\csv\eqls_2011.csv,eqls_2011.csv,23556114,EQLS 2011 Wave Data File in CSV format 13 | UKDA-7724-csv\csv\eqls_2007and2011.csv,eqls_2007and2011.csv,39643426,EQLS 2007 and 2011 Waves Combined Data File in CSV format 14 | -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/excel/eqls_api_map.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/excel/eqls_api_map.csv -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/excel/eqls_concordance_grid.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/excel/eqls_concordance_grid.xlsx -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/pdf/7724_eqls_2007-2011_user_guide_v2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/rds-course/057949109330f3b33e0077208bd1fdcd76cbdb79/coursebook/modules/m4/data/UKDA-7724-csv/mrdoc/pdf/7724_eqls_2007-2011_user_guide_v2.pdf -------------------------------------------------------------------------------- /coursebook/modules/m4/data/UKDA-7724-csv/read7724.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | read7724 6 | 7 | 8 | 9 | 10 | 30 | 31 | 32 | 33 | 34 |

UK DATA ARCHIVE: IMPORTANT STUDY INFORMATION

35 |
36 |

Study Number 7724 - European Quality of Life Time Series, 2007 and 2011: Open Access

37 | 38 |
39 | 40 | 41 | 42 | 43 |

DATA PROCESSING NOTES

44 |
45 | Data Archive Processing Standards 46 | 47 |

The data were processed to the UK Data Archive's A* standard. This is the 48 | Archive's highest standard, and means that an extremely rigorous and 49 | comprehensive series of checks was carried out to ensure the quality of the data 50 | and documentation. Firstly, checks were made that the number of cases and 51 | variables matched the depositor's records. Secondly, checks were made that all 52 | variables had comprehensible variable labels and all nominal (categorical) 53 | variables had comprehensible value labels. Where possible, either with reference 54 | to the documentation and/or in communication with the depositor, labels were 55 | accordingly edited or created. Thirdly, logical checks were performed to ensure 56 | that nominal (categorical) variables had values within the range defined (either 57 | by value labels or in the depositor's documentation). Lastly, any data or 58 | documentation that breached confidentiality rules were altered or suppressed to 59 | preserve anonymity. 60 | 61 |

All notable and/or outstanding problems discovered are detailed under the 'Data 62 | and documentation problems' heading below. 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |

Data and documentation problems 72 |

None encountered. 73 | 74 | 75 | 76 |

Data conversion information 77 | 78 | 79 |

From January 2003 onwards, almost all data conversions have been performed using software developed by the UK Data Archive. This enables standardisation of the conversion methods and ensures optimal data quality. In addition to its own data processing/conversion code, this software uses the SPSS and StatTransfer command processors to perform certain format translations. Although data conversion is automated, all data files are also subject to visual inspection by a member of the Archive's Data Services team. 80 |

With some format conversions, data, and more especially internal metadata (i.e. variable labels, value labels, missing value definitions, data type information), will inevitably be lost or truncated owing to the differential limits of the proprietary formats. A UK Data Archive Data Dictionary file (generally in Rich Text Format (RTF)) is usually provided for each data file, enabling viewing and searching of the internal metadata as it existed in the originating format. These 81 | files are called: 82 | 83 | [data file name]_UKDA_Data_Dictionary.rtf 84 | 85 |

Important information about the data format supplied 86 | 87 |

The links below provide important information about the Archive's data 88 | supply formats. Some of this information is specific to the ingest 89 | format of the data, i.e. the format in which the Archive received the data 90 | from the depositor. The ingest format for this study was 91 | SPSS 92 |

93 | Please follow the appropriate link below to see information on your chosen supply (download) 94 | format. 95 |

96 |

SPSS (*.sav) 97 |

102 | 103 | STATA (*.dta) 104 | 112 | 113 |
Tab-delimited text (*.tab) 114 | 122 | 123 | 124 |
MS Excel (*.xls/*.xslx) 125 | 129 | 130 |
SAS (*.sas7bdat and *.sas) 131 | 146 | 147 |
MS Access (*.mdb/*.mdbx) 148 | 152 | 153 | 154 |

Conversion of documentation formats 155 |

The documentation supplied with Archive studies is usually converted to Adobe Portable Document Format (PDF), with documents bookmarked to aid navigation. The vast majority of PDF files are generated from MS Word, RTF, Excel or plain text (.txt) source files, though PDF documentation for older studies in the collection may have been created from scanned paper documents. Occasionally, some documentation cannot be usefully converted to PDF (e.g. MS Excel files with wide worksheets) and this is usually supplied in the original or a more appropriate format. 156 |

157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /coursebook/modules/m4/hands-on.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Module 4 hands-on session\n", "\n", "## Description\n", "\n", "In this hands-on session participants are divided in groups of 4 or 5\n", "participants and paired with a helper. Each group should choose one of\n", "the proposed tasks (or propose a new one if keen!) and work together on\n", "it. At the end of the session the students will present their work to\n", "the class.\n", "\n", "The timeline for this session is the following:\n", "\n", "- Phase 1: Presentation of tasks and goals of the hands-on session (15\n", " minutes)\n", "- Phase 2: Groups are formed and students choose task to develop (15\n", " minutes)\n", "- Phase 3: Work on the tasks (2.5 hours + with two 15 m break)\n", "- Phase 4: Group discussion of findings (30 minutes)\n", "- Phase 5: Course wrap-up (30 minutes)\n", "\n", "Students are encouraged to use code developed in the hands-on session\n", "for module 3 and/or the data exploration notebook in Sections 4.3 and\n", "4.4. They can use data from the UK or choose another country. This is\n", "your chance to draw mathematical conclusions from the dataset.\n", "\n", "## Proposed tasks\n", "\n", "1. **Improving the models**: We invite you to try to improve/modify the\n", " models discussed in *Section 4.4*. Some suggestions:\n", "\n", " - Adding new variables and/or [interactions\n", " variables](https://en.wikipedia.org/wiki/Interaction_(statistics))\n", " to the model. Does this new model improve your knowledge?\n", "\n", " - The dataset is very imbalanced (the majority is \u2018good health\u2019).\n", " We have addressed this in sections 4.3/4.4 by changing the\n", " threshold of our p(x) classifier. But there are other ways of\n", " dealing with an imbalanced dataset (for some ideas see\n", " [here](https://towardsdatascience.com/how-to-deal-with-imbalanced-data-34ab7db9b100)).\n", " Investigate how somese change your modelled conclusions.\n", "\n", "2. **Prediction & Simulation**.\n", "\n", " - Logistic regression predicts the mean of a bernouilli\n", " distribution. Essentially, you get a generative model for each\n", " combination of predictor variable values. Have a play with\n", " simulating data from this bernouilli distribution to generate a\n", " new dataset of N people (we do this a bit in 4.1 and 4.3). Does\n", " our simulated dataset look anything like our real dataset?\n", " - Can you visualise how p(x) changes when you change specific\n", " variables while keeping the others constant?\n", " - In the above point we have assumed a single point estimate of\n", " p(x). But there is uncertainty in our coefficients, and\n", " therefore uncertainty in our p(x). What if we sample from this\n", " uncertainty when generating p(x)?\n", "\n", "3. **Comparative analysis with another country**: Up to now we have\n", " only looked at the UK, but what happens in other countries? How good\n", " is the performance if you use a model trained with UK data in\n", " another country? How different is the model (coefficients,\n", " performance, etc) trained with data from another country (e.g Poland\n", " vs the model of the UK shown in *Section 4.4*). Can you conclude\n", " that the same factors have different impact between the countries?\n", " Feel free to compare between any country you\u2019d like.\n", "\n", "We expect you to compare and discuss in detail what you have learned\n", "from these new models and think what would be the answer to the research\n", "question.\n", "\n", "1. **Imputation**: In module 3 we explored missingness in the data, and\n", " touched on different ways of dealing with this. Here we could\n", " explore the effect of different methods of imputation. For any\n", " method of imputation, the critical thing is to compare model output\n", " on the imputed data with the model output on the uninmputated data\n", " to assess how it changes the conclusions. Some suggestions of\n", " increasing complexity:\n", "\n", "- Replacing missing rows with the average of the missing variables.\n", "\n", "- Sample from a variable\u2019s distribution to fill out the missing rows.\n", " You could:\n", "\n", " - sample with replacement from the empirical values\n", " - create a probability estimate of the distribution (e.g.\u00a0kde) and\n", " sample from that.\n", " - something else\u2026\n", "\n", "- Model the missing variable as dependent on present variables. You\n", " could apply our generalised regression framework: pick potential\n", " predictors, select your distribution for the residuals, see if you\n", " want a link function other than the identity function.\n", "\n", "## Final discussion session\n", "\n", "Group discussion with the following points. We don\u2019t have \u201cright\n", "answers\u201d for this discussion.\n", "\n", "1. If the research question asks for an overall assessment of all\n", " Europe. How do we appropriately combine the models?\n", "\n", "2. Is there a better way of modelling dataset given the research\n", " question?\n", "\n", "3. After everything you\u2019ve done in all the hands on sessions, what\n", " would be your answer to the research question? What else has to be\n", " done?"], "id": "37f0d8de-f7e5-435d-a8ec-43c49eeeb6cd"}], "nbformat": 4, "nbformat_minor": 5, "metadata": {}} -------------------------------------------------------------------------------- /coursebook/modules/m4/overview.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "id": "f7ab275c-1750-46cf-9740-10162e827152", "metadata": {}, "source": ["# Overview\n", "\n", "The key goal of research data science is to learn from data. One of the\n", "most powerful methods of learning from data is **statistical\n", "modelling**.\n", "\n", "We demystify the key concepts involved through applying simple models (linear and logistic regression). The intended take-homes can be applied to any modelling problem.\n", "\n", "The module is structured as follows:\n", "\n", "- **The what and why of statistical modelling**. We begin by defining\n", " what modelling is and motivating the power of modelling.\n", "- **Fitting models**. Here we go through the components of a model,\n", " including describing how to fit one to data.\n", "- **Building a simple model**. We then carefully build a model based\n", " on the understanding of our data, taking care to understand the\n", " model.\n", "- **Evaluation a model**. It is not enough to have a model that is\n", " fitted to your data. The model has to be useful. The final section\n", " will cover how to evaluate your model and iteratively improve upon\n", " your model.\n", "\n", "**References:**\n", "\n", "We will include more specific references as we move through the module.\n", "But useful accessible introductions to modelling that has inspired much\n", "of this module\u2019s content are Poldrack\u2019s [Statistical Thinking for the\n", "21st\n", "Century](https://web.stanford.edu/group/poldracklab/statsthinking21/index.html),\n", "Holmes and Huber\u2019s [Modern Statistics for Modern\n", "Biology](https://web.stanford.edu/class/bios221/book/Chap-Models.html),\n", "as well as the introductory sections of Richard McElreath\u2019s wonderfully\n", "readable [Statistical\n", "Rethinking](https://xcelab.net/rm/statistical-rethinking/) and Bishop\u2019s\n", "classic [Machine Learning for Pattern\n", "Recognition](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf)\n", "textbook."]}], "metadata": {"kernelspec": {"display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.6"}, "vscode": {"interpreter": {"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"}}}, "nbformat": 4, "nbformat_minor": 5} -------------------------------------------------------------------------------- /documentation/delivery_tips.md: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /documentation/developer_instructions.md: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "rds-course" 3 | packages = [ 4 | { include = "coursebook" }, 5 | ] 6 | version = "0.1.0" 7 | description = "" 8 | authors = ["ChristinaLast ", "James Bishop ", "Jack Roberts "] 9 | 10 | [tool.poetry.dependencies] 11 | python = ">=3.7.1,<3.11" 12 | numpy = "^1.21.6" 13 | pandas = "^1.3.5" 14 | scikit-learn = "^1.0.2" 15 | matplotlib = "^3.5.2" 16 | seaborn = "^0.11.2" 17 | jupyterlab = "^3.4.2" 18 | missingno = "^0.5.1" 19 | opencv-python = "^4.5.5" 20 | requests = "^2.27.1" 21 | statsmodels = "^0.13.2" 22 | plotly = "^5.8.0" 23 | graphviz = "^0.20" 24 | scipy = "^1.7.3" 25 | jupyter-book = "^0.12.3" 26 | openpyxl = "^3.0.9" 27 | jsonschema = "^3.2.0" 28 | 29 | [build-system] 30 | requires = ["poetry-core>=1.0.0"] 31 | build-backend = "poetry.core.masonry.api" 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.7.12; python_version >= "3.7" 2 | anyio==3.6.1; python_full_version >= "3.6.2" and python_version >= "3.7" 3 | appnope==0.1.3; python_version >= "3.7" 4 | argon2-cffi-bindings==21.2.0; python_version >= "3.7" 5 | argon2-cffi==21.3.0; python_version >= "3.7" 6 | attrs==21.4.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.5.0" and python_version >= "3.7" and python_version < "4.0" 7 | babel==2.10.1; python_version >= "3.7" 8 | backcall==0.2.0; python_version >= "3.7" 9 | beautifulsoup4==4.11.1; python_full_version >= "3.6.0" and python_version >= "3.7" 10 | bleach==5.0.0; python_version >= "3.7" 11 | certifi==2021.10.8; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7" 12 | cffi==1.15.0; implementation_name == "pypy" and python_version >= "3.7" 13 | charset-normalizer==2.0.12; python_full_version >= "3.6.0" and python_version >= "3.7" 14 | click==8.1.3; python_version >= "3.7" and python_version < "4.0" 15 | colorama==0.4.4; python_version >= "3.7" and python_full_version < "3.0.0" and sys_platform == "win32" and platform_system == "Windows" or sys_platform == "win32" and python_version >= "3.7" and python_full_version >= "3.5.0" and platform_system == "Windows" 16 | cycler==0.11.0; python_version >= "3.7" 17 | debugpy==1.6.0; python_version >= "3.7" 18 | decorator==5.1.1; python_version >= "3.7" 19 | defusedxml==0.7.1; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7" 20 | docutils==0.16; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7" 21 | entrypoints==0.4; python_version >= "3.7" 22 | et-xmlfile==1.1.0; python_version >= "3.6" 23 | fastjsonschema==2.15.3; python_version >= "3.7" 24 | fonttools==4.33.3; python_version >= "3.7" 25 | gitdb==4.0.9; python_version >= "3.7" 26 | gitpython==3.1.27; python_version >= "3.7" 27 | graphviz==0.20; python_version >= "3.7" 28 | greenlet==1.1.2; python_version >= "3" and python_full_version < "3.0.0" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7") or python_version >= "3" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7") and python_full_version >= "3.5.0" 29 | idna==3.3; python_full_version >= "3.6.2" and python_version >= "3.7" and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7") 30 | imagesize==1.3.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7" 31 | importlib-metadata==4.11.3; python_version < "3.8" and python_version >= "3.7" and (python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.6.0" and python_version >= "3.7" and python_version < "3.8") 32 | importlib-resources==5.7.1; python_version < "3.9" and python_version >= "3.7" 33 | ipykernel==6.13.0; python_version >= "3.7" 34 | ipython-genutils==0.2.0; python_version >= "3.7" 35 | ipython==7.33.0; python_version >= "3.7" 36 | ipywidgets==7.7.0; python_version >= "3.7" 37 | jedi==0.18.1; python_version >= "3.7" 38 | jinja2==3.0.3; python_version >= "3.7" 39 | joblib==1.1.0; python_version >= "3.7" 40 | json5==0.9.8; python_version >= "3.7" 41 | jsonschema==3.2.0; python_version >= "3.7" 42 | jupyter-book==0.12.3; python_version >= "3.7" 43 | jupyter-cache==0.4.3; python_version >= "3.7" 44 | jupyter-client==7.3.1; python_full_version >= "3.7.0" and python_version >= "3.7" 45 | jupyter-core==4.10.0; python_version >= "3.7" 46 | jupyter-server-mathjax==0.2.5; python_version >= "3.7" 47 | jupyter-server==1.17.0; python_version >= "3.7" 48 | jupyter-sphinx==0.3.2; python_version >= "3.7" 49 | jupyterlab-pygments==0.2.2; python_version >= "3.7" 50 | jupyterlab-server==2.13.0; python_version >= "3.7" 51 | jupyterlab-widgets==1.1.0; python_version >= "3.7" 52 | jupyterlab==3.4.2; python_version >= "3.7" 53 | kiwisolver==1.4.2; python_version >= "3.7" 54 | latexcodec==2.0.1; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7" 55 | linkify-it-py==1.0.3; python_version >= "3.7" 56 | markdown-it-py==1.1.0; python_version >= "3.7" and python_version < "4.0" 57 | markupsafe==2.1.1; python_version >= "3.7" 58 | matplotlib-inline==0.1.3; python_version >= "3.7" 59 | matplotlib==3.5.2; python_version >= "3.7" 60 | mdit-py-plugins==0.2.8; python_version >= "3.7" and python_version < "4.0" 61 | missingno==0.5.1 62 | mistune==0.8.4; python_version >= "3.7" 63 | myst-nb==0.13.2; python_version >= "3.7" 64 | myst-parser==0.15.2; python_version >= "3.7" 65 | nbclassic==0.3.7; python_version >= "3.7" 66 | nbclient==0.5.13; python_full_version >= "3.7.0" and python_version >= "3.7" 67 | nbconvert==6.5.0; python_version >= "3.7" 68 | nbdime==3.1.1; python_version >= "3.7" 69 | nbformat==5.4.0; python_full_version >= "3.7.0" and python_version >= "3.7" 70 | nest-asyncio==1.5.5; python_full_version >= "3.7.0" and python_version >= "3.7" 71 | notebook-shim==0.1.0; python_version >= "3.7" 72 | notebook==6.4.11; python_version >= "3.7" 73 | numpy==1.21.6; python_version >= "3.7" and python_version < "3.11" 74 | opencv-python==4.5.5.64; python_version >= "3.6" 75 | openpyxl==3.0.9; python_version >= "3.6" 76 | packaging==21.3; python_version >= "3.7" 77 | pandas==1.3.5; python_full_version >= "3.7.1" 78 | pandocfilters==1.5.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7" 79 | parso==0.8.3; python_version >= "3.7" 80 | patsy==0.5.2; python_version >= "3.7" 81 | pexpect==4.8.0; sys_platform != "win32" and python_version >= "3.7" 82 | pickleshare==0.7.5; python_version >= "3.7" 83 | pillow==9.1.1; python_version >= "3.7" 84 | plotly==5.8.0; python_version >= "3.6" 85 | prometheus-client==0.14.1; python_version >= "3.7" 86 | prompt-toolkit==3.0.29; python_full_version >= "3.6.2" and python_version >= "3.7" 87 | psutil==5.9.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7" 88 | ptyprocess==0.7.0; sys_platform != "win32" and python_version >= "3.7" and os_name != "nt" 89 | py==1.11.0; python_version >= "3.7" and python_full_version < "3.0.0" and implementation_name == "pypy" or implementation_name == "pypy" and python_version >= "3.7" and python_full_version >= "3.5.0" 90 | pybtex-docutils==1.0.1; python_version >= "3.7" 91 | pybtex==0.24.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.7" 92 | pycparser==2.21; python_version >= "3.7" and python_full_version < "3.0.0" and implementation_name == "pypy" or implementation_name == "pypy" and python_version >= "3.7" and python_full_version >= "3.4.0" 93 | pydata-sphinx-theme==0.7.2; python_version >= "3.7" 94 | pygments==2.12.0; python_version >= "3.7" 95 | pyparsing==3.0.9; python_full_version >= "3.6.8" and python_version >= "3.7" 96 | pyrsistent==0.18.1; python_version >= "3.7" 97 | python-dateutil==2.8.2; python_full_version >= "3.7.1" and python_version >= "3.7" and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.7") 98 | pytz==2022.1; python_full_version >= "3.7.1" and python_version >= "3.7" 99 | pywin32==304; sys_platform == "win32" and platform_python_implementation != "PyPy" and python_version >= "3.7" 100 | pywinpty==2.0.5; os_name == "nt" and python_version >= "3.7" 101 | pyyaml==6.0; python_version >= "3.7" and python_version < "4.0" and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.7") 102 | pyzmq==22.3.0; python_version >= "3.7" 103 | requests==2.27.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.6.0") 104 | scikit-learn==1.0.2; python_version >= "3.7" 105 | scipy==1.7.3; python_version >= "3.7" and python_version < "3.11" 106 | seaborn==0.11.2; python_version >= "3.6" 107 | send2trash==1.8.0; python_version >= "3.7" 108 | setuptools-scm==6.4.2; python_version >= "3.7" 109 | six==1.16.0; python_full_version >= "3.7.1" and python_version >= "3.7" and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.7") and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7") 110 | smmap==5.0.0; python_version >= "3.7" 111 | sniffio==1.2.0; python_full_version >= "3.6.2" and python_version >= "3.7" 112 | snowballstemmer==2.2.0; python_version >= "3.7" 113 | soupsieve==2.3.2.post1; python_full_version >= "3.6.0" and python_version >= "3.7" 114 | sphinx-book-theme==0.1.10; python_version >= "3.7" 115 | sphinx-comments==0.0.3; python_version >= "3.7" 116 | sphinx-copybutton==0.5.0; python_version >= "3.7" 117 | sphinx-external-toc==0.2.4; python_version >= "3.7" and python_version < "4.0" 118 | sphinx-jupyterbook-latex==0.4.6; python_version >= "3.7" 119 | sphinx-multitoc-numbering==0.1.3; python_version >= "3.7" 120 | sphinx-panels==0.6.0; python_version >= "3.7" 121 | sphinx-thebe==0.1.2; python_version >= "3.7" 122 | sphinx-togglebutton==0.3.1; python_version >= "3.7" 123 | sphinx==4.5.0; python_version >= "3.7" and python_version < "4.0" 124 | sphinxcontrib-applehelp==1.0.2; python_version >= "3.7" 125 | sphinxcontrib-bibtex==2.4.2; python_version >= "3.7" 126 | sphinxcontrib-devhelp==1.0.2; python_version >= "3.7" 127 | sphinxcontrib-htmlhelp==2.0.0; python_version >= "3.7" 128 | sphinxcontrib-jsmath==1.0.1; python_version >= "3.7" 129 | sphinxcontrib-qthelp==1.0.3; python_version >= "3.7" 130 | sphinxcontrib-serializinghtml==1.1.5; python_version >= "3.7" 131 | sqlalchemy==1.4.36; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7" 132 | statsmodels==0.13.2; python_version >= "3.7" 133 | tenacity==8.0.1; python_version >= "3.6" 134 | terminado==0.15.0; python_version >= "3.7" 135 | threadpoolctl==3.1.0; python_version >= "3.7" 136 | tinycss2==1.1.1; python_version >= "3.7" 137 | tomli==2.0.1; python_version >= "3.7" 138 | tornado==6.1; python_version >= "3.7" 139 | traitlets==5.2.1.post0; python_full_version >= "3.7.0" and python_version >= "3.7" 140 | typing-extensions==4.2.0; python_version < "3.8" and python_version >= "3.7" and python_full_version >= "3.6.2" 141 | uc-micro-py==1.0.1; python_version >= "3.7" 142 | urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.7" 143 | wcwidth==0.2.5; python_full_version >= "3.6.2" and python_version >= "3.7" 144 | webencodings==0.5.1; python_version >= "3.7" 145 | websocket-client==1.3.2; python_version >= "3.7" 146 | widgetsnbextension==3.6.0; python_version >= "3.7" 147 | zipp==3.8.0; python_version < "3.8" and python_version >= "3.7" 148 | --------------------------------------------------------------------------------