├── .gitignore ├── CHANGELOG.md ├── CITATION.cff ├── Dockerfile ├── LICENSE ├── README.md ├── _config.yml ├── _toc.yml ├── binder └── environment.yml ├── content ├── 001_setup │ ├── conda.md │ ├── contributing.md │ ├── docker.md │ ├── git.md │ ├── install.md │ ├── prereq.ipynb │ └── prereq │ │ ├── 01_list_comp.ipynb │ │ ├── 02_dicts.ipynb │ │ ├── 04_scope.ipynb │ │ └── 05_pep8.ipynb ├── 01_algorithms │ ├── 01_design.ipynb │ ├── 01_design │ │ ├── 01_primes.ipynb │ │ ├── 02_better_design.ipynb │ │ ├── 03_micro_opt.ipynb │ │ └── 04_summing_up.ipynb │ ├── 02_oop.ipynb │ ├── 02_oop │ │ ├── 01_python_classes.ipynb │ │ ├── 02_oop_sim.ipynb │ │ ├── 03_oop_cv.ipynb │ │ ├── numpy_cv.py │ │ └── text_adventure │ │ │ ├── __init__.py │ │ │ ├── advanced_game.py │ │ │ └── basic_game.py │ ├── 03_numpy.ipynb │ ├── 03_numpy │ │ ├── 01_performance.ipynb │ │ ├── 02_vectors.ipynb │ │ ├── 03_slicing.ipynb │ │ ├── 04_algebra.ipynb │ │ ├── 05_statistics.ipynb │ │ ├── 05a_regression.ipynb │ │ ├── 06_sampling.ipynb │ │ ├── 07_advanced_iter.ipynb │ │ ├── 08_cs1.ipynb │ │ ├── 09_cs2.ipynb │ │ ├── 10_cs3.ipynb │ │ └── data │ │ │ ├── lysis.csv │ │ │ ├── minor_illness_ed_attends.csv │ │ │ └── wisconsin.csv │ ├── 04_exercises.ipynb │ ├── 04_exercises │ │ ├── 01_science_funcs.ipynb │ │ ├── 02_array.ipynb │ │ ├── 02_basic_oop.ipynb │ │ ├── 04_numpy_stats.ipynb │ │ ├── big_special_str.txt │ │ ├── breach.csv │ │ ├── data │ │ │ ├── bank_arrivals.csv │ │ │ ├── breach.csv │ │ │ ├── dtocs.csv │ │ │ ├── lysis.csv │ │ │ ├── moviedb.csv │ │ │ └── pieces │ │ │ │ ├── p1.csv │ │ │ │ ├── p10.csv │ │ │ │ ├── p2.csv │ │ │ │ ├── p3.csv │ │ │ │ ├── p4.csv │ │ │ │ ├── p5.csv │ │ │ │ ├── p6.csv │ │ │ │ ├── p7.csv │ │ │ │ ├── p8.csv │ │ │ │ └── p9.csv │ │ ├── dtocs.csv │ │ ├── ex_templates │ │ │ ├── ex1_quickstart.py │ │ │ ├── ex2_quickstart.py │ │ │ └── lab4_debug_challenge.py │ │ └── im │ │ │ ├── all_overlap.png │ │ │ ├── brb_sol.png │ │ │ ├── one_piece.PNG │ │ │ ├── only_one_piece.png │ │ │ ├── outline_pane.PNG │ │ │ └── valid_layout.png │ ├── 05_debug.md │ ├── 05_debug │ │ ├── 00_debug_cv.py │ │ ├── 01_debug_numpy.md │ │ └── debug_numpy_py.py │ ├── 06_solutions.md │ ├── 06_solutions │ │ ├── 01_science_funcs.ipynb │ │ ├── 02_array.ipynb │ │ ├── 02_basic_numpy.ipynb │ │ ├── 02_basic_oop.ipynb │ │ ├── 04_numpy_stats.ipynb │ │ ├── Untitled.ipynb │ │ ├── big_special_str.txt │ │ └── data │ │ │ ├── bank_arrivals.csv │ │ │ ├── breach.csv │ │ │ ├── dtocs.csv │ │ │ ├── lysis.csv │ │ │ └── moviedb.csv │ ├── data │ │ ├── hist.csv │ │ ├── minor_illness_ed_attends.csv │ │ ├── salaries.csv │ │ └── salaries_extended.csv │ └── im │ │ ├── gsearch.PNG │ │ ├── salaries.PNG │ │ └── salaries_extended.PNG ├── 02_stat_prog │ ├── 01_pandas │ │ ├── 01_intro_pandas.ipynb │ │ ├── 02_files.ipynb │ │ ├── 03_non_standard_download.ipynb │ │ ├── 04_datetimes.ipynb │ │ ├── 05_analysing.ipynb │ │ └── 06_cs_combining.ipynb │ ├── 01_pandas_front_page.md │ ├── 02_matplotlib │ │ ├── 01_matplotlib.ipynb │ │ ├── 02_matplotlib2.ipynb │ │ ├── 02_plotting_time_series.ipynb │ │ ├── 03_cs_hm.ipynb │ │ ├── explore.png │ │ └── stacked.png │ ├── 02_visual_front_page.md │ ├── 03_exercises │ │ ├── 00_dataframes.ipynb │ │ ├── 01_data_wrangling_matplotlib.ipynb │ │ ├── 02_stroke_data_wrangling.ipynb │ │ ├── 03_visualise_ts.ipynb │ │ ├── data │ │ │ ├── di_counts.csv │ │ │ ├── di_rq_to_test.csv │ │ │ ├── di_test_to_report.csv │ │ │ ├── sw_imaging.csv │ │ │ ├── synth_lysis.csv │ │ │ └── total_referrals.csv │ │ └── hosp_1_ed.png │ ├── 03_exercises_front_page.md │ ├── 04_solutions │ │ ├── 00_dataframes.ipynb │ │ ├── 01_data_wrangling_matplotlib_solutions.ipynb │ │ ├── 02_stroke_data_wrangling_solutions.ipynb │ │ ├── 03_visualise_ts_SOLUTIONS.ipynb │ │ └── total_referrals.csv │ └── 04_solutions_front_page.md ├── 03_mgt │ ├── 01_git │ │ ├── 01_why.md │ │ ├── 02_git.md │ │ ├── 03_cs_1.md │ │ ├── 04_cs_2.md │ │ └── 05_cs_3.md │ ├── 02_packaging │ │ ├── 01_python_packages.ipynb │ │ ├── example.ipynb │ │ ├── my_package_name │ │ │ ├── __init__.py │ │ │ ├── datasets.py │ │ │ ├── package_data │ │ │ │ ├── example_datset_1.csv │ │ │ │ └── example_datset_2.csv │ │ │ └── plotting.py │ │ └── ts_emergency │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ ├── .~lock.ts_ed.csv# │ │ │ ├── syn_ts_ed_long.csv │ │ │ └── syn_ts_ed_wide.csv │ │ │ ├── datasets.py │ │ │ └── plotting.py │ ├── 03_mgt_front_page.md │ ├── 03_pypi │ │ ├── 01_local.md │ │ ├── 02_github.md │ │ ├── 03_pypi.md │ │ ├── 04_automation.md │ │ ├── LICENSE │ │ ├── MANIFEST.in │ │ ├── environment.yml │ │ ├── requirements.txt │ │ ├── setup.py │ │ └── test_package │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ └── test_data.csv │ │ │ └── test.py │ ├── 03_vc_front_page.md │ ├── 04_binder │ │ └── 01_binder.md │ ├── 04_exercises │ │ ├── 01_python_packages.ipynb │ │ ├── 02_conda.ipynb │ │ ├── 02_use_conda.md │ │ ├── 03_binder.md │ │ └── im │ │ │ ├── detrended.jpg │ │ │ └── diag.jpg │ ├── 04_exercises_front_page.md │ ├── 05_solutions │ │ ├── 01_python_packages_solutions.ipynb │ │ ├── im │ │ │ ├── detrended.jpg │ │ │ └── diag.jpg │ │ └── ts_emergency │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ ├── .~lock.ts_ed.csv# │ │ │ ├── syn_ts_ed_long.csv │ │ │ └── syn_ts_ed_wide.csv │ │ │ ├── datasets.py │ │ │ └── plotting │ │ │ ├── __init__.py │ │ │ ├── tsa.py │ │ │ └── view.py │ └── 05_solutions_front_page.md ├── appendix │ ├── acknowledge.md │ ├── fp_lectures.md │ ├── fp_practicals.md │ ├── labs │ │ ├── 01_basics.ipynb │ │ ├── 02_basics.ipynb │ │ ├── debug1.md │ │ ├── debug2.md │ │ └── src │ │ │ ├── cinema_exercise.py │ │ │ ├── list_comprehensions.py │ │ │ ├── moviedb.csv │ │ │ ├── py_finance.py │ │ │ ├── string_manipulation.py │ │ │ ├── test_finance.py │ │ │ ├── week1_debug_challenge1.py │ │ │ └── wk2_debug_challenge.py │ └── lectures │ │ ├── Lecture1.ipynb │ │ └── Lecture2.ipynb ├── front_page.md └── imgs │ ├── logo_v1.png │ ├── package_versus_project.drawio │ ├── package_versus_project.png │ ├── small_logo.png │ ├── title.odg │ ├── title_cropped.png │ ├── title_cropped.png~ │ └── title_logo.png └── images ├── binder_1.png ├── binder_2.png ├── book_small_logo.svg ├── book_title_page_log.svg ├── detrended.jpg ├── diag.jpg ├── release.png ├── test_pypi2.png ├── test_pypi3.png └── testpypi.png /.gitignore: -------------------------------------------------------------------------------- 1 | #Jupyter book build 2 | _build/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). Dates formatted as YYYY-MM-DD as per [ISO standard](https://www.iso.org/iso-8601-date-and-time-format.html). 7 | 8 | Consistent identifier (represents all versions, resolves to latest): [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10026326.svg)](https://doi.org/10.5281/zenodo.10026326) 9 | 10 | ## [v3.0.0]() UNRELEASED 11 | 12 | ### Changed 13 | 14 | * ENV: updated to python 3.11 and added linting packages and `hatch` for packaging. 15 | * ENV: upgraded packages to latest as of Jul 2024 including numpy > 2.0. Tested all numpy notebooks for compatibility. 16 | * Coding Scientific functions exercises: Added new exercise 3 that implements $W_q$ and $P_n$ from a M/M/1 queue. 17 | * Updated python packaging section. Retired `setuptools` approach in favour of `hatch`. Split sections on installable packages, github install, PyPI install and automation. 18 | * Removed redundant numpy notebooks from old intro python course. 19 | * Removed `seaborn` dependency from visualise time series exercise. 20 | * Minor typo and sentence fixes. 21 | 22 | ### 23 | 24 | ## 2.0.1 (2023-09-25) 25 | 26 | ### Release Highlights 27 | 28 | * Patched conda install instructions 29 | * Added contributors 30 | * Added `CHANGELOG.md` 31 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: 'Python for health data science: a hands-on introduction' 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Thomas 12 | family-names: Monks 13 | affiliation: 'University of Exeter ' 14 | orcid: 'https://orcid.org/0000-0003-2631-4481' 15 | identifiers: 16 | - type: doi 17 | value: 10.5281/zenodo.7107920 18 | description: v2.0.0 19 | repository-code: 'https://github.com/health-data-science-OR/coding-for-ml' 20 | url: 'https://www.pythonhealthdatascience.com' 21 | keywords: 22 | - Python 23 | - Health 24 | - Data Science 25 | license: MIT 26 | version: 2.0.0 27 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | #install wget 4 | RUN apt-get update \ 5 | && apt-get install -y wget \ 6 | && rm -rf /var/lib/apt/lists/* 7 | 8 | #get the latest version of miniconda 9 | RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 10 | 11 | # install in batch (silent) mode, does not edit PATH or .bashrc or .bash_profile 12 | # -p path 13 | # -f force 14 | RUN bash Miniconda3-latest-Linux-x86_64.sh -b 15 | 16 | ENV PATH=/root/miniconda3/bin:${PATH} 17 | 18 | # cleanup 19 | RUN rm Miniconda3-latest-Linux-x86_64.sh 20 | 21 | #update conda 22 | RUN conda update -y conda 23 | 24 | #create directory for code. 25 | RUN mkdir /home/code 26 | 27 | #set working directory. 28 | WORKDIR /home/code 29 | 30 | # Copy all files across to container 31 | COPY . /home/code 32 | 33 | # Install anaconda, conda-forge and pip dependencies the clean up. 34 | RUN conda env create -f binder/environment.yml && conda clean -afy 35 | 36 | #open a port 37 | EXPOSE 80 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 health-data-science-OR 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8377497.svg)](https://doi.org/10.5281/zenodo.8377497) 2 | 3 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/health-data-science-OR/coding-for-ml/HEAD) 4 | 5 | # Python for health data science: a hands-on introduction 6 | 7 | Learning materials for Coding for Machine Learning and Data Science 8 | 9 | This material is also available as a [Jupyter Book](https://health-data-science-or.github.io/coding-for-ml) 10 | 11 | ## Installing the virtual environment 12 | 13 | Details of a conda virtual environment is available in `binder/environment.yml` 14 | 15 | 1. Clone the repo 16 | 2. Navigate to the repo in a terminal (Mac/Linux) or anaconda prompt (Windows) 17 | 3. Create the virtual environment 18 | * `conda env create -f binder/environment.yml` 19 | 20 | 4. Activate the environment 21 | * `conda activate hds_code` 22 | 23 | 5. Launch Jupyter-lab to edit and run code 24 | * `jupyter-lab` 25 | 26 | ## Citation 27 | 28 | Please cite using the zenodo link. LaTex is: 29 | 30 | ``` 31 | @software{monks_thomas_2023_8377497, 32 | author = {Monks, Thomas}, 33 | title = {{Python for health data science: a hands-on 34 | introduction}}, 35 | month = sep, 36 | year = 2023, 37 | note = {{If you use this software, please cite it using the 38 | metadata from this file.}}, 39 | publisher = {Zenodo}, 40 | version = {v2.0.1}, 41 | doi = {10.5281/zenodo.8377497}, 42 | url = {https://doi.org/10.5281/zenodo.8377497} 43 | } 44 | ``` -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # book settings 2 | title: Python for health data science. 3 | author: Thomas Monks 4 | email: t.m.w.monks@exeter.ac.uk 5 | description: >- # this means to ignore newlines until "baseurl:" 6 | Learning materials for Coding for Machine Learning and Data Science. 7 | logo: content/imgs/small_logo.png 8 | #logo: content/imgs/logo_v1.png 9 | 10 | # only build files specified in table of contents file 11 | only_build_toc_files: true 12 | 13 | execute: 14 | execute_notebooks: off #cache output from .ipynb files for faster build 15 | timeout: -1 #no time restriction on notebook execution 16 | 17 | repository: 18 | url: https://github.com/health-data-science-OR/coding-for-ml 19 | branch: main 20 | 21 | html: 22 | use_repository_button: true 23 | use_issues_button: true 24 | 25 | # Configure your Binder links, such as the URL of the BinderHub. 26 | launch_buttons: 27 | binderhub_url: "https://mybinder.org" 28 | colab_url: "https://colab.research.google.com" 29 | notebook_interface: "jupyterlab" 30 | thebe: true 31 | -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | format: jb-book 2 | root: content/front_page 3 | parts: 4 | - caption: Introduction 5 | chapters: 6 | - file: content/001_setup/contributing 7 | - file: content/001_setup/install 8 | - file: content/001_setup/git 9 | - file: content/001_setup/conda 10 | - file: content/001_setup/prereq 11 | sections: 12 | - file: content/001_setup/prereq/01_list_comp 13 | - file: content/001_setup/prereq/02_dicts 14 | - file: content/001_setup/prereq/04_scope 15 | - file: content/001_setup/prereq/05_pep8 16 | - caption: Algorithms and computational modelling 17 | chapters: 18 | - file: content/01_algorithms/01_design 19 | sections: 20 | - file: content/01_algorithms/01_design/01_primes 21 | - file: content/01_algorithms/01_design/02_better_design 22 | - file: content/01_algorithms/01_design/03_micro_opt 23 | - file: content/01_algorithms/01_design/04_summing_up 24 | - file: content/01_algorithms/02_oop 25 | sections: 26 | - file: content/01_algorithms/02_oop/01_python_classes 27 | - file: content/01_algorithms/02_oop/02_oop_sim 28 | - file: content/01_algorithms/02_oop/03_oop_cv 29 | - file: content/01_algorithms/03_numpy 30 | sections: 31 | - file: content/01_algorithms/03_numpy/01_performance 32 | - file: content/01_algorithms/03_numpy/02_vectors 33 | - file: content/01_algorithms/03_numpy/03_slicing 34 | - file: content/01_algorithms/03_numpy/04_algebra 35 | - file: content/01_algorithms/03_numpy/05_statistics 36 | - file: content/01_algorithms/03_numpy/05a_regression 37 | - file: content/01_algorithms/03_numpy/06_sampling 38 | - file: content/01_algorithms/03_numpy/07_advanced_iter 39 | - file: content/01_algorithms/03_numpy/08_cs1 40 | - file: content/01_algorithms/03_numpy/09_cs2 41 | - file: content/01_algorithms/03_numpy/10_cs3 42 | - file: content/01_algorithms/04_exercises 43 | sections: 44 | - file: content/01_algorithms/04_exercises/01_science_funcs 45 | - file: content/01_algorithms/04_exercises/02_basic_oop 46 | - file: content/01_algorithms/04_exercises/02_array 47 | - file: content/01_algorithms/04_exercises/04_numpy_stats 48 | - file: content/01_algorithms/05_debug 49 | sections: 50 | - file: content/01_algorithms/05_debug/01_debug_numpy 51 | - file: content/01_algorithms/06_solutions 52 | sections: 53 | - file: content/01_algorithms/06_solutions/01_science_funcs 54 | - file: content/01_algorithms/06_solutions/02_basic_oop 55 | - file: content/01_algorithms/06_solutions/02_array 56 | - file: content/01_algorithms/06_solutions/04_numpy_stats 57 | - caption: Statistical Programming 58 | chapters: 59 | - file: content/02_stat_prog/01_pandas_front_page 60 | sections: 61 | - file: content/02_stat_prog/01_pandas/01_intro_pandas 62 | - file: content/02_stat_prog/01_pandas/02_files 63 | - file: content/02_stat_prog/01_pandas/03_non_standard_download 64 | - file: content/02_stat_prog/01_pandas/04_datetimes 65 | - file: content/02_stat_prog/01_pandas/05_analysing 66 | - file: content/02_stat_prog/01_pandas/06_cs_combining 67 | - file: content/02_stat_prog/02_visual_front_page 68 | sections: 69 | - file: content/02_stat_prog/02_matplotlib/01_matplotlib 70 | - file: content/02_stat_prog/02_matplotlib/02_matplotlib2 71 | - file: content/02_stat_prog/02_matplotlib/03_cs_hm 72 | - file: content/02_stat_prog/02_matplotlib/02_plotting_time_series 73 | - file: content/02_stat_prog/03_exercises_front_page 74 | sections: 75 | - file: content/02_stat_prog/03_exercises/00_dataframes 76 | - file: content/02_stat_prog/03_exercises/01_data_wrangling_matplotlib 77 | - file: content/02_stat_prog/03_exercises/02_stroke_data_wrangling 78 | - file: content/02_stat_prog/03_exercises/03_visualise_ts 79 | - file: content/02_stat_prog/04_solutions_front_page 80 | sections: 81 | - file: content/02_stat_prog/04_solutions/00_dataframes 82 | - file: content/02_stat_prog/04_solutions/01_data_wrangling_matplotlib_solutions 83 | - file: content/02_stat_prog/04_solutions/02_stroke_data_wrangling_solutions 84 | - file: content/02_stat_prog/04_solutions/03_visualise_ts_SOLUTIONS 85 | - caption: Managing Data Science Projects 86 | chapters: 87 | - file: content/03_mgt/03_vc_front_page 88 | sections: 89 | - file: content/03_mgt/01_git/02_git 90 | - file: content/03_mgt/01_git/03_cs_1 91 | - file: content/03_mgt/01_git/04_cs_2 92 | - file: content/03_mgt/01_git/05_cs_3 93 | - file: content/03_mgt/03_mgt_front_page 94 | sections: 95 | - file: content/03_mgt/02_packaging/01_python_packages 96 | - file: content/03_mgt/03_pypi/01_local 97 | - file: content/03_mgt/03_pypi/02_github 98 | - file: content/03_mgt/03_pypi/03_pypi 99 | - file: content/03_mgt/03_pypi/04_automation 100 | - file: content/03_mgt/04_binder/01_binder 101 | - file: content/03_mgt/04_exercises_front_page 102 | sections: 103 | - file: content/03_mgt/04_exercises/01_python_packages 104 | - file: content/03_mgt/04_exercises/02_conda 105 | - file: content/03_mgt/04_exercises/03_binder 106 | - file: content/03_mgt/05_solutions_front_page 107 | sections: 108 | - file: content/03_mgt/05_solutions/01_python_packages_solutions 109 | - caption: Appendix 110 | chapters: 111 | - file: content/appendix/acknowledge 112 | - caption: Appendix Basic Python 113 | chapters: 114 | - file: content/appendix/fp_lectures 115 | sections: 116 | - file: content/appendix/lectures/Lecture1 117 | - file: content/appendix/lectures/Lecture2 118 | - file: content/appendix/fp_practicals 119 | sections: 120 | - file: content/appendix/labs/01_basics 121 | - file: content/appendix/labs/debug1 122 | - file: content/appendix/labs/02_basics 123 | - file: content/appendix/labs/debug2 124 | -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: hds_code 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - black 6 | - flake8 7 | - hatch 8 | - jupyterlab=4.2.4 9 | - jupyterlab-spellchecker=0.8.4 10 | - matplotlib=3.9.1 11 | - nbqa 12 | - nodejs=22.5.1 13 | - numpy=2.0.1 14 | - pandas=2.2.2 15 | - pip=24.0 16 | - python=3.11 17 | - pytest=8.3.2 18 | - py7zr=0.21.1 19 | - rich=13.7.1 20 | - scikit-learn=1.5.1 21 | - scipy==1.14.0 22 | - statsmodels=0.14.2 23 | -------------------------------------------------------------------------------- /content/001_setup/conda.md: -------------------------------------------------------------------------------- 1 | # Conda virtual environment 2 | 3 | The code examples in this module have been created in using the conda virtual environment `hds_code` 4 | 5 | To create the conda environment, enter the following commands into the terminal: 6 | 7 | ```console 8 | git clone https://github.com/health-data-science-OR/coding-for-ml 9 | cd coding-for-ml 10 | conda update conda 11 | conda env create -f binder/environment.yml 12 | conda activate hds_code 13 | ``` 14 | 15 | The dependencies `hds_code` will install via conda are: 16 | 17 | ```yaml 18 | name: hds_code 19 | channels: 20 | - conda-forge 21 | dependencies: 22 | - jupyterlab=3.4.6 23 | - matplotlib=3.5.3 24 | - nodejs=18.8.0 25 | - numpy=1.23.2 26 | - pandas=1.4.4 27 | - pip=22.2.2 28 | - python=3.8.12 29 | - scipy==1.9.1 30 | - statsmodels=0.13.2 31 | - pip: 32 | - rich==12.5.1 33 | - scikit-learn==1.1.2 34 | - py7zr==0.20.0 35 | ``` 36 | -------------------------------------------------------------------------------- /content/001_setup/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | I welcome contributions to the book. In each case credit will be given. Please do not feel you need to be an expert to contribute all views and feedback are greatly appreciated. You can contribute in the following ways: 4 | 5 | ## Typographical and grammatical errors 6 | 7 | If you spot a typo, spelling mistake, poor grammar (no matter how minor), or just feel a sentence/paragraph could be rewritten to improve clarity, I would greatly appreciate you reporting it. You can do that by raising a Github issue via the repository. The url is [https://github.com/health-data-science-OR/coding-for-ml/issues](https://github.com/health-data-science-OR/coding-for-ml/issues) 8 | 9 | When you submit an issue please provide: 10 | 11 | * link to page of book with the error 12 | * Original sentence(s) that contain the error. 13 | * Suggested fix 14 | * Label the problem as `documentation`. 15 | 16 | ## Reporting bugs 17 | 18 | If you find a bug in any of the code examples including solutions to exercises please report via [Github issues](https://github.com/health-data-science-OR/coding-for-ml/issues). Please provide the following: 19 | 20 | * The operating system and version you are using. 21 | * Details of your python dependencies or virtual environment if you are using one. (if using conda please provide a environment.yml file - see the conda exercises for help ) 22 | * Steps to reproduce the problem including the url. 23 | * Please label the problem as `bug` 24 | * **Optional**: recommended fix. 25 | 26 | > Note: Before reporting a bug please check that the Jupyter notebook cells have been been run in **order**. I recommend selecting 'Reset Kernel and Clear All Outputs' from the Kernel menu and rerunning the notebook to confirm. 27 | 28 | ## Submit general feedback or request new content 29 | 30 | The book will evolve over time. I'd greatly welcome feedback, via Github issues, on the book including requests for new new topics, chapters, or expanded sections on current content such as cases studies or package functionality. 31 | 32 | ### Requests 33 | 34 | For new content requests please 35 | 36 | * Detail the requested content including description and lists of any relevant packages 37 | * Explain why this content is relevant to health data scientists 38 | * Optional: provide an example 39 | * Label the issue as an `enhancement` 40 | 41 | ### General feedback 42 | 43 | For anything I've not covered here please submit a Github issue labelled as `feedback` 44 | 45 | ## Code of conduct 46 | 47 | If you wish to contribute in any of the above ways, including responding to feedback from others, then I ask you to follow the contribution code of conduct for this book. 48 | 49 | * Demonstrating empathy and kindness toward other people 50 | * Being respectful of differing opinions, viewpoints, and experiences 51 | * Giving and gracefully accepting constructive feedback 52 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 53 | * Focusing on what is best not just for us as individuals, but for the overall community 54 | 55 | These guidelines are adapted from the [Contributor Covenant](https://www.contributor-covenant.org/) 56 | 57 | -------------------------------------------------------------------------------- /content/001_setup/docker.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | **COMING SOON!** 4 | 5 | If desired an Ubuntu 20.04 image is available from DockerHub. The image contains the conda environment `hds_code` and ships with the GitHub Repo. 6 | 7 | 8 | -------------------------------------------------------------------------------- /content/001_setup/install.md: -------------------------------------------------------------------------------- 1 | # Install python and an IDE 2 | 3 | The code in this book is written in using python 3.8.8 and a list of dependencies. 4 | 5 | ## Local installation 6 | 7 | For beginners it is is recommended that users first install 'Anaconda'. This bundles python along with data science centric IDEs called `Spyder` and `Jupyter Notebook` (I recommend the more modern Jupyter-Lab over basic notebook, but there is no requirement to use it.) 8 | 9 | https://www.anaconda.com/download/ 10 | 11 | ```{admonition} See also 12 | :class: tip 13 | Anaconda includes 'conda' (a package manager). An optional step is to follow our notes to use [conda](conda.md) to create a virtual environment that includes python 3.8.12 and Jupyter-Lab 3.x 14 | ``` 15 | 16 | ```{admonition} My personal preferences 17 | :class: tip 18 | Alternatively (and my preference) you can install substantially smaller [Miniconda](https://docs.conda.io/en/latest/miniconda.html) and install the packages you need using the provided conda environment or by selecting them yourself. I tend to use packages installed from [conda-forge](https://conda-forge.org/), but the packages in the Anaconda channel (defaults) are equally good. 19 | ``` 20 | 21 | 22 | ## Run our code via Binderhub or Google Colab 23 | 24 | ```{note} 25 | You have the option of running our code in the cloud without the need to install on your local machine. 26 | ``` 27 | 28 | When you navigate to books pages and exercises that contain code cells you will see a image of a 'rocketship' in the top right hand corner of the screen. Move you mouse over the image and you can choose to either open the notebook in BinderHub (will take ~1 minute to open) or [Google Colab](https://colab.research.google.com). You will need a Google account to use Google Colab. -------------------------------------------------------------------------------- /content/001_setup/prereq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fb54ccc4-e555-414a-95b1-2f65f30718eb", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Prerequisites\n", 9 | "\n", 10 | "This book isn't intended to teach basic python or use of python Integrated Development Environments such as Pycharm, or Visual Studio Code. There are many excellent books you can purchase (or borrow from a good library), and free courses on YouTube that you can make use of to get you started. If you are an absolute beginner you can also investigate some [online preparatory material](https://health-data-science-or.github.io/basic-python/content/front_page.html) I provide for the MSc in Health Data Science at Exeter, but this it no intended to be exhaustive, and you would be daft to not investigate the plethora of material available for free online.\n", 11 | "\n", 12 | "Nevertheless there are a few areas of basic python knowledge that are essential to get the most of out of the book. As such I have included a brief \n", 13 | "summary of the following topics that you may wish to revise before tackling the main material.\n", 14 | "\n", 15 | "\n", 16 | "* [List comprehensions](./prereq/01_list_comp)\n", 17 | "* [Dictionaries](./prereq/02_dicts)\n", 18 | "* [Variable scope](./prereq/04_scope)\n", 19 | "* [Coding standards](./prereq/05_pep8)" 20 | ] 21 | } 22 | ], 23 | "metadata": { 24 | "kernelspec": { 25 | "display_name": "Python 3 (ipykernel)", 26 | "language": "python", 27 | "name": "python3" 28 | }, 29 | "language_info": { 30 | "codemirror_mode": { 31 | "name": "ipython", 32 | "version": 3 33 | }, 34 | "file_extension": ".py", 35 | "mimetype": "text/x-python", 36 | "name": "python", 37 | "nbconvert_exporter": "python", 38 | "pygments_lexer": "ipython3", 39 | "version": "3.8.12" 40 | } 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 5 44 | } 45 | -------------------------------------------------------------------------------- /content/01_algorithms/01_design.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "54b219c6", 6 | "metadata": {}, 7 | "source": [ 8 | "# The importance of design\n", 9 | "\n", 10 | "As your experience in data science grows I have no doubt that one lesson you will repeatedly learn is the importance of a good algorithm. A good algorithm can turn a seven day model run time into 4 hours (and a 14 day run time due to finding a mistake in the first run into an 8 hour run). A related lesson is that how you design your code to implement an algorithm also matters. Indeed the decisions you make in implementation can also have an order of magnitude impact on run time. In this section we will explore a simple problem - computing (small) prime numbers and how the choice of algorithm and its implementation can affect run time." 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Python 3 (ipykernel)", 17 | "language": "python", 18 | "name": "python3" 19 | }, 20 | "language_info": { 21 | "codemirror_mode": { 22 | "name": "ipython", 23 | "version": 3 24 | }, 25 | "file_extension": ".py", 26 | "mimetype": "text/x-python", 27 | "name": "python", 28 | "nbconvert_exporter": "python", 29 | "pygments_lexer": "ipython3", 30 | "version": "3.8.8" 31 | } 32 | }, 33 | "nbformat": 4, 34 | "nbformat_minor": 5 35 | } 36 | -------------------------------------------------------------------------------- /content/01_algorithms/01_design/04_summing_up.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b7d58aef", 6 | "metadata": {}, 7 | "source": [ 8 | "# Summing up\n", 9 | "\n", 10 | "## What else might you do?\n", 11 | "\n", 12 | "We chose a particular algorithm to improve on trial and error in the computation of primes. So the big questions are:\n", 13 | "\n", 14 | "* are there other algorithms and options that might reduce runtime? \n", 15 | "* is this good performance good enough or a problem for our study?\n", 16 | "* are we missing any obvious big ticket design changes?\n", 17 | "\n", 18 | "There are in fact other algorithms which are more efficient at computing larger primes than our sieve, but our sieve is reasonably good for the small primes we have computed. Could we make further improvements in its basic design? Yes we can. For instance, we know that all even numbers above two cannot be primes. We therefore don't even need to consider these in our algorithm. This not only reduces computation, but we can also half the size of our data structure (you might try this an exercise).\n", 19 | "\n", 20 | "Another option that we will explore in a later section is `numpy`. This provides some very fast optimised data structures and procedures that we will compare to standard python." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "79ab5782", 26 | "metadata": {}, 27 | "source": [ 28 | "## Conclusions\n", 29 | "\n", 30 | "We've seen that a good algorithm makes a huge difference to what is feasible to compute. A trial and error approach to computing primes prevented us from even finding relatively small prime numbers in a reasonable time frame. The **Sieve of Eratosthenes** made the unfeasible suddenly feasible. We've also seen that the design of code can also affect execution time by an order of magnitude. But not all optimisations will have a huge impact on performance and some optimisation may scale with our problem size. " 31 | ] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3 (ipykernel)", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.8.8" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 5 55 | } 56 | -------------------------------------------------------------------------------- /content/01_algorithms/02_oop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "communist-courage", 6 | "metadata": {}, 7 | "source": [ 8 | "# Designing using objects" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "regulated-coral", 14 | "metadata": {}, 15 | "source": [ 16 | "> For a more detailed treatment of classes see Part IV of *Lutz. (2013). Learning Python. 5th Ed. O'Reilly.*\n", 17 | "\n", 18 | "To get the most out of python when coding algorithms and models it is essential that you understand the basics of python classes and object orientated programming (OOP). **The key takeaways from this section are that class aid code reuse and design (although when used unwisely they can overcomplicate designs!)**. We will try and do this in a fun way so you can see the benefits.\n", 19 | "\n", 20 | "**In this section you will learn:**\n", 21 | "\n", 22 | "* How to instantiate multiple instances of a class.\n", 23 | "* How to declare a class and define a class constructor method.\n", 24 | "* What is meant by class attributes and methods.\n", 25 | "\n", 26 | "> It is worth noting that in python you don't have to use classes you can actually achieve everything with functions. However, the abstraction benefits of OO are really important for the design and organisation of complex projects." 27 | ] 28 | } 29 | ], 30 | "metadata": { 31 | "kernelspec": { 32 | "display_name": "Python 3 (ipykernel)", 33 | "language": "python", 34 | "name": "python3" 35 | }, 36 | "language_info": { 37 | "codemirror_mode": { 38 | "name": "ipython", 39 | "version": 3 40 | }, 41 | "file_extension": ".py", 42 | "mimetype": "text/x-python", 43 | "name": "python", 44 | "nbconvert_exporter": "python", 45 | "pygments_lexer": "ipython3", 46 | "version": "3.8.8" 47 | } 48 | }, 49 | "nbformat": 4, 50 | "nbformat_minor": 5 51 | } 52 | -------------------------------------------------------------------------------- /content/01_algorithms/02_oop/text_adventure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/02_oop/text_adventure/__init__.py -------------------------------------------------------------------------------- /content/01_algorithms/03_numpy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "db806001", 6 | "metadata": {}, 7 | "source": [ 8 | "# Scientific coding in `numpy`" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "1e0fd607", 14 | "metadata": {}, 15 | "source": [ 16 | "So far we have only covered standard python for health data science. Its very important to develop your skills in standard python as you will undoubtedly use them the most. Beyond the standard library you will learn why python is an exciting and popular language in data science: there is a whole ecosystem of scientific libraries built around algorithms, modelling data manipulating, processing and visualision. The fundamental and in my view most important of these is [numpy](https://numpy.org/). The most important contribution of the `numpy` library is the concept of an efficient n-dimentional **array**. \n", 17 | "\n", 18 | "**I know what you are thinking**: 'why does python need an array when we already have lists and other similar data structures? The answer is simply speed of computation: `numpy` arrays are lightening fast relative to standard python. As you will learn, underneath the hood a `list` is very different from a `numpy` array. \n", 19 | "\n", 20 | "---\n", 21 | "\n", 22 | "```{admonition} \"This is too much\"\n", 23 | "Its worth saying that in my view is this (enourmous) efficiency benefit does come with a trade-off. `numpy` does have a higher learning curve than standard python for new coders. When I first taught numpy it was part of a course in analytics introducing mathematics and business students to data science in python. After our first `numpy` computer lab my favourite quote from a group of distressed students was **\"this is too much... TOO MUCH\"** (it was actually shouted at me). `numpy` is less pythonic and it can at time be difficult to get the elegance of design you want matched with `numpy` code. However, for many mathemathical operations `numpy` code can be more readable due to a concept called 'broadcasting' that we shall cover in a later section. Do persevere if you find it difficult at first. To be clear if you are doing substantive computational work in python you should be using `numpy` and you need to know how to use it to be employable in health data science.\n", 24 | "\n", 25 | "The material in the following sections has been designed to try and avoid my \"too much, too much\" problem. I hope you enjoy it. If you have any suggestions on what you need explaining do let me know.\n", 26 | "```\n", 27 | "\n", 28 | "\n", 29 | "\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "dd1e9495", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3 (ipykernel)", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.8.12" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 5 62 | } 63 | -------------------------------------------------------------------------------- /content/01_algorithms/03_numpy/07_advanced_iter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "e3650b93-d8bc-40fc-9740-c78e07617b0a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "a3fae3c4-a320-4328-99cc-69fffb471c12", 16 | "metadata": {}, 17 | "source": [ 18 | "# Advanced Array Iteration\n", 19 | "\n", 20 | "For most of the `numpy` code that I write in data science applications I make use of slicing, indexing and standard operations. However, occationally there is a need to use a `numpy` iteration function called `nditer`. This might be useful in an instance where I need to iterate over each element of an 2, 3 or 4D array without including multiple for loops. There is extensive documentation for this on the `numpy` docs. Here we will consider some basic functionality that I have found useful in applied work.\n", 21 | "\n", 22 | "## A matrix example\n", 23 | "\n", 24 | "We will consider how to iterate over each element in a 2 dimensional array. You obviously easily do this in standard python. Here's a simple example:" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 166, 30 | "id": "807cb26b-1d55-45a0-ab39-9d4c8045a0b5", 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "[[0 1 2]\n", 38 | " [3 4 5]]\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "a = np.arange(6).reshape(2, 3)\n", 44 | "print(a)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "9761376f-4d23-4115-9cd9-015539e211f8", 50 | "metadata": {}, 51 | "source": [ 52 | "A standard python implementation to iterate over all combinations is as follows. Note the requirement of an inner loop." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 167, 58 | "id": "6d52334c-ff03-4956-993d-57865b8f3a94", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "def standard_all_element_iteration(a, print_out=True):\n", 63 | " for row in a:\n", 64 | " for col in row:\n", 65 | " if print_out: print(col, end= ' ')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 168, 71 | "id": "9218afa6-9bdb-4525-97e9-85b6fa54c789", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "0 1 2 3 4 5 " 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "standard_all_element_iteration(a)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "a3f023c9-7b22-41c2-93cd-201df8937cdb", 89 | "metadata": {}, 90 | "source": [ 91 | "When we need to iterate over all elements of an array then we can use nditer to eliminate the inner loop." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 169, 97 | "id": "6be1a102-5e0f-4842-b121-847bb8246e9c", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "def nditer_all_element_iteration(a, print_out=True):\n", 102 | " for element in np.nditer(a):\n", 103 | " if print_out: print(element, end=' ')" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 170, 109 | "id": "46f5566c-ce9d-4708-8ce8-4e0516eb5a62", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "0 1 2 3 4 5 " 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "nditer_all_element_iteration(a)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "f8fb9b51-5b7f-40e1-bb3e-08b3693523c6", 127 | "metadata": {}, 128 | "source": [ 129 | "The result is that we have considerably faster iteration because the inner loop executes in C." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 171, 135 | "id": "ebdf3b65-91c1-4323-b676-89db3407d044", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "1.29 µs ± 5.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n", 143 | "640 ns ± 5.14 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "%timeit standard_all_element_iteration(a, print_out=False)\n", 149 | "%timeit nditer_all_element_iteration(a, print_out=False)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "9b885b6d-7917-413d-8ef5-56a64606f8ca", 155 | "metadata": {}, 156 | "source": [ 157 | "Note that the iteration took place in across the rows our the array `a`. To iterate across the all elements column-wise you can use 'Fortran' ordering by passing the parameter `order='F'` to `np.nditer`" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 174, 163 | "id": "d6086e14-5018-44cc-9adb-b045b3fd0a9f", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "[[0 1 2]\n", 171 | " [3 4 5]]\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(a)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 173, 182 | "id": "45bfde47-bd52-44db-a0ef-8d91de997b9e", 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "0 3 1 4 2 5 " 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "for element in np.nditer(a, order='F'):\n", 195 | " print(element, end=' ')" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.7.3" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 5 220 | } 221 | -------------------------------------------------------------------------------- /content/01_algorithms/03_numpy/09_cs2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d01ee12f", 7 | "metadata": { 8 | "tags": [ 9 | "hide-input" 10 | ] 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import numpy as np\n", 15 | "import math" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "991310e4", 21 | "metadata": {}, 22 | "source": [ 23 | "# Case study 2: prime sieve\n", 24 | "\n", 25 | "This chapter opened by exploring the importance of good algorithm and code design. We spent a fair bit of time redesigning and micro-optimising a function in standard python that implemented a prime sieve. For large n, for example greater than 10 million, the function `prime_sieve_best` was our fastest option." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "id": "80ad6468", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "def prime_sieve_best(n):\n", 36 | " '''\n", 37 | " Our fastest prime sieve in standard python\n", 38 | " Fastest for large n e.g. > 10m.\n", 39 | " '''\n", 40 | " candidates = bytearray(b\"\\x01\") * (n + 1)\n", 41 | " candidates[0] = 0\n", 42 | " candidates[1] = 0\n", 43 | " limit = int(math.sqrt(n)) + 1 \n", 44 | " \n", 45 | " for i in range(2, limit): \n", 46 | " if candidates[i]:\n", 47 | " candidates[i+i::i] = [0] * ((n - i) // i)\n", 48 | " \n", 49 | " return [i for i in range(n+1) if candidates[i]] " 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "517eef0e", 55 | "metadata": {}, 56 | "source": [ 57 | "The function `prime_sieve_np` again reimplements the algorithm, but this time using `numpy` optimised arrays and functions." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "id": "d266ffac", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "def prime_sieve_np(n):\n", 68 | " '''\n", 69 | " Prime sieve reimplemented in NumPy.\n", 70 | " '''\n", 71 | " candidates = np.ones(n, dtype=bool)\n", 72 | " limit = int(np.sqrt(n)) + 1\n", 73 | " candidates[0:2] = False\n", 74 | " \n", 75 | " for i in range(2, limit):\n", 76 | " if candidates[i]:\n", 77 | " candidates[i+i::i] = False\n", 78 | " return np.flatnonzero(candidates)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "1681fee5", 84 | "metadata": {}, 85 | "source": [ 86 | "You should see a reasonable speed up, for free, using `numpy`. Let's compare it for an even larger n." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 4, 92 | "id": "3015a9cd", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "11.4 s ± 601 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "HUNDRED_MILLION = 100_000_000\n", 105 | "%timeit len(prime_sieve_best(HUNDRED_MILLION))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "id": "a87aea88", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "1.21 s ± 42.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "%timeit len(prime_sieve_np(HUNDRED_MILLION))" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "53518015", 129 | "metadata": {}, 130 | "source": [ 131 | "That's should provide around a factor of 10 speed up. On my machine runtime dropped from around 1 seconds on average to 1.1 seconds on average.\n", 132 | "\n", 133 | "This is also a nice example where, in my opinion, the numpy code is more readable than the standard python. This is partly because `numpy` broadcasting means we can the elements in a slice cleanly. i.e.\n", 134 | "\n", 135 | "```python\n", 136 | "candidates[i+i::i] = False\n", 137 | "```\n", 138 | "verus standard python\n", 139 | "\n", 140 | "```python\n", 141 | "candidates[i+i::i] = [0] * ((n - i) // i)\n", 142 | "```" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.7.3" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 5 167 | } 168 | -------------------------------------------------------------------------------- /content/01_algorithms/03_numpy/data/lysis.csv: -------------------------------------------------------------------------------- 1 | per_treated 2 | 0.018867925 3 | 0.03030303 4 | 0.018867925 5 | 0.018867925 6 | 0.060606061 7 | 0.018867925 8 | 0.045454545 9 | 0.045454545 10 | 0.018867925 11 | 0.075757576 12 | 0.045454545 13 | 0.015151515 14 | 0.03030303 15 | 0.018867925 16 | 0.015151515 17 | 0.060606061 18 | 0.060606061 19 | 0.03030303 20 | 0.015151515 21 | 0.045454545 22 | 0.015151515 23 | 0.03030303 24 | 0.03030303 25 | 0.075757576 26 | 0.060606061 27 | 0.03030303 28 | 0.060606061 29 | 0.03030303 30 | 0.060606061 31 | 0.03030303 32 | 0.060606061 33 | 0.060606061 34 | 0.075757576 35 | 0.045454545 36 | 0.075757576 37 | 0.03030303 38 | 0.045454545 39 | 0.015151515 40 | 0.121212121 41 | 0.03030303 42 | 0.045454545 43 | 0.045454545 44 | 0.075757576 45 | 0.166666667 46 | 0.03030303 47 | 0.090909091 48 | 0.090909091 49 | 0.090909091 50 | 0.25 51 | 0.147058824 52 | 0.161764706 53 | 0.117647059 54 | 0.191176471 55 | 0.132352941 56 | -------------------------------------------------------------------------------- /content/01_algorithms/03_numpy/data/minor_illness_ed_attends.csv: -------------------------------------------------------------------------------- 1 | attends_rate_per_10k_pop 2 | 2.11927795 3 | 3.490575446 4 | 3.989229081 5 | 2.368604767 6 | 3.241248629 7 | 2.867258402 8 | 3.11658522 9 | 2.742594994 10 | 3.615238855 11 | 3.615238855 12 | 4.363219308 13 | 3.11658522 14 | 3.739902264 15 | 2.243941358 16 | 3.241248629 17 | 1.620624314 18 | 2.243941358 19 | 2.991921811 20 | 2.368604767 21 | 2.368604767 22 | 2.368604767 23 | 3.11658522 24 | 2.493268176 25 | 2.368604767 26 | 3.11658522 27 | 2.742594994 28 | 3.739902264 29 | 1.994614541 30 | 2.867258402 31 | 1.620624314 32 | 2.991921811 33 | 3.365912037 34 | 1.620624314 35 | 3.739902264 36 | 2.742594994 37 | 3.11658522 38 | 2.493268176 39 | 2.368604767 40 | 3.739902264 41 | 3.864565673 42 | 1.745287723 43 | 4.238555899 44 | 2.368604767 45 | 3.615238855 46 | 1.994614541 47 | 2.11927795 48 | 2.991921811 49 | 2.617931585 50 | 2.243941358 51 | 2.368604767 52 | 3.490575446 53 | 2.368604767 54 | 2.867258402 55 | 2.991921811 56 | 2.867258402 57 | 2.867258402 58 | 2.991921811 59 | 3.241248629 60 | 2.617931585 61 | 3.11658522 62 | 3.11658522 63 | 2.11927795 64 | 3.864565673 65 | 2.867258402 66 | 3.989229081 67 | 5.111199761 68 | 2.867258402 69 | 2.867258402 70 | 2.493268176 71 | 3.739902264 72 | 3.739902264 73 | 1.869951132 74 | 2.11927795 75 | 3.615238855 76 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "strange-apollo", 6 | "metadata": {}, 7 | "source": [ 8 | "# Exercises" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "north-destination", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [] 18 | } 19 | ], 20 | "metadata": { 21 | "kernelspec": { 22 | "display_name": "Python 3 (ipykernel)", 23 | "language": "python", 24 | "name": "python3" 25 | }, 26 | "language_info": { 27 | "codemirror_mode": { 28 | "name": "ipython", 29 | "version": 3 30 | }, 31 | "file_extension": ".py", 32 | "mimetype": "text/x-python", 33 | "name": "python", 34 | "nbconvert_exporter": "python", 35 | "pygments_lexer": "ipython3", 36 | "version": "3.8.8" 37 | } 38 | }, 39 | "nbformat": 4, 40 | "nbformat_minor": 5 41 | } 42 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/02_basic_oop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "concerned-excess", 6 | "metadata": {}, 7 | "source": [ 8 | "# Basic Object Orientated Methods\n", 9 | "\n", 10 | "Follow these simple OOP exercises to practice and gain confidence in coding classes. " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "listed-equilibrium", 16 | "metadata": {}, 17 | "source": [ 18 | "## Exercise 1\n", 19 | "\n", 20 | "**Task:**\n", 21 | "* Create a class called `Patient`. \n", 22 | "* The class should contain a constructor that accepts the following parameters. The parameters should be stored in appropriately named attributes.\n", 23 | " * patient_id: int\n", 24 | " * age: int" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "behind-choice", 30 | "metadata": {}, 31 | "source": [ 32 | "**Hints**\n", 33 | "* Don't forget to include the `self` parameter!\n", 34 | "* Make sure you use correct case for the class name. `Patient` follows PEP8 guidelines while `patient` does not!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "valued-detroit", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# your code here ..." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "upset-stanford", 50 | "metadata": {}, 51 | "source": [ 52 | "## Exercise 2:\n", 53 | "\n", 54 | "**Task:**\n", 55 | "* Create a class called `Ward`\n", 56 | "* Code a constructor method. \n", 57 | " * It should accept `ward_id` (int) as parameter and assign it to an attribute\n", 58 | " * It should create a new empty list attribute that will hold patients staying on the ward. \n", 59 | "* Create a method called `add_patient`. It should accept a parameter called `patient` (that is a patient class).\n", 60 | "* Create a method or property called `n_patients`. It should return the number of patients on the ward.\n", 61 | "\n", 62 | "**Hints:**\n", 63 | "* Don't forget the `self` parameter in the method!\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 21, 69 | "id": "offensive-devices", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# your code here ..." 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "widespread-dubai", 79 | "metadata": {}, 80 | "source": [ 81 | "## Exercise 3:\n", 82 | "\n", 83 | "You will now test the `Ward` class by generating a number of patients and adding them to a ward object.\n", 84 | "\n", 85 | "**Task:**\n", 86 | "* Code a function that first creates a `Ward` object and then adds a user specified number of `Patient` instances via the `add_patient` function.\n", 87 | "* The function must return the ward object.\n", 88 | "* Test your function with 5 patients.\n", 89 | "\n", 90 | "**Hints**:\n", 91 | "* You will need to design the function so that it allocates a patient an age. One option is to randomly generate an age in a given range. You could achieve this using the `random.randint()` function. E.g.\n", 92 | "\n", 93 | "```python\n", 94 | "from random import randint\n", 95 | "lower, upper = 60, 95\n", 96 | "age = randint(lower, upper)\n", 97 | "\n", 98 | "```" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 9, 104 | "id": "threatened-great", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# your code here ..." 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "id": "naughty-rugby", 114 | "metadata": {}, 115 | "source": [ 116 | "## Exercise 4:\n", 117 | "\n", 118 | "**Task:**\n", 119 | "* Now create a `Hospital` class\n", 120 | "* The class should allow the creation of new wards as well as adding a patient to a user specified ward.\n", 121 | "* The class must provide a `n_patients` method or property that returns the uptodate total of patients in the hospital.\n", 122 | "* Create some test data and create a `Hospital` object. Return the total number of patients in the hospital." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "bored-injection", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# your code here ..." 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "neutral-friendship", 138 | "metadata": {}, 139 | "source": [ 140 | "## Exercise 5\n", 141 | "\n", 142 | "**Task:**\n", 143 | "\n", 144 | "Let's create a new type of patient specific to those with respiratory conditions. \n", 145 | "\n", 146 | "The new class will also accept `patient_id` and `age`. You will need to create two new parameters as well: `pack_yrs` and `fev1`. Fyi:\n", 147 | "\n", 148 | "* A pack year is defined as twenty cigarettes smoked everyday for one year \n", 149 | "* FEV1 stands for Forced Expiratory Volumne and is a percentage measured out of 100%. Lower values are worse.\n", 150 | "\n", 151 | "\n", 152 | "Call the class `RespiratoryPatient`\n", 153 | "\n", 154 | "**Hints**:\n", 155 | "* You can solve this exercise by either using inheritance or composition. Compositin is a bit harder (and more code), but its more flexible and safer in practice." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 38, 161 | "id": "returning-status", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# your code here ..." 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.8.8" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 5 190 | } 191 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/breach.csv: -------------------------------------------------------------------------------- 1 | breaches 2 | 33184 3 | 41151 4 | 47414 5 | 46436.42857 6 | 89917.28571 7 | 72889.28571 8 | 46942.85714 9 | 59172.71429 10 | 57379.85714 11 | 54805.42857 12 | 51701.71429 13 | 44885.42857 14 | 45076.57143 15 | 55392.42857 16 | 62452.71429 17 | 59131.85714 18 | 72832 19 | 73746.57143 20 | 85427.14286 21 | 66947 22 | 64561 23 | 61520.71429 24 | 61439.28571 25 | 56433.57143 26 | 54607 27 | 59110.14286 28 | 68868.42857 29 | 70014.28571 30 | 97186.57143 31 | 94561.42857 32 | 94112.28571 33 | 120995 34 | 120121.7143 35 | 64823.28571 36 | 56340 37 | 65588.71429 38 | 68396.14286 39 | 74543.71429 40 | 79250 41 | 70781.14286 42 | 84049.28571 43 | 83995.71429 44 | 89101.28571 45 | 85828.71429 46 | 88957.14286 47 | 102189.7143 48 | 92863.71429 49 | 96272 50 | 88879.71429 51 | 99849.71429 52 | 118978.1429 53 | 119731.1429 54 | 192155 55 | 153156.4286 56 | 132867.8571 57 | 141194 58 | 125799.2857 59 | 110284.4286 60 | 99324 61 | 97475 62 | 106475 63 | 123111 64 | 147554 65 | 163472 66 | 168604 67 | 216286 68 | 229081 69 | 265834 70 | 186122 71 | 201329 72 | 184912 73 | 201973 74 | 174419 75 | 182597 76 | 219137 77 | 221713 78 | 268818 79 | 281612 80 | 216416 81 | 201392 82 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/breach.csv: -------------------------------------------------------------------------------- 1 | breaches 2 | 33184 3 | 41151 4 | 47414 5 | 46436.42857 6 | 89917.28571 7 | 72889.28571 8 | 46942.85714 9 | 59172.71429 10 | 57379.85714 11 | 54805.42857 12 | 51701.71429 13 | 44885.42857 14 | 45076.57143 15 | 55392.42857 16 | 62452.71429 17 | 59131.85714 18 | 72832 19 | 73746.57143 20 | 85427.14286 21 | 66947 22 | 64561 23 | 61520.71429 24 | 61439.28571 25 | 56433.57143 26 | 54607 27 | 59110.14286 28 | 68868.42857 29 | 70014.28571 30 | 97186.57143 31 | 94561.42857 32 | 94112.28571 33 | 120995 34 | 120121.7143 35 | 64823.28571 36 | 56340 37 | 65588.71429 38 | 68396.14286 39 | 74543.71429 40 | 79250 41 | 70781.14286 42 | 84049.28571 43 | 83995.71429 44 | 89101.28571 45 | 85828.71429 46 | 88957.14286 47 | 102189.7143 48 | 92863.71429 49 | 96272 50 | 88879.71429 51 | 99849.71429 52 | 118978.1429 53 | 119731.1429 54 | 192155 55 | 153156.4286 56 | 132867.8571 57 | 141194 58 | 125799.2857 59 | 110284.4286 60 | 99324 61 | 97475 62 | 106475 63 | 123111 64 | 147554 65 | 163472 66 | 168604 67 | 216286 68 | 229081 69 | 265834 70 | 186122 71 | 201329 72 | 184912 73 | 201973 74 | 174419 75 | 182597 76 | 219137 77 | 221713 78 | 268818 79 | 281612 80 | 216416 81 | 201392 82 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/dtocs.csv: -------------------------------------------------------------------------------- 1 | dtoc 2 | 4940 3 | 5004 4 | 4588 5 | 4409 6 | 3861 7 | 4597 8 | 4404 9 | 4170 10 | 3910 11 | 4056 12 | 4137 13 | 4228 14 | 4144 15 | 4165 16 | 4150 17 | 4165 18 | 3617 19 | 4094 20 | 4007 21 | 4028 22 | 3954 23 | 3857 24 | 4086 25 | 4031 26 | 3961 27 | 4102 28 | 4115 29 | 3894 30 | 3448 31 | 4188 32 | 4007 33 | 4053 34 | 4046 35 | 4184 36 | 3888 37 | 3961 38 | 4084 39 | 4231 40 | 4147 41 | 4200 42 | 3649 43 | 4221 44 | 4276 45 | 4327 46 | 4207 47 | 4516 48 | 4363 49 | 4612 50 | 4704 51 | 4960 52 | 4930 53 | 5063 54 | 4475 55 | 5221 56 | 4942 57 | 4948 58 | 4739 59 | 4972 60 | 4996 61 | 4888 62 | 5114 63 | 5247 64 | 5330 65 | 5573 66 | 5004 67 | 5777 68 | 5714 69 | 5601 70 | 5852 71 | 5996 72 | 6152 73 | 6361 74 | 6387 75 | 6759 76 | 6777 77 | 6771 78 | 6167 79 | 7118 80 | 6855 81 | 6648 82 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/lysis.csv: -------------------------------------------------------------------------------- 1 | per_treated 2 | 0.018867925 3 | 0.03030303 4 | 0.018867925 5 | 0.018867925 6 | 0.060606061 7 | 0.018867925 8 | 0.045454545 9 | 0.045454545 10 | 0.018867925 11 | 0.075757576 12 | 0.045454545 13 | 0.015151515 14 | 0.03030303 15 | 0.018867925 16 | 0.015151515 17 | 0.060606061 18 | 0.060606061 19 | 0.03030303 20 | 0.015151515 21 | 0.045454545 22 | 0.015151515 23 | 0.03030303 24 | 0.03030303 25 | 0.075757576 26 | 0.060606061 27 | 0.03030303 28 | 0.060606061 29 | 0.03030303 30 | 0.060606061 31 | 0.03030303 32 | 0.060606061 33 | 0.060606061 34 | 0.075757576 35 | 0.045454545 36 | 0.075757576 37 | 0.03030303 38 | 0.045454545 39 | 0.015151515 40 | 0.121212121 41 | 0.03030303 42 | 0.045454545 43 | 0.045454545 44 | 0.075757576 45 | 0.166666667 46 | 0.03030303 47 | 0.090909091 48 | 0.090909091 49 | 0.090909091 50 | 0.25 51 | 0.147058824 52 | 0.161764706 53 | 0.117647059 54 | 0.191176471 55 | 0.132352941 56 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/moviedb.csv: -------------------------------------------------------------------------------- 1 | ID,Title,Budget,Box_office,Year,Meta_Critic 2 | 1,Amazing spiderman,230,757.9,2012,66 3 | 2,Ironman,140,585.2,2008,57 4 | 3,Thor,150,449.3,2011,54 5 | 4,Captain America: the first avenger,140,370.6,2011,66 6 | 5,Antman,130,519.3,2015,64 7 | 6,Guardians of the Galaxy,232.3,773,2014,76 8 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p1.csv: -------------------------------------------------------------------------------- 1 | 0,1,0 2 | 1,1,1 3 | 0,1,0 4 | 0,1,0 5 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p10.csv: -------------------------------------------------------------------------------- 1 | 1,1,1,1,1 2 | 1,0,1,0,1 3 | 1,0,1,0,1 -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p2.csv: -------------------------------------------------------------------------------- 1 | 1,1 2 | 1,1 3 | 1,1 4 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p3.csv: -------------------------------------------------------------------------------- 1 | 1,0,0 2 | 1,1,0 3 | 0,1,0 4 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p4.csv: -------------------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 4 | 1 5 | 1 6 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p5.csv: -------------------------------------------------------------------------------- 1 | 0,1,0 2 | 1,1,1 3 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p6.csv: -------------------------------------------------------------------------------- 1 | 0,1,1 2 | 1,1,1 3 | 1,1,1 4 | 1,1,0 -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p7.csv: -------------------------------------------------------------------------------- 1 | 0,1 2 | 0,1 3 | 1,1 -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p8.csv: -------------------------------------------------------------------------------- 1 | 1,1,1,1 -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/data/pieces/p9.csv: -------------------------------------------------------------------------------- 1 | 1,1 2 | 0,1 -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/dtocs.csv: -------------------------------------------------------------------------------- 1 | dtoc 2 | 4940 3 | 5004 4 | 4588 5 | 4409 6 | 3861 7 | 4597 8 | 4404 9 | 4170 10 | 3910 11 | 4056 12 | 4137 13 | 4228 14 | 4144 15 | 4165 16 | 4150 17 | 4165 18 | 3617 19 | 4094 20 | 4007 21 | 4028 22 | 3954 23 | 3857 24 | 4086 25 | 4031 26 | 3961 27 | 4102 28 | 4115 29 | 3894 30 | 3448 31 | 4188 32 | 4007 33 | 4053 34 | 4046 35 | 4184 36 | 3888 37 | 3961 38 | 4084 39 | 4231 40 | 4147 41 | 4200 42 | 3649 43 | 4221 44 | 4276 45 | 4327 46 | 4207 47 | 4516 48 | 4363 49 | 4612 50 | 4704 51 | 4960 52 | 4930 53 | 5063 54 | 4475 55 | 5221 56 | 4942 57 | 4948 58 | 4739 59 | 4972 60 | 4996 61 | 4888 62 | 5114 63 | 5247 64 | 5330 65 | 5573 66 | 5004 67 | 5777 68 | 5714 69 | 5601 70 | 5852 71 | 5996 72 | 6152 73 | 6361 74 | 6387 75 | 6759 76 | 6777 77 | 6771 78 | 6167 79 | 7118 80 | 6855 81 | 6648 82 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/ex_templates/ex1_quickstart.py: -------------------------------------------------------------------------------- 1 | #### Define the function heatmap_of_minima(trials, max_rand) here ### 2 | 3 | 4 | def main(): 5 | ''' This is the function that runs your code 6 | 7 | Feel free to try different numbers and see the results 8 | ''' 9 | 10 | # Parameters: 11 | trials = 10000 12 | max_rand = 99999 13 | 14 | # You need to provide this function! 15 | heatmap_of_minima(trials, max_rand) 16 | 17 | 18 | if __name__ == "__main__": 19 | main() 20 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/ex_templates/ex2_quickstart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | MATH6005 - Lab 4, exercise 2 4 | Solution. 5 | 6 | @author: Carlos Lamas-Fernandez 7 | """ 8 | import os 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | def main(): 14 | ''' Defines the main parameters and runs the algorithm 15 | ''' 16 | 17 | directory = '../data/pieces' 18 | piece_names = ['p1.csv', 'p2.csv', 'p3.csv', 'p4.csv', 'p5.csv', 19 | 'p6.csv', 'p7.csv', 'p8.csv', 'p9.csv', 'p10.csv'] 20 | 21 | container_width = 6 22 | container_height = 12 23 | 24 | piece_list = read_pieces(directory, piece_names) 25 | 26 | container, packed_pieces = top_left_corner(piece_list, 27 | container_width, 28 | container_height) 29 | # Show final packing 30 | plot_container(container) 31 | 32 | # See if it was possible to pack all pieces: 33 | if packed_pieces < len(piece_list): 34 | print('WARNING: only {0} pieces were packed, out of {1}'. 35 | format(packed_pieces, len(piece_list))) 36 | 37 | 38 | def top_left_corner(piece_list, container_width, container_height): 39 | ''' Top left corner algorithm 40 | 41 | Keyword arguments: 42 | piece_list -- list of pieces (float numpy arrays) 43 | container_width -- size of the container along the x axis (int) 44 | container_height -- size of the container along the y axis (int) 45 | ''' 46 | 47 | packed_pieces = 0 48 | container = np.zeros((container_height, container_width)) 49 | 50 | for piece in piece_list: 51 | 52 | 53 | # Remove if statement!!! 54 | if (packed_pieces > 0): 55 | break 56 | ### 57 | 58 | 59 | piece_placed = False 60 | 61 | for c_row in range(container_height): 62 | for c_col in range(container_width): 63 | if not pieces_overlap(container, piece, c_row, c_col): 64 | place_piece(container, piece, c_row, c_col) 65 | piece_placed = True 66 | break 67 | else: 68 | continue 69 | 70 | if piece_placed: 71 | break 72 | 73 | if piece_placed: 74 | packed_pieces += 1 75 | 76 | return(container, packed_pieces) 77 | 78 | 79 | def pieces_overlap(container, piece, x_coord, y_coord): 80 | '''Returns true if the piece overlaps with others, false if not 81 | 82 | Keyword arguments: 83 | container -- binary representation of the container (numpy array) 84 | piece -- binary representation of the piece tested (numpy array) 85 | x_coord -- x position in the container (int) 86 | y_coord -- y position in the container (int) 87 | ''' 88 | 89 | # Check for overlap, if found, return True 90 | 91 | return False 92 | 93 | 94 | def place_piece(container, piece, x_coord, y_coord): 95 | ''' Modifies the container to have piece placed at position (x,y) 96 | 97 | Keyword arguments: 98 | container -- binary representation of the container (numpy array) 99 | piece -- binary representation of the piece placed (numpy array) 100 | x_coord -- x position in the container (int) 101 | y_coord -- y position in the container (int) 102 | ''' 103 | 104 | p_x, p_y = piece.shape 105 | 106 | container_bit = container[x_coord:x_coord + p_x, y_coord:y_coord + p_y] 107 | container_bit += piece 108 | 109 | 110 | def read_pieces(directory, piece_names): 111 | ''' Returns a list of pieces composed of numpy arrays for each of them 112 | 113 | The pieces should be in the directory "directory" and their names listed 114 | in the list "piece_names". It is expected that pieces are CSV files 115 | containing only 0 and 1 values. The function adds a small random number 116 | (i.e. <= 0.4) in order to achieve different colours when plotting the 117 | resulting container. 118 | 119 | Keyword arguments: 120 | directory -- string containing the name of directory of the piece files 121 | piece_names -- list with the names of the piece CSV files 122 | x -- x position in the container (int) 123 | y -- y position in the container (int) 124 | ''' 125 | n_pieces = len(piece_names) 126 | piece_list = [] 127 | for pname in piece_names: 128 | piece = np.loadtxt(directory + os.sep + pname, 129 | delimiter=',', 130 | dtype=np.float64, 131 | ndmin=2) 132 | 133 | # Add a small random number to have different colours in the plot 134 | piece = piece*(1 + 0.1 + (1 + len(piece_list))/n_pieces*0.4) 135 | piece_list.append(piece) 136 | 137 | return piece_list 138 | 139 | 140 | def plot_container(container): 141 | ''' Plots the contents of the container 142 | 143 | To have different colours, loops through the pieces and changes their 144 | value from 1.xx to their index + 2 145 | 146 | Keyword arguments: 147 | container -- binary representation of the container (numpy array) 148 | ''' 149 | 150 | container_for_plot = container 151 | 152 | n_pieces = 0 153 | while True: 154 | min_val = np.min(container_for_plot[container_for_plot > 0]) 155 | if min_val > 2: 156 | break 157 | 158 | container_for_plot[container_for_plot == min_val] = n_pieces + 2 159 | 160 | n_pieces += 1 161 | 162 | plt.imshow(container_for_plot, cmap='tab20b') 163 | 164 | if __name__ == "__main__": 165 | main() 166 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/ex_templates/lab4_debug_challenge.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Lab 4 - Debug Challenge 4 | 5 | @author: Carlos Lamas-Fernandez 6 | """ 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | def main(): 11 | ''' Defines the main parameters and runs the algorithm 12 | ''' 13 | 14 | directory = '../data/pieces' 15 | piece_names = ['p1.csv', 'p2.csv', 'p3.csv', 'p4.csv', 'p5.csv', 16 | 'p6.csv', 'p7.csv', 'p8.csv', 'p9.csv', 'p10.csv'] 17 | 18 | piece_list = read_pieces(directory, piece_names) 19 | 20 | 21 | # Print information for the pieces: 22 | for p_i in piece_list: 23 | # Print the piece first: 24 | print('===== PIECE {0}=====') 25 | print_piece(p_i) 26 | 27 | print('Piece rotations:') 28 | for rot in range(4): 29 | print('Rotated {0} degrees:'.format(90*rot) 30 | p_i = rotate_right(p_i) 31 | print_piece(p_i) 32 | 33 | 34 | def rotate_right(piece) 35 | ''' Rotate a piece to the right. 36 | We first transpose it and then flip it horizontally 37 | ''' 38 | rotated_piece = np.transpose(piece) 39 | rotated_piece = np.fliplr(piece) 40 | return(rotated_piece) 41 | 42 | def print_piece(piece): 43 | ''' Shows the piece on screen 44 | ''' 45 | plt.imshow(-1*piece, cmap='gray') 46 | plt.show() 47 | 48 | 49 | 50 | def read_pieces(directory, piece_names): 51 | ''' Returns a list of pieces composed of numpy arrays for each of them 52 | 53 | The pieces should be in the directory "directory" and their names listed 54 | in the list "piece_names". It is expected that pieces are CSV files 55 | containing only 0 and 1 values. The function adds a small random number 56 | (i.e. <= 0.4) in order to achieve different colours when plotting the 57 | resulting container. 58 | 59 | Keyword arguments: 60 | directory -- string containing the name of directory of the piece files 61 | piece_names -- list with the names of the piece CSV files 62 | x -- x position in the container (int) 63 | y -- y position in the container (int) 64 | ''' 65 | piece_list = [] 66 | for pname in piece_names: 67 | piece = np.loadtxt(directory + os.sep + pname, 68 | delimiter=',', 69 | dtype=np.float64, 70 | ndmin=2) 71 | 72 | piece_list.append(piece) 73 | 74 | return piece_list 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/im/all_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/all_overlap.png -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/im/brb_sol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/brb_sol.png -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/im/one_piece.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/one_piece.PNG -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/im/only_one_piece.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/only_one_piece.png -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/im/outline_pane.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/outline_pane.PNG -------------------------------------------------------------------------------- /content/01_algorithms/04_exercises/im/valid_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/valid_layout.png -------------------------------------------------------------------------------- /content/01_algorithms/05_debug.md: -------------------------------------------------------------------------------- 1 | # Debug challenges 2 | 3 | To test your `numpy` skill there are two debug challenges 4 | 5 | 1. Debug a monte-carlo simulation of an acute stroke pathway. -------------------------------------------------------------------------------- /content/01_algorithms/05_debug/00_debug_cv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | def synthetic_classification(n_samples=10, n_features=1, shuffle=False, 5 | random_seed=None): 6 | ''' 7 | Generates a simple random synthetic dataset in a given shape. 8 | Used for testing of generator classes. 9 | 10 | X: each feature is a sequence i to i + n_samples where i is the feature no. 11 | y data is 0 or 1 weighted very roughly 50/50. 12 | 13 | These sequences are randomised if shuffle is set to True. 14 | 15 | No error checking. Assumes all inputs are valid. 16 | 17 | Params: 18 | ------ 19 | n_samples: int, optional (default=10) 20 | The number of samples 21 | 22 | n_features: int, optional (default=1) 23 | The number of features in the classification problem 24 | 25 | shuffle: bool, optional (default=False) 26 | If true then sequences are randomly shuffled 27 | 28 | random_seed: int or None, optional (default=None) 29 | If shuffle then controls the ordering of the sequences generated. 30 | 31 | Returns: 32 | -------- 33 | X, y 34 | Where X and y are np.ndarrays and X will have shape 35 | (n_samples, n_features) 36 | 37 | ''' 38 | X = [[(col * (n_samples)) + row for col in range(n_features)] 39 | for row in range(n_samples)] 40 | y = ([1] * (n_samples // 2)) + ([0] * ((n_samples // 2) + (n_samples % 2))) 41 | 42 | if shuffle: 43 | for lst in [X, y]: 44 | random.seed(random_seed) 45 | random.shuffle(lst) 46 | return np.array(X), np.array(y) 47 | 48 | 49 | class KFold: 50 | ''' 51 | K-fold cross validation of a X, y formatted dataset. 52 | Optional random shuffling of input data. 53 | Note that original data is not shuffled, but a copy of a shuffled 54 | array is created. 55 | ''' 56 | def __init__(self, k=5, shuffle=False, random_seed=None): 57 | ''' 58 | Params: 59 | ------- 60 | k: int 61 | The number of folds 62 | 63 | shuffle: bool, optional (default=False) 64 | When True the data are randomly shuffled 65 | 66 | random_seed: int or None, optional (default=None) 67 | When shuffle set to true and random_seed is an integer the shuffling 68 | of the dataset is controlled prior to folding. 69 | ''' 70 | self.k = k 71 | self.shuffle = shuffle 72 | self.rng = np.random.default_rng(random_seed) 73 | 74 | def __repr__(self): 75 | rep = f'KFoldCV(k={self.k}, shuffle={self.shuffle},' \ 76 | + f'random_seed={self.random_seed})' 77 | 78 | 79 | def get_n_splits(self, X): 80 | ''' 81 | Return an integer representing the number of splits that 82 | will be generated. 83 | 84 | ''' 85 | return self.k 86 | 87 | def split(self, X, y): 88 | ''' 89 | Generator method. Returns incremental splits of the dataset 90 | on each call. 91 | 92 | Params: 93 | ------ 94 | X: array-like 95 | python list or numpy.ndarray containing X data. For multiple features 96 | shape should be (n_samples, n_features) 97 | 98 | y: array-like 99 | python list or numpy.ndarray containing y target data. For multiple 100 | targets shape should be (n_samples, n_targets) 101 | 102 | Returns: 103 | -------- 104 | train_X, test_X, train_y, test_y 105 | 106 | Where each is a np.ndarray 107 | ''' 108 | # convert lists to numpy arrays 109 | X, y = np.asarray(X), np.asarray(y) 110 | 111 | # store the indexes of each element - its these that get shuffled. 112 | if self.shuffle: 113 | idx = self.rng.integers(0, len(X), size=len(X)) 114 | else: 115 | idx = np.arange(len(X), dtype=np.int16) 116 | 117 | # length of k - 1 splits... final split continues to end. 118 | split_len = int(len(X) / (self.k)) 119 | 120 | for test_idx in range(0, len(X), split_len): 121 | 122 | # create k - 1 training folds for X 123 | train_X = self._fold_training_data(X, idx, test_idx, split_len) 124 | # X test data for fold 125 | test_X = X[idx[test_idx: test_idx+split_len]] 126 | 127 | # create k - 1 training segments for y 128 | train_y = self._fold_training_data(y, idx, test_idx, split_len) 129 | # y test data fold 130 | test_y = y[idx[test_idx: test_idx+split_len]] 131 | 132 | yield train_X, test_X, train_y, test_y 133 | 134 | 135 | def _fold_training_data(self, data, idx, test_idx, split_len): 136 | ''' 137 | create training segments for X or y 138 | ''' 139 | train_seg1 = data[idx[:test_idx]] 140 | train_seg2 = data[idx[test_idx + split_len: ]] 141 | return np.concatenate([train_seg1, train_seg2]) 142 | 143 | 144 | if __name__ == '__main__': 145 | # generate test dataset 146 | X, y = synthetic_classification(n_samples=10, n_features=1, shuffle=False) 147 | 148 | # create an instance of LeaveNOut 149 | cv = KFold(k=5) 150 | 151 | # basic cross validation loop. 152 | # I've zipped together a range and the splits into order to get fold no. 153 | for i, split_data in zip(range(cv.get_n_splits(X)), cv.split(X, y)): 154 | train_X, train_y, test_X, test_y = split_data 155 | print(f'Fold {i+1}:\nTrain:\tX:{train_X}, y:{train_y}') 156 | print(f'Test:\tX:{test_X}, y:{test_y}') 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /content/01_algorithms/06_solutions.md: -------------------------------------------------------------------------------- 1 | # Solutions -------------------------------------------------------------------------------- /content/01_algorithms/06_solutions/data/breach.csv: -------------------------------------------------------------------------------- 1 | breaches 2 | 33184 3 | 41151 4 | 47414 5 | 46436.42857 6 | 89917.28571 7 | 72889.28571 8 | 46942.85714 9 | 59172.71429 10 | 57379.85714 11 | 54805.42857 12 | 51701.71429 13 | 44885.42857 14 | 45076.57143 15 | 55392.42857 16 | 62452.71429 17 | 59131.85714 18 | 72832 19 | 73746.57143 20 | 85427.14286 21 | 66947 22 | 64561 23 | 61520.71429 24 | 61439.28571 25 | 56433.57143 26 | 54607 27 | 59110.14286 28 | 68868.42857 29 | 70014.28571 30 | 97186.57143 31 | 94561.42857 32 | 94112.28571 33 | 120995 34 | 120121.7143 35 | 64823.28571 36 | 56340 37 | 65588.71429 38 | 68396.14286 39 | 74543.71429 40 | 79250 41 | 70781.14286 42 | 84049.28571 43 | 83995.71429 44 | 89101.28571 45 | 85828.71429 46 | 88957.14286 47 | 102189.7143 48 | 92863.71429 49 | 96272 50 | 88879.71429 51 | 99849.71429 52 | 118978.1429 53 | 119731.1429 54 | 192155 55 | 153156.4286 56 | 132867.8571 57 | 141194 58 | 125799.2857 59 | 110284.4286 60 | 99324 61 | 97475 62 | 106475 63 | 123111 64 | 147554 65 | 163472 66 | 168604 67 | 216286 68 | 229081 69 | 265834 70 | 186122 71 | 201329 72 | 184912 73 | 201973 74 | 174419 75 | 182597 76 | 219137 77 | 221713 78 | 268818 79 | 281612 80 | 216416 81 | 201392 82 | -------------------------------------------------------------------------------- /content/01_algorithms/06_solutions/data/dtocs.csv: -------------------------------------------------------------------------------- 1 | dtoc 2 | 4940 3 | 5004 4 | 4588 5 | 4409 6 | 3861 7 | 4597 8 | 4404 9 | 4170 10 | 3910 11 | 4056 12 | 4137 13 | 4228 14 | 4144 15 | 4165 16 | 4150 17 | 4165 18 | 3617 19 | 4094 20 | 4007 21 | 4028 22 | 3954 23 | 3857 24 | 4086 25 | 4031 26 | 3961 27 | 4102 28 | 4115 29 | 3894 30 | 3448 31 | 4188 32 | 4007 33 | 4053 34 | 4046 35 | 4184 36 | 3888 37 | 3961 38 | 4084 39 | 4231 40 | 4147 41 | 4200 42 | 3649 43 | 4221 44 | 4276 45 | 4327 46 | 4207 47 | 4516 48 | 4363 49 | 4612 50 | 4704 51 | 4960 52 | 4930 53 | 5063 54 | 4475 55 | 5221 56 | 4942 57 | 4948 58 | 4739 59 | 4972 60 | 4996 61 | 4888 62 | 5114 63 | 5247 64 | 5330 65 | 5573 66 | 5004 67 | 5777 68 | 5714 69 | 5601 70 | 5852 71 | 5996 72 | 6152 73 | 6361 74 | 6387 75 | 6759 76 | 6777 77 | 6771 78 | 6167 79 | 7118 80 | 6855 81 | 6648 82 | -------------------------------------------------------------------------------- /content/01_algorithms/06_solutions/data/lysis.csv: -------------------------------------------------------------------------------- 1 | per_treated 2 | 0.018867925 3 | 0.03030303 4 | 0.018867925 5 | 0.018867925 6 | 0.060606061 7 | 0.018867925 8 | 0.045454545 9 | 0.045454545 10 | 0.018867925 11 | 0.075757576 12 | 0.045454545 13 | 0.015151515 14 | 0.03030303 15 | 0.018867925 16 | 0.015151515 17 | 0.060606061 18 | 0.060606061 19 | 0.03030303 20 | 0.015151515 21 | 0.045454545 22 | 0.015151515 23 | 0.03030303 24 | 0.03030303 25 | 0.075757576 26 | 0.060606061 27 | 0.03030303 28 | 0.060606061 29 | 0.03030303 30 | 0.060606061 31 | 0.03030303 32 | 0.060606061 33 | 0.060606061 34 | 0.075757576 35 | 0.045454545 36 | 0.075757576 37 | 0.03030303 38 | 0.045454545 39 | 0.015151515 40 | 0.121212121 41 | 0.03030303 42 | 0.045454545 43 | 0.045454545 44 | 0.075757576 45 | 0.166666667 46 | 0.03030303 47 | 0.090909091 48 | 0.090909091 49 | 0.090909091 50 | 0.25 51 | 0.147058824 52 | 0.161764706 53 | 0.117647059 54 | 0.191176471 55 | 0.132352941 56 | -------------------------------------------------------------------------------- /content/01_algorithms/06_solutions/data/moviedb.csv: -------------------------------------------------------------------------------- 1 | ID,Title,Budget,Box_office,Year,Meta_Critic 2 | 1,Amazing spiderman,230,757.9,2012,66 3 | 2,Ironman,140,585.2,2008,57 4 | 3,Thor,150,449.3,2011,54 5 | 4,Captain America: the first avenger,140,370.6,2011,66 6 | 5,Antman,130,519.3,2015,64 7 | 6,Guardians of the Galaxy,232.3,773,2014,76 8 | -------------------------------------------------------------------------------- /content/01_algorithms/data/hist.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/data/hist.csv -------------------------------------------------------------------------------- /content/01_algorithms/data/minor_illness_ed_attends.csv: -------------------------------------------------------------------------------- 1 | attends_rate_per_10k_pop 2 | 2.11927795 3 | 3.490575446 4 | 3.989229081 5 | 2.368604767 6 | 3.241248629 7 | 2.867258402 8 | 3.11658522 9 | 2.742594994 10 | 3.615238855 11 | 3.615238855 12 | 4.363219308 13 | 3.11658522 14 | 3.739902264 15 | 2.243941358 16 | 3.241248629 17 | 1.620624314 18 | 2.243941358 19 | 2.991921811 20 | 2.368604767 21 | 2.368604767 22 | 2.368604767 23 | 3.11658522 24 | 2.493268176 25 | 2.368604767 26 | 3.11658522 27 | 2.742594994 28 | 3.739902264 29 | 1.994614541 30 | 2.867258402 31 | 1.620624314 32 | 2.991921811 33 | 3.365912037 34 | 1.620624314 35 | 3.739902264 36 | 2.742594994 37 | 3.11658522 38 | 2.493268176 39 | 2.368604767 40 | 3.739902264 41 | 3.864565673 42 | 1.745287723 43 | 4.238555899 44 | 2.368604767 45 | 3.615238855 46 | 1.994614541 47 | 2.11927795 48 | 2.991921811 49 | 2.617931585 50 | 2.243941358 51 | 2.368604767 52 | 3.490575446 53 | 2.368604767 54 | 2.867258402 55 | 2.991921811 56 | 2.867258402 57 | 2.867258402 58 | 2.991921811 59 | 3.241248629 60 | 2.617931585 61 | 3.11658522 62 | 3.11658522 63 | 2.11927795 64 | 3.864565673 65 | 2.867258402 66 | 3.989229081 67 | 5.111199761 68 | 2.867258402 69 | 2.867258402 70 | 2.493268176 71 | 3.739902264 72 | 3.739902264 73 | 1.869951132 74 | 2.11927795 75 | 3.615238855 76 | -------------------------------------------------------------------------------- /content/01_algorithms/data/salaries.csv: -------------------------------------------------------------------------------- 1 | ID,Age,"Gender (0=Female, 1=Male)",Salary 2 | 0,22,0,72000 3 | 1,47,0,27000 4 | 2,35,0,36000 5 | 3,33,1,19000 6 | 4,34,1,104000 7 | 5,65,1,86000 8 | 6,54,1,104000 9 | 7,19,0,21000 10 | 8,62,1,52000 11 | -------------------------------------------------------------------------------- /content/01_algorithms/data/salaries_extended.csv: -------------------------------------------------------------------------------- 1 | Name,ID,Age,Department,Salary 2 | John,0,22,Marketing,72000 3 | Anna,1,47,Marketing,27000 4 | Joseph,2,35,Sales,36000 5 | Mary,3,33,Logistics,19000 6 | Anthony,4,34,Logistics,104000 7 | Claire,5,65,Logistics,86000 8 | Bernard,6,54,Logistics,104000 9 | Sarah,7,19,Sales,21000 10 | Nick,8,62,Sales,52000 11 | -------------------------------------------------------------------------------- /content/01_algorithms/im/gsearch.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/im/gsearch.PNG -------------------------------------------------------------------------------- /content/01_algorithms/im/salaries.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/im/salaries.PNG -------------------------------------------------------------------------------- /content/01_algorithms/im/salaries_extended.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/im/salaries_extended.PNG -------------------------------------------------------------------------------- /content/02_stat_prog/01_pandas_front_page.md: -------------------------------------------------------------------------------- 1 | # Data wrangling 2 | 3 | `pandas` is a data science package orignally developed by Wes McKinney. It builds on top of `numpy` to provide a higher level API for wrangling, analysing and visualising data. It is also closely coupled to matplotlib with a number of shorthand methods to create plots of data. 4 | 5 | Our labs on `pandas` will cover beginner and intermediate techniques in data wrangling, manipulation and visualisation. There is also an exercise on creating a reproducible pipeline for downloading and efficiently storing a large data file in memory. 6 | 7 | 8 | -------------------------------------------------------------------------------- /content/02_stat_prog/02_matplotlib/explore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/02_stat_prog/02_matplotlib/explore.png -------------------------------------------------------------------------------- /content/02_stat_prog/02_matplotlib/stacked.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/02_stat_prog/02_matplotlib/stacked.png -------------------------------------------------------------------------------- /content/02_stat_prog/02_visual_front_page.md: -------------------------------------------------------------------------------- 1 | # Visualising data -------------------------------------------------------------------------------- /content/02_stat_prog/03_exercises/03_visualise_ts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Visualising time series data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "**Step 1: Import emergency department reattendance data.** \n", 26 | "\n", 27 | "This is a time series from a hospital that measures the number of patients per month that have reattended an ED within 7 days of a previous attendance.\n", 28 | "\n", 29 | "This can be found in **\"data/ed_reattend.csv\"**\n", 30 | "or \n", 31 | "'https://raw.githubusercontent.com/hsma-master/hsma/master/12_forecasting/data/ed_reattend.csv'\n", 32 | "\n", 33 | "* **Hint 1**: look back at the lecture notes and see how `pd.read_csv()` was used. \n", 34 | "\n", 35 | "* **Hint 2**: The format of the 'date' column is in UK standard dd/mm/yyyy. You will need to set the `dayfirst=True` of `pd.read_csv()` to make sure pandas interprets the dates correctly.\n", 36 | "\n", 37 | "* **Hint 3**: The data is monthly and the dates are all the first day of the month. This is called monthly start and its shorthand is 'MS'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "#your code here\n", 47 | "url = 'https://raw.githubusercontent.com/hsma-master/hsma/master/' \\\n", 48 | " + '12_forecasting/data/ed_reattend.csv'" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "**Step 2: Check the shape of the `DataFrame` and print out the first 5 observations**" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "#your code here" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "**Step 3: Check the minimum and maximum date of the series**\n", 72 | "\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "#your code here" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "**Step 4: Create a basic plot of the time series**" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "#your code here" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "**Step 5: Improve the appearance of your chart**\n", 105 | " \n", 106 | "Try the following:\n", 107 | " \n", 108 | "* Add a y-axis label\n", 109 | "* Add gridlines to the plot\n", 110 | "* Add markers to block\n", 111 | "* Change the colour of the line\n", 112 | "* Experiment with using seaborn" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "#your code here" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "**Step 6: Perform a calender adjustment**\n", 129 | "\n", 130 | "The data is at the monthly level. Therefore some of the noise in the time series is due to the differing number of days per month. Perform a calender adjust and plot the daily rate of reattendance." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "#your code here" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "**Step 7: Run a smoother through the series to assess trend**\n", 147 | "\n", 148 | "Hint: Try using the `.rolling` method of dataframe with a `window=12` and `center=True` to create a 12 month centred moving average \n", 149 | "\n", 150 | "Is there any benefit from switchoing to a 6 month MA? Why does the 6-MA look different to the 12-MA.\n", 151 | "\n", 152 | "Use the calender adjusted data." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "#your code here" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "**Step 8: Perform a seasonal decomposition on the time series**\n", 169 | "\n", 170 | "Plot the trend, seasonal and remainder components of the decomposition.\n", 171 | "\n", 172 | "Try both an additive and multiplicative model. What is the difference between the two models?\n", 173 | "\n", 174 | "* Hint: Look back at the lecture for a function to help you.\n", 175 | "\n", 176 | "\n" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "#your code here" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3 (ipykernel)", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.11.9" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 4 210 | } 211 | -------------------------------------------------------------------------------- /content/02_stat_prog/03_exercises/data/sw_imaging.csv: -------------------------------------------------------------------------------- 1 | region,org_code,provider,imaging_type,n_referrals,mdn_days_rtt,mdn_days_ttr 2 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Computerized Axial Tomography,46160,3.0,0.0 3 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Diagnostic Ultrasonography,72985,14.0,0.0 4 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Fluoroscopy,12320,0.0,0.0 5 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Magnetic Resonance Imaging,27535,40.0,5.0 6 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Nuclear Medicine Procedure,4245,28.0,1.0 7 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Plain Radiography,198295,0.0,1.0 8 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Single Photon Emission Computerized Tomography,425,28.0,0.0 9 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Computerized Axial Tomography,40870,3.0,0.0 10 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Diagnostic Ultrasonography,40070,14.0,0.0 11 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Fluoroscopy,4745,16.0,0.0 12 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Magnetic Resonance Imaging,22380,23.0,7.0 13 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Nuclear Medicine Procedure,3465,14.0,1.0 14 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Plain Radiography,161850,0.0,1.0 15 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Positron Emission Tomography,1000,9.0,4.0 16 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Single Photon Emission Computerized Tomography,545,17.0,2.0 17 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Computerized Axial Tomography,31165,3.0,0.0 18 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Diagnostic Ultrasonography,42690,13.0,0.0 19 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Fluoroscopy,8230,0.0,0.0 20 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Magnetic Resonance Imaging,14730,14.0,2.0 21 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Nuclear Medicine Procedure,1975,20.0,1.0 22 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Plain Radiography,110505,0.0,0.0 23 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Positron Emission Tomography,810,14.0,2.0 24 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Single Photon Emission Computerized Tomography,90,33.0,5.5 25 | Y58,RK9,University Hospitals Plymouth NHS Trust,Computerized Axial Tomography,54575,2.0,0.0 26 | Y58,RK9,University Hospitals Plymouth NHS Trust,Diagnostic Ultrasonography,89160,2.0,0.0 27 | Y58,RK9,University Hospitals Plymouth NHS Trust,Fluoroscopy,11500,0.0,0.0 28 | Y58,RK9,University Hospitals Plymouth NHS Trust,Magnetic Resonance Imaging,27210,30.0,3.0 29 | Y58,RK9,University Hospitals Plymouth NHS Trust,Nuclear Medicine Procedure,5030,23.0,0.0 30 | Y58,RK9,University Hospitals Plymouth NHS Trust,Plain Radiography,188830,0.0,3.0 31 | Y58,RK9,University Hospitals Plymouth NHS Trust,Positron Emission Tomography,270,14.0,1.0 32 | Y58,RK9,University Hospitals Plymouth NHS Trust,Single Photon Emission Computerized Tomography,1285,0.0,0.0 33 | -------------------------------------------------------------------------------- /content/02_stat_prog/03_exercises/data/total_referrals.csv: -------------------------------------------------------------------------------- 1 | provider,n_referrals 2 | Royal Cornwall Hospitals NHS Trust,361965 3 | Royal Devon and Exeter NHS Foundation Trust,274925 4 | Torbay and South Devon NHS Foundation Trust,210195 5 | University Hospitals Plymouth NHS Trust,377860 6 | -------------------------------------------------------------------------------- /content/02_stat_prog/03_exercises/hosp_1_ed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/02_stat_prog/03_exercises/hosp_1_ed.png -------------------------------------------------------------------------------- /content/02_stat_prog/03_exercises_front_page.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | `pandas` is a data science package orignally developed by Wes McKinney. It builds on top of `numpy` to provide a higher level API for wrangling, analysing and visualising data. It is also closely coupled to matplotlib with a number of shorthand methods to create plots of data. 4 | 5 | Our lectures on `pandas` will cover beginner and intermediate techniques in data wrangling, manipulation and visualisation. 6 | -------------------------------------------------------------------------------- /content/02_stat_prog/04_solutions/total_referrals.csv: -------------------------------------------------------------------------------- 1 | provider,n_referrals 2 | Royal Cornwall Hospitals NHS Trust,361965 3 | Royal Devon and Exeter NHS Foundation Trust,274925 4 | Torbay and South Devon NHS Foundation Trust,210195 5 | University Hospitals Plymouth NHS Trust,377860 6 | -------------------------------------------------------------------------------- /content/02_stat_prog/04_solutions_front_page.md: -------------------------------------------------------------------------------- 1 | # Solutions 2 | 3 | The following sections provide example solutions to the statistical programming exercises. There are often many ways to solve data wrangling and visualisation problems. The solutions provided should be taken as guides only. If you feel you have a better way feel free to raise an issue and suggest your solution is adopted instead! -------------------------------------------------------------------------------- /content/03_mgt/01_git/01_why.md: -------------------------------------------------------------------------------- 1 | # The case for version control 2 | 3 | ```{admonition} "Wait.. you DON'T use version control for your code?!" 4 | I'm going to be very honest and say that I find it odd that I still need to have a conversation with people who code about basic version control. I can't really understand why it isn't taught routinely and second nature to everyone. Its actually best not to admit you don't use it when I'm in the room. 5 | 6 | I'm old, and in my professional career, I can't remember a time I didn't use version control software. I think its so important for data science that I'm going to admit a few embarrassing (in a geeky sort of way) things about my history with version control. My first admission is that I first ventured into the world of version control in the summer of 2001 for a summer VB6(!) coding job. At the time I was doing my undergrad in Computer Science and let me tell you - I made a lot of mistakes in my coding! My second admission is that I used to be an avid MS Windows user (so sorry Stallman) and from around 2003-2008 I used TortoiseSVN. This is a GUI extension of the excellent subversion software for version control. I switched to equally excellent Git after I returned to university to study for a PhD (introduced courtesy of the Warwick Computing Society) where I used it to control R, S-PLUS(!), and C# code. It was not until June 28th 2011 that I pushed my first commit to the now famous GitHub (some dodgy C# code to automate a commercial simulation package via the Windows Common Object Model - yuk!). 7 | 8 | You don't need to remember any of that, just take home the message, that I'm pro version control for one single reason. Part of any data science study is carefully controlling and managing your code. If you don't then you will fail to get it producing the same results or perhaps even working again in 6 months time! **You should view your code as a first class citizen in data science. Do your code, yourself and others a favour - use version control.** 9 | ``` 10 | 11 | ## Why use version control? 12 | 13 | ### Scenario 1 14 | 15 | Consider a scenario where you take up a position as a data scientist in a government organisation. On your first day you are told that your predecessor has left already, but all code needed for your job is saved to the server. You log in and have a look in the directory: 16 | 17 | ``` 18 | uber_import_gov_proj 19 | ├── 20190320_main_v2.py 20 | ├── archive 21 | │   ├── 20190504_v3_main_not_final.py 22 | │   ├── tests_before_fix.py 23 | │   ├── v1_main.py 24 | │   └── v3_main_final.py 25 | ├── v2_main_20190320.py 26 | ├── v3_main_final.py 27 | ├── v2v3_main_final_TM_MP_MA_DC(MA_conflicted_copy).py 28 | └── v3_main_final_TMonks_Conflicted_Copy.py 29 | ``` 30 | Take a moment to take in the mess of this project. Perhaps you can laugh about it. The questions you should be asking yourself are: 31 | 32 | * have you ever ended up in a mess like this even though you have had the most noble of intentions at the start of a project? 33 | * have you ever worked with someone who has managed work in this way? 34 | 35 | In my experience this sort of structure turns up surprisingly often, for all sorts of data science and non-data science projects. It is certainly more common than a cleanly organised data science project. This is a totally unnecessary situation. With version control we actually only need this structure: 36 | 37 | ``` 38 | uber_import_gov_proj 39 | ├── main.py 40 | ``` 41 | 42 | ### Scenario 2 43 | 44 | Even though the code is a complete mess, you are still working for that government organisation several months later. It is a Monday morning and you stroll into work with the intention of trying again to work out if you should run the analysis code in `v3_main_final.py` or `archive/v3_main_final.py`. But alas your plans are interrupted! Some organisation critical code originally written years ago, by an analyst long since departed, failed to run over the weekend. It's your job to fix it! You open up the code and after the initial horror of finding its a single 'god function' with a repeating verbose code, begin to try and make sense of the problem. Your initial findings are: 45 | 46 | * Its clear from comments in the code that it has been modified by several people over the years, but it is not clear how many times, who the coders were, what the changes were made and in what order. 47 | * There's no 'archive' folder listing older versions of the code and no documentation. So there's no way to roll back changes. 48 | * There's no code to test if the main analysis code runs as expected. 49 | 50 | Before you laugh again this is actually a situation I found myself in many years ago. It wasn't fun (at all - especially as I had lots of people checking if "I'd fixed it yet?" quite frequently). It did turn out that a change had introduced the bug under a given set of conditions. So, after quite a while, I fixed what turned out to be an extremely important piece of code for the organisation. There was no version control system in place so I carefully documented the changes both in the code via comments and in external documentation. 51 | 52 | Can you think of any software that's open source and free that would have made this a bit easier? 53 | -------------------------------------------------------------------------------- /content/03_mgt/01_git/02_git.md: -------------------------------------------------------------------------------- 1 | # Introducing Git 2 | 3 | Congratulations, you have reached **a very important topic** in your data science studies! Before we get into **Git** I want to acknowledge there other high quality version control tools available for your python or code in any other language; for example, subversion. 4 | 5 | Git is a distributed version control system for files. Git was originally developed by Linus Torvolds (who famously created the **Linux kernel**). 6 | 7 | > The origin of the name Git is quite amusing. I will leave you to look these stories up and enjoy! 8 | 9 | I'm going to teach you about Git not because its the best, but because **it is software I use on a daily basis** and also its the software I see most researchers using (which isn't that many!). For most data science tasks I've also found it robust and easy to use (although like all software there is always a few head scratching moments!). 10 | 11 | > I am managing the material I am writing for this book using Git. 12 | 13 | ## A simple example 14 | 15 | Before we look at setting up Git and issuing some commands let's just look at a simple use case and understand the benefits. 16 | 17 | Let's assume you create a file `main.py` and add the following code: 18 | 19 | ```python 20 | ''' 21 | main module 22 | 23 | ''' 24 | 25 | def do_something(): 26 | pass 27 | 28 | if __name__ == '__main__': 29 | do_something() 30 | ``` 31 | 32 | Using version control you **commit** this file to a **repository**: a complete history of a project including all changes that have been made to files and directories within it. 33 | 34 | A few weeks later you modify `main.py`. 35 | 36 | ```python 37 | ''' 38 | main module 39 | 40 | ''' 41 | 42 | def do_something(): 43 | pass 44 | 45 | def do_something_else(): 46 | pass 47 | 48 | if __name__ == '__main__': 49 | do_something() 50 | do_something_else() 51 | ``` 52 | 53 | With version control you don't need to create a new version of `main.py`. Instead you save the changes to file, **stage** them and **commit** them to the repository. This means we can view the history of commits! For example, with this project git has a log that reports: 54 | 55 | ```bash 56 | commit 4be943efd265dd58020d64af770ff63d229fd8d8 (HEAD -> master) 57 | Author: Tom Monks 58 | Date: Mon Aug 2 16:13:24 2021 +0100 59 | 60 | MAIN: added do_something_else() 61 | 62 | commit 2e09f233e392448fdcf82b3c8ed45cd8a72c3e0e 63 | Author: Tom Monks 64 | Date: Mon Aug 2 16:11:58 2021 +0100 65 | 66 | MAIN: main.py -> do_something() 67 | 68 | ``` 69 | 70 | In the output above the first line ends with `(HEAD -> master)`. This the the latest commit (the **head**). 71 | 72 | Git commits track the changes to files between commits (or the history of changes to a file). We can view changes between specific commits. This is called the **difference** or **diff**. For example for our two simple commits Git outputs: 73 | 74 | ```shell 75 | diff --git a/main.py b/main.py 76 | index 38056d8..431d04e 100644 77 | --- a/main.py 78 | +++ b/main.py 79 | @@ -6,5 +6,9 @@ main module 80 | def do_something(): 81 | pass 82 | 83 | +def do_something_else(): 84 | + pass 85 | + 86 | if __name__ == '__main__': 87 | do_something() 88 | + do_something_else() 89 | 90 | ``` 91 | 92 | This output is designed to be fairly intuitive. The `+` at the start of a line indicates that this is new code in the second commit. This is incredibly helpful when you need to understand what has changed and how this might affect an analysis (or introduce bugs). If we had removed a line of code the it would have been prefixed with a `-`. 93 | 94 | > Side note: commits become harder to followed the more changes are included. So try to avoid huge commits where many many files and lines of code have been changed. Commit often and thoughtfully. 95 | 96 | Now imagine a scenario where you arrive at work the next day, re-run your analysis code and realise that you have made a mistake in the modified `main.py`: do_something_else() is not needed and in fact the new code has broken the original analysis. You need to roll back to the first iteration of the code that you know works. This is called a **rollback**. After the rollback the git log looks like: 97 | 98 | ```bash 99 | commit 2e09f233e392448fdcf82b3c8ed45cd8a72c3e0e (HEAD -> master) 100 | Author: Tom Monks 101 | Date: Mon Aug 2 16:11:58 2021 +0100 102 | ``` 103 | Referring back to our previous log we can see that the `HEAD` is now the original commit. Indeed `main.py` has reverted to: 104 | 105 | ```python 106 | ''' 107 | main module 108 | 109 | ''' 110 | 111 | def do_something(): 112 | pass 113 | 114 | if __name__ = '__main__': 115 | do_something() 116 | ``` 117 | 118 | > There are various ways to rollback. In this case I've demoed what is called a a **hard** reset. In practice, for a bug, you may want to do a **soft** reset or even safer a **revert + restore**. 119 | 120 | ## Distributed? 121 | 122 | A defining feature of Git is that it is **distributed**. This means that each git **repository** is a complete history of a project and that multiple users are required to merge their changes together. 123 | 124 | ## Other Git resources 125 | 126 | In addition to the resources in this book I very much recommend exploring the Git material provided by [Software Carpentry](https://swcarpentry.github.io/git-novice/). This is wonderful novice friendly material that is open and free to use. There's also the main [git website](https://git-scm.com/) which includes a free copy of an excellent git book. 127 | 128 | 129 | -------------------------------------------------------------------------------- /content/03_mgt/01_git/04_cs_2.md: -------------------------------------------------------------------------------- 1 | # Case study: roll back and bug fix 2 | 3 | This case study assumes you have worked through case study 1 and have a the following project that is a git repo. 4 | 5 | ```bash 6 | analysis_code 7 | ├── main.py 8 | ├── readme.md 9 | ├── run_1.log 10 | └── run_2.log 11 | ``` 12 | 13 | Issuing the `git log --oneline` command you should get the following history. 14 | 15 | > Remember that your commit hash values will be unique to your repo. 16 | 17 | ```bash 18 | 6475df6 DOCS: run instructions 19 | 186d91b SETUP: .gitignore + *.log 20 | d404939 INIT: add main.py 21 | ``` 22 | 23 | ## Scenario 24 | 25 | You have written an analysis program. This code executes each night on critical patient level data. At a recent meeting a senior researcher requested a new subgroup analysis in the code. You know how to do this and quickly make the change to the data. Unfortunately your change introduces an unexpected bug into the main analysis. Given the importance of the code, it is necessary to roll back to the previous version that works with no problems while a fix is found. 26 | 27 | To simulate this, you are going to commit a change to an existing code base that results in a bug. We will explore using git to undo the change by rolling back to a previous commit and then fixing the bug. 28 | 29 | ## Step 1: Modify and commit changes to`main.py` 30 | 31 | Let's make a change to the `main.py` and commit it to the repo. The twist is that our new modifications are going to contain a hidden and sneaky bug! 32 | 33 | ```{admonition} This would never happen in real life 34 | You may be reading this and thinking - I will never commit a bug to the main branch of my git repo because all of my code is tested and triple checked beforehand. If so good for you. It never happens to me either... ahem. 35 | ``` 36 | 37 | ```python 38 | ''' 39 | main module 40 | 41 | ''' 42 | 43 | def do_something(): 44 | print('Friendly code') 45 | 46 | def do_something_else(): 47 | print('This is a major bug!') 48 | 49 | if __name__ == '__main__': 50 | do_something() 51 | do_something_else() 52 | 53 | ``` 54 | 55 | The git command to commit is 56 | 57 | ```bash 58 | git status 59 | git add main.py 60 | git status 61 | git commit -m "MAIN:+do_something_else() extends analysis" 62 | git log -2 --oneline 63 | ``` 64 | 65 | > Note that I checked the status of my repo before and after staging a file. 66 | 67 | The final git command requests a one line summary of the last two commits: 68 | 69 | ```bash 70 | e1c4fd3 MAIN:+do_something_else() extends analysis 71 | 6475df6 DOCS: run instructions 72 | ``` 73 | ## Step 2: Finding the bug 74 | 75 | When you run the analysis code: 76 | 77 | > This assumes you are in the `analysis_code` directory that contains `main.py` 78 | 79 | ```bash 80 | python3 main.py 81 | ``` 82 | You are shocked to receive the following output: 83 | 84 | ```shell 85 | Friendly code 86 | This is a major bug! 87 | ``` 88 | This is a major problem for your project. This code runs every night and needs to be rolled back to a previous working version. Luckily this is simple because it is only 1 commit previous. 89 | 90 | ## Step 3: Reviewing changes 91 | 92 | Before we undo the last commit to the repo let's have quick look at what changes were actually made to `main.py`. We can do that by using commit id or by using `HEAD~1` where `~1` refers to 1 commit previous. 93 | 94 | ```bash 95 | git diff HEAD~1 main.py 96 | ``` 97 | In English, this asks git for the difference in main.py one commit previous. This results in: 98 | 99 | ```shell 100 | diff --git a/main.py b/main.py 101 | index 38056d8..d1c50df 100644 102 | --- a/main.py 103 | +++ b/main.py 104 | @@ -4,7 +4,11 @@ main module 105 | 106 | 107 | def do_something(): 108 | - pass 109 | + print('Friendly code') 110 | 111 | -if __name__ = '__main__': 112 | +def do_something_else(): 113 | + print('This is a major bug!') 114 | + 115 | +if __name__ == '__main__': 116 | do_something() 117 | + do_something_else() 118 | 119 | ``` 120 | 121 | ## Step 3: Rolling back 122 | 123 | In my view "undo" operations in git can be some of the most confusing because there is more than one way to do it. 124 | 125 | ### `git revert` 126 | 127 | Here we will take a safe option and **git revert** a commit. This command creates a **new commit** and reverses changes made in a previous commit. It is safe because **you don't lose any history**. The old buggy commit remains and you can access the code within it. 128 | 129 | The commit we want to revert is the last one. To be clear its is the commit that contains the code that introduced the bug. We first look up its commit hash: 130 | 131 | ```bash 132 | git log -2 --oneline 133 | ``` 134 | 135 | ```bash 136 | e1c4fd3 (HEAD -> master) MAIN:+do_something_else() extends analysis 137 | 6475df6 DOCS: run instructions 138 | ``` 139 | 140 | and then issue the revert command referencing e1c4fd3 141 | 142 | > a reminder again that this is the commit that introduced the bug! 143 | 144 | ```bash 145 | git revert e1c4fd3 146 | ``` 147 | 148 | When you do this you will be prompted to add a commit message. One is provided for you by git. I'm just going to accept it as is. 149 | 150 | ```nano 151 | Revert "MAIN:+do_something_else() extends analysis" 152 | 153 | This reverts commit e1c4fd3ce836f6fe1f7df3a6d1fb805209a790d8. 154 | ``` 155 | 156 | After reverting git we can check `main.py` and find that it has returned to a bug free state! 157 | 158 | ```python 159 | ''' 160 | main module 161 | 162 | ''' 163 | 164 | def do_something(): 165 | pass 166 | 167 | if __name__ = '__main__': 168 | do_something() 169 | ``` 170 | 171 | We can also confirm that our history is intact and `git revert` has created a new commit by `git log -3 --oneline` 172 | 173 | ```bash 174 | 73fcaa5 (HEAD -> master) Revert "MAIN:+do_something_else() extends analysis" 175 | e1c4fd3 MAIN:+do_something_else() extends analysis 176 | 6475df6 DOCS: run instructions 177 | 178 | ``` 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /content/03_mgt/01_git/05_cs_3.md: -------------------------------------------------------------------------------- 1 | # Case study: Branching and merging 2 | 3 | > This case study assumes you have worked through case studies 1 and 2. 4 | 5 | A key feature of version control is the ability to create a **branch** seperate from the main code that are used for development. Once the code within a branch is complete it is **merged** into the repo's master branch. 6 | 7 | ```{admonition} You should always use branches even when working alone. 8 | Strictly speaking it is best practice to use a branch to create new features and conduct testing. We have ignored branching so far to keep our introductory material as simple as possible. Ideally you always have a clean (and as best you can bug free) main (master) branch that you or your users/colleagues use for production. Obviously if you are working on your own then simplicity is key. A minimum safe option is to create a `dev` branch where you work towards a new version of your code (perhaps a group of new features) and then merge when everything is complete and tested. 9 | ``` 10 | 11 | ## Scenario 12 | Now that you reverted the master branch to the last working version the code will run correctly every night. In parrallel you will create a new branch and fix the bug in the modified code. Once you are satified it works you will merge the code into the master branch to use in production. 13 | 14 | To do this you will need to 15 | 16 | * Create and checkout a new branch called `dev` 17 | * Restore the version of `main.py` that contains the bug 18 | * Fix and test the bug 19 | * Merge the `dev` branch into `main` 20 | 21 | ## Step 1: Create and checking out a new branch 22 | 23 | We create a new branch using the `git branch ` command. To fix the bug we will work on a branch called `dev` 24 | 25 | ```bash 26 | $ git branch dev 27 | $ git branch 28 | ``` 29 | 30 | The second command will list all of the branches available in the repo. 31 | 32 | ```bash 33 | dev 34 | * master 35 | ``` 36 | 37 | The output reports that there are two available `dev` and `master`. The `*` indicates the active branch. To **switch** the dev branch we can either 38 | 39 | ```bash 40 | $ git switch dev 41 | $ git checkout dev 42 | ``` 43 | 44 | Both `git checkout` and `git switch` will move the branch you are working on to `dev`. 45 | 46 | At this point the code in `dev` is an identical copy of `master` with the same commit history. 47 | 48 | ## Step 2: restore the buggy code. 49 | 50 | One option now is to write the new code from scratch again - this time making sure that is properly tested and working before merging into main. This isn't really making the most of git, however. Instead we will **restore** the buggy version of `main.py` and fix it in the `dev` branch. First let's have a look at the git commit log. 51 | 52 | ```bash 53 | 73fcaa5 (HEAD -> dev, master) Revert "MAIN:+do_something_else() extends analysis" 54 | e1c4fd3 MAIN:+do_something_else() extends analysis 55 | 6475df6 DOCS: run instructions 56 | ``` 57 | The commit that contains the target version of `main.py` is `e1c4fd3`. Next we restore the file, add it to the staging area and commit it to `dev`. 58 | 59 | ```bash 60 | $ git restore --source e1c4fd3 main.py 61 | $ git add main.py 62 | $ git commit -m "FIX: restore main.py to e1c4fd3" 63 | ``` 64 | The file `main.py`, including the bug, has now been restored, but it only in the `dev` branch. 65 | 66 | ```python 67 | ''' 68 | main module 69 | 70 | ''' 71 | 72 | def do_something(): 73 | print('Friendly code') 74 | 75 | def do_something_else(): 76 | print('This is a major bug!') 77 | 78 | if __name__ == '__main__': 79 | do_something() 80 | do_something_else() 81 | ``` 82 | 83 | ## Step 3: Fix the bug 84 | 85 | The fixed `main.py` now looks like 86 | 87 | ```python 88 | ''' 89 | main module 90 | 91 | Fix issued by TM. 92 | ''' 93 | 94 | def do_something(): 95 | print('Friendly code') 96 | 97 | def do_something_else(): 98 | print('Expected value') 99 | 100 | if __name__ == '__main__': 101 | do_something() 102 | do_something_else() 103 | ``` 104 | 105 | It's the end of the working day. You haven't had time to test your fix, so you won't risk merging it with master. But you stage and commit it to `dev` before you leave. 106 | 107 | ```bash 108 | $ git add main.py 109 | $ git commit -m "FIX: do_something_else() patched." 110 | ``` 111 | 112 | You can also quickly switch back to master (`git switch master`) we can confirm the code in `main.py` that will run overnight is the original version. 113 | 114 | ## Step 4: Test and merge the code. 115 | 116 | The next day you have plenty of time to test the new code works. Switching to the `dev` branch we can run 117 | 118 | ```bash 119 | $ python3 main.py 120 | 121 | Friendly code 122 | Expected value 123 | ``` 124 | This confirms everything has worked as expected so we can **merge** the new version of the code into master. 125 | 126 | To complete the merge we need to switch to the `master` branch and use the `merge` command. 127 | 128 | ```bash 129 | $ git switch master 130 | $ git merge dev 131 | 132 | Updating 73fcaa5..6fcbf0d 133 | Fast-forward 134 | main.py | 10 ++++++++-- 135 | 1 file changed, 8 insertions(+), 2 deletions(-) 136 | ``` 137 | 138 | Just to prove to ourselves that the merge has worked we can check the log. 139 | 140 | ```bash 141 | $ git log -4 --oneline 142 | 143 | 6fcbf0d (HEAD -> master, dev) FIX: do_something_else() patched. 144 | 2e9cfcd FIX: restore main.py to e1c4fd3 145 | 73fcaa5 Revert "MAIN:+do_something_else() extends analysis" 146 | e1c4fd3 MAIN:+do_something_else() extends analysis 147 | ``` 148 | and that's it you now have updated your code base. 149 | -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/my_package_name/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | __author__ = 'Tom Monks' -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/my_package_name/datasets.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The datasets module. 3 | 4 | Dummy module for illustration 5 | ''' -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/my_package_name/package_data/example_datset_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/02_packaging/my_package_name/package_data/example_datset_1.csv -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/my_package_name/package_data/example_datset_2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/02_packaging/my_package_name/package_data/example_datset_2.csv -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/my_package_name/plotting.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The plotting module. 3 | 4 | Dummy module for illustration 5 | ''' 6 | 7 | def plt_diagostics(): 8 | '''dummy function just for illustration''' 9 | pass -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/ts_emergency/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | __author__ = 'Tom Monks' -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/ts_emergency/data/.~lock.ts_ed.csv#: -------------------------------------------------------------------------------- 1 | ,tom,pop-os.localdomain,16.07.2021 15:37,file:///home/tom/.config/libreoffice/4; -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/ts_emergency/datasets.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Functions to load built in datasets for ts_emergency. 3 | Datasets are downloaded from an external github repo. 4 | 5 | The key loading function is load_ed_ts 6 | ''' 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | LONG_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \ 12 | + 'hpdm139-datasets/main/syn_ts_ed_long.csv' 13 | 14 | WIDE_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \ 15 | + 'hpdm139-datasets/main/syn_ts_ed_wide.csv' 16 | 17 | def load_ed_ts(data_format='wide', as_pandas=True): 18 | ''' 19 | Load the built-in ED dataset 20 | 21 | Params: 22 | ------ 23 | data_format: str 24 | 'Wide' or 'long' format. Wide format provides hospital columns. 25 | Long format provides a categorical hospital column and single attends 26 | column. 27 | 28 | as_pandas: bool, optional (default = True) 29 | Return as `pandas.Dataframe`. If False then `numpy.ndarray` 30 | 31 | Returns: 32 | ------- 33 | pandas.Dataframe or if `as_pandas=False` then returns `numpy.ndarray` 34 | 35 | ''' 36 | valid_formats = ['wide', 'w', 'long', 'l'] 37 | 38 | if data_format.lower() not in valid_formats: 39 | raise ArgumentError(f'data format should be one of {valid_formats}') 40 | 41 | if data_format == 'wide' or data_format == 'w': 42 | df = _ed_data_to_wide(LONG_URL) 43 | else: 44 | df = _ed_data_to_long(WIDE_URL) 45 | 46 | if as_pandas: 47 | return df 48 | else: 49 | return df.to_numpy() 50 | 51 | 52 | 53 | def _ed_data_to_wide(file_path): 54 | ''' 55 | Return the ED data in wide format. 56 | 57 | 1. Pivot table 58 | 2. Transpose and drop the ('attends', hosp_i) multi-index 59 | 3. Rename columns [0, 1, 2, 4] tp ['hosp_1', 'hosp_2', 'hosp_3', 'hosp_4'] 60 | 4. Index to DateTimeIndex 61 | 5. Drop the additional uneeded series 'date' (as stored in index as well) 62 | 6. Convert attendence numbers from int64 to int16 63 | 64 | Params: 65 | ------ 66 | file_path: str 67 | Path to wide format file 68 | 69 | Returns: 70 | ------- 71 | pandas.DataFrame 72 | ''' 73 | # column name transfers 74 | translated_names = {0:'hosp_1', 75 | 1:'hosp_2', 76 | 2:'hosp_3', 77 | 3:'hosp_4'} 78 | 79 | data_types = {'hosp_1':np.int16, 80 | 'hosp_2':np.int16, 81 | 'hosp_3':np.int16, 82 | 'hosp_4':np.int16} 83 | 84 | df = (pd.read_csv(file_path) 85 | .pivot_table(values=['attends'], index=['date'], columns=['hosp']) 86 | .T.reset_index(drop=True) 87 | .T.rename(columns=translated_names) 88 | .assign(date=lambda x: pd.to_datetime(x.index)) 89 | .set_index('date') 90 | .astype(data_types) 91 | ) 92 | 93 | return df 94 | 95 | 96 | 97 | def _ed_data_to_long(file_path): 98 | ''' 99 | Return the ED data in long format. Uses pd.wide_to_long() 100 | Assume wide format file is used. 101 | 102 | 1. pd.wide_to_long() 103 | 2. reset_index() to remove multi-index 104 | 3. rename col 'hosp_' to 'attends' 105 | 4. date to datetime 106 | 5. Convert attendence numbers from int64 to int16 amd hosp_id to int8. 107 | (could also be a categorical field.) 108 | 109 | Params: 110 | ------ 111 | file_path: str 112 | Path to wide format file 113 | 114 | Returns: 115 | ------- 116 | pandas.DataFrame 117 | ''' 118 | 119 | translated_names = {'hosp_':'attends'} 120 | data_types = {'hosp': np.int8, 'attends':np.int16} 121 | 122 | long_df = ( 123 | pd.wide_to_long(pd.read_csv(file_path), stubnames='hosp_', 124 | i=['date'], j='hosp') 125 | .reset_index() 126 | .rename(columns=translated_names) 127 | .assign(date=lambda x: pd.to_datetime(x['date'])) 128 | .astype(data_types) 129 | ) 130 | 131 | return long_df 132 | -------------------------------------------------------------------------------- /content/03_mgt/02_packaging/ts_emergency/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | DEFAULT_LABEL_FS = 12 4 | DEFAULT_AXIS_FS = 12 5 | DEFAULT_FIGSIZE = (12,8) 6 | 7 | def plot_single_ed(wide_df, hosp_id, figsize=(12,3), 8 | fontsize=DEFAULT_LABEL_FS, line_width=2): 9 | ''' 10 | Plot a single ED's data 11 | Assumes data are passed in wide format. 12 | 13 | Params: 14 | ------- 15 | wide_df: pandas.Dataframe 16 | ED time series data in wide format 17 | 18 | hosp_id: str 19 | name of hospital column to plot e.g. 'hosp_1' 20 | 21 | figsize: tuple(int, int), optional (default=(12,3)) 22 | `matplotlib` figure size 23 | 24 | fontsize: int, optional (default=DEFAULT_LABEL_FS) 25 | Size of label font 26 | 27 | line_width: int 28 | Width of the line plot 29 | 30 | Returns: 31 | ------- 32 | matplotlib fig, ax 33 | 34 | ''' 35 | fig = plt.figure(figsize=figsize) 36 | ax = fig.add_subplot() 37 | ax.set_xlabel("Date", fontsize=fontsize) 38 | ax.set_ylabel("Attendances", fontsize=fontsize) 39 | 40 | _ = ax.plot(wide_df[hosp_id], lw=line_width) 41 | # include x, y grid 42 | _ = ax.grid(ls='--') 43 | 44 | # set size of x, y ticks 45 | _ = ax.tick_params(axis='both', labelsize=fontsize) 46 | 47 | # return the figure 48 | return fig, ax 49 | 50 | 51 | def plot_eds(wide_df, figsize=DEFAULT_FIGSIZE, label_font_size=DEFAULT_LABEL_FS, 52 | axis_font_size=DEFAULT_AXIS_FS): 53 | ''' 54 | Plot all ED's attendances in a 1x4 grid layout. 55 | 56 | Params: 57 | ------ 58 | wide_df: pandas.Dataframe 59 | ED time series data in wide format 60 | 61 | figsize: tuple(int, int), optional (default=(12,3)) 62 | `matplotlib` figure size 63 | 64 | label_font_size: int, optional (default=DEFAULT_LABEL_FS) 65 | Size of label font 66 | 67 | axis_font_size: int, optional (default=DEFAULT_AXIS_FS) 68 | Size of axis tick font 69 | 70 | Returns: 71 | -------- 72 | matplotlib fig 73 | ''' 74 | 75 | fig, axs = plt.subplots(nrows=4, ncols=1, tight_layout=True, figsize=(12,8), 76 | sharex=True) 77 | 78 | # note that axs is a 2D array 79 | for hosp_idx in range(0, 4): 80 | _ = axs[hosp_idx].plot(wide_df[f'hosp_{hosp_idx+1}']) 81 | _ = axs[hosp_idx].set_title(f'Hospital {hosp_idx+1}', 82 | fontsize=label_font_size) 83 | _ = axs[hosp_idx].grid(ls='--') 84 | 85 | # axis labels matplotlib >=3.4 86 | AXIS_LABEL_SIZE = 12 87 | _ = fig.supylabel('ED Attendances', fontsize=axis_font_size) 88 | _ = fig.supxlabel('Date', fontsize=axis_font_size) 89 | 90 | return fig -------------------------------------------------------------------------------- /content/03_mgt/03_mgt_front_page.md: -------------------------------------------------------------------------------- 1 | # Deployment 2 | 3 | So you now know how to code in Python and a few of its data science and machine learning extensions. Congratualations! The truth is, however, that this is only the beginning of your journey in health data science. When you start your first job, perhaps in genomics or building machine learning models for a big company, you need to know how to manage and deploy code. 4 | 5 | In the final part of the book we will look at: 6 | 7 | 1. Local and remote version control 8 | 2. Setting up and orgaising local python packages 9 | 3. Deploying python packages on the Python Package Index (PyPi) 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/02_github.md: -------------------------------------------------------------------------------- 1 | # Install from GitHub 2 | 3 | The steps we followed in the [last section](./01_local.md) have led to an installable package. So far we have installed the package from a local repository. To make installation even easier for our users we will now set things up so they can install the package on GitHub. This is a good option for early development of the package. You can have an remotely installable package, without the need to make things more complicated by using PyPI. 4 | 5 | As a reminder we have the following basic package structure: 6 | 7 | ``` 8 | analysis-package 9 | ├── analysis_package 10 | │ ├── __init__.py 11 | │ ├── model.py 12 | │ ├── data 13 | │ | ├── model_data.csv 14 | ├── tests 15 | │ ├── test_model.py 16 | ├── LICENSE 17 | ├── environment.yml 18 | ├── README.md 19 | └── pyproject.toml 20 | ``` 21 | 22 | > As a reminder that `pyproject.toml` is the key to allowing our package to be installed via `pip`. 23 | 24 | To make this package installable on GitHub we need to create a GitHub repository and **push** our repository to it. The example repository for `package-template` is available [here](https://github.com/health-data-science-OR/analysis-package). 25 | 26 | To install from GitHub we need to activate the Python environment that we wish to install into e.g. 27 | 28 | ```bash 29 | conda activate hds_code 30 | ``` 31 | 32 | and then we issue the following command: 33 | 34 | ```bash 35 | pip install git+https://github.com/health-data-science-OR/analysis-package@main 36 | ``` 37 | 38 | The below is an exert from the output generated by the modified pip install. It reveals how the process differs from the local install! In summary, the **main branch** of the repository is **cloned** to your local machine (stored in a temporary directory). Once the repository has been downloaded the normal pip install process proceeds along with dependency installation. 39 | 40 | ```bash 41 | Collecting git+https://github.com/health-data-science-OR/package-template@main 42 | Cloning https://github.com/health-data-science-OR/package-template (to revision main) to /tmp/pip-req-build-raw3ilfx 43 | Running command git clone --filter=blob:none --quiet https://github.com/health-data-science-OR/package-template /tmp/pip-req-build-raw3ilfx 44 | Resolved https://github.com/health-data-science-OR/package-template to commit cc91d307285b9f10f9cab8cc8290525d84637352 45 | Installing build dependencies ... done 46 | Getting requirements to build wheel ... done 47 | Preparing metadata (pyproject.toml) ... done 48 | Collecting matplotlib>=3.1.3 (from analysis_package==0.1.0) 49 | ``` 50 | 51 | In general to use GitHub for installations we issue a modification of the following command: 52 | 53 | ```bash 54 | pip install git+https://github.com/user/repo.git@branch_or_tag 55 | ``` 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/03_pypi.md: -------------------------------------------------------------------------------- 1 | # Publish a package on PyPI 2 | 3 | The first thing to say is that **there is a PyPI test site!** It is called [TestPyPI](https://testpypi.python.org) and is incredibly helpful and I recommend you make use of it while learning instead of the main PyPI site. I found that I made several mistakes the first time I attempted publication (and still do with new packages!). The great thing about TestPyPI is that once your package is published you can install it, just like you would if it were on the production PyPI site. 4 | 5 | ```{admonition} pyOpenSci.org: An (excellent) alternative guide to PyPI 6 | :class: information, dropdown 7 | While I was updating version 3 of this book I came across an excellent easy to follow guide to python package publishing on PyPI [available on pyopensci.org](https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html). Its another good source of information if you want a different perspective. 8 | ``` 9 | ## IMPORTANT: Use a unique package name 10 | 11 | Before you proceed any further I recommend visiting PyPI and TestPyPI searching for your package name. You might find a package with an identical name. If an identically named package exists then you need to **rename your package, before attempting any of the steps outlined in this section**. 12 | 13 | In the past I have also regretted using a similar name to existing packages. Don't make the same mistakes as me! Keep things simple for yourself: make your package name distinctive. 14 | 15 | If you do rename your package then make sure you must update the following: 16 | 17 | 1. The `pyproject.toml` meta-data 18 | 2. The package directory 19 | 3. The GitHub repository 20 | 21 | ## Setting up TestPyPI 22 | 23 | ### Get a TestPyPI account 24 | 25 | You need to go to https://test.pypi.org and create an account. You should be greeted by a webpage similar to the below. Note that the banner that is making clear you are on the test site. As part of the account creation process you will be required to setup two factor authentication. 26 | 27 | ![testpypi](../../../images/testpypi.png) 28 | 29 | ### API Tokens 30 | 31 | Rather than use your username and password to upload to TestPyPI you need to use an **API token**. Tokens come in two levels of scope: 32 | 33 | * **account wide**: an API token scoped to your entire account will have upload permissions for all of your current and future projects. 34 | * **project specific**: this is self explanatory an API token that allows uploads of a specific package. 35 | 36 | There's a catch to this framework! You can only create a project specific token for an existing project. This means that for a new package you need to use an account wide token. Once the package is uploaded you can then create the project specific token. 37 | 38 | To create a token head to account settings and select **Create API token**. You should be presented with a page similar to the below. I've selected account wide token and chosen the name "new_package_uploads" so that its use is clear. Click on **Create Token** 39 | 40 | ![testpypi](../../../images/test_pypi2.png) 41 | 42 | You will then be shown the generated token. **IMPORTANT** - you need to save this token to a very safe place. You won't be shown it again and you don't want to share it with others as it can access all projects in your account. The token will take the following form: `pypi-[random string]` 43 | 44 | ### Using hatch to publish to and TestPyPI 45 | 46 | To publish on PyPI you need to upload a source tarball and wheel distribution. If you need a reminder of what a wheel is head over to the [introduction to installable packages](./01_local.md). To generate these files issue the following command in the top level of repo directory: 47 | 48 | ```bash 49 | hatch build 50 | ``` 51 | 52 | This will create a new directory `dist/` containing the source and wheel files. 53 | 54 | ```bash 55 | ├── dist 56 | │   ├── analysis_package-0.1.0-py3-none-any.whl 57 | │   └── analysis_package-0.1.0.tar.gz 58 | ``` 59 | 60 | You are not ready to upload! Have your account wide API token to hand. To publish to `TestPyPI` we simply run 61 | 62 | ```bash 63 | hatch publish -r test 64 | ``` 65 | 66 | * You will be prompted for a username enter `__token__` 67 | * You will then be prompted for the API token. Paste in your API token (this won't be displayed). 68 | 69 | The package will then be uploaded to TestPyPI and `hatch` will inform you if this has been successful. If it has you will be prompted with a URL to the TestPyPI page for your new package. For example, https://test.pypi.org/project/analysis_package/0.1.0/. Go take a look at your page! 70 | 71 | ### Install your package from TestPyPI 72 | 73 | On your web page will be a special TestPyPI link to install your package. This looks a bit different from production PyPI, but has the same result. For example, for `analysis_package` we install it as follows: 74 | 75 | ```bash 76 | # let's intall into the hds_code env 77 | conda activate hds_code 78 | 79 | # pip install analysis_package from TestPyPI 80 | pip install -i https://test.pypi.org/simple/analysis_package==0.1.0 81 | ``` 82 | 83 | ### Recommended: create a package specific API token. 84 | 85 | Now that you have created your package, I recommend logging back into your TestPyPI account and creating a package specific API token. It is more secure to work with API tokens that are specific to packages (especially if working in a team or group when developing the work). This avoids accidental uploads to different packages. To do this select **projects**, your project (e.g. `analysis_package`) and then **settings**. 86 | 87 | ![testpypi](../../../images/test_pypi3.png) 88 | 89 | ### Publish on PyPI production 90 | 91 | First I just want to say that you should not publish on the main production PyPI platform unless it is needed. Use PyPI when necessary to help your own research, work or colleagues, but not for testing purposes: use TestPyPI instead. **You will need a separate account for PyPI.**. If you intend to publish to PyPI then you need to follow all of the same steps we used for setting up TestPyPI. 92 | 93 | When you are ready to upload there is a different `hatch` command to publish: 94 | 95 | ```bash 96 | hatch publish 97 | ``` -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/04_automation.md: -------------------------------------------------------------------------------- 1 | # Automation 2 | 3 | The manual upload steps I've outlined in the [PyPI section](./03_pypi.md) are somewhat historical. We know that most modern projects make use of version control in the cloud such as GitLab or GitHub. These tools include ways to automatically publish updates to PyPI. 4 | 5 | ## GitHub Actions 6 | 7 | One option to automate publication of updates to a PyPI package is a GitHub action. An action can be described as a job or a workflow that runs when certain events are triggered. For example, when code is pushed to a remote repository or when a new release is created. To be clear 8 | actions aren't part of the package - they are instead a tools for continuous integration of code. They help the package managers do repetitive tasks needed for maintenance and publishing efficiently and consistently. 9 | 10 | Actions are specified in YAML (Yet Another Markup Language). And are actually quite straightforward to read. GitHub has a large number of templates available you can use and adapt. 11 | 12 | You can read more about GitHub actions [here](https://docs.github.com/en/actions) 13 | 14 | ## Automating package publication to PyPI 15 | 16 | The YAML below is an action that is used to automate the updating of a package on PyPI. It is triggered when a new **release** of the code is made on the main branch. 17 | 18 | ```yaml 19 | name: Upload Python Package 20 | 21 | on: 22 | release: 23 | types: [published] 24 | 25 | permissions: 26 | contents: read 27 | 28 | jobs: 29 | deploy: 30 | 31 | runs-on: ubuntu-latest 32 | 33 | steps: 34 | - uses: actions/checkout@v3 35 | - name: Set up Python 36 | uses: actions/setup-python@v3 37 | with: 38 | python-version: '3.x' 39 | - name: Install dependencies 40 | run: | 41 | python -m pip install --upgrade pip 42 | pip install hatch 43 | - name: Build package 44 | run: hatch build 45 | - name: Publish package 46 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 47 | with: 48 | user: __token__ 49 | password: ${{ secrets.PYPI_TOKEN }} 50 | ``` 51 | 52 | To set this up you will need to supply GitHub with the API token for your package. Its stored securely in GitHub in what is called a **Secret**. In the YAML above you see the final line uses `${{ secrets.PYPI_TOKEN }}`. This means I have named my secret that stores the API project token as `PYPI_TOKEN`. It is essential to create and use a package specific API token for PyPI (or TestPyPI). Do not use an account wide token. 53 | 54 | This action runs on a new release of the code. A release is version of the package that follows the {major}.{minor}.{patch} (e.g. v1.1.2) naming convention we introduced when first learning how to [structure a local python package](./01_local.md). For simplicity I recommend ensuring release numbering matching the package version you have in `__init__.py`. For any package on GitHub you can see the current version on the landing page. For example, for a package I am developing called `sim-tools` you can see the current version highlighted in the screenshot below. 55 | 56 | ![release](../../../images/release.png) 57 | 58 | To create a new release is simple. Click on the **Releases** link highlighted above followed by **Draft new release**. You can read more about releases [here](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) 59 | 60 | ## Where are actions stored? 61 | 62 | When we have added added an action to GitHub our repo looks slightly different. We now have a new directory called `.github` that contains the YAML file describing the action. 63 | 64 | ``` 65 | analysis-package 66 | ├── .github 67 | │ ├── workflows 68 | │ | ├── publish_package.yml 69 | ├── analysis_package 70 | │ ├── __init__.py 71 | │ ├── model.py 72 | │ ├── data 73 | │ | ├── model_data.csv 74 | ├── tests 75 | │ ├── test_model.py 76 | ├── LICENSE 77 | ├── environment.yml 78 | ├── README.md 79 | └── pyproject.toml 80 | ``` 81 | 82 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tom Monks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | recursive-include test_package/data *.csv 3 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/environment.yml: -------------------------------------------------------------------------------- 1 | name: pypi_package_dev 2 | channels: 3 | - defaults 4 | dependencies: 5 | - jupyterlab=1.2.6 6 | - matplotlib=3.1.3 7 | - numpy=1.18.1 8 | - pandas=1.0.1 9 | - pip=20.0.2 10 | - pytest=5.3.5 11 | - python=3.8.1 12 | - scipy=1.4.1 13 | - seaborn=0.10.0 14 | - pip: 15 | - pytest-cov==2.10.0 16 | - setuptools>=51.1.2 17 | - twine>=3.3.0 18 | - wheel>0.36.2 19 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=3.1.3 2 | numpy>=1.18.1 3 | pandas>=1.0.1 4 | scipy>=1.4.1 5 | seaborn>=0.10.0 6 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from test_package import __version__ 3 | 4 | # Read in the requirements.txt file 5 | with open("requirements.txt") as f: 6 | requirements = [] 7 | for library in f.read().splitlines(): 8 | requirements.append(library) 9 | 10 | with open("README.md", "r") as fh: 11 | long_description = fh.read() 12 | 13 | setuptools.setup( 14 | name="pypi-template_2222", 15 | #there must be an way to auto tick up the version number... 16 | version=__version__, 17 | author="Thomas Monks", 18 | #I've created a specific email account before and forwarded to my own. 19 | author_email="generic@genericemail.com", 20 | license="The MIT License (MIT)", 21 | description="A short, but useful description to appear on pypi", 22 | #read in from readme.md and will appear on PyPi 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | url="https://github.com/TomMonks/pypi-template", 26 | packages=setuptools.find_packages(), 27 | #if true look in MANIFEST.in for data files to include 28 | include_package_data=True, 29 | #2nd approach to include data is include_package_data=False 30 | package_data={"test_package": ["data/*.csv"]}, 31 | #these are for documentation 32 | classifiers=[ 33 | "Programming Language :: Python :: 3.6", 34 | "Programming Language :: Python :: 3.7", 35 | "Programming Language :: Python :: 3.8", 36 | "License :: OSI Approved :: MIT License", 37 | "Operating System :: OS Independent", 38 | ], 39 | python_requires='>=3.6.9', 40 | install_requires=requirements, 41 | ) 42 | -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/test_package/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | test_package 3 | 4 | Part of a repo containing boilerplate code for publishing 5 | on PyPi. 6 | 7 | """ 8 | __version__ = '0.1.0' 9 | __author__ = 'Thomas Monks' -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/test_package/data/test_data.csv: -------------------------------------------------------------------------------- 1 | "10", "20", "30" 2 | "40", "50", "60" -------------------------------------------------------------------------------- /content/03_mgt/03_pypi/test_package/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/03_pypi/test_package/test.py -------------------------------------------------------------------------------- /content/03_mgt/03_vc_front_page.md: -------------------------------------------------------------------------------- 1 | # Version control 2 | 3 | So you now know how to code in Python and a few of its data science and machine learning extensions. Congratualations! The truth is, however, that this is only the beginning of your journey in health data science. When you start your first job, perhaps in genomics or building machine learning models for a big company, you need to know how to manage and deploy code. 4 | 5 | In the first part of this topic we will look at version control for source code. 6 | 7 | ## The case for version control 8 | 9 | ```{admonition} "Wait.. you DON'T use version control for your code?!" 10 | I'm going to be very honest and say that I find it odd that I still need to have a conversation with people who code about basic version control. I can't really understand why it isn't taught routinely and second nature to everyone. Its actually best not to admit you don't use it when I'm in the room. 11 | 12 | I'm old, and in my professional career, I can't remember a time I didn't use version control software. I think its so important for data science that I'm going to admit a few embarrassing (in a geeky sort of way) things about my history with version control. My first admission is that I first ventured into the world of version control in the summer of 2001 for a summer VB6(!) coding job. At the time I was doing my undergrad in Computer Science and let me tell you - I made a lot of mistakes in my coding! My second admission is that I used to be an avid MS Windows user (so sorry Stallman) and from around 2003-2008 I used TortoiseSVN. This is a GUI extension of the excellent subversion software for version control. I switched to equally excellent Git after I returned to university to study for a PhD (introduced courtesy of the Warwick Computing Society) where I used it to control R, S-PLUS(!), and C# code. It was not until June 28th 2011 that I pushed my first commit to the now famous GitHub (some dodgy C# code to automate a commercial simulation package via the Windows Common Object Model - yuk!). 13 | 14 | You don't need to remember any of that, just take home the message, that I'm pro version control for one single reason. Part of any data science study is carefully controlling and managing your code. If you don't then you will fail to get it producing the same results or perhaps even working again in 6 months time! **You should view your code as a first class citizen in data science. Do your code, yourself and others a favour - use version control.** 15 | ``` 16 | 17 | ## Why use version control? 18 | 19 | ### Scenario 1 20 | 21 | Consider a scenario where you take up a position as a data scientist in a government organisation. On your first day you are told that your predecessor has left already, but all code needed for your job is saved to the server. You log in and have a look in the directory: 22 | 23 | ``` 24 | uber_import_gov_proj 25 | ├── 20190320_main_v2.py 26 | ├── archive 27 | │ ├── 20190504_v3_main_not_final.py 28 | │ ├── tests_before_fix.py 29 | │ ├── v1_main.py 30 | │ └── v3_main_final.py 31 | ├── v2_main_20190320.py 32 | ├── v3_main_final.py 33 | ├── v2v3_main_final_TM_MP_MA_DC(MA_conflicted_copy).py 34 | └── v3_main_final_TMonks_Conflicted_Copy.py 35 | ``` 36 | Take a moment to take in the mess of this project. Perhaps you can laugh about it. The questions you should be asking yourself are: 37 | 38 | * have you ever ended up in a mess like this even though you have had the most noble of intentions at the start of a project? 39 | * have you ever worked with someone who has managed work in this way? 40 | 41 | In my experience this sort of structure turns up surprisingly often, for all sorts of data science and non-data science projects. It is certainly more common than a cleanly organised data science project. This is a totally unnecessary situation. With version control we actually only need this structure: 42 | 43 | ``` 44 | uber_import_gov_proj 45 | ├── main.py 46 | ``` 47 | 48 | ### Scenario 2 49 | 50 | Even though the code is a complete mess, you are still working for that government organisation several months later. It is a Monday morning and you stroll into work with the intention of trying again to work out if you should run the analysis code in `v3_main_final.py` or `archive/v3_main_final.py`. But alas your plans are interrupted! Some organisation critical code originally written years ago, by an analyst long since departed, failed to run over the weekend. It's your job to fix it! You open up the code and after the initial horror of finding its a single 'god function' with a repeating verbose code, begin to try and make sense of the problem. Your initial findings are: 51 | 52 | * Its clear from comments in the code that it has been modified by several people over the years, but it is not clear how many times, who the coders were, what the changes were made and in what order. 53 | * There's no 'archive' folder listing older versions of the code and no documentation. So there's no way to roll back changes. 54 | * There's no code to test if the main analysis code runs as expected. 55 | 56 | Before you laugh again this is actually a situation I found myself in many years ago. It wasn't fun (at all - especially as I had lots of people checking if "I'd fixed it yet?" quite frequently). It did turn out that a change had introduced the bug under a given set of conditions. So, after quite a while, I fixed what turned out to be an extremely important piece of code for the organisation. There was no version control system in place so I carefully documented the changes both in the code via comments and in external documentation. 57 | 58 | Can you think of any software that's open source and free that would have made this a bit easier? 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /content/03_mgt/04_exercises/02_conda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ad63482d-4247-4589-a572-68345afa9ae5", 6 | "metadata": {}, 7 | "source": [ 8 | "# Using `conda`\n", 9 | "\n", 10 | "To help you with your deployment either via PyPi, Binder or handing over a local python package it is a good idea to improve your `conda` package manager skills and working with conda virtual environments. \n", 11 | "\n", 12 | "> If you are working on a Windows OS I recommend running these commands from Anaconda prompt. If you are working on a Mac or a Linux machine then use a terminal.\n", 13 | "\n", 14 | "## Exercise 1\n", 15 | "\n", 16 | "* List the `conda` environments on your computer.\n", 17 | "\n", 18 | "```\n", 19 | "conda env list\n", 20 | "```\n", 21 | "\n", 22 | "## Exercise 2\n", 23 | "\n", 24 | "* By default you will be in the `base` conda environment. List the packages installed.\n", 25 | "\n", 26 | "```bash\n", 27 | "conda list\n", 28 | "```\n", 29 | "\n", 30 | "## Exercise 3\n", 31 | "\n", 32 | "Let's practice creating an empty environment, activating it, checking it is empty and then remove it.\n", 33 | "\n", 34 | "\n", 35 | "* Create an empty conda environment `empty_env`\n", 36 | "\n", 37 | "```bash\n", 38 | "conda create --name empty_env\n", 39 | "```\n", 40 | "\n", 41 | "> You will be prompted if you want to proceed. Answer Yes!\n", 42 | "\n", 43 | "* Activate the environment\n", 44 | "\n", 45 | "```bash\n", 46 | "$ conda activate empty_env\n", 47 | "```\n", 48 | "\n", 49 | "* List the packages installed\n", 50 | "\n", 51 | "```bash\n", 52 | "conda list\n", 53 | "```\n", 54 | "\n", 55 | "> There should be no packages! If they are then you are probably in the wrong environment. Check this with `conda env list`. The active env is marked with `*`\n", 56 | "\n", 57 | "* Deactivate the env to return to `base`\n", 58 | "\n", 59 | "```bash\n", 60 | "conda deactivate\n", 61 | "```\n", 62 | "\n", 63 | "* Remove the environment\n", 64 | "\n", 65 | "```bash\n", 66 | "conda env remove --name empty_env\n", 67 | "```\n", 68 | "\n", 69 | "* Verify the environment is removed using list\n", 70 | "\n", 71 | "```bash\n", 72 | "conda env list\n", 73 | "```\n", 74 | "\n", 75 | "\n", 76 | "## Exercise 4\n", 77 | "\n", 78 | "Now let's create an environment and install a few packages from the command line.\n", 79 | "\n", 80 | "* Create an environment called `test_env` \n", 81 | "* Activate `test_env`\n", 82 | "* Install `python` version 3.8.8 and `numpy` 1.20.3 \n", 83 | "\n", 84 | "```bash\n", 85 | "conda install python=3.8.8 numpy=1.20.3\n", 86 | "```\n", 87 | "\n", 88 | "> Conda will report what dependencies are going to be installed. This might vary depending on what operating system you use. You will also be prompted if you are happy to proceed. It will take a few seconds to install.\n", 89 | "\n", 90 | "\n", 91 | "\n", 92 | "List all the packages installed in `test_env`. Check that the `python` and `numpy` versions match those you used.\n", 93 | "\n", 94 | "\n", 95 | "## Exercise 5\n", 96 | "\n", 97 | "Staying with `test_env `create a `environment.yml` file that contains **only** the packages you installed from the command line.\n", 98 | "\n", 99 | "* Issue the following export command. Make sure you include the `--from-history` option or you will get a full list of everything in the environment. The output you should is is displayed below as well.\n", 100 | "\n", 101 | "```bash\n", 102 | "$ conda env export --from-history\n", 103 | "\n", 104 | "name: test_env\n", 105 | "channels:\n", 106 | " - defaults\n", 107 | "dependencies:\n", 108 | " - numpy=1.20.3\n", 109 | " - python=3.8.8\n", 110 | "```\n", 111 | "\n", 112 | "* It is also possible to export this to a named file (typically `environment.yml`)\n", 113 | "\n", 114 | "```bash\n", 115 | "$ conda env export --from-history -f environment.yml\n", 116 | "```\n", 117 | "\n", 118 | "> Remember this will export the file to the current working directory. For simplicity I recommend working in the same directory as your code. This makes even more sense for example if you have a git repo.\n", 119 | "\n", 120 | "## Exercise 6\n", 121 | "\n", 122 | "Now let's practice creating a conda env from file. I recommend working in the same directory as exercise 5.\n", 123 | "\n", 124 | "* Deactivate the `test_env` environment\n", 125 | "* Remove the `test_env` \n", 126 | "* Create the conda environment from file\n", 127 | "\n", 128 | "```bash\n", 129 | "$ conda env create -f environment.yml\n", 130 | "```\n", 131 | "\n", 132 | "> This (re)creates `test_env`.\n", 133 | "\n", 134 | "* Activate `test_env`\n", 135 | "* Check what packages are installed." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "id": "35d74941-b5e2-49b9-bced-0e0444026108", 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 3 (ipykernel)", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.8.8" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 5 168 | } 169 | -------------------------------------------------------------------------------- /content/03_mgt/04_exercises/02_use_conda.md: -------------------------------------------------------------------------------- 1 | # Using `conda` 2 | 3 | To help you with your deployment either via PyPi, Binder or handing over a local python package it is a good idea to improve your `conda` package manager skills and working with conda virtual environments. 4 | 5 | > If you are working on a Windows OS I recommend running these commands from Anaconda prompt. If you are working on a Mac or a Linux machine then use a terminal. 6 | 7 | > For more detail on conda check out the [docs](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) 8 | 9 | ## Exercise 1 10 | 11 | * List the `conda` environments on your computer. 12 | 13 | ```bash 14 | $ conda env list 15 | ``` 16 | 17 | ## Exercise 2 18 | 19 | * By default you will be in the `base` conda environment. List the packages installed. 20 | 21 | ```bash 22 | $ conda list 23 | ``` 24 | 25 | ## Exercise 3 26 | 27 | Let's practice creating an empty environment, activating it, checking it is empty and then remove it. 28 | 29 | 30 | * Create an empty conda environment `empty_env` 31 | 32 | ```bash 33 | $ conda create --name empty_env 34 | ``` 35 | 36 | > You will be prompted if you want to proceed. Answer Yes! 37 | 38 | * Activate the environment 39 | 40 | ```bash 41 | $ conda activate empty_env 42 | ``` 43 | 44 | * List the packages installed 45 | 46 | ```bash 47 | $ conda list 48 | ``` 49 | 50 | > There should be no packages! If they are then you are probably in the wrong environment. Check this with `conda env list`. The active env is marked with `*` 51 | 52 | * Deactivate the env to return to `base` 53 | 54 | ```bash 55 | $ conda deactivate 56 | ``` 57 | 58 | * Remove the environment 59 | 60 | ```bash 61 | $ conda env remove --name empty_env 62 | ``` 63 | 64 | * Verify the environment is removed using list 65 | 66 | ```bash 67 | $ conda env list 68 | ``` 69 | 70 | ## Exercise 4 71 | 72 | Now let's create an environment and install a few packages from the command line. 73 | 74 | * Create an environment called `test_env` 75 | * Activate `test_env` 76 | * Install `python` version 3.8.8 and `numpy` 1.20.3 77 | 78 | ```bash 79 | $ conda install python=3.8.8 numpy=1.20.3 80 | ``` 81 | 82 | > Conda will report what dependencies are going to be installed. This might vary depending on what operating system you use. You will also be prompted if you are happy to proceed. It will take a few seconds to install. 83 | 84 | * List all the packages installed in `test_env`. Check that the `python` and `numpy` versions match those you used. 85 | 86 | 87 | ## Exercise 5 88 | 89 | Staying with `test_env `create a `environment.yml` file that contains **only** the packages you installed from the command line. 90 | 91 | * Issue the following export command. Make sure you include the `--from-history` option or you will get a full list of everything in the environment. The output you should is is displayed below as well. 92 | 93 | ```bash 94 | $ conda env export --from-history 95 | 96 | name: test_env 97 | channels: 98 | - defaults 99 | dependencies: 100 | - numpy=1.20.3 101 | - python=3.8.8 102 | ``` 103 | 104 | * It is also possible to export this to a named file (typically `environment.yml`) 105 | 106 | ```bash 107 | $ conda env export --from-history -f environment.yml 108 | ``` 109 | 110 | > Remember this will export the file to the current working directory. For simplicity I recommend working in the same directory as your code. This makes even more sense for example if you have a git repo. 111 | 112 | ## Exercise 6 113 | 114 | Now let's practice creating a conda env from file. I recommend working in the same directory as exercise 5. 115 | 116 | * Deactivate the `test_env` environment 117 | * Remove the `test_env` 118 | * Create the conda environment from file 119 | 120 | ```bash 121 | $ conda env create -f environment.yml 122 | ``` 123 | 124 | > This (re)creates `test_env`. 125 | 126 | * Activate `test_env` 127 | * Check what packages are installed. 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /content/03_mgt/04_exercises/03_binder.md: -------------------------------------------------------------------------------- 1 | # Binderhub exercises 2 | 3 | > For this exercise you will need a Github account. Sign-up via [https://github.com/](https://github.com/). 4 | 5 | In exercises 1 to 4 you are going to upload a Jupyter notebook to Github and share it via Binderhub. 6 | 7 | ## Exercise 1: 8 | 9 | First create the Github repo and insert a notebook file. 10 | 11 | **Task**: 12 | 13 | * Create a Github repo. You can use any repo name you choose. If you cannot decide a suggestion is 'binder_exercise' 14 | * Make a local copy of the notebook that contains the solutions to ED data wrangling exercise. 15 | * Push the notebook to the repo. 16 | 17 | **Hints** 18 | * If you do not know how to use GitHub you can create the repository and then click on the green **upload** button. This will allow you to select the notebook and add a commit message. 19 | * If you prefer to do this via git then I recommend creating the remote repo first, cloning locally, add (and stage), commit the notebook. Finally push using `git push`. Depending on your authentication method you may be asked for your GitHub username and password. 20 | 21 | ## Exercise 2: 22 | 23 | You now need to create a conda environment file so that binderhub knows what version of python and data science packages to install. 24 | 25 | **Task**: 26 | * Create a directory in the repo called `binder`` 27 | * Create a conda environment file in `binder/environment.yml` with the appropriate libraries. A suggestion is: 28 | 29 | ```YAML 30 | name: binder_ex 31 | channels: 32 | - defaults 33 | - conda-forge 34 | dependencies: 35 | - matplotlib=3.4.2 36 | - numpy=1.20.3 37 | - pandas=1.3.1 38 | - python=3.8.8 39 | ``` 40 | 41 | * Commit the changes and push to github using your preferred method. 42 | 43 | 44 | ## Exercise 3: 45 | 46 | You are now ready to share your notebook via binder 47 | 48 | **Task** 49 | * Copy the URL of your GitHub repo's main page. 50 | * Using your browser navigate to [https://mybinder.org](https://mybinder.org) 51 | * Paste the URL of your Github repo and click on 'launch' (the build will take several minutes) 52 | 53 | ## Exercise 4: 54 | 55 | Let's add a 'launch binder badge' to a `README.md` file in your repo. 56 | 57 | **Task**: 58 | * From the BinderHub setup page copy the markdown text that you will use to create the badge. 59 | * If required (i.e. you don't already have one). Create a `README.md` file and add it to your GitHub repo. 60 | * Open `README.md` for editing. At the top paste in the copied launch binderhub markdown. 61 | * Push the update to your GitHub repo. 62 | * Navigate to your GitHub repo and click on the badge to launch binderhub! 63 | 64 | 65 | ## Exercise 5: 66 | 67 | **Task** 68 | * Use BinderHub to share the more advanced `ts_emergency` package you created in the exercises. -------------------------------------------------------------------------------- /content/03_mgt/04_exercises/im/detrended.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/04_exercises/im/detrended.jpg -------------------------------------------------------------------------------- /content/03_mgt/04_exercises/im/diag.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/04_exercises/im/diag.jpg -------------------------------------------------------------------------------- /content/03_mgt/04_exercises_front_page.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | Managing python data science projects - exercises. 4 | -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/im/detrended.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/05_solutions/im/detrended.jpg -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/im/diag.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/05_solutions/im/diag.jpg -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/ts_emergency/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | __author__ = 'Tom Monks' -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/ts_emergency/data/.~lock.ts_ed.csv#: -------------------------------------------------------------------------------- 1 | ,tom,pop-os.localdomain,16.07.2021 15:37,file:///home/tom/.config/libreoffice/4; -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/ts_emergency/datasets.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Functions to load built in datasets for ts_emergency. 3 | Datasets are downloaded from an external github repo. 4 | 5 | The key loading function is load_ed_ts 6 | ''' 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | LONG_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \ 12 | + 'hpdm139-datasets/main/syn_ts_ed_long.csv' 13 | 14 | WIDE_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \ 15 | + 'hpdm139-datasets/main/syn_ts_ed_wide.csv' 16 | 17 | def load_ed_ts(data_format='wide', as_pandas=True): 18 | ''' 19 | Load the built-in ED dataset 20 | 21 | Params: 22 | ------ 23 | data_format: str 24 | 'Wide' or 'long' format. Wide format provides hospital columns. 25 | Long format provides a categorical hospital column and single attends 26 | column. 27 | 28 | as_pandas: bool, optional (default = True) 29 | Return as `pandas.Dataframe`. If False then `numpy.ndarray` 30 | 31 | Returns: 32 | ------- 33 | pandas.Dataframe or if `as_pandas=False` then returns `numpy.ndarray` 34 | 35 | ''' 36 | valid_formats = ['wide', 'w', 'long', 'l'] 37 | 38 | if data_format.lower() not in valid_formats: 39 | raise ArgumentError(f'data format should be one of {valid_formats}') 40 | 41 | if data_format == 'wide' or data_format == 'w': 42 | df = _ed_data_to_wide(LONG_URL) 43 | else: 44 | df = _ed_data_to_long(WIDE_URL) 45 | 46 | if as_pandas: 47 | return df 48 | else: 49 | return df.to_numpy() 50 | 51 | 52 | 53 | def _ed_data_to_wide(file_path): 54 | ''' 55 | Return the ED data in wide format. 56 | 57 | 1. Pivot table 58 | 2. Transpose and drop the ('attends', hosp_i) multi-index 59 | 3. Rename columns [0, 1, 2, 4] tp ['hosp_1', 'hosp_2', 'hosp_3', 'hosp_4'] 60 | 4. Index to DateTimeIndex 61 | 5. Drop the additional uneeded series 'date' (as stored in index as well) 62 | 6. Convert attendence numbers from int64 to int16 63 | 64 | Params: 65 | ------ 66 | file_path: str 67 | Path to wide format file 68 | 69 | Returns: 70 | ------- 71 | pandas.DataFrame 72 | ''' 73 | # column name transfers 74 | translated_names = {0:'hosp_1', 75 | 1:'hosp_2', 76 | 2:'hosp_3', 77 | 3:'hosp_4'} 78 | 79 | data_types = {'hosp_1':np.int16, 80 | 'hosp_2':np.int16, 81 | 'hosp_3':np.int16, 82 | 'hosp_4':np.int16} 83 | 84 | df = (pd.read_csv(file_path) 85 | .pivot_table(values=['attends'], index=['date'], columns=['hosp']) 86 | .T.reset_index(drop=True) 87 | .T.rename(columns=translated_names) 88 | .assign(date=lambda x: pd.to_datetime(x.index)) 89 | .set_index('date') 90 | .astype(data_types) 91 | ) 92 | 93 | return df 94 | 95 | 96 | 97 | def _ed_data_to_long(file_path): 98 | ''' 99 | Return the ED data in long format. Uses pd.wide_to_long() 100 | Assume wide format file is used. 101 | 102 | 1. pd.wide_to_long() 103 | 2. reset_index() to remove multi-index 104 | 3. rename col 'hosp_' to 'attends' 105 | 4. date to datetime 106 | 5. Convert attendence numbers from int64 to int16 amd hosp_id to int8. 107 | (could also be a categorical field.) 108 | 109 | Params: 110 | ------ 111 | file_path: str 112 | Path to wide format file 113 | 114 | Returns: 115 | ------- 116 | pandas.DataFrame 117 | ''' 118 | 119 | translated_names = {'hosp_':'attends'} 120 | data_types = {'hosp': np.int8, 'attends':np.int16} 121 | 122 | long_df = ( 123 | pd.wide_to_long(pd.read_csv(file_path), stubnames='hosp_', 124 | i=['date'], j='hosp') 125 | .reset_index() 126 | .rename(columns=translated_names) 127 | .assign(date=lambda x: pd.to_datetime(x['date'])) 128 | .astype(data_types) 129 | ) 130 | 131 | return long_df 132 | -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/ts_emergency/plotting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/05_solutions/ts_emergency/plotting/__init__.py -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/ts_emergency/plotting/tsa.py: -------------------------------------------------------------------------------- 1 | ''' 2 | tsa - time series analysis module 3 | 4 | plotting functions for time series analysis 5 | ''' 6 | 7 | # standard imports 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 11 | 12 | # cross package imports 13 | from ts_emergency.plotting.view import plot_single_ed 14 | 15 | def plot_detrended(wide_df, hosp_id, ax=None): 16 | ''' 17 | Plot the first difference of the ED time series 18 | ''' 19 | 20 | # create differenced dataframe 21 | diff_df = wide_df.diff(periods=1) 22 | 23 | fig, ax = plot_single_ed(diff_df, hosp_id, ax) 24 | ax.set_title('Detrended') 25 | 26 | return fig, ax 27 | 28 | 29 | def diagnostic_plot(wide_df, hosp_id, figsize=(9, 6), maxlags=56, 30 | include_zero=False): 31 | ''' 32 | Basic plot of diagnostics for ED time series. 33 | 34 | 1. Detrended series 35 | 2. ACF 36 | 3. PACF 37 | 38 | Params: 39 | ------ 40 | wide_df: pandas.Dataframe 41 | ED data in wide format 42 | 43 | hosp_id: str 44 | column name for hospital 45 | 46 | figsize: (int, int), optional (default=(9,6)) 47 | size of figure 48 | 49 | maxlags: int, optional (default=56) 50 | The number of lags to include int the ACF and PACF 51 | 52 | include_zero: bool, optional (default=False) 53 | Include ACF and PACF of observation with itself in plot (=1.0) 54 | 55 | Returns: 56 | ------- 57 | fig, np.ndarray 58 | ''' 59 | fig = plt.figure(figsize=figsize, tight_layout=True) 60 | 61 | # add gridspec 62 | gs = fig.add_gridspec(3, 2) 63 | 64 | # detrended axis spans two columns 65 | ax1 = fig.add_subplot(gs[0, :]) 66 | # acf axis spans 2 rows in column idx 0 67 | ax2 = fig.add_subplot(gs[1:,0]) 68 | # pacf axis spans 2 rows in column idx 1 69 | ax3 = fig.add_subplot(gs[1:, 1]) 70 | 71 | # plot detrended on axis 1 72 | _ = plot_detrended(wide_df, hosp_id, ax=ax1) 73 | 74 | # plot acf on axis 2 75 | _ = plot_acf(wide_df[hosp_id], lags=maxlags, ax=ax2, zero=include_zero) 76 | # plot pacf on axi 77 | _ = plot_pacf(wide_df[hosp_id], lags=maxlags, ax=ax3, zero=include_zero) 78 | 79 | axs = np.array([ax1, ax2, ax3]) 80 | return fig, axs 81 | 82 | -------------------------------------------------------------------------------- /content/03_mgt/05_solutions/ts_emergency/plotting/view.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | DEFAULT_LABEL_FS = 12 4 | DEFAULT_AXIS_FS = 12 5 | DEFAULT_FIGSIZE = (12,8) 6 | 7 | def plot_single_ed(wide_df, hosp_id, ax=None, figsize=(12,3), 8 | fontsize=DEFAULT_LABEL_FS, line_width=2): 9 | ''' 10 | Plot a single ED's data 11 | Assumes data are passed in wide format. 12 | 13 | Params: 14 | ------- 15 | wide_df: pandas.Dataframe 16 | ED time series data in wide format 17 | 18 | hosp_id: str 19 | name of hospital column to plot e.g. 'hosp_1' 20 | 21 | figsize: tuple(int, int), optional (default=(12,3)) 22 | `matplotlib` figure size 23 | 24 | fontsize: int, optional (default=DEFAULT_LABEL_FS) 25 | Size of label font 26 | 27 | line_width: int 28 | Width of the line plot 29 | 30 | Returns: 31 | ------- 32 | matplotlib fig, ax 33 | 34 | ''' 35 | 36 | if ax is None: 37 | fig = plt.figure(figsize=figsize) 38 | ax = fig.add_subplot() 39 | 40 | ax.set_xlabel("Date", fontsize=fontsize) 41 | ax.set_ylabel("Attendances", fontsize=fontsize) 42 | 43 | _ = ax.plot(wide_df[hosp_id], lw=line_width) 44 | # include x, y grid 45 | _ = ax.grid(ls='--') 46 | 47 | # set size of x, y ticks 48 | _ = ax.tick_params(axis='both', labelsize=fontsize) 49 | 50 | # return the figure 51 | return ax.figure, ax 52 | 53 | 54 | def plot_eds(wide_df, figsize=DEFAULT_FIGSIZE, label_font_size=DEFAULT_LABEL_FS, 55 | axis_font_size=DEFAULT_AXIS_FS): 56 | ''' 57 | Plot all ED's attendances in a 1x4 grid layout. 58 | 59 | Params: 60 | ------ 61 | wide_df: pandas.Dataframe 62 | ED time series data in wide format 63 | 64 | figsize: tuple(int, int), optional (default=(12,3)) 65 | `matplotlib` figure size 66 | 67 | label_font_size: int, optional (default=DEFAULT_LABEL_FS) 68 | Size of label font 69 | 70 | axis_font_size: int, optional (default=DEFAULT_AXIS_FS) 71 | Size of axis tick font 72 | 73 | Returns: 74 | -------- 75 | matplotlib fig 76 | ''' 77 | 78 | fig, axs = plt.subplots(nrows=4, ncols=1, tight_layout=True, figsize=(12,8), 79 | sharex=True) 80 | 81 | # note that axs is a 2D array 82 | for hosp_idx in range(0, 4): 83 | _ = axs[hosp_idx].plot(wide_df[f'hosp_{hosp_idx+1}']) 84 | _ = axs[hosp_idx].set_title(f'Hospital {hosp_idx+1}', 85 | fontsize=label_font_size) 86 | _ = axs[hosp_idx].grid(ls='--') 87 | 88 | # axis labels matplotlib >=3.4 89 | AXIS_LABEL_SIZE = 12 90 | _ = fig.supylabel('ED Attendances', fontsize=axis_font_size) 91 | _ = fig.supxlabel('Date', fontsize=axis_font_size) 92 | 93 | return fig -------------------------------------------------------------------------------- /content/03_mgt/05_solutions_front_page.md: -------------------------------------------------------------------------------- 1 | # Solutions 2 | 3 | The following sections provide example solutions to the managing python project exercises. There are often many ways to solve these problems. The solutions provided should be taken as guides only. If you feel you have a better way feel free to raise an issue and suggest your solution is adopted instead! 4 | -------------------------------------------------------------------------------- /content/appendix/acknowledge.md: -------------------------------------------------------------------------------- 1 | # Acknowledgements 2 | 3 | I'd like to extend my thanks to the following people for their contributions to the book. All contributions no matter the size are welcome. 4 | 5 | * [agh208](https://github.com/agh208): Amy Heather (MSc Health Data Science, University of Exeter. 2021/22) 6 | * [SubaruSpirit](https://github.com/SubaruSpirit). 7 | * [tristar82](https://github.com/tristar82). Elliott Coyne (MSc Health Data Science, University of Exeter. 2021/22) 8 | * [reevesglobal](https://github.com/reevesglobal) 9 | * [trptaylor](https://github.com/trptaylor) 10 | * [kaungmyatwaiyan](https://github.com/kaungmyatwaiyan) MSc Health Data Science 2021/22. 11 | * [JeffAkkerman](https://github.com/JeffAkkerman) 12 | * [ploginovic](https://github.com/ploginovic) Pavel Loginovic (MSc Health Data Science, University of Exeter. 2023/24) -------------------------------------------------------------------------------- /content/appendix/fp_lectures.md: -------------------------------------------------------------------------------- 1 | # Lectures -------------------------------------------------------------------------------- /content/appendix/fp_practicals.md: -------------------------------------------------------------------------------- 1 | # Practicals -------------------------------------------------------------------------------- /content/appendix/labs/debug1.md: -------------------------------------------------------------------------------- 1 | # Debug challenge 1 2 | 3 | ```{admonition} Challenge 4 | 5 | The simple code listing below contains a number of bugs. 6 | Can you fix the code and help it to run? 7 | ``` 8 | **Hints:** 9 | * Use a Python IDE such as`spyder` or `Visual Studio Code` it will help you debug. 10 | * Read the Python interpreter output. 11 | * The errors reported can look confusing at first, but read them carefully and they will point you to the lines of code with problems. 12 | * The `Spyder` IDE may give you some hints about formatting errors 13 | * It can be useful to use `print()` to display intermediate calculations and variable values. 14 | * Remember that `Spyder` has a variable viewer where you can look at the value of all variables created. 15 | * There might be multiple bugs! When you fix one and try to run the code you might find another! 16 | 17 | Have a go **yourself** and then watch our approach: 18 | 19 | * https://www.youtube.com/watch?v=XCuD59bYKx0 20 | 21 | 22 | ```python 23 | 24 | 25 | def split_word_in_two(to_split): 26 | """ 27 | Returns string split into two parts 28 | 29 | If the word's length is even the two parts have 30 | equal number of characters 31 | 32 | Params: 33 | ------- 34 | to_split: str 35 | the string to split int o 36 | """ 37 | length = len(to_spit) 38 | half_length = length / 2 39 | 40 | part1 = to_split[:half] 41 | part2 = to_split[half:] 42 | 43 | return part1, part2 44 | 45 | 46 | def main(): 47 | """ 48 | Tests the split_word_in_two function. 49 | Input word = 'faster' 50 | Expected output = ('fas', 'ter') 51 | """ 52 | word_to_split = 'faster' 53 | result = split_word_in_two(word_to_split) 54 | print('Part 1 = {0}; Part 2 = {1}'.format(result[0], result[1])) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | 60 | ``` -------------------------------------------------------------------------------- /content/appendix/labs/debug2.md: -------------------------------------------------------------------------------- 1 | # Debug challenge 2 2 | 3 | **This weeks debug challenges loops and nested loops.** 4 | 5 | A classic task in programming is to sort a list. Python makes this 6 | very simple by including a number of ways to sort a list. 7 | 8 | Under the hood the sorting routines are all variations on loops 9 | where values in the array are swapped until it is in ascending order. 10 | 11 | ```{admonition} Challenge 12 | 13 | The function below implements **insertion sort**. Insertion sort is an 14 | efficient algorithm for sorting a list made of two loops. An outer loop 15 | that iterates forward through the list and an inner list that iterates backwards. 16 | 17 | The code below is not running. Can you debug it? 18 | ``` 19 | 20 | ```python 21 | 22 | def insertion_sort(to_sort): 23 | """ 24 | Sort a list of numbers using the insertion sort algorithm 25 | Return a list of sorted numbers 26 | 27 | Keyword arguments: 28 | to_sort -- an unsorted python list of numbers 29 | """ 30 | 31 | #This is the outer loop. 32 | for i in range(1, to_sort): 33 | 34 | j = i 35 | 36 | #This inner while loop. Note the backwards iteration. 37 | #The while loop terminates when either: 38 | #1. j == 0 i.e. the first element in the list is reached 39 | #2. there is no need to do any sorting i.e. to_sort[j-1] < to_sort[j] 40 | while j > 0 and to_sort[j-1] > to_sort[j] 41 | 42 | #to swap the values we need a 3rd variables (temp) 43 | temp = to_sort[j] 44 | to_sort[j] = to_sort[j-1] 45 | to_sort[j-1] = temp 46 | j -= 1 47 | 48 | return to_sort 49 | 50 | 51 | if __name__ == "__main__": 52 | list_to_sort = [14,33,27,10,35,19,42,44] 53 | sorted_list = insertion_sort(list_to_sort) 54 | print(sorted_list) 55 | 56 | ``` -------------------------------------------------------------------------------- /content/appendix/labs/src/cinema_exercise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ Exercise: Booking Cinema Tickets and Refreshments 4 | 5 | You are going to the cinema and want to know the cost. 6 | 7 | Create 3 functions. 8 | 9 | tickets. returns the costs of tickets (i.e one or more) purchased. 10 | Normal tickets cost 10.99. Wednesdays reduce the cost by 2.00. 11 | Premium seating adds an extra 1.50 regardless of the day 12 | 13 | refreshments. returns the cost of refreshments. A user could buy 'popcorn' for 2.00 or 'fizzy pop' for 3.50 14 | 15 | cinema_trip. Adds the cost of tickets and refreshments together. 16 | 17 | """ 18 | 19 | 20 | def tickets(number, day, premium_seating): 21 | """ 22 | The cost of the cinema ticket. 23 | Normal ticket cost is $10.99 24 | Wednesdays reduce the cost by $2.00 25 | Premium seating adds an extra $1.50 regardless of the day 26 | 27 | Parameters: 28 | ---------- 29 | number: int 30 | integer value representing the number of seats to book 31 | 32 | day: int 33 | day of the week to book (1 = Monday ... 7 = Sunday) 34 | 35 | premium_seating: bool 36 | boolean True/False. Are premium seats required. 37 | 38 | Returns: 39 | ------- 40 | float 41 | """ 42 | #fill in your code here. 43 | return 0.0 44 | 45 | 46 | def refreshment(choice ='popcorn'): 47 | """ 48 | The cost of refrehments. Choices are popcorn or fizzy pop 49 | 50 | Parameters: 51 | ---------- 52 | choice The users choice of refreshment (default = 'popcorn') 53 | 54 | Returns: 55 | ------- 56 | float 57 | """ 58 | 59 | #fill in your code here 60 | return 0.0 61 | 62 | 63 | def cinema_trip(persons, day, premium_seating, treat): 64 | """ 65 | The total cost of going to the cinema 66 | 67 | Parameters: 68 | ---------- 69 | persons: int 70 | number of people who need a ticket 71 | 72 | day: int 73 | day of the week to book (1 = Monday, 7 = Sunday) 74 | 75 | preimum_seating: bool 76 | boolean True/False if premium seats are required 77 | 78 | treat: str 79 | string value representing a choice of refreshment 80 | 81 | Returns: 82 | ------- 83 | float 84 | """ 85 | #fill in your code here 86 | return tickets(persons, day, premium_seating) + refreshment(treat) 87 | 88 | 89 | if __name__ == '__main__': 90 | persons = 2 91 | day = 1 92 | premium_seating = True 93 | treat = "popcorn" 94 | 95 | total_cost = cinema_trip(persons, day, premium_seating, treat) 96 | 97 | msg = f'today a trip to the cineman will cost you £{total_cost:.2f}' 98 | print(msg) 99 | #expected answer = £26.98 100 | 101 | persons = 3 102 | day = 3 103 | premium_seating = True 104 | treat = "fizzy pop" 105 | 106 | total_cost = cinema_trip(persons, day, premium_seating, treat) 107 | 108 | msg = f'today a trip to the cineman will cost you £{total_cost:.2f}' 109 | print(msg) 110 | #expected answer = £34.97 111 | -------------------------------------------------------------------------------- /content/appendix/labs/src/list_comprehensions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | List comprehension examples 5 | 6 | List comprehensions are an alternative to for loops 7 | They work specifically with Python Lists. 8 | 9 | The code examples below give an introduction to using them. 10 | 11 | 1. Double a list of numbers 12 | 2. Call a function from a list comprehension 13 | 3. Using zip within a list function to iterate multiple lists 14 | 4. Using If statments within a list comprehension 15 | 5. Creating a list of lists using a nested list comprehension 16 | 6. Looping through a list of lists using a list comprehension 17 | 18 | @author: tom 19 | 20 | """ 21 | 22 | #%% 23 | # ============================================================================= 24 | # Example 1 - double the numbers 25 | # ============================================================================= 26 | 27 | foo = [1, 2, 3, 4] 28 | bar = [] 29 | 30 | for x in foo: 31 | bar.append(x * 2) 32 | 33 | print(bar) 34 | 35 | #%% 36 | 37 | # ============================================================================= 38 | # list comprehension approach for the same result... 39 | # ============================================================================= 40 | 41 | foo = [1, 2, 3, 4] 42 | bar = [x * 2 for x in foo] 43 | print(bar) 44 | 45 | #%% 46 | 47 | # ============================================================================= 48 | # Example 2 - convert celsius to fahrenheit 49 | # This example calls a function from within the list comprehension. 50 | # ============================================================================= 51 | 52 | def convert_celsius_to_fahrenheit(deg_celsius): 53 | """ 54 | Convert degress celsius to fahrenheit 55 | Returns float value - temp in fahrenheit 56 | Keyword arguments: 57 | def_celcius -- temp in degrees celsius 58 | """ 59 | return (9/5) * deg_celsius + 32 60 | 61 | #list of temps in degree celsius to convert to fahrenheit 62 | celsius = [39.2, 36.5, 37.3, 41.0] 63 | 64 | #standard for loop approach 65 | fahrenheit = [] 66 | for x in celsius: 67 | fahrenheit.append(convert_celsius_to_fahrenheit(x)) 68 | 69 | 70 | print('using standard for loop: {}'.format(fahrenheit)) 71 | 72 | #implementation using a list comprehension 73 | fahrenheit = [convert_celsius_to_fahrenheit(x) for x in celsius] 74 | print('using list comprehension: {}'.format(fahrenheit)) 75 | 76 | #%% 77 | # ============================================================================= 78 | # Example 3 - convert the strings to different data types 79 | # This example also make ue of the zip function 80 | # Zip allow you to iterate through two lists at the same time 81 | # ============================================================================= 82 | 83 | inputs = ["1", "3.142", "True", "spam"] 84 | converters = [int, float, bool, str] 85 | 86 | values_with_correct_data_types = [t(s) for (s, t) in zip(inputs, converters)] 87 | print(values_with_correct_data_types) 88 | 89 | #%% 90 | # ============================================================================= 91 | # Example 4 - Using if statements within a list comprehension 92 | # The example filters a list of file names to the python files only 93 | # ============================================================================= 94 | 95 | unfiltered_files = ['test.py', 'names.csv', 'fun_module.py', 'prog.config'] 96 | 97 | python_files = [] 98 | 99 | # filter the files using a standard for loop 100 | for file in unfiltered_files: 101 | if file[-2:] == 'py': 102 | python_files.append(file) 103 | 104 | print('using standard for loop: {}'.format(python_files)) 105 | 106 | #list comprehension 107 | python_files = [file for file in unfiltered_files if file[-2:] == 'py'] 108 | 109 | print('using list comprehension {}'.format(python_files)) 110 | 111 | 112 | #%% 113 | # ============================================================================= 114 | # Example 5 - List comprehension to create a list of lists 115 | # List comprehensions can greatly reduce the complexity of code 116 | # needed to create a list of lists. 117 | # ============================================================================= 118 | 119 | list_of_lists = [] 120 | 121 | for i in range(5): 122 | sub_list = [] 123 | for j in range(3): 124 | sub_list.append(i * j) 125 | list_of_lists.append(sub_list) 126 | 127 | print(list_of_lists) 128 | 129 | #a lists comprehension reduces 6 lines of code to 1 130 | list_of_lists = [[i * j for j in range(3)] for i in range(5)] 131 | 132 | print(list_of_lists) 133 | 134 | 135 | #%% 136 | # ============================================================================= 137 | # Example 6: Iterate over all items in a list of lists 138 | # using a list comprehension 139 | # The code converts a list of lists to a list of items 140 | # We call this flattening the list. 141 | # ============================================================================= 142 | 143 | list_of_lists = [[8, 2, 1], [9, 1, 2], [4, 5, 100]] 144 | 145 | flat_list = [] 146 | for row in list_of_lists: 147 | for col in row: 148 | flat_list.append(col) 149 | 150 | print(flat_list) 151 | 152 | #implementation as list comprehension 153 | flat_list = [item for sublist in list_of_lists for item in sublist] 154 | 155 | print(flat_list) 156 | 157 | 158 | #%% -------------------------------------------------------------------------------- /content/appendix/labs/src/moviedb.csv: -------------------------------------------------------------------------------- 1 | "ID","Title","Budget","Box_office","Year","Meta_Critic" 2 | 1,"Amazing Spiderman",230,757.9,2012,66 3 | 2,"Ironman",140,585.2,2008,57 4 | 3,"Thor",150,449.3,2011,54 5 | 4,"Captain America: the first avenger",140,370.6,2011,66 6 | 5,"Antman",130,519.3,2015,64 7 | 6,"Guardians of the Galaxy",232.3,773,2014,76 8 | -------------------------------------------------------------------------------- /content/appendix/labs/src/py_finance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This module is used to demonstrate how to avoid code running on 5 | import. 6 | 7 | It contains some example functions. 8 | 9 | Example usage: 10 | 11 | import py_finance 12 | """ 13 | 14 | def pv(future_value, rate, n): 15 | 16 | ''' 17 | Discount a value at defined rate n time periods into the future. 18 | 19 | Forumula: 20 | PV = FV / (1 + r)^n 21 | Where 22 | FV = future value 23 | r = the comparator (interest) rate 24 | n = number of years in the future 25 | 26 | Keyword arguments: 27 | future value -- the value to discount 28 | rate -- the rate at which to do the discounting 29 | n -- the number of time periods into the future 30 | ''' 31 | return future_value / (1 + rate)**n 32 | 33 | 34 | def print_pv(future_value, rate, n, present_value): 35 | ''' 36 | Prints a sentence reporting the present value of a 37 | future_value assuming a rate in n time units 38 | 39 | Keyword arguments: 40 | future value -- the value to discount 41 | rate -- the rate at which to do the discounting 42 | n -- the number of time periods into the future 43 | present_value -- the present value of the transaction 44 | ''' 45 | msg = 'Using an interest rate of {0}, ' + \ 46 | 'a payment of £{1:.2f} in {2} years time is worth £{3:.2f} today' 47 | 48 | print(msg.format(rate, future_value, n, present_value)) 49 | 50 | 51 | def test_case1(): 52 | #Test case 1 53 | future_value = 2000 54 | rate = 0.035 55 | years = 5 56 | result = pv(future_value, rate, years) 57 | 58 | print_pv(future_value, rate, years, result) 59 | 60 | def test_case2(): 61 | #Test case 2 62 | future_value = 350 63 | rate = 0.01 64 | years = 10 65 | result = pv(future_value, rate, years) 66 | 67 | print_pv(future_value, rate, years, result) 68 | 69 | def main(): 70 | test_case1() 71 | test_case2() 72 | 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /content/appendix/labs/src/string_manipulation.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Example: Illustrates how to use, format and manipulate strings 3 | # Author: T.Monks 4 | ###################################################################### 5 | 6 | 7 | #print a string to console 8 | print("foo") 9 | 10 | #access a specific letter 11 | print("Spam"[0]) #access the first character in string 12 | print("Spam"[2]) #access the third character in string 13 | 14 | 15 | mystr = "Spam" 16 | print(mystr[-1]) #alternative count from end to access the last character 17 | print(mystr[-4]) #alternative count from end to access the first character 18 | 19 | #string slicing (substrings) 20 | 21 | print(mystr[:2]) #this will print the first two chars. Same as writing mystr[0:2] 22 | 23 | print(mystr[1:4]) #print chars 1 to 4 i.e. pam 24 | 25 | print(mystr[-2:]) #last two chars i.e. am 26 | print(mystr[-3:]) #last three chars i.e. pam 27 | 28 | print(mystr[1:]) #print string starting from char pos 1 i.e pam 29 | print(mystr[2:]) #print string starting from char pos 2 i.e. am 30 | 31 | mystr = "123456789" 32 | print(mystr[1:8:2]) #starting from index 1 return every other char up to index 8 = "2468" 33 | print(mystr[0:8:2]) #1357 34 | 35 | #concatenation 36 | myvar1 = "foo" 37 | myvar2 = "bar" 38 | 39 | print(myvar1 + myvar2) 40 | 41 | #string case 42 | print(myvar1.lower()) 43 | print("LoWeRCAse".lower()) 44 | print(myvar1.upper()) 45 | 46 | #string length 47 | print(len(myvar1)) 48 | 49 | #convert numeric values to strings 50 | my_num= 2 51 | print("my daughter is " + str(my_num)) 52 | 53 | #advanced formatting of output 54 | language = "python" 55 | skill = "productivity" 56 | 57 | print("%s increases your programming %s" %(language, skill)) 58 | 59 | #modern alternative output formatting using string.format 60 | print("{} increases your programming {}".format(language, skill)) 61 | 62 | #use an optional index to easily rearrange the order of output. 63 | print("{0} increases your programming {1}".format(language, skill)) 64 | print("{1} increases your programming {0}".format(language, skill)) 65 | 66 | #reuse the same variable multiple times 67 | print("{0} increases {0} your {0} programming {1}".format(language, skill)) 68 | 69 | #formatting output to n decimal places 70 | print("{:.2f}".format(0.123456789)) 71 | print("{:.3f}".format(0.123456789)) 72 | 73 | #if you need {} in your output - double up. 74 | print("{{}}".format("double")) 75 | 76 | #splitting strings 77 | 78 | sentence = "we are the knights who say ni!" 79 | print(sentence.split()) #default is to split by space. 80 | 81 | sentence = "we|are|the|knights|who|say|ni!" 82 | print(sentence.split("|")) 83 | print(sentence.split("|", 1)) #the 2nd parameter limits the number of splits 84 | print(sentence.split("|", 2)) 85 | 86 | #the reverse of split is the join command 87 | split_data = ["we", "are", "the", "knights", "that", "say", "ni!"] 88 | sentence = " ".join(split_data) #join words with " " as between each word. 89 | print(sentence) 90 | 91 | sentence = "-".join(split_data) #join words with ", " as between each word. 92 | print(sentence) 93 | 94 | 95 | #Strings are iterable 96 | mystr = "123456789" 97 | for c in mystr: 98 | print(c) -------------------------------------------------------------------------------- /content/appendix/labs/src/test_finance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jan 11 16:03:41 2019 5 | 6 | @author: tom 7 | """ 8 | 9 | from py_finance import pv, print_pv 10 | 11 | future_value = 1000 12 | rate = 0.05 13 | years = 10 14 | result = pv(future_value, rate, years) 15 | 16 | print_pv(future_value, rate, years, result) 17 | -------------------------------------------------------------------------------- /content/appendix/labs/src/week1_debug_challenge1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Debug Exercise Python Lab 1 5 | 6 | The code below has a number of bugs. 7 | 8 | Can you fix the code and help it to run? 9 | 10 | 11 | """ 12 | 13 | def split_word_in_two(to_split): 14 | """ 15 | Returns string split into two parts 16 | 17 | If the word's length is even the two parts have 18 | equal number of characters 19 | 20 | Params: 21 | ------- 22 | to_split: str 23 | the string to split int o 24 | """ 25 | length = len(to_spit) 26 | half_length = length / 2 27 | 28 | part1 = to_split[:half] 29 | part2 = to_split[half:] 30 | 31 | return part1, part2 32 | 33 | 34 | def main(): 35 | """ 36 | Tests the split_word_in_two function. 37 | Input word = 'faster' 38 | Expected output = ('fas', 'ter') 39 | """ 40 | word_to_split = 'faster' 41 | result = split_word_in_two(word_to_split) 42 | print('Part 1 = {0}; Part 2 = {1}'.format(result[0], result[1])) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | 48 | -------------------------------------------------------------------------------- /content/appendix/labs/src/wk2_debug_challenge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Week 2: Debug Challenge. 7 | 8 | This weeks debug challenges loops and nested loops. 9 | 10 | A classic task in programming is to sort a list. Python makes this 11 | very simple by including a number of ways to sort a list. 12 | 13 | Under the hood the sorting routines are all variations on loops 14 | where values in the array are swapped until it is in ascending order. 15 | 16 | The function below implements insertion sort. Insertion sort is an 17 | efficient algorithm for sorting a list made of two loops. An outer loop 18 | that iterates forward through the list and an inner list that iterates backwards. 19 | 20 | The code below is not running. Can you debug it? 21 | 22 | Good luck! 23 | 24 | 25 | """ 26 | 27 | def insertion_sort(to_sort): 28 | """ 29 | Sort a list of numbers using the insertion sort algorithm 30 | Return a list of sorted numbers 31 | 32 | Keyword arguments: 33 | to_sort -- an unsorted python list of numbers 34 | """ 35 | 36 | #This is the outer loop. 37 | for i in range(1, to_sort): 38 | 39 | j = i 40 | 41 | #This inner while loop. Note the backwards iteration. 42 | #The while loop terminates when either: 43 | #1. j == 0 i.e. the first element in the list is reached 44 | #2. there is no need to do any sorting i.e. to_sort[j-1] < to_sort[j] 45 | while j > 0 and to_sort[j-1] > to_sort[j] 46 | 47 | #to swap the values we need a 3rd variables (temp) 48 | temp = to_sort[j] 49 | to_sort[j] = to_sort[j-1] 50 | to_sort[j-1] = temp 51 | j -= 1 52 | 53 | return to_sort 54 | 55 | 56 | if __name__ == "__main__": 57 | list_to_sort = [14,33,27,10,35,19,42,44] 58 | sorted_list = insertion_sort(list_to_sort) 59 | print(sorted_list) -------------------------------------------------------------------------------- /content/front_page.md: -------------------------------------------------------------------------------- 1 | ![title image](imgs/title_logo.png) 2 | 3 | # Preface 4 | 5 | Welcome to my online textbook for learning enough python to be a credible data scientist. 6 | 7 | The book was written with three audiences in mind 8 | 9 | 1. Post-graduate students studying or researching a health data science related topic 10 | 2. My MSc students in [health data science](https://www.exeter.ac.uk/postgraduate/courses/medicine/healthdatasciencemsc/) at the University of Exeter. Indeed I use this book in the module coding for ML and data science. 11 | 3. Health service analysts (particularly in the UK's NHS) who are looking to boost their python skills to be a more rounded data scientist. 12 | 13 | The book aims to support these groups because data science is a rapidly evolving discipline that offers huge potential for the future of health care, medicine and wider areas of science. I'm very exited about health data science using python and you should be too. We now have wonderful python machine learning packages such as `sklearn`, `keras` + `tensorflow` and `pytorch`. These packages are very easy to use and scripting in them can be learnt using more online tutorials than you can count. For that reason I am not going to write about these popular machine learning packages in this book. I don't really care if you can write an `sklearn` script. Instead I'm going to focus on making you a more rounded data scientist that can write code that's going to stand the test of time. My aim is that, by the end of the book, you will be able to write clean code that can be **confidently** published alongside your research, can be run by others and can be returned to by yourself in 5-10 years and still understood (the person most likely to reuse your work is you!). By the end of the book you will be a health data scientist and a "coder" (or if you are old like me a "programmer") and view code as a first class citizen in your data science projects. You will be able to focus on the data science as opposed to getting bogged down in the frequent coding problems you will face in real studies. This means we are going to focus a bit on code design, a bit on scientific problems, a bit on statistical programming and a bit on the management and deployment of data science code projects. 14 | 15 | This all sounds a bit pretentious doesn't it? Well perhaps a bit, but it comes from a good place. Through my work I regularly meet people and students within the data science discipline who can use a package such as `sklearn`, but can't implement a very basic algorithm in python (or anything else), control code dependencies and versioning, and (to my horror) have manual (or semi-manual) pipelines for wrangling their data in shape. These data scientists would be far more employable and useful to an organisation if they took their coding to the next level. To be more blunt kids there's money, kudos, better science, and real benefits for society that are up for grabs if people were willing to put in the effort. 16 | 17 | The book is powered by [Jupyter Book](https://jupyterbook.org/intro.html). This means that the parts of the book containing code (with a few minor caveats) are executable online using [BinderHub](https://binderhub.readthedocs.io/en/latest/index.html#) or the free version of Google Colabratory. To use Google Colab you will need to login to a Google account. BinderHub does not require a login. You can also download sections of the book as Jupyter notebooks (.ipynb) that you can run locally and with higher performance in Jupyter-Lab or Notebook. 18 | 19 | My decision to make the textbook entirely open and free is influenced by three factors. The first is Dr Michael Allen's - relentless and subversive - crusade to make science more open and democratic in our discipline. Over time it appears some of his philosophy has rubbed off on me as well; or he has shamelessly brainwashed me. Either way I agree that data science and the knowledge that underpins it should be free to all. The second influence was Rob J Hyndman's and George Athanasopoulos' fantastic and hugely successful online textbook [forecasting: principles and practice](https://otexts.com/fpp3/). After reading an early edition of this many years ago I was immediately convinced of the benefit of sharing and updating knowledge in this way. I've partly modelled the book on these ideas and content will be continually updated to remove errors and expanded WITHOUT needing to purchase a new edition. I'll archive all editions permanently at [Cern via Zenodo](https://zenodo.org/) (I'm confident this is a good place and will make it to the heat death of the universe). Finally, when I saw the wonderful [Jupyter Book project](https://jupyterbook.org/intro.html) I knew immediately that this was the right tool to create a book of my own. The team that develop Jupyter book are so fantastic I even forgive them for changing the way Juypter Book table of content pages work between versions 0.10 and 0.11. As I write this the book is served up for free via Microsoft's [GitHub pages](https://pages.github.com/) and provides interactive code for you to run in the cloud. The book title image and python symbol were created by me in [Inkscape](https://inkscape.org/). 20 | 21 | I don't have a copy editor...sorry. Instructions for reporting mistakes and any garbage is [here](001_setup/contributing.md). 22 | 23 | I hope the book is useful to you whatever area of data science you are working in. I'll admit that I've not been able to cover everything I had originally planned for this first edition. If you have content ideas do let me know via the GitHub repo issues and you might find them, along with credit, in a future release! 24 | 25 | Work hard and code every day! 26 | 27 | **Tom Monks** 28 | September 2021. 29 | 30 | 31 | ## Creative Commons License¶ 32 | 33 | All written content in this book (this includes all files and content in the [content/](https://github.com/health-data-science-OR/coding-for-ml/tree/main/content) folder) is shared under CC BY 4.0. 34 | 35 | You are free to: 36 | 37 | Share - copy and redistribute the material in any medium or format 38 | 39 | Adapt - remix, transform, and build upon the material for any purpose, even commercially. 40 | 41 | You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. 42 | 43 | License info: https://creativecommons.org/licenses/by/4.0/ 44 | 45 | ## Code: MIT Licensed 46 | 47 | All code in this book is licensed under a [MIT permissive license](https://github.com/health-data-science-OR/coding-for-ml/blob/main/LICENSE) -------------------------------------------------------------------------------- /content/imgs/logo_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/logo_v1.png -------------------------------------------------------------------------------- /content/imgs/package_versus_project.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /content/imgs/package_versus_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/package_versus_project.png -------------------------------------------------------------------------------- /content/imgs/small_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/small_logo.png -------------------------------------------------------------------------------- /content/imgs/title.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title.odg -------------------------------------------------------------------------------- /content/imgs/title_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title_cropped.png -------------------------------------------------------------------------------- /content/imgs/title_cropped.png~: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title_cropped.png~ -------------------------------------------------------------------------------- /content/imgs/title_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title_logo.png -------------------------------------------------------------------------------- /images/binder_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/binder_1.png -------------------------------------------------------------------------------- /images/binder_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/binder_2.png -------------------------------------------------------------------------------- /images/book_title_page_log.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 38 | 41 | 42 | 44 | 47 | 51 | 55 | 56 | 59 | 63 | 67 | 68 | 77 | 86 | 87 | 91 | 103 | PYTHON FOR HEALTH DATA SCIENCE 118 | 124 | 129 | 134 | 141 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /images/detrended.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/detrended.jpg -------------------------------------------------------------------------------- /images/diag.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/diag.jpg -------------------------------------------------------------------------------- /images/release.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/release.png -------------------------------------------------------------------------------- /images/test_pypi2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/test_pypi2.png -------------------------------------------------------------------------------- /images/test_pypi3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/test_pypi3.png -------------------------------------------------------------------------------- /images/testpypi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/testpypi.png --------------------------------------------------------------------------------