├── .gitignore
├── CHANGELOG.md
├── CITATION.cff
├── Dockerfile
├── LICENSE
├── README.md
├── _config.yml
├── _toc.yml
├── binder
    └── environment.yml
├── content
    ├── 001_setup
    │   ├── conda.md
    │   ├── contributing.md
    │   ├── docker.md
    │   ├── git.md
    │   ├── install.md
    │   ├── prereq.ipynb
    │   └── prereq
    │   │   ├── 01_list_comp.ipynb
    │   │   ├── 02_dicts.ipynb
    │   │   ├── 04_scope.ipynb
    │   │   └── 05_pep8.ipynb
    ├── 01_algorithms
    │   ├── 01_design.ipynb
    │   ├── 01_design
    │   │   ├── 01_primes.ipynb
    │   │   ├── 02_better_design.ipynb
    │   │   ├── 03_micro_opt.ipynb
    │   │   └── 04_summing_up.ipynb
    │   ├── 02_oop.ipynb
    │   ├── 02_oop
    │   │   ├── 01_python_classes.ipynb
    │   │   ├── 02_oop_sim.ipynb
    │   │   ├── 03_oop_cv.ipynb
    │   │   ├── numpy_cv.py
    │   │   └── text_adventure
    │   │   │   ├── __init__.py
    │   │   │   ├── advanced_game.py
    │   │   │   └── basic_game.py
    │   ├── 03_numpy.ipynb
    │   ├── 03_numpy
    │   │   ├── 01_performance.ipynb
    │   │   ├── 02_vectors.ipynb
    │   │   ├── 03_slicing.ipynb
    │   │   ├── 04_algebra.ipynb
    │   │   ├── 05_statistics.ipynb
    │   │   ├── 05a_regression.ipynb
    │   │   ├── 06_sampling.ipynb
    │   │   ├── 07_advanced_iter.ipynb
    │   │   ├── 08_cs1.ipynb
    │   │   ├── 09_cs2.ipynb
    │   │   ├── 10_cs3.ipynb
    │   │   └── data
    │   │   │   ├── lysis.csv
    │   │   │   ├── minor_illness_ed_attends.csv
    │   │   │   └── wisconsin.csv
    │   ├── 04_exercises.ipynb
    │   ├── 04_exercises
    │   │   ├── 01_science_funcs.ipynb
    │   │   ├── 02_array.ipynb
    │   │   ├── 02_basic_oop.ipynb
    │   │   ├── 04_numpy_stats.ipynb
    │   │   ├── big_special_str.txt
    │   │   ├── breach.csv
    │   │   ├── data
    │   │   │   ├── bank_arrivals.csv
    │   │   │   ├── breach.csv
    │   │   │   ├── dtocs.csv
    │   │   │   ├── lysis.csv
    │   │   │   ├── moviedb.csv
    │   │   │   └── pieces
    │   │   │   │   ├── p1.csv
    │   │   │   │   ├── p10.csv
    │   │   │   │   ├── p2.csv
    │   │   │   │   ├── p3.csv
    │   │   │   │   ├── p4.csv
    │   │   │   │   ├── p5.csv
    │   │   │   │   ├── p6.csv
    │   │   │   │   ├── p7.csv
    │   │   │   │   ├── p8.csv
    │   │   │   │   └── p9.csv
    │   │   ├── dtocs.csv
    │   │   ├── ex_templates
    │   │   │   ├── ex1_quickstart.py
    │   │   │   ├── ex2_quickstart.py
    │   │   │   └── lab4_debug_challenge.py
    │   │   └── im
    │   │   │   ├── all_overlap.png
    │   │   │   ├── brb_sol.png
    │   │   │   ├── one_piece.PNG
    │   │   │   ├── only_one_piece.png
    │   │   │   ├── outline_pane.PNG
    │   │   │   └── valid_layout.png
    │   ├── 05_debug.md
    │   ├── 05_debug
    │   │   ├── 00_debug_cv.py
    │   │   ├── 01_debug_numpy.md
    │   │   └── debug_numpy_py.py
    │   ├── 06_solutions.md
    │   ├── 06_solutions
    │   │   ├── 01_science_funcs.ipynb
    │   │   ├── 02_array.ipynb
    │   │   ├── 02_basic_numpy.ipynb
    │   │   ├── 02_basic_oop.ipynb
    │   │   ├── 04_numpy_stats.ipynb
    │   │   ├── Untitled.ipynb
    │   │   ├── big_special_str.txt
    │   │   └── data
    │   │   │   ├── bank_arrivals.csv
    │   │   │   ├── breach.csv
    │   │   │   ├── dtocs.csv
    │   │   │   ├── lysis.csv
    │   │   │   └── moviedb.csv
    │   ├── data
    │   │   ├── hist.csv
    │   │   ├── minor_illness_ed_attends.csv
    │   │   ├── salaries.csv
    │   │   └── salaries_extended.csv
    │   └── im
    │   │   ├── gsearch.PNG
    │   │   ├── salaries.PNG
    │   │   └── salaries_extended.PNG
    ├── 02_stat_prog
    │   ├── 01_pandas
    │   │   ├── 01_intro_pandas.ipynb
    │   │   ├── 02_files.ipynb
    │   │   ├── 03_non_standard_download.ipynb
    │   │   ├── 04_datetimes.ipynb
    │   │   ├── 05_analysing.ipynb
    │   │   └── 06_cs_combining.ipynb
    │   ├── 01_pandas_front_page.md
    │   ├── 02_matplotlib
    │   │   ├── 01_matplotlib.ipynb
    │   │   ├── 02_matplotlib2.ipynb
    │   │   ├── 02_plotting_time_series.ipynb
    │   │   ├── 03_cs_hm.ipynb
    │   │   ├── explore.png
    │   │   └── stacked.png
    │   ├── 02_visual_front_page.md
    │   ├── 03_exercises
    │   │   ├── 00_dataframes.ipynb
    │   │   ├── 01_data_wrangling_matplotlib.ipynb
    │   │   ├── 02_stroke_data_wrangling.ipynb
    │   │   ├── 03_visualise_ts.ipynb
    │   │   ├── data
    │   │   │   ├── di_counts.csv
    │   │   │   ├── di_rq_to_test.csv
    │   │   │   ├── di_test_to_report.csv
    │   │   │   ├── sw_imaging.csv
    │   │   │   ├── synth_lysis.csv
    │   │   │   └── total_referrals.csv
    │   │   └── hosp_1_ed.png
    │   ├── 03_exercises_front_page.md
    │   ├── 04_solutions
    │   │   ├── 00_dataframes.ipynb
    │   │   ├── 01_data_wrangling_matplotlib_solutions.ipynb
    │   │   ├── 02_stroke_data_wrangling_solutions.ipynb
    │   │   ├── 03_visualise_ts_SOLUTIONS.ipynb
    │   │   └── total_referrals.csv
    │   └── 04_solutions_front_page.md
    ├── 03_mgt
    │   ├── 01_git
    │   │   ├── 01_why.md
    │   │   ├── 02_git.md
    │   │   ├── 03_cs_1.md
    │   │   ├── 04_cs_2.md
    │   │   └── 05_cs_3.md
    │   ├── 02_packaging
    │   │   ├── 01_python_packages.ipynb
    │   │   ├── example.ipynb
    │   │   ├── my_package_name
    │   │   │   ├── __init__.py
    │   │   │   ├── datasets.py
    │   │   │   ├── package_data
    │   │   │   │   ├── example_datset_1.csv
    │   │   │   │   └── example_datset_2.csv
    │   │   │   └── plotting.py
    │   │   └── ts_emergency
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │       ├── .~lock.ts_ed.csv#
    │   │   │       ├── syn_ts_ed_long.csv
    │   │   │       └── syn_ts_ed_wide.csv
    │   │   │   ├── datasets.py
    │   │   │   └── plotting.py
    │   ├── 03_mgt_front_page.md
    │   ├── 03_pypi
    │   │   ├── 01_local.md
    │   │   ├── 02_github.md
    │   │   ├── 03_pypi.md
    │   │   ├── 04_automation.md
    │   │   ├── LICENSE
    │   │   ├── MANIFEST.in
    │   │   ├── environment.yml
    │   │   ├── requirements.txt
    │   │   ├── setup.py
    │   │   └── test_package
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │       └── test_data.csv
    │   │   │   └── test.py
    │   ├── 03_vc_front_page.md
    │   ├── 04_binder
    │   │   └── 01_binder.md
    │   ├── 04_exercises
    │   │   ├── 01_python_packages.ipynb
    │   │   ├── 02_conda.ipynb
    │   │   ├── 02_use_conda.md
    │   │   ├── 03_binder.md
    │   │   └── im
    │   │   │   ├── detrended.jpg
    │   │   │   └── diag.jpg
    │   ├── 04_exercises_front_page.md
    │   ├── 05_solutions
    │   │   ├── 01_python_packages_solutions.ipynb
    │   │   ├── im
    │   │   │   ├── detrended.jpg
    │   │   │   └── diag.jpg
    │   │   └── ts_emergency
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │       ├── .~lock.ts_ed.csv#
    │   │   │       ├── syn_ts_ed_long.csv
    │   │   │       └── syn_ts_ed_wide.csv
    │   │   │   ├── datasets.py
    │   │   │   └── plotting
    │   │   │       ├── __init__.py
    │   │   │       ├── tsa.py
    │   │   │       └── view.py
    │   └── 05_solutions_front_page.md
    ├── appendix
    │   ├── acknowledge.md
    │   ├── fp_lectures.md
    │   ├── fp_practicals.md
    │   ├── labs
    │   │   ├── 01_basics.ipynb
    │   │   ├── 02_basics.ipynb
    │   │   ├── debug1.md
    │   │   ├── debug2.md
    │   │   └── src
    │   │   │   ├── cinema_exercise.py
    │   │   │   ├── list_comprehensions.py
    │   │   │   ├── moviedb.csv
    │   │   │   ├── py_finance.py
    │   │   │   ├── string_manipulation.py
    │   │   │   ├── test_finance.py
    │   │   │   ├── week1_debug_challenge1.py
    │   │   │   └── wk2_debug_challenge.py
    │   └── lectures
    │   │   ├── Lecture1.ipynb
    │   │   └── Lecture2.ipynb
    ├── front_page.md
    └── imgs
    │   ├── logo_v1.png
    │   ├── package_versus_project.drawio
    │   ├── package_versus_project.png
    │   ├── small_logo.png
    │   ├── title.odg
    │   ├── title_cropped.png
    │   ├── title_cropped.png~
    │   └── title_logo.png
└── images
    ├── binder_1.png
    ├── binder_2.png
    ├── book_small_logo.svg
    ├── book_title_page_log.svg
    ├── detrended.jpg
    ├── diag.jpg
    ├── release.png
    ├── test_pypi2.png
    ├── test_pypi3.png
    └── testpypi.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | #Jupyter book build
  2 | _build/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). Dates formatted as YYYY-MM-DD as per [ISO standard](https://www.iso.org/iso-8601-date-and-time-format.html).
 7 | 
 8 | Consistent identifier (represents all versions, resolves to latest): [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10026326.svg)](https://doi.org/10.5281/zenodo.10026326)
 9 | 
10 | ## [v3.0.0]() UNRELEASED
11 | 
12 | ### Changed
13 | 
14 | * ENV: updated to python 3.11 and added linting packages and `hatch` for packaging.
15 | * ENV: upgraded packages to latest as of Jul 2024 including numpy > 2.0. Tested all numpy notebooks for compatibility.
16 | * Coding Scientific functions exercises: Added new exercise 3 that implements $W_q$ and $P_n$ from a M/M/1 queue.
17 | * Updated python packaging section.  Retired `setuptools` approach in favour of `hatch`. Split sections on installable packages, github install, PyPI install and automation.
18 | * Removed redundant numpy notebooks from old intro python course.
19 | * Removed `seaborn` dependency from visualise time series exercise.
20 | * Minor typo and sentence fixes.
21 | 
22 | ###
23 | 
24 | ## 2.0.1 (2023-09-25) 
25 | 
26 | ### Release Highlights
27 | 
28 | * Patched conda install instructions
29 | * Added contributors
30 | * Added `CHANGELOG.md`
31 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: 'Python for health data science: a hands-on introduction'
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | authors:
11 |   - given-names: Thomas
12 |     family-names: Monks
13 |     affiliation: 'University of Exeter '
14 |     orcid: 'https://orcid.org/0000-0003-2631-4481'
15 | identifiers:
16 |   - type: doi
17 |     value: 10.5281/zenodo.7107920
18 |     description: v2.0.0
19 | repository-code: 'https://github.com/health-data-science-OR/coding-for-ml'
20 | url: 'https://www.pythonhealthdatascience.com'
21 | keywords:
22 |   - Python
23 |   - Health
24 |   - Data Science
25 | license: MIT
26 | version: 2.0.0
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | #install wget
 4 | RUN apt-get update \
 5 |   && apt-get install -y wget \
 6 |   && rm -rf /var/lib/apt/lists/*
 7 | 
 8 | #get the latest version of miniconda
 9 | RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
10 | 
11 | # install in batch (silent) mode, does not edit PATH or .bashrc or .bash_profile
12 | # -p path
13 | # -f force
14 | RUN bash Miniconda3-latest-Linux-x86_64.sh -b
15 | 
16 | ENV PATH=/root/miniconda3/bin:${PATH} 
17 | 
18 | # cleanup
19 | RUN rm Miniconda3-latest-Linux-x86_64.sh
20 | 
21 | #update conda 
22 | RUN conda update -y conda
23 | 
24 | #create directory for code.
25 | RUN mkdir /home/code
26 | 
27 | #set working directory.
28 | WORKDIR /home/code
29 | 
30 | # Copy all files across to container
31 | COPY . /home/code
32 | 
33 | # Install anaconda, conda-forge and pip dependencies the clean up.
34 | RUN conda env create -f binder/environment.yml && conda clean -afy
35 | 
36 | #open a port
37 | EXPOSE 80
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 health-data-science-OR
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8377497.svg)](https://doi.org/10.5281/zenodo.8377497)
 2 | 
 3 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/health-data-science-OR/coding-for-ml/HEAD)
 4 | 
 5 | # Python for health data science: a hands-on introduction
 6 | 
 7 | Learning materials for Coding for Machine Learning and Data Science
 8 | 
 9 | This material is also available as a [Jupyter Book](https://health-data-science-or.github.io/coding-for-ml)
10 | 
11 | ## Installing the virtual environment
12 | 
13 | Details of a conda virtual environment is available in `binder/environment.yml`
14 | 
15 | 1. Clone the repo
16 | 2. Navigate to the repo in a terminal (Mac/Linux) or anaconda prompt (Windows)
17 | 3. Create the virtual environment
18 | * `conda env create -f binder/environment.yml`
19 | 
20 | 4. Activate the environment
21 | * `conda activate hds_code`
22 | 
23 | 5. Launch Jupyter-lab to edit and run code
24 | * `jupyter-lab`
25 | 
26 | ## Citation
27 | 
28 | Please cite using the zenodo link.  LaTex is:
29 | 
30 | ```
31 | @software{monks_thomas_2023_8377497,
32 |   author       = {Monks, Thomas},
33 |   title        = {{Python for health data science: a hands-on 
34 |                    introduction}},
35 |   month        = sep,
36 |   year         = 2023,
37 |   note         = {{If you use this software, please cite it using the 
38 |                    metadata from this file.}},
39 |   publisher    = {Zenodo},
40 |   version      = {v2.0.1},
41 |   doi          = {10.5281/zenodo.8377497},
42 |   url          = {https://doi.org/10.5281/zenodo.8377497}
43 | }
44 | ```


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
 1 | # book settings
 2 | title: Python for health data science.
 3 | author: Thomas Monks
 4 | email: t.m.w.monks@exeter.ac.uk
 5 | description: >- # this means to ignore newlines until "baseurl:"
 6 |   Learning materials for Coding for Machine Learning and Data Science.
 7 | logo: content/imgs/small_logo.png
 8 | #logo: content/imgs/logo_v1.png
 9 | 
10 | # only build files specified in table of contents file
11 | only_build_toc_files: true
12 | 
13 | execute:
14 |   execute_notebooks: off #cache output from .ipynb files for faster build
15 |   timeout: -1 #no time restriction on notebook execution
16 |   
17 | repository:
18 |   url: https://github.com/health-data-science-OR/coding-for-ml
19 |   branch: main
20 |   
21 | html:
22 |   use_repository_button: true
23 |   use_issues_button: true
24 |   
25 | # Configure your Binder links, such as the URL of the BinderHub.
26 | launch_buttons:
27 |   binderhub_url: "https://mybinder.org"
28 |   colab_url: "https://colab.research.google.com"
29 |   notebook_interface: "jupyterlab"
30 |   thebe: true
31 | 


--------------------------------------------------------------------------------
/_toc.yml:
--------------------------------------------------------------------------------
  1 | format: jb-book
  2 | root: content/front_page
  3 | parts:
  4 | - caption: Introduction
  5 |   chapters:
  6 |   - file: content/001_setup/contributing
  7 |   - file: content/001_setup/install
  8 |   - file: content/001_setup/git
  9 |   - file: content/001_setup/conda
 10 |   - file: content/001_setup/prereq
 11 |     sections:
 12 |     - file: content/001_setup/prereq/01_list_comp
 13 |     - file: content/001_setup/prereq/02_dicts
 14 |     - file: content/001_setup/prereq/04_scope
 15 |     - file: content/001_setup/prereq/05_pep8
 16 | - caption: Algorithms and computational modelling
 17 |   chapters:
 18 |   - file: content/01_algorithms/01_design
 19 |     sections:
 20 |     - file: content/01_algorithms/01_design/01_primes
 21 |     - file: content/01_algorithms/01_design/02_better_design
 22 |     - file: content/01_algorithms/01_design/03_micro_opt
 23 |     - file: content/01_algorithms/01_design/04_summing_up
 24 |   - file: content/01_algorithms/02_oop
 25 |     sections:
 26 |     - file: content/01_algorithms/02_oop/01_python_classes
 27 |     - file: content/01_algorithms/02_oop/02_oop_sim
 28 |     - file: content/01_algorithms/02_oop/03_oop_cv
 29 |   - file: content/01_algorithms/03_numpy
 30 |     sections:
 31 |     - file: content/01_algorithms/03_numpy/01_performance
 32 |     - file: content/01_algorithms/03_numpy/02_vectors
 33 |     - file: content/01_algorithms/03_numpy/03_slicing
 34 |     - file: content/01_algorithms/03_numpy/04_algebra
 35 |     - file: content/01_algorithms/03_numpy/05_statistics
 36 |     - file: content/01_algorithms/03_numpy/05a_regression
 37 |     - file: content/01_algorithms/03_numpy/06_sampling
 38 |     - file: content/01_algorithms/03_numpy/07_advanced_iter
 39 |     - file: content/01_algorithms/03_numpy/08_cs1
 40 |     - file: content/01_algorithms/03_numpy/09_cs2
 41 |     - file: content/01_algorithms/03_numpy/10_cs3
 42 |   - file: content/01_algorithms/04_exercises
 43 |     sections:
 44 |     - file: content/01_algorithms/04_exercises/01_science_funcs
 45 |     - file: content/01_algorithms/04_exercises/02_basic_oop
 46 |     - file: content/01_algorithms/04_exercises/02_array
 47 |     - file: content/01_algorithms/04_exercises/04_numpy_stats
 48 |   - file: content/01_algorithms/05_debug
 49 |     sections:
 50 |     - file: content/01_algorithms/05_debug/01_debug_numpy
 51 |   - file: content/01_algorithms/06_solutions
 52 |     sections:
 53 |     - file: content/01_algorithms/06_solutions/01_science_funcs
 54 |     - file: content/01_algorithms/06_solutions/02_basic_oop
 55 |     - file: content/01_algorithms/06_solutions/02_array
 56 |     - file: content/01_algorithms/06_solutions/04_numpy_stats
 57 | - caption: Statistical Programming
 58 |   chapters:
 59 |   - file: content/02_stat_prog/01_pandas_front_page
 60 |     sections:
 61 |     - file: content/02_stat_prog/01_pandas/01_intro_pandas
 62 |     - file: content/02_stat_prog/01_pandas/02_files
 63 |     - file: content/02_stat_prog/01_pandas/03_non_standard_download
 64 |     - file: content/02_stat_prog/01_pandas/04_datetimes
 65 |     - file: content/02_stat_prog/01_pandas/05_analysing
 66 |     - file: content/02_stat_prog/01_pandas/06_cs_combining
 67 |   - file: content/02_stat_prog/02_visual_front_page
 68 |     sections:
 69 |     - file: content/02_stat_prog/02_matplotlib/01_matplotlib
 70 |     - file: content/02_stat_prog/02_matplotlib/02_matplotlib2
 71 |     - file: content/02_stat_prog/02_matplotlib/03_cs_hm
 72 |     - file: content/02_stat_prog/02_matplotlib/02_plotting_time_series
 73 |   - file: content/02_stat_prog/03_exercises_front_page
 74 |     sections:
 75 |     - file: content/02_stat_prog/03_exercises/00_dataframes
 76 |     - file: content/02_stat_prog/03_exercises/01_data_wrangling_matplotlib
 77 |     - file: content/02_stat_prog/03_exercises/02_stroke_data_wrangling
 78 |     - file: content/02_stat_prog/03_exercises/03_visualise_ts
 79 |   - file: content/02_stat_prog/04_solutions_front_page
 80 |     sections:
 81 |     - file: content/02_stat_prog/04_solutions/00_dataframes
 82 |     - file: content/02_stat_prog/04_solutions/01_data_wrangling_matplotlib_solutions
 83 |     - file: content/02_stat_prog/04_solutions/02_stroke_data_wrangling_solutions
 84 |     - file: content/02_stat_prog/04_solutions/03_visualise_ts_SOLUTIONS
 85 | - caption: Managing Data Science Projects
 86 |   chapters:
 87 |   - file: content/03_mgt/03_vc_front_page
 88 |     sections:
 89 |     - file: content/03_mgt/01_git/02_git
 90 |     - file: content/03_mgt/01_git/03_cs_1
 91 |     - file: content/03_mgt/01_git/04_cs_2
 92 |     - file: content/03_mgt/01_git/05_cs_3
 93 |   - file: content/03_mgt/03_mgt_front_page
 94 |     sections:
 95 |     - file: content/03_mgt/02_packaging/01_python_packages
 96 |     - file: content/03_mgt/03_pypi/01_local
 97 |     - file: content/03_mgt/03_pypi/02_github
 98 |     - file: content/03_mgt/03_pypi/03_pypi
 99 |     - file: content/03_mgt/03_pypi/04_automation
100 |     - file: content/03_mgt/04_binder/01_binder
101 |   - file: content/03_mgt/04_exercises_front_page
102 |     sections:
103 |     - file: content/03_mgt/04_exercises/01_python_packages
104 |     - file: content/03_mgt/04_exercises/02_conda
105 |     - file: content/03_mgt/04_exercises/03_binder
106 |   - file: content/03_mgt/05_solutions_front_page
107 |     sections:
108 |     - file: content/03_mgt/05_solutions/01_python_packages_solutions
109 | - caption: Appendix
110 |   chapters:
111 |     - file: content/appendix/acknowledge
112 | - caption: Appendix Basic Python
113 |   chapters:
114 |   - file: content/appendix/fp_lectures
115 |     sections:
116 |     - file: content/appendix/lectures/Lecture1
117 |     - file: content/appendix/lectures/Lecture2
118 |   - file: content/appendix/fp_practicals
119 |     sections:
120 |     - file: content/appendix/labs/01_basics
121 |     - file: content/appendix/labs/debug1
122 |     - file: content/appendix/labs/02_basics
123 |     - file: content/appendix/labs/debug2
124 | 


--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
 1 | name: hds_code
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - black
 6 |   - flake8
 7 |   - hatch
 8 |   - jupyterlab=4.2.4
 9 |   - jupyterlab-spellchecker=0.8.4
10 |   - matplotlib=3.9.1
11 |   - nbqa
12 |   - nodejs=22.5.1
13 |   - numpy=2.0.1
14 |   - pandas=2.2.2
15 |   - pip=24.0
16 |   - python=3.11
17 |   - pytest=8.3.2
18 |   - py7zr=0.21.1
19 |   - rich=13.7.1
20 |   - scikit-learn=1.5.1
21 |   - scipy==1.14.0
22 |   - statsmodels=0.14.2    
23 | 


--------------------------------------------------------------------------------
/content/001_setup/conda.md:
--------------------------------------------------------------------------------
 1 | # Conda virtual environment
 2 | 
 3 | The code examples in this module have been created in using the conda virtual environment `hds_code`
 4 | 
 5 | To create the conda environment, enter the following commands into the terminal:
 6 | 
 7 | ```console
 8 | git clone https://github.com/health-data-science-OR/coding-for-ml
 9 | cd coding-for-ml
10 | conda update conda
11 | conda env create -f binder/environment.yml
12 | conda activate hds_code
13 | ```
14 | 
15 | The dependencies `hds_code` will install via conda are:
16 | 
17 | ```yaml
18 | name: hds_code
19 | channels:
20 |   - conda-forge
21 | dependencies:
22 |   - jupyterlab=3.4.6
23 |   - matplotlib=3.5.3
24 |   - nodejs=18.8.0
25 |   - numpy=1.23.2
26 |   - pandas=1.4.4
27 |   - pip=22.2.2
28 |   - python=3.8.12
29 |   - scipy==1.9.1
30 |   - statsmodels=0.13.2
31 |   - pip:   
32 |     - rich==12.5.1
33 |     - scikit-learn==1.1.2
34 |     - py7zr==0.20.0
35 | ```
36 | 


--------------------------------------------------------------------------------
/content/001_setup/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | I welcome contributions to the book.  In each case credit will be given.  Please do not feel you need to be an expert to contribute all views and feedback are greatly appreciated.  You can contribute in the following ways:
 4 | 
 5 | ## Typographical and grammatical errors
 6 | 
 7 | If you spot a typo, spelling mistake, poor grammar (no matter how minor), or just feel a sentence/paragraph could be rewritten to improve clarity, I would greatly appreciate you reporting it.  You can do that by raising a Github issue via the repository.  The url is [https://github.com/health-data-science-OR/coding-for-ml/issues](https://github.com/health-data-science-OR/coding-for-ml/issues)
 8 | 
 9 | When you submit an issue please provide:
10 | 
11 | * link to page of book with the error
12 | * Original sentence(s) that contain the error.
13 | * Suggested fix
14 | * Label the problem as `documentation`.
15 | 
16 | ## Reporting bugs
17 | 
18 | If you find a bug in any of the code examples including solutions to exercises please report via [Github issues](https://github.com/health-data-science-OR/coding-for-ml/issues).  Please provide the following:
19 | 
20 | * The operating system and version you are using.
21 | * Details of your python dependencies or virtual environment if you are using one. (if using conda please provide a environment.yml file - see the conda exercises for help )
22 | * Steps to reproduce the problem including the url.
23 | * Please label the problem as `bug`
24 | * **Optional**: recommended fix.
25 | 
26 | > Note: Before reporting a bug please check that the Jupyter notebook cells have been been run in **order**.  I recommend selecting 'Reset Kernel and Clear All Outputs' from the Kernel menu and rerunning the notebook to confirm.   
27 | 
28 | ## Submit general feedback or request new content
29 | 
30 | The book will evolve over time. I'd greatly welcome feedback, via Github issues, on the book including requests for new new topics, chapters, or expanded sections on current content such as cases studies or package functionality. 
31 | 
32 | ### Requests
33 | 
34 | For new content requests please
35 | 
36 | * Detail the requested content including description and lists of any relevant packages
37 | * Explain why this content is relevant to health data scientists
38 | * Optional: provide an example 
39 | * Label the issue as an `enhancement`
40 | 
41 | ### General feedback
42 | 
43 | For anything I've not covered here please submit a Github issue labelled as `feedback`
44 | 
45 | ## Code of conduct
46 | 
47 | If you wish to contribute in any of the above ways, including responding to feedback from others, then I ask you to follow the contribution code of conduct for this book.
48 | 
49 | * Demonstrating empathy and kindness toward other people
50 | * Being respectful of differing opinions, viewpoints, and experiences
51 | * Giving and gracefully accepting constructive feedback
52 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
53 | * Focusing on what is best not just for us as individuals, but for the overall community
54 | 
55 | These guidelines are adapted from the [Contributor Covenant](https://www.contributor-covenant.org/)
56 | 
57 | 


--------------------------------------------------------------------------------
/content/001_setup/docker.md:
--------------------------------------------------------------------------------
1 | # Docker
2 | 
3 | **COMING SOON!**
4 | 
5 | If desired an Ubuntu 20.04 image is available from DockerHub.  The image contains the conda environment `hds_code` and ships with the GitHub Repo.
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/content/001_setup/install.md:
--------------------------------------------------------------------------------
 1 | # Install python and an IDE
 2 | 
 3 | The code in this book is written in using python 3.8.8 and a list of dependencies.
 4 | 
 5 | ## Local installation
 6 | 
 7 | For beginners it is is recommended that users first install 'Anaconda'. This bundles python along with data science centric IDEs called `Spyder` and `Jupyter Notebook` (I recommend the more modern Jupyter-Lab over basic notebook, but there is no requirement to use it.)
 8 | 
 9 | https://www.anaconda.com/download/
10 | 
11 | ```{admonition} See also
12 | :class: tip
13 | Anaconda includes 'conda' (a package manager).  An optional step is to follow our notes to use [conda](conda.md) to create a virtual environment that includes python 3.8.12 and Jupyter-Lab 3.x
14 | ```
15 | 
16 | ```{admonition} My personal preferences
17 | :class: tip
18 | Alternatively (and my preference) you can install substantially smaller [Miniconda](https://docs.conda.io/en/latest/miniconda.html) and install the packages you need using the provided conda environment or by selecting them yourself.  I tend to use packages installed from [conda-forge](https://conda-forge.org/), but the packages in the Anaconda channel (defaults) are equally good.
19 | ```
20 | 
21 | 
22 | ## Run our code via Binderhub or Google Colab
23 | 
24 | ```{note}
25 | You have the option of running our code in the cloud without the need to install on your local machine.
26 | ```
27 | 
28 | When you navigate to books pages and exercises that contain code cells you will see a image of a 'rocketship' in the top right hand corner of the screen.  Move you mouse over the image and you can choose to either open the notebook in BinderHub (will take ~1 minute to open) or [Google Colab](https://colab.research.google.com). You will need a Google account to use Google Colab.


--------------------------------------------------------------------------------
/content/001_setup/prereq.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "fb54ccc4-e555-414a-95b1-2f65f30718eb",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Python Prerequisites\n",
 9 |     "\n",
10 |     "This book isn't intended to teach basic python or use of python Integrated Development Environments such as Pycharm, or Visual Studio Code.  There are many excellent books you can purchase (or borrow from a good library), and free courses on YouTube that you can make use of to get you started.  If you are an absolute beginner you can also investigate some [online preparatory material](https://health-data-science-or.github.io/basic-python/content/front_page.html) I provide for the MSc in Health Data Science at Exeter, but this it no intended to be exhaustive, and you would be daft to not investigate the plethora of material available for free online.\n",
11 |     "\n",
12 |     "Nevertheless there are a few areas of basic python knowledge that are essential to get the most of out of the book.  As such I have included a brief \n",
13 |     "summary of the following topics that you may wish to revise before tackling the main material.\n",
14 |     "\n",
15 |     "\n",
16 |     "* [List comprehensions](./prereq/01_list_comp)\n",
17 |     "* [Dictionaries](./prereq/02_dicts)\n",
18 |     "* [Variable scope](./prereq/04_scope)\n",
19 |     "* [Coding standards](./prereq/05_pep8)"
20 |    ]
21 |   }
22 |  ],
23 |  "metadata": {
24 |   "kernelspec": {
25 |    "display_name": "Python 3 (ipykernel)",
26 |    "language": "python",
27 |    "name": "python3"
28 |   },
29 |   "language_info": {
30 |    "codemirror_mode": {
31 |     "name": "ipython",
32 |     "version": 3
33 |    },
34 |    "file_extension": ".py",
35 |    "mimetype": "text/x-python",
36 |    "name": "python",
37 |    "nbconvert_exporter": "python",
38 |    "pygments_lexer": "ipython3",
39 |    "version": "3.8.12"
40 |   }
41 |  },
42 |  "nbformat": 4,
43 |  "nbformat_minor": 5
44 | }
45 | 


--------------------------------------------------------------------------------
/content/01_algorithms/01_design.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "54b219c6",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# The importance of design\n",
 9 |     "\n",
10 |     "As your experience in data science grows I have no doubt that one lesson you will repeatedly learn is the importance of a good algorithm.  A good algorithm can turn a seven day model run time into 4 hours (and a 14 day run time due to finding a mistake in the first run into an 8 hour run). A related lesson is that how you design your code to implement an algorithm also matters.  Indeed the decisions you make in implementation can also have an order of magnitude impact on run time.  In this section we will explore a simple problem - computing (small) prime numbers and how the choice of algorithm and its implementation can affect run time."
11 |    ]
12 |   }
13 |  ],
14 |  "metadata": {
15 |   "kernelspec": {
16 |    "display_name": "Python 3 (ipykernel)",
17 |    "language": "python",
18 |    "name": "python3"
19 |   },
20 |   "language_info": {
21 |    "codemirror_mode": {
22 |     "name": "ipython",
23 |     "version": 3
24 |    },
25 |    "file_extension": ".py",
26 |    "mimetype": "text/x-python",
27 |    "name": "python",
28 |    "nbconvert_exporter": "python",
29 |    "pygments_lexer": "ipython3",
30 |    "version": "3.8.8"
31 |   }
32 |  },
33 |  "nbformat": 4,
34 |  "nbformat_minor": 5
35 | }
36 | 


--------------------------------------------------------------------------------
/content/01_algorithms/01_design/04_summing_up.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "b7d58aef",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Summing up\n",
 9 |     "\n",
10 |     "## What else might you do?\n",
11 |     "\n",
12 |     "We chose a particular algorithm to improve on trial and error in the computation of primes.  So the big questions are:\n",
13 |     "\n",
14 |     "* are there other algorithms and options that might reduce runtime?   \n",
15 |     "* is this good performance good enough or a problem for our study?\n",
16 |     "* are we missing any obvious big ticket design changes?\n",
17 |     "\n",
18 |     "There are in fact other algorithms which are more efficient at computing larger primes than our sieve, but our sieve is reasonably good for the small primes we have computed.  Could we make further improvements in its basic design? Yes we can.  For instance, we know that all even numbers above two cannot be primes.  We therefore don't even need to consider these in our algorithm.  This not only reduces computation, but we can also half the size of our data structure (you might try this an exercise).\n",
19 |     "\n",
20 |     "Another option that we will explore in a later section is `numpy`.  This provides some very fast optimised data structures and procedures that we will compare to standard python."
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "markdown",
25 |    "id": "79ab5782",
26 |    "metadata": {},
27 |    "source": [
28 |     "## Conclusions\n",
29 |     "\n",
30 |     "We've seen that a good algorithm makes a huge difference to what is feasible to compute.  A trial and error approach to computing primes prevented us from even finding relatively small prime numbers in a reasonable time frame.  The **Sieve of Eratosthenes** made the unfeasible suddenly feasible.  We've also seen that the design of code can also affect execution time by an order of magnitude.  But not all optimisations will have a huge impact on performance and some optimisation may scale with our problem size.  "
31 |    ]
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "display_name": "Python 3 (ipykernel)",
37 |    "language": "python",
38 |    "name": "python3"
39 |   },
40 |   "language_info": {
41 |    "codemirror_mode": {
42 |     "name": "ipython",
43 |     "version": 3
44 |    },
45 |    "file_extension": ".py",
46 |    "mimetype": "text/x-python",
47 |    "name": "python",
48 |    "nbconvert_exporter": "python",
49 |    "pygments_lexer": "ipython3",
50 |    "version": "3.8.8"
51 |   }
52 |  },
53 |  "nbformat": 4,
54 |  "nbformat_minor": 5
55 | }
56 | 


--------------------------------------------------------------------------------
/content/01_algorithms/02_oop.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "communist-courage",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Designing using objects"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "markdown",
13 |    "id": "regulated-coral",
14 |    "metadata": {},
15 |    "source": [
16 |     "> For a more detailed treatment of classes see Part IV of *Lutz. (2013). Learning Python. 5th Ed. O'Reilly.*\n",
17 |     "\n",
18 |     "To get the most out of python when coding algorithms and models it is essential that you understand the basics of python classes and object orientated programming (OOP).  **The key takeaways from this section are that class aid code reuse and design (although when used unwisely they can overcomplicate designs!)**. We will try and do this in a fun way so you can see the benefits.\n",
19 |     "\n",
20 |     "**In this section you will learn:**\n",
21 |     "\n",
22 |     "* How to instantiate multiple instances of a class.\n",
23 |     "* How to declare a class and define a class constructor method.\n",
24 |     "* What is meant by class attributes and methods.\n",
25 |     "\n",
26 |     "> It is worth noting that in python you don't have to use classes you can actually achieve everything with functions.  However, the abstraction benefits of OO are really important for the design and organisation of complex projects."
27 |    ]
28 |   }
29 |  ],
30 |  "metadata": {
31 |   "kernelspec": {
32 |    "display_name": "Python 3 (ipykernel)",
33 |    "language": "python",
34 |    "name": "python3"
35 |   },
36 |   "language_info": {
37 |    "codemirror_mode": {
38 |     "name": "ipython",
39 |     "version": 3
40 |    },
41 |    "file_extension": ".py",
42 |    "mimetype": "text/x-python",
43 |    "name": "python",
44 |    "nbconvert_exporter": "python",
45 |    "pygments_lexer": "ipython3",
46 |    "version": "3.8.8"
47 |   }
48 |  },
49 |  "nbformat": 4,
50 |  "nbformat_minor": 5
51 | }
52 | 


--------------------------------------------------------------------------------
/content/01_algorithms/02_oop/text_adventure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/02_oop/text_adventure/__init__.py


--------------------------------------------------------------------------------
/content/01_algorithms/03_numpy.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "db806001",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Scientific coding in `numpy`"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "markdown",
13 |    "id": "1e0fd607",
14 |    "metadata": {},
15 |    "source": [
16 |     "So far we have only covered standard python for health data science.  Its very important to develop your skills in standard python as you will undoubtedly use them the most. Beyond the standard library you will learn why python is an exciting and popular language in data science:  there is a whole ecosystem of scientific libraries built around algorithms, modelling data manipulating, processing and visualision. The fundamental and in my view most important of these is [numpy](https://numpy.org/). The most important contribution of the `numpy` library is the concept of an efficient n-dimentional **array**.  \n",
17 |     "\n",
18 |     "**I know what you are thinking**: 'why does python need an array when we already have lists and other similar data structures?  The answer is simply speed of computation: `numpy` arrays are lightening fast relative to standard python. As you will learn, underneath the hood a `list` is very different from a `numpy` array.  \n",
19 |     "\n",
20 |     "---\n",
21 |     "\n",
22 |     "```{admonition} \"This is too much\"\n",
23 |     "Its worth saying that in my view is this (enourmous) efficiency benefit does come with a trade-off. `numpy` does have a higher learning curve than standard python for new coders.  When I first taught numpy it was part of a course in analytics introducing mathematics and business students to data science in python.  After our first `numpy` computer lab my favourite quote from a group of distressed students was **\"this is too much... TOO MUCH\"** (it was actually shouted at me).  `numpy` is less pythonic and it can at time be difficult to get the elegance of design you want matched with `numpy` code.  However, for many mathemathical operations `numpy` code can be more readable due to a concept called 'broadcasting' that we shall cover in a later section.  Do persevere if you find it difficult at first. To be clear if you are doing substantive computational work in python you should be using `numpy` and you need to know how to use it to be employable in health data science.\n",
24 |     "\n",
25 |     "The material in the following sections has been designed to try and avoid my \"too much, too much\" problem.  I hope you enjoy it.  If you have any suggestions on what you need explaining do let me know.\n",
26 |     "```\n",
27 |     "\n",
28 |     "\n",
29 |     "\n"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "id": "dd1e9495",
36 |    "metadata": {},
37 |    "outputs": [],
38 |    "source": []
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "kernelspec": {
43 |    "display_name": "Python 3 (ipykernel)",
44 |    "language": "python",
45 |    "name": "python3"
46 |   },
47 |   "language_info": {
48 |    "codemirror_mode": {
49 |     "name": "ipython",
50 |     "version": 3
51 |    },
52 |    "file_extension": ".py",
53 |    "mimetype": "text/x-python",
54 |    "name": "python",
55 |    "nbconvert_exporter": "python",
56 |    "pygments_lexer": "ipython3",
57 |    "version": "3.8.12"
58 |   }
59 |  },
60 |  "nbformat": 4,
61 |  "nbformat_minor": 5
62 | }
63 | 


--------------------------------------------------------------------------------
/content/01_algorithms/03_numpy/07_advanced_iter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "id": "e3650b93-d8bc-40fc-9740-c78e07617b0a",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "a3fae3c4-a320-4328-99cc-69fffb471c12",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "# Advanced Array Iteration\n",
 19 |     "\n",
 20 |     "For most of the `numpy` code that I write in data science applications I make use of slicing, indexing and standard operations. However, occationally there is a need to use a `numpy` iteration function called `nditer`.  This might be useful in an instance where I need to iterate over each element of an 2, 3 or 4D array without including multiple for loops.  There is extensive documentation for this on the `numpy` docs.  Here we will consider some basic functionality that I have found useful in applied work.\n",
 21 |     "\n",
 22 |     "## A matrix example\n",
 23 |     "\n",
 24 |     "We will consider how to iterate over each element in a 2 dimensional array.  You obviously easily do this in standard python.  Here's a simple example:"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 166,
 30 |    "id": "807cb26b-1d55-45a0-ab39-9d4c8045a0b5",
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "[[0 1 2]\n",
 38 |       " [3 4 5]]\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "a = np.arange(6).reshape(2, 3)\n",
 44 |     "print(a)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "id": "9761376f-4d23-4115-9cd9-015539e211f8",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "A standard python implementation to iterate over all combinations is as follows.  Note the requirement of an inner loop."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 167,
 58 |    "id": "6d52334c-ff03-4956-993d-57865b8f3a94",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "def standard_all_element_iteration(a, print_out=True):\n",
 63 |     "    for row in a:\n",
 64 |     "        for col in row:\n",
 65 |     "            if print_out: print(col, end= ' ')"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 168,
 71 |    "id": "9218afa6-9bdb-4525-97e9-85b6fa54c789",
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "0 1 2 3 4 5 "
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "standard_all_element_iteration(a)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "id": "a3f023c9-7b22-41c2-93cd-201df8937cdb",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "When we need to iterate over all elements of an array then we can use nditer to eliminate the inner loop."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 169,
 97 |    "id": "6be1a102-5e0f-4842-b121-847bb8246e9c",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "def nditer_all_element_iteration(a, print_out=True):\n",
102 |     "    for element in np.nditer(a):\n",
103 |     "        if print_out: print(element, end=' ')"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 170,
109 |    "id": "46f5566c-ce9d-4708-8ce8-4e0516eb5a62",
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "0 1 2 3 4 5 "
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "nditer_all_element_iteration(a)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "id": "f8fb9b51-5b7f-40e1-bb3e-08b3693523c6",
127 |    "metadata": {},
128 |    "source": [
129 |     "The result is that we have considerably faster iteration because the inner loop executes in C."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 171,
135 |    "id": "ebdf3b65-91c1-4323-b676-89db3407d044",
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "1.29 µs ± 5.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
143 |       "640 ns ± 5.14 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "%timeit standard_all_element_iteration(a, print_out=False)\n",
149 |     "%timeit nditer_all_element_iteration(a, print_out=False)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "9b885b6d-7917-413d-8ef5-56a64606f8ca",
155 |    "metadata": {},
156 |    "source": [
157 |     "Note that the iteration took place in across the rows our the array `a`.  To iterate across the all elements column-wise you can use 'Fortran' ordering by passing the parameter `order='F'` to `np.nditer`"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 174,
163 |    "id": "d6086e14-5018-44cc-9adb-b045b3fd0a9f",
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "[[0 1 2]\n",
171 |       " [3 4 5]]\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "print(a)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 173,
182 |    "id": "45bfde47-bd52-44db-a0ef-8d91de997b9e",
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "name": "stdout",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "0 3 1 4 2 5 "
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "for element in np.nditer(a, order='F'):\n",
195 |     "    print(element, end=' ')"
196 |    ]
197 |   }
198 |  ],
199 |  "metadata": {
200 |   "kernelspec": {
201 |    "display_name": "Python 3",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.7.3"
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 5
220 | }
221 | 


--------------------------------------------------------------------------------
/content/01_algorithms/03_numpy/09_cs2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "d01ee12f",
  7 |    "metadata": {
  8 |     "tags": [
  9 |      "hide-input"
 10 |     ]
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import numpy as np\n",
 15 |     "import math"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "991310e4",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Case study 2: prime sieve\n",
 24 |     "\n",
 25 |     "This chapter opened by exploring the importance of good algorithm and code design.  We spent a fair bit of time redesigning and micro-optimising a function in standard python that implemented a prime sieve.  For large n, for example greater than 10 million, the function `prime_sieve_best` was our fastest option."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "id": "80ad6468",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "def prime_sieve_best(n):\n",
 36 |     "    '''\n",
 37 |     "    Our fastest prime sieve in standard python\n",
 38 |     "    Fastest for large n e.g. > 10m.\n",
 39 |     "    '''\n",
 40 |     "    candidates = bytearray(b\"\\x01\") * (n + 1)\n",
 41 |     "    candidates[0] = 0\n",
 42 |     "    candidates[1] = 0\n",
 43 |     "    limit = int(math.sqrt(n)) + 1    \n",
 44 |     "    \n",
 45 |     "    for i in range(2, limit): \n",
 46 |     "        if candidates[i]:\n",
 47 |     "            candidates[i+i::i] = [0] * ((n - i) // i)\n",
 48 |     "            \n",
 49 |     "    return [i for i in range(n+1) if candidates[i]]      "
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "517eef0e",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "The function `prime_sieve_np` again reimplements the algorithm, but this time using `numpy` optimised arrays and functions."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "id": "d266ffac",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def prime_sieve_np(n):\n",
 68 |     "    '''\n",
 69 |     "    Prime sieve reimplemented in NumPy.\n",
 70 |     "    '''\n",
 71 |     "    candidates = np.ones(n, dtype=bool)\n",
 72 |     "    limit = int(np.sqrt(n)) + 1\n",
 73 |     "    candidates[0:2] = False\n",
 74 |     "    \n",
 75 |     "    for i in range(2, limit):\n",
 76 |     "        if candidates[i]:\n",
 77 |     "            candidates[i+i::i] = False\n",
 78 |     "    return np.flatnonzero(candidates)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "id": "1681fee5",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "You should see a reasonable speed up, for free, using `numpy`.  Let's compare it for an even larger n."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 4,
 92 |    "id": "3015a9cd",
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "11.4 s ± 601 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "HUNDRED_MILLION = 100_000_000\n",
105 |     "%timeit len(prime_sieve_best(HUNDRED_MILLION))"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 5,
111 |    "id": "a87aea88",
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "1.21 s ± 42.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "%timeit len(prime_sieve_np(HUNDRED_MILLION))"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "id": "53518015",
129 |    "metadata": {},
130 |    "source": [
131 |     "That's should provide around a factor of 10 speed up.  On my machine runtime dropped from around 1 seconds on average to 1.1 seconds on average.\n",
132 |     "\n",
133 |     "This is also a nice example where, in my opinion, the numpy code is more readable than the standard python.  This is partly because `numpy` broadcasting means we can the elements in a slice cleanly. i.e.\n",
134 |     "\n",
135 |     "```python\n",
136 |     "candidates[i+i::i] = False\n",
137 |     "```\n",
138 |     "verus standard python\n",
139 |     "\n",
140 |     "```python\n",
141 |     "candidates[i+i::i] = [0] * ((n - i) // i)\n",
142 |     "```"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "Python 3",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.7.3"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 5
167 | }
168 | 


--------------------------------------------------------------------------------
/content/01_algorithms/03_numpy/data/lysis.csv:
--------------------------------------------------------------------------------
 1 | per_treated
 2 | 0.018867925
 3 | 0.03030303
 4 | 0.018867925
 5 | 0.018867925
 6 | 0.060606061
 7 | 0.018867925
 8 | 0.045454545
 9 | 0.045454545
10 | 0.018867925
11 | 0.075757576
12 | 0.045454545
13 | 0.015151515
14 | 0.03030303
15 | 0.018867925
16 | 0.015151515
17 | 0.060606061
18 | 0.060606061
19 | 0.03030303
20 | 0.015151515
21 | 0.045454545
22 | 0.015151515
23 | 0.03030303
24 | 0.03030303
25 | 0.075757576
26 | 0.060606061
27 | 0.03030303
28 | 0.060606061
29 | 0.03030303
30 | 0.060606061
31 | 0.03030303
32 | 0.060606061
33 | 0.060606061
34 | 0.075757576
35 | 0.045454545
36 | 0.075757576
37 | 0.03030303
38 | 0.045454545
39 | 0.015151515
40 | 0.121212121
41 | 0.03030303
42 | 0.045454545
43 | 0.045454545
44 | 0.075757576
45 | 0.166666667
46 | 0.03030303
47 | 0.090909091
48 | 0.090909091
49 | 0.090909091
50 | 0.25
51 | 0.147058824
52 | 0.161764706
53 | 0.117647059
54 | 0.191176471
55 | 0.132352941
56 | 


--------------------------------------------------------------------------------
/content/01_algorithms/03_numpy/data/minor_illness_ed_attends.csv:
--------------------------------------------------------------------------------
 1 | attends_rate_per_10k_pop
 2 | 2.11927795
 3 | 3.490575446
 4 | 3.989229081
 5 | 2.368604767
 6 | 3.241248629
 7 | 2.867258402
 8 | 3.11658522
 9 | 2.742594994
10 | 3.615238855
11 | 3.615238855
12 | 4.363219308
13 | 3.11658522
14 | 3.739902264
15 | 2.243941358
16 | 3.241248629
17 | 1.620624314
18 | 2.243941358
19 | 2.991921811
20 | 2.368604767
21 | 2.368604767
22 | 2.368604767
23 | 3.11658522
24 | 2.493268176
25 | 2.368604767
26 | 3.11658522
27 | 2.742594994
28 | 3.739902264
29 | 1.994614541
30 | 2.867258402
31 | 1.620624314
32 | 2.991921811
33 | 3.365912037
34 | 1.620624314
35 | 3.739902264
36 | 2.742594994
37 | 3.11658522
38 | 2.493268176
39 | 2.368604767
40 | 3.739902264
41 | 3.864565673
42 | 1.745287723
43 | 4.238555899
44 | 2.368604767
45 | 3.615238855
46 | 1.994614541
47 | 2.11927795
48 | 2.991921811
49 | 2.617931585
50 | 2.243941358
51 | 2.368604767
52 | 3.490575446
53 | 2.368604767
54 | 2.867258402
55 | 2.991921811
56 | 2.867258402
57 | 2.867258402
58 | 2.991921811
59 | 3.241248629
60 | 2.617931585
61 | 3.11658522
62 | 3.11658522
63 | 2.11927795
64 | 3.864565673
65 | 2.867258402
66 | 3.989229081
67 | 5.111199761
68 | 2.867258402
69 | 2.867258402
70 | 2.493268176
71 | 3.739902264
72 | 3.739902264
73 | 1.869951132
74 | 2.11927795
75 | 3.615238855
76 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "strange-apollo",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Exercises"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "id": "north-destination",
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": []
18 |   }
19 |  ],
20 |  "metadata": {
21 |   "kernelspec": {
22 |    "display_name": "Python 3 (ipykernel)",
23 |    "language": "python",
24 |    "name": "python3"
25 |   },
26 |   "language_info": {
27 |    "codemirror_mode": {
28 |     "name": "ipython",
29 |     "version": 3
30 |    },
31 |    "file_extension": ".py",
32 |    "mimetype": "text/x-python",
33 |    "name": "python",
34 |    "nbconvert_exporter": "python",
35 |    "pygments_lexer": "ipython3",
36 |    "version": "3.8.8"
37 |   }
38 |  },
39 |  "nbformat": 4,
40 |  "nbformat_minor": 5
41 | }
42 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/02_basic_oop.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "concerned-excess",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Basic Object Orientated Methods\n",
  9 |     "\n",
 10 |     "Follow these simple OOP exercises to practice and gain confidence in coding classes.  "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "listed-equilibrium",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Exercise 1\n",
 19 |     "\n",
 20 |     "**Task:**\n",
 21 |     "* Create a class called `Patient`.  \n",
 22 |     "* The class should contain a constructor that accepts the following parameters. The parameters should be stored in appropriately named attributes.\n",
 23 |     "    * patient_id: int\n",
 24 |     "    * age: int"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "behind-choice",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "**Hints**\n",
 33 |     "* Don't forget to include the `self` parameter!\n",
 34 |     "* Make sure you use correct case for the class name.  `Patient` follows PEP8 guidelines while `patient` does not!"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "valued-detroit",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# your code here ..."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "id": "upset-stanford",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Exercise 2:\n",
 53 |     "\n",
 54 |     "**Task:**\n",
 55 |     "* Create a class called `Ward`\n",
 56 |     "* Code a constructor method.  \n",
 57 |     "    * It should accept `ward_id` (int) as parameter and assign it to an attribute\n",
 58 |     "    * It should create a new empty list attribute that will hold patients staying on the ward. \n",
 59 |     "* Create a method called `add_patient`.  It should accept a parameter called `patient` (that is a patient class).\n",
 60 |     "* Create a method or property called `n_patients`.  It should return the number of patients on the ward.\n",
 61 |     "\n",
 62 |     "**Hints:**\n",
 63 |     "* Don't forget the `self` parameter in the method!\n"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 21,
 69 |    "id": "offensive-devices",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# your code here ..."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "id": "widespread-dubai",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Exercise 3:\n",
 82 |     "\n",
 83 |     "You will now test the `Ward` class by generating a number of patients and adding them to a ward object.\n",
 84 |     "\n",
 85 |     "**Task:**\n",
 86 |     "* Code a function that first creates a `Ward` object and then adds a user specified number of `Patient` instances via the `add_patient` function.\n",
 87 |     "* The function must return the ward object.\n",
 88 |     "* Test your function with 5 patients.\n",
 89 |     "\n",
 90 |     "**Hints**:\n",
 91 |     "* You will need to design the function so that it allocates a patient an age.  One option is to randomly generate an age in a given range.  You could achieve this using the `random.randint()` function.  E.g.\n",
 92 |     "\n",
 93 |     "```python\n",
 94 |     "from random import randint\n",
 95 |     "lower, upper = 60, 95\n",
 96 |     "age = randint(lower, upper)\n",
 97 |     "\n",
 98 |     "```"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 9,
104 |    "id": "threatened-great",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# your code here ..."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "id": "naughty-rugby",
114 |    "metadata": {},
115 |    "source": [
116 |     "## Exercise 4:\n",
117 |     "\n",
118 |     "**Task:**\n",
119 |     "* Now create a `Hospital` class\n",
120 |     "* The class should allow the creation of new wards as well as adding a patient to a user specified ward.\n",
121 |     "* The class must provide a `n_patients` method or property that returns the uptodate total of patients in the hospital.\n",
122 |     "* Create some test data and create a `Hospital` object. Return the total number of patients in the hospital."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "id": "bored-injection",
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# your code here ..."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "neutral-friendship",
138 |    "metadata": {},
139 |    "source": [
140 |     "## Exercise 5\n",
141 |     "\n",
142 |     "**Task:**\n",
143 |     "\n",
144 |     "Let's create a new type of patient specific to those with respiratory conditions.  \n",
145 |     "\n",
146 |     "The new class will also accept `patient_id` and `age`.  You will need to create two new parameters as well: `pack_yrs` and `fev1`. Fyi:\n",
147 |     "\n",
148 |     "* A pack year is defined as twenty cigarettes smoked everyday for one year \n",
149 |     "* FEV1 stands for Forced Expiratory Volumne and is a percentage measured out of 100%.  Lower values are worse.\n",
150 |     "\n",
151 |     "\n",
152 |     "Call the class `RespiratoryPatient`\n",
153 |     "\n",
154 |     "**Hints**:\n",
155 |     "* You can solve this exercise by either using inheritance or composition.  Compositin is a bit harder (and more code), but its more flexible and safer in practice."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 38,
161 |    "id": "returning-status",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# your code here ..."
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.8.8"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 5
190 | }
191 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/breach.csv:
--------------------------------------------------------------------------------
 1 | breaches
 2 | 33184
 3 | 41151
 4 | 47414
 5 | 46436.42857
 6 | 89917.28571
 7 | 72889.28571
 8 | 46942.85714
 9 | 59172.71429
10 | 57379.85714
11 | 54805.42857
12 | 51701.71429
13 | 44885.42857
14 | 45076.57143
15 | 55392.42857
16 | 62452.71429
17 | 59131.85714
18 | 72832
19 | 73746.57143
20 | 85427.14286
21 | 66947
22 | 64561
23 | 61520.71429
24 | 61439.28571
25 | 56433.57143
26 | 54607
27 | 59110.14286
28 | 68868.42857
29 | 70014.28571
30 | 97186.57143
31 | 94561.42857
32 | 94112.28571
33 | 120995
34 | 120121.7143
35 | 64823.28571
36 | 56340
37 | 65588.71429
38 | 68396.14286
39 | 74543.71429
40 | 79250
41 | 70781.14286
42 | 84049.28571
43 | 83995.71429
44 | 89101.28571
45 | 85828.71429
46 | 88957.14286
47 | 102189.7143
48 | 92863.71429
49 | 96272
50 | 88879.71429
51 | 99849.71429
52 | 118978.1429
53 | 119731.1429
54 | 192155
55 | 153156.4286
56 | 132867.8571
57 | 141194
58 | 125799.2857
59 | 110284.4286
60 | 99324
61 | 97475
62 | 106475
63 | 123111
64 | 147554
65 | 163472
66 | 168604
67 | 216286
68 | 229081
69 | 265834
70 | 186122
71 | 201329
72 | 184912
73 | 201973
74 | 174419
75 | 182597
76 | 219137
77 | 221713
78 | 268818
79 | 281612
80 | 216416
81 | 201392
82 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/breach.csv:
--------------------------------------------------------------------------------
 1 | breaches
 2 | 33184
 3 | 41151
 4 | 47414
 5 | 46436.42857
 6 | 89917.28571
 7 | 72889.28571
 8 | 46942.85714
 9 | 59172.71429
10 | 57379.85714
11 | 54805.42857
12 | 51701.71429
13 | 44885.42857
14 | 45076.57143
15 | 55392.42857
16 | 62452.71429
17 | 59131.85714
18 | 72832
19 | 73746.57143
20 | 85427.14286
21 | 66947
22 | 64561
23 | 61520.71429
24 | 61439.28571
25 | 56433.57143
26 | 54607
27 | 59110.14286
28 | 68868.42857
29 | 70014.28571
30 | 97186.57143
31 | 94561.42857
32 | 94112.28571
33 | 120995
34 | 120121.7143
35 | 64823.28571
36 | 56340
37 | 65588.71429
38 | 68396.14286
39 | 74543.71429
40 | 79250
41 | 70781.14286
42 | 84049.28571
43 | 83995.71429
44 | 89101.28571
45 | 85828.71429
46 | 88957.14286
47 | 102189.7143
48 | 92863.71429
49 | 96272
50 | 88879.71429
51 | 99849.71429
52 | 118978.1429
53 | 119731.1429
54 | 192155
55 | 153156.4286
56 | 132867.8571
57 | 141194
58 | 125799.2857
59 | 110284.4286
60 | 99324
61 | 97475
62 | 106475
63 | 123111
64 | 147554
65 | 163472
66 | 168604
67 | 216286
68 | 229081
69 | 265834
70 | 186122
71 | 201329
72 | 184912
73 | 201973
74 | 174419
75 | 182597
76 | 219137
77 | 221713
78 | 268818
79 | 281612
80 | 216416
81 | 201392
82 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/dtocs.csv:
--------------------------------------------------------------------------------
 1 | dtoc
 2 | 4940
 3 | 5004
 4 | 4588
 5 | 4409
 6 | 3861
 7 | 4597
 8 | 4404
 9 | 4170
10 | 3910
11 | 4056
12 | 4137
13 | 4228
14 | 4144
15 | 4165
16 | 4150
17 | 4165
18 | 3617
19 | 4094
20 | 4007
21 | 4028
22 | 3954
23 | 3857
24 | 4086
25 | 4031
26 | 3961
27 | 4102
28 | 4115
29 | 3894
30 | 3448
31 | 4188
32 | 4007
33 | 4053
34 | 4046
35 | 4184
36 | 3888
37 | 3961
38 | 4084
39 | 4231
40 | 4147
41 | 4200
42 | 3649
43 | 4221
44 | 4276
45 | 4327
46 | 4207
47 | 4516
48 | 4363
49 | 4612
50 | 4704
51 | 4960
52 | 4930
53 | 5063
54 | 4475
55 | 5221
56 | 4942
57 | 4948
58 | 4739
59 | 4972
60 | 4996
61 | 4888
62 | 5114
63 | 5247
64 | 5330
65 | 5573
66 | 5004
67 | 5777
68 | 5714
69 | 5601
70 | 5852
71 | 5996
72 | 6152
73 | 6361
74 | 6387
75 | 6759
76 | 6777
77 | 6771
78 | 6167
79 | 7118
80 | 6855
81 | 6648
82 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/lysis.csv:
--------------------------------------------------------------------------------
 1 | per_treated
 2 | 0.018867925
 3 | 0.03030303
 4 | 0.018867925
 5 | 0.018867925
 6 | 0.060606061
 7 | 0.018867925
 8 | 0.045454545
 9 | 0.045454545
10 | 0.018867925
11 | 0.075757576
12 | 0.045454545
13 | 0.015151515
14 | 0.03030303
15 | 0.018867925
16 | 0.015151515
17 | 0.060606061
18 | 0.060606061
19 | 0.03030303
20 | 0.015151515
21 | 0.045454545
22 | 0.015151515
23 | 0.03030303
24 | 0.03030303
25 | 0.075757576
26 | 0.060606061
27 | 0.03030303
28 | 0.060606061
29 | 0.03030303
30 | 0.060606061
31 | 0.03030303
32 | 0.060606061
33 | 0.060606061
34 | 0.075757576
35 | 0.045454545
36 | 0.075757576
37 | 0.03030303
38 | 0.045454545
39 | 0.015151515
40 | 0.121212121
41 | 0.03030303
42 | 0.045454545
43 | 0.045454545
44 | 0.075757576
45 | 0.166666667
46 | 0.03030303
47 | 0.090909091
48 | 0.090909091
49 | 0.090909091
50 | 0.25
51 | 0.147058824
52 | 0.161764706
53 | 0.117647059
54 | 0.191176471
55 | 0.132352941
56 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/moviedb.csv:
--------------------------------------------------------------------------------
1 | ID,Title,Budget,Box_office,Year,Meta_Critic
2 | 1,Amazing spiderman,230,757.9,2012,66
3 | 2,Ironman,140,585.2,2008,57
4 | 3,Thor,150,449.3,2011,54
5 | 4,Captain America: the first avenger,140,370.6,2011,66
6 | 5,Antman,130,519.3,2015,64
7 | 6,Guardians of the Galaxy,232.3,773,2014,76
8 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p1.csv:
--------------------------------------------------------------------------------
1 | 0,1,0
2 | 1,1,1
3 | 0,1,0
4 | 0,1,0
5 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p10.csv:
--------------------------------------------------------------------------------
1 | 1,1,1,1,1
2 | 1,0,1,0,1
3 | 1,0,1,0,1


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p2.csv:
--------------------------------------------------------------------------------
1 | 1,1
2 | 1,1
3 | 1,1
4 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p3.csv:
--------------------------------------------------------------------------------
1 | 1,0,0
2 | 1,1,0
3 | 0,1,0
4 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p4.csv:
--------------------------------------------------------------------------------
1 | 1
2 | 1
3 | 1
4 | 1
5 | 1
6 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p5.csv:
--------------------------------------------------------------------------------
1 | 0,1,0
2 | 1,1,1
3 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p6.csv:
--------------------------------------------------------------------------------
1 | 0,1,1
2 | 1,1,1
3 | 1,1,1
4 | 1,1,0


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p7.csv:
--------------------------------------------------------------------------------
1 | 0,1
2 | 0,1
3 | 1,1


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p8.csv:
--------------------------------------------------------------------------------
1 | 1,1,1,1


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/data/pieces/p9.csv:
--------------------------------------------------------------------------------
1 | 1,1
2 | 0,1


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/dtocs.csv:
--------------------------------------------------------------------------------
 1 | dtoc
 2 | 4940
 3 | 5004
 4 | 4588
 5 | 4409
 6 | 3861
 7 | 4597
 8 | 4404
 9 | 4170
10 | 3910
11 | 4056
12 | 4137
13 | 4228
14 | 4144
15 | 4165
16 | 4150
17 | 4165
18 | 3617
19 | 4094
20 | 4007
21 | 4028
22 | 3954
23 | 3857
24 | 4086
25 | 4031
26 | 3961
27 | 4102
28 | 4115
29 | 3894
30 | 3448
31 | 4188
32 | 4007
33 | 4053
34 | 4046
35 | 4184
36 | 3888
37 | 3961
38 | 4084
39 | 4231
40 | 4147
41 | 4200
42 | 3649
43 | 4221
44 | 4276
45 | 4327
46 | 4207
47 | 4516
48 | 4363
49 | 4612
50 | 4704
51 | 4960
52 | 4930
53 | 5063
54 | 4475
55 | 5221
56 | 4942
57 | 4948
58 | 4739
59 | 4972
60 | 4996
61 | 4888
62 | 5114
63 | 5247
64 | 5330
65 | 5573
66 | 5004
67 | 5777
68 | 5714
69 | 5601
70 | 5852
71 | 5996
72 | 6152
73 | 6361
74 | 6387
75 | 6759
76 | 6777
77 | 6771
78 | 6167
79 | 7118
80 | 6855
81 | 6648
82 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/ex_templates/ex1_quickstart.py:
--------------------------------------------------------------------------------
 1 | #### Define the function heatmap_of_minima(trials, max_rand) here ###
 2 | 
 3 | 
 4 | def main():
 5 |     ''' This is the function that runs your code
 6 | 
 7 |     Feel free to try different numbers and see the results
 8 |     '''
 9 | 
10 |     # Parameters:
11 |     trials = 10000
12 |     max_rand = 99999
13 | 
14 |     # You need to provide this function!
15 |     heatmap_of_minima(trials, max_rand)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     main()
20 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/ex_templates/ex2_quickstart.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | MATH6005 - Lab 4, exercise 2
  4 | Solution.
  5 | 
  6 | @author: Carlos Lamas-Fernandez
  7 | """
  8 | import os
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | def main():
 14 |     ''' Defines the main parameters and runs the algorithm
 15 |     '''
 16 | 
 17 |     directory = '../data/pieces'
 18 |     piece_names = ['p1.csv', 'p2.csv', 'p3.csv', 'p4.csv', 'p5.csv',
 19 |                    'p6.csv', 'p7.csv', 'p8.csv', 'p9.csv', 'p10.csv']
 20 | 
 21 |     container_width = 6
 22 |     container_height = 12
 23 | 
 24 |     piece_list = read_pieces(directory, piece_names)
 25 | 
 26 |     container, packed_pieces = top_left_corner(piece_list,
 27 |                                                container_width,
 28 |                                                container_height)
 29 |     # Show final packing
 30 |     plot_container(container)
 31 | 
 32 |     # See if it was possible to pack all pieces:
 33 |     if packed_pieces < len(piece_list):
 34 |         print('WARNING: only {0} pieces were packed, out of {1}'.
 35 |               format(packed_pieces, len(piece_list)))
 36 | 
 37 | 
 38 | def top_left_corner(piece_list, container_width, container_height):
 39 |     ''' Top left corner algorithm
 40 | 
 41 |     Keyword arguments:
 42 |     piece_list -- list of pieces (float numpy arrays)
 43 |     container_width -- size of the container along the x axis (int)
 44 |     container_height -- size of the container along the y axis (int)
 45 |     '''
 46 | 
 47 |     packed_pieces = 0
 48 |     container = np.zeros((container_height, container_width))
 49 | 
 50 |     for piece in piece_list:
 51 | 
 52 |         
 53 |         # Remove if statement!!!
 54 |         if (packed_pieces > 0):
 55 |             break
 56 |         ###
 57 | 
 58 | 
 59 |         piece_placed = False
 60 | 
 61 |         for c_row in range(container_height):
 62 |             for c_col in range(container_width):
 63 |                 if not pieces_overlap(container, piece, c_row, c_col):
 64 |                     place_piece(container, piece, c_row, c_col)
 65 |                     piece_placed = True
 66 |                     break
 67 |                 else:
 68 |                     continue
 69 | 
 70 |             if piece_placed:
 71 |                 break
 72 | 
 73 |         if piece_placed:
 74 |             packed_pieces += 1
 75 | 
 76 |     return(container, packed_pieces)
 77 | 
 78 | 
 79 | def pieces_overlap(container, piece, x_coord, y_coord):
 80 |     '''Returns true if the piece overlaps with others, false if not
 81 | 
 82 |     Keyword arguments:
 83 |     container -- binary representation of the container (numpy array)
 84 |     piece -- binary representation of the piece tested (numpy array)
 85 |     x_coord -- x position in the container (int)
 86 |     y_coord -- y position in the container (int)
 87 |     '''
 88 | 
 89 |     # Check for overlap, if found, return True
 90 | 
 91 |     return False
 92 | 
 93 | 
 94 | def place_piece(container, piece, x_coord, y_coord):
 95 |     ''' Modifies the container to have piece placed at position (x,y)
 96 | 
 97 |     Keyword arguments:
 98 |     container -- binary representation of the container (numpy array)
 99 |     piece -- binary representation of the piece placed (numpy array)
100 |     x_coord -- x position in the container (int)
101 |     y_coord -- y position in the container (int)
102 |     '''
103 | 
104 |     p_x, p_y = piece.shape
105 | 
106 |     container_bit = container[x_coord:x_coord + p_x, y_coord:y_coord + p_y]
107 |     container_bit += piece
108 | 
109 | 
110 | def read_pieces(directory, piece_names):
111 |     ''' Returns a list of pieces composed of numpy arrays for each of them
112 | 
113 |     The pieces should be in the directory "directory" and their names listed
114 |     in the list "piece_names". It is expected that pieces are CSV files
115 |     containing only 0 and 1 values. The function adds a small random number
116 |     (i.e. <= 0.4) in order to achieve different colours when plotting the
117 |     resulting container.
118 | 
119 |     Keyword arguments:
120 |     directory -- string containing the name of directory of the piece files
121 |     piece_names -- list with the names of the piece CSV files
122 |     x -- x position in the container (int)
123 |     y -- y position in the container (int)
124 |     '''
125 |     n_pieces = len(piece_names)
126 |     piece_list = []
127 |     for pname in piece_names:
128 |         piece = np.loadtxt(directory + os.sep + pname,
129 |                            delimiter=',',
130 |                            dtype=np.float64,
131 |                            ndmin=2)
132 | 
133 |         # Add a small random number to have different colours in the plot
134 |         piece = piece*(1 + 0.1 + (1 + len(piece_list))/n_pieces*0.4)
135 |         piece_list.append(piece)
136 | 
137 |     return piece_list
138 | 
139 | 
140 | def plot_container(container):
141 |     ''' Plots the contents of the container
142 | 
143 |     To have different colours, loops through the pieces and changes their
144 |     value from 1.xx to their index + 2
145 | 
146 |     Keyword arguments:
147 |     container -- binary representation of the container (numpy array)
148 |     '''
149 | 
150 |     container_for_plot = container
151 | 
152 |     n_pieces = 0
153 |     while True:
154 |         min_val = np.min(container_for_plot[container_for_plot > 0])
155 |         if min_val > 2:
156 |             break
157 | 
158 |         container_for_plot[container_for_plot == min_val] = n_pieces + 2
159 | 
160 |         n_pieces += 1
161 | 
162 |     plt.imshow(container_for_plot, cmap='tab20b')
163 | 
164 | if __name__ == "__main__":
165 |     main()
166 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/ex_templates/lab4_debug_challenge.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Lab 4 - Debug Challenge
 4 | 
 5 | @author: Carlos Lamas-Fernandez
 6 | """
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | def main():
11 |     ''' Defines the main parameters and runs the algorithm
12 |     '''
13 | 
14 |     directory = '../data/pieces'
15 |     piece_names = ['p1.csv', 'p2.csv', 'p3.csv', 'p4.csv', 'p5.csv',
16 |                    'p6.csv', 'p7.csv', 'p8.csv', 'p9.csv', 'p10.csv']
17 | 
18 |     piece_list = read_pieces(directory, piece_names)
19 |     
20 |     
21 |     # Print information for the pieces:
22 |     for p_i in piece_list:
23 |         # Print the piece first:
24 |         print('===== PIECE {0}=====')
25 |         print_piece(p_i)
26 |         
27 |     print('Piece rotations:')
28 |     for rot in range(4):
29 |         print('Rotated {0} degrees:'.format(90*rot)
30 |     p_i = rotate_right(p_i)
31 |     print_piece(p_i)
32 | 
33 | 
34 | def rotate_right(piece)
35 |     ''' Rotate a piece to the right.
36 |     We first transpose it and then flip it horizontally
37 |     '''
38 |     rotated_piece = np.transpose(piece)
39 |     rotated_piece = np.fliplr(piece)
40 |     return(rotated_piece)
41 | 
42 | def print_piece(piece):
43 |     ''' Shows the piece on screen
44 |     '''
45 |     plt.imshow(-1*piece, cmap='gray')
46 |     plt.show()    
47 |         
48 |         
49 | 
50 | def read_pieces(directory, piece_names):
51 |     ''' Returns a list of pieces composed of numpy arrays for each of them
52 | 
53 |     The pieces should be in the directory "directory" and their names listed
54 |     in the list "piece_names". It is expected that pieces are CSV files
55 |     containing only 0 and 1 values. The function adds a small random number
56 |     (i.e. <= 0.4) in order to achieve different colours when plotting the
57 |     resulting container.
58 | 
59 |     Keyword arguments:
60 |     directory -- string containing the name of directory of the piece files
61 |     piece_names -- list with the names of the piece CSV files
62 |     x -- x position in the container (int)
63 |     y -- y position in the container (int)
64 |     '''
65 |     piece_list = []
66 |     for pname in piece_names:
67 |         piece = np.loadtxt(directory + os.sep + pname,
68 |                            delimiter=',',
69 |                            dtype=np.float64,
70 |                            ndmin=2)
71 | 
72 |         piece_list.append(piece)
73 | 
74 |     return piece_list
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/im/all_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/all_overlap.png


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/im/brb_sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/brb_sol.png


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/im/one_piece.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/one_piece.PNG


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/im/only_one_piece.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/only_one_piece.png


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/im/outline_pane.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/outline_pane.PNG


--------------------------------------------------------------------------------
/content/01_algorithms/04_exercises/im/valid_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/04_exercises/im/valid_layout.png


--------------------------------------------------------------------------------
/content/01_algorithms/05_debug.md:
--------------------------------------------------------------------------------
1 | # Debug challenges
2 | 
3 | To test your `numpy` skill there are two debug challenges
4 | 
5 | 1. Debug a monte-carlo simulation of an acute stroke pathway.


--------------------------------------------------------------------------------
/content/01_algorithms/05_debug/00_debug_cv.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | def synthetic_classification(n_samples=10, n_features=1, shuffle=False, 
  5 |                              random_seed=None):
  6 |     '''
  7 |     Generates a simple random synthetic dataset in a given shape.
  8 |     Used for testing of generator classes.
  9 |     
 10 |     X: each feature is a sequence i to i + n_samples where i is the feature no.
 11 |     y data is 0 or 1 weighted very roughly 50/50.
 12 |     
 13 |     These sequences are randomised if shuffle is set to True.
 14 |     
 15 |     No error checking. Assumes all inputs are valid.
 16 |     
 17 |     Params:
 18 |     ------
 19 |     n_samples: int, optional (default=10)
 20 |         The number of samples
 21 |         
 22 |     n_features: int, optional (default=1)
 23 |         The number of features in the classification problem
 24 |         
 25 |     shuffle: bool, optional (default=False)
 26 |         If true then sequences are randomly shuffled
 27 |         
 28 |     random_seed: int or None, optional (default=None)
 29 |         If shuffle then controls the ordering of the sequences generated.
 30 |     
 31 |     Returns:
 32 |     --------
 33 |     X, y
 34 |     Where X and y are np.ndarrays and X will have shape 
 35 |     (n_samples, n_features) 
 36 |         
 37 |     '''
 38 |     X = [[(col * (n_samples)) + row for col in range(n_features)] 
 39 |                                           for row in range(n_samples)]
 40 |     y = ([1] * (n_samples // 2)) + ([0] * ((n_samples // 2) + (n_samples % 2)))
 41 |     
 42 |     if shuffle: 
 43 |         for lst in [X, y]:
 44 |             random.seed(random_seed)
 45 |             random.shuffle(lst)
 46 |     return np.array(X), np.array(y)
 47 | 
 48 | 
 49 | class KFold:
 50 |     '''
 51 |     K-fold cross validation of a X, y formatted dataset.
 52 |     Optional random shuffling of input data.
 53 |     Note that original data is not shuffled, but a copy of a shuffled
 54 |     array is created.
 55 |     '''
 56 |     def __init__(self, k=5, shuffle=False, random_seed=None):
 57 |         '''
 58 |         Params:
 59 |         -------
 60 |         k: int
 61 |             The number of folds
 62 |             
 63 |         shuffle: bool, optional (default=False)
 64 |             When True the data are randomly shuffled
 65 |             
 66 |         random_seed: int or None, optional (default=None)
 67 |             When shuffle set to true and random_seed is an integer the shuffling
 68 |             of the dataset is controlled prior to folding.
 69 |         '''
 70 |         self.k = k
 71 |         self.shuffle = shuffle
 72 |         self.rng = np.random.default_rng(random_seed)
 73 |     
 74 |     def __repr__(self):
 75 |         rep = f'KFoldCV(k={self.k}, shuffle={self.shuffle},' \
 76 |                 + f'random_seed={self.random_seed})'
 77 | 
 78 | 
 79 |     def get_n_splits(self, X):
 80 |         '''
 81 |         Return an integer representing the number of splits that 
 82 |         will be generated.
 83 |     
 84 |         '''
 85 |         return self.k
 86 | 
 87 |     def split(self, X, y):
 88 |         '''
 89 |         Generator method.  Returns incremental splits of the dataset
 90 |         on each call.
 91 |         
 92 |         Params:
 93 |         ------
 94 |         X: array-like 
 95 |             python list or numpy.ndarray containing X data. For multiple features
 96 |             shape should be (n_samples, n_features)
 97 |         
 98 |         y: array-like
 99 |             python list or numpy.ndarray containing y target data. For multiple 
100 |             targets shape should be (n_samples, n_targets)
101 |         
102 |         Returns:
103 |         --------
104 |         train_X, test_X, train_y, test_y 
105 |         
106 |         Where each is a np.ndarray
107 |         '''
108 |         # convert lists to numpy arrays
109 |         X, y = np.asarray(X), np.asarray(y)
110 |         
111 |         # store the indexes of each element - its these that get shuffled.
112 |         if self.shuffle:
113 |             idx = self.rng.integers(0, len(X), size=len(X))
114 |         else:
115 |             idx = np.arange(len(X), dtype=np.int16)
116 |         
117 |         # length of k - 1 splits... final split continues to end.
118 |         split_len = int(len(X) / (self.k))
119 | 
120 |         for test_idx in range(0, len(X), split_len):
121 |         
122 |             # create k - 1 training folds for X 
123 |             train_X = self._fold_training_data(X, idx, test_idx, split_len)
124 |             # X test data for fold
125 |             test_X = X[idx[test_idx: test_idx+split_len]]
126 |             
127 |             # create k - 1 training segments for y
128 |             train_y = self._fold_training_data(y, idx, test_idx, split_len)
129 |             # y test data fold
130 |             test_y = y[idx[test_idx: test_idx+split_len]]
131 |             
132 |             yield train_X, test_X, train_y, test_y
133 |             
134 |         
135 |     def _fold_training_data(self, data, idx, test_idx, split_len):
136 |         '''
137 |         create training segments for X or y
138 |         '''
139 |         train_seg1 = data[idx[:test_idx]]
140 |         train_seg2 = data[idx[test_idx + split_len: ]]                              
141 |         return np.concatenate([train_seg1, train_seg2])
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     # generate test dataset
146 |     X, y = synthetic_classification(n_samples=10, n_features=1, shuffle=False)
147 | 
148 |     # create an instance of LeaveNOut
149 |     cv = KFold(k=5)
150 | 
151 |     # basic cross validation loop.
152 |     # I've zipped together a range and the splits into order to get fold no.
153 |     for i, split_data in zip(range(cv.get_n_splits(X)), cv.split(X, y)):
154 |         train_X, train_y, test_X, test_y = split_data
155 |         print(f'Fold {i+1}:\nTrain:\tX:{train_X}, y:{train_y}')
156 |         print(f'Test:\tX:{test_X}, y:{test_y}')
157 |         
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/content/01_algorithms/06_solutions.md:
--------------------------------------------------------------------------------
1 | # Solutions


--------------------------------------------------------------------------------
/content/01_algorithms/06_solutions/data/breach.csv:
--------------------------------------------------------------------------------
 1 | breaches
 2 | 33184
 3 | 41151
 4 | 47414
 5 | 46436.42857
 6 | 89917.28571
 7 | 72889.28571
 8 | 46942.85714
 9 | 59172.71429
10 | 57379.85714
11 | 54805.42857
12 | 51701.71429
13 | 44885.42857
14 | 45076.57143
15 | 55392.42857
16 | 62452.71429
17 | 59131.85714
18 | 72832
19 | 73746.57143
20 | 85427.14286
21 | 66947
22 | 64561
23 | 61520.71429
24 | 61439.28571
25 | 56433.57143
26 | 54607
27 | 59110.14286
28 | 68868.42857
29 | 70014.28571
30 | 97186.57143
31 | 94561.42857
32 | 94112.28571
33 | 120995
34 | 120121.7143
35 | 64823.28571
36 | 56340
37 | 65588.71429
38 | 68396.14286
39 | 74543.71429
40 | 79250
41 | 70781.14286
42 | 84049.28571
43 | 83995.71429
44 | 89101.28571
45 | 85828.71429
46 | 88957.14286
47 | 102189.7143
48 | 92863.71429
49 | 96272
50 | 88879.71429
51 | 99849.71429
52 | 118978.1429
53 | 119731.1429
54 | 192155
55 | 153156.4286
56 | 132867.8571
57 | 141194
58 | 125799.2857
59 | 110284.4286
60 | 99324
61 | 97475
62 | 106475
63 | 123111
64 | 147554
65 | 163472
66 | 168604
67 | 216286
68 | 229081
69 | 265834
70 | 186122
71 | 201329
72 | 184912
73 | 201973
74 | 174419
75 | 182597
76 | 219137
77 | 221713
78 | 268818
79 | 281612
80 | 216416
81 | 201392
82 | 


--------------------------------------------------------------------------------
/content/01_algorithms/06_solutions/data/dtocs.csv:
--------------------------------------------------------------------------------
 1 | dtoc
 2 | 4940
 3 | 5004
 4 | 4588
 5 | 4409
 6 | 3861
 7 | 4597
 8 | 4404
 9 | 4170
10 | 3910
11 | 4056
12 | 4137
13 | 4228
14 | 4144
15 | 4165
16 | 4150
17 | 4165
18 | 3617
19 | 4094
20 | 4007
21 | 4028
22 | 3954
23 | 3857
24 | 4086
25 | 4031
26 | 3961
27 | 4102
28 | 4115
29 | 3894
30 | 3448
31 | 4188
32 | 4007
33 | 4053
34 | 4046
35 | 4184
36 | 3888
37 | 3961
38 | 4084
39 | 4231
40 | 4147
41 | 4200
42 | 3649
43 | 4221
44 | 4276
45 | 4327
46 | 4207
47 | 4516
48 | 4363
49 | 4612
50 | 4704
51 | 4960
52 | 4930
53 | 5063
54 | 4475
55 | 5221
56 | 4942
57 | 4948
58 | 4739
59 | 4972
60 | 4996
61 | 4888
62 | 5114
63 | 5247
64 | 5330
65 | 5573
66 | 5004
67 | 5777
68 | 5714
69 | 5601
70 | 5852
71 | 5996
72 | 6152
73 | 6361
74 | 6387
75 | 6759
76 | 6777
77 | 6771
78 | 6167
79 | 7118
80 | 6855
81 | 6648
82 | 


--------------------------------------------------------------------------------
/content/01_algorithms/06_solutions/data/lysis.csv:
--------------------------------------------------------------------------------
 1 | per_treated
 2 | 0.018867925
 3 | 0.03030303
 4 | 0.018867925
 5 | 0.018867925
 6 | 0.060606061
 7 | 0.018867925
 8 | 0.045454545
 9 | 0.045454545
10 | 0.018867925
11 | 0.075757576
12 | 0.045454545
13 | 0.015151515
14 | 0.03030303
15 | 0.018867925
16 | 0.015151515
17 | 0.060606061
18 | 0.060606061
19 | 0.03030303
20 | 0.015151515
21 | 0.045454545
22 | 0.015151515
23 | 0.03030303
24 | 0.03030303
25 | 0.075757576
26 | 0.060606061
27 | 0.03030303
28 | 0.060606061
29 | 0.03030303
30 | 0.060606061
31 | 0.03030303
32 | 0.060606061
33 | 0.060606061
34 | 0.075757576
35 | 0.045454545
36 | 0.075757576
37 | 0.03030303
38 | 0.045454545
39 | 0.015151515
40 | 0.121212121
41 | 0.03030303
42 | 0.045454545
43 | 0.045454545
44 | 0.075757576
45 | 0.166666667
46 | 0.03030303
47 | 0.090909091
48 | 0.090909091
49 | 0.090909091
50 | 0.25
51 | 0.147058824
52 | 0.161764706
53 | 0.117647059
54 | 0.191176471
55 | 0.132352941
56 | 


--------------------------------------------------------------------------------
/content/01_algorithms/06_solutions/data/moviedb.csv:
--------------------------------------------------------------------------------
1 | ID,Title,Budget,Box_office,Year,Meta_Critic
2 | 1,Amazing spiderman,230,757.9,2012,66
3 | 2,Ironman,140,585.2,2008,57
4 | 3,Thor,150,449.3,2011,54
5 | 4,Captain America: the first avenger,140,370.6,2011,66
6 | 5,Antman,130,519.3,2015,64
7 | 6,Guardians of the Galaxy,232.3,773,2014,76
8 | 


--------------------------------------------------------------------------------
/content/01_algorithms/data/hist.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/data/hist.csv


--------------------------------------------------------------------------------
/content/01_algorithms/data/minor_illness_ed_attends.csv:
--------------------------------------------------------------------------------
 1 | attends_rate_per_10k_pop
 2 | 2.11927795
 3 | 3.490575446
 4 | 3.989229081
 5 | 2.368604767
 6 | 3.241248629
 7 | 2.867258402
 8 | 3.11658522
 9 | 2.742594994
10 | 3.615238855
11 | 3.615238855
12 | 4.363219308
13 | 3.11658522
14 | 3.739902264
15 | 2.243941358
16 | 3.241248629
17 | 1.620624314
18 | 2.243941358
19 | 2.991921811
20 | 2.368604767
21 | 2.368604767
22 | 2.368604767
23 | 3.11658522
24 | 2.493268176
25 | 2.368604767
26 | 3.11658522
27 | 2.742594994
28 | 3.739902264
29 | 1.994614541
30 | 2.867258402
31 | 1.620624314
32 | 2.991921811
33 | 3.365912037
34 | 1.620624314
35 | 3.739902264
36 | 2.742594994
37 | 3.11658522
38 | 2.493268176
39 | 2.368604767
40 | 3.739902264
41 | 3.864565673
42 | 1.745287723
43 | 4.238555899
44 | 2.368604767
45 | 3.615238855
46 | 1.994614541
47 | 2.11927795
48 | 2.991921811
49 | 2.617931585
50 | 2.243941358
51 | 2.368604767
52 | 3.490575446
53 | 2.368604767
54 | 2.867258402
55 | 2.991921811
56 | 2.867258402
57 | 2.867258402
58 | 2.991921811
59 | 3.241248629
60 | 2.617931585
61 | 3.11658522
62 | 3.11658522
63 | 2.11927795
64 | 3.864565673
65 | 2.867258402
66 | 3.989229081
67 | 5.111199761
68 | 2.867258402
69 | 2.867258402
70 | 2.493268176
71 | 3.739902264
72 | 3.739902264
73 | 1.869951132
74 | 2.11927795
75 | 3.615238855
76 | 


--------------------------------------------------------------------------------
/content/01_algorithms/data/salaries.csv:
--------------------------------------------------------------------------------
 1 | ID,Age,"Gender (0=Female, 1=Male)",Salary
 2 | 0,22,0,72000
 3 | 1,47,0,27000
 4 | 2,35,0,36000
 5 | 3,33,1,19000
 6 | 4,34,1,104000
 7 | 5,65,1,86000
 8 | 6,54,1,104000
 9 | 7,19,0,21000
10 | 8,62,1,52000
11 | 


--------------------------------------------------------------------------------
/content/01_algorithms/data/salaries_extended.csv:
--------------------------------------------------------------------------------
 1 | Name,ID,Age,Department,Salary
 2 | John,0,22,Marketing,72000
 3 | Anna,1,47,Marketing,27000
 4 | Joseph,2,35,Sales,36000
 5 | Mary,3,33,Logistics,19000
 6 | Anthony,4,34,Logistics,104000
 7 | Claire,5,65,Logistics,86000
 8 | Bernard,6,54,Logistics,104000
 9 | Sarah,7,19,Sales,21000
10 | Nick,8,62,Sales,52000
11 | 


--------------------------------------------------------------------------------
/content/01_algorithms/im/gsearch.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/im/gsearch.PNG


--------------------------------------------------------------------------------
/content/01_algorithms/im/salaries.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/im/salaries.PNG


--------------------------------------------------------------------------------
/content/01_algorithms/im/salaries_extended.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/01_algorithms/im/salaries_extended.PNG


--------------------------------------------------------------------------------
/content/02_stat_prog/01_pandas_front_page.md:
--------------------------------------------------------------------------------
1 | # Data wrangling
2 | 
3 | `pandas` is a data science package orignally developed by Wes McKinney. It builds on top of `numpy` to provide a higher level API for wrangling, analysing and visualising data. It is also closely coupled to matplotlib with a number of shorthand methods to create plots of data.
4 | 
5 | Our labs on `pandas` will cover beginner and intermediate techniques in data wrangling, manipulation and visualisation.  There is also an exercise on creating a reproducible pipeline for downloading and efficiently storing a large data file in memory.
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/content/02_stat_prog/02_matplotlib/explore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/02_stat_prog/02_matplotlib/explore.png


--------------------------------------------------------------------------------
/content/02_stat_prog/02_matplotlib/stacked.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/02_stat_prog/02_matplotlib/stacked.png


--------------------------------------------------------------------------------
/content/02_stat_prog/02_visual_front_page.md:
--------------------------------------------------------------------------------
1 | # Visualising data


--------------------------------------------------------------------------------
/content/02_stat_prog/03_exercises/03_visualise_ts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Visualising time series data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import numpy as np\n",
 18 |     "import matplotlib.pyplot as plt"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "**Step 1: Import emergency department reattendance data.**  \n",
 26 |     "\n",
 27 |     "This is a time series from a hospital that measures the number of patients per month that have reattended an ED within 7 days of a previous attendance.\n",
 28 |     "\n",
 29 |     "This can be found in **\"data/ed_reattend.csv\"**\n",
 30 |     "or \n",
 31 |     "'https://raw.githubusercontent.com/hsma-master/hsma/master/12_forecasting/data/ed_reattend.csv'\n",
 32 |     "\n",
 33 |     "* **Hint 1**: look back at the lecture notes and see how `pd.read_csv()` was used.  \n",
 34 |     "\n",
 35 |     "* **Hint 2**: The format of the 'date' column is in UK standard dd/mm/yyyy.  You will need to set the `dayfirst=True` of `pd.read_csv()` to make sure pandas interprets the dates correctly.\n",
 36 |     "\n",
 37 |     "* **Hint 3**: The data is monthly and the dates are all the first day of the month.  This is called monthly start and its shorthand is 'MS'"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 1,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "#your code here\n",
 47 |     "url = 'https://raw.githubusercontent.com/hsma-master/hsma/master/' \\\n",
 48 |     "       + '12_forecasting/data/ed_reattend.csv'"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "**Step 2: Check the shape of the `DataFrame` and print out the first 5 observations**"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "#your code here"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "**Step 3: Check the minimum and maximum date of the series**\n",
 72 |     "\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "#your code here"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "**Step 4: Create a basic plot of the time series**"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "#your code here"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "**Step 5: Improve the appearance of your chart**\n",
105 |     "    \n",
106 |     "Try the following:\n",
107 |     "    \n",
108 |     "* Add a y-axis label\n",
109 |     "* Add gridlines to the plot\n",
110 |     "* Add markers to block\n",
111 |     "* Change the colour of the line\n",
112 |     "* Experiment with using seaborn"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "#your code here"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "**Step 6: Perform a calender adjustment**\n",
129 |     "\n",
130 |     "The data is at the monthly level.  Therefore some of the noise in the time series is due to the differing number of days per month.  Perform a calender adjust and plot the daily rate of reattendance."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "#your code here"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "**Step 7: Run a smoother through the series to assess trend**\n",
147 |     "\n",
148 |     "Hint:  Try using the `.rolling` method of dataframe with a `window=12` and `center=True` to create a 12 month centred moving average \n",
149 |     "\n",
150 |     "Is there any benefit from switchoing to a 6 month MA?  Why does the 6-MA look different to the 12-MA.\n",
151 |     "\n",
152 |     "Use the calender adjusted data."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "#your code here"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "**Step 8: Perform a seasonal decomposition on the time series**\n",
169 |     "\n",
170 |     "Plot the trend, seasonal and remainder components of the decomposition.\n",
171 |     "\n",
172 |     "Try both an additive and multiplicative model.  What is the difference between the two models?\n",
173 |     "\n",
174 |     "* Hint: Look back at the lecture for a function to help you.\n",
175 |     "\n",
176 |     "\n"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "#your code here"
186 |    ]
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "kernelspec": {
191 |    "display_name": "Python 3 (ipykernel)",
192 |    "language": "python",
193 |    "name": "python3"
194 |   },
195 |   "language_info": {
196 |    "codemirror_mode": {
197 |     "name": "ipython",
198 |     "version": 3
199 |    },
200 |    "file_extension": ".py",
201 |    "mimetype": "text/x-python",
202 |    "name": "python",
203 |    "nbconvert_exporter": "python",
204 |    "pygments_lexer": "ipython3",
205 |    "version": "3.11.9"
206 |   }
207 |  },
208 |  "nbformat": 4,
209 |  "nbformat_minor": 4
210 | }
211 | 


--------------------------------------------------------------------------------
/content/02_stat_prog/03_exercises/data/sw_imaging.csv:
--------------------------------------------------------------------------------
 1 | region,org_code,provider,imaging_type,n_referrals,mdn_days_rtt,mdn_days_ttr
 2 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Computerized Axial Tomography,46160,3.0,0.0
 3 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Diagnostic Ultrasonography,72985,14.0,0.0
 4 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Fluoroscopy,12320,0.0,0.0
 5 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Magnetic Resonance Imaging,27535,40.0,5.0
 6 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Nuclear Medicine Procedure,4245,28.0,1.0
 7 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Plain Radiography,198295,0.0,1.0
 8 | Y58,REF,Royal Cornwall Hospitals NHS Trust,Single Photon Emission Computerized Tomography,425,28.0,0.0
 9 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Computerized Axial Tomography,40870,3.0,0.0
10 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Diagnostic Ultrasonography,40070,14.0,0.0
11 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Fluoroscopy,4745,16.0,0.0
12 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Magnetic Resonance Imaging,22380,23.0,7.0
13 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Nuclear Medicine Procedure,3465,14.0,1.0
14 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Plain Radiography,161850,0.0,1.0
15 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Positron Emission Tomography,1000,9.0,4.0
16 | Y58,RH8,Royal Devon and Exeter NHS Foundation Trust,Single Photon Emission Computerized Tomography,545,17.0,2.0
17 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Computerized Axial Tomography,31165,3.0,0.0
18 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Diagnostic Ultrasonography,42690,13.0,0.0
19 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Fluoroscopy,8230,0.0,0.0
20 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Magnetic Resonance Imaging,14730,14.0,2.0
21 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Nuclear Medicine Procedure,1975,20.0,1.0
22 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Plain Radiography,110505,0.0,0.0
23 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Positron Emission Tomography,810,14.0,2.0
24 | Y58,RA9,Torbay and South Devon NHS Foundation Trust,Single Photon Emission Computerized Tomography,90,33.0,5.5
25 | Y58,RK9,University Hospitals Plymouth NHS Trust,Computerized Axial Tomography,54575,2.0,0.0
26 | Y58,RK9,University Hospitals Plymouth NHS Trust,Diagnostic Ultrasonography,89160,2.0,0.0
27 | Y58,RK9,University Hospitals Plymouth NHS Trust,Fluoroscopy,11500,0.0,0.0
28 | Y58,RK9,University Hospitals Plymouth NHS Trust,Magnetic Resonance Imaging,27210,30.0,3.0
29 | Y58,RK9,University Hospitals Plymouth NHS Trust,Nuclear Medicine Procedure,5030,23.0,0.0
30 | Y58,RK9,University Hospitals Plymouth NHS Trust,Plain Radiography,188830,0.0,3.0
31 | Y58,RK9,University Hospitals Plymouth NHS Trust,Positron Emission Tomography,270,14.0,1.0
32 | Y58,RK9,University Hospitals Plymouth NHS Trust,Single Photon Emission Computerized Tomography,1285,0.0,0.0
33 | 


--------------------------------------------------------------------------------
/content/02_stat_prog/03_exercises/data/total_referrals.csv:
--------------------------------------------------------------------------------
1 | provider,n_referrals
2 | Royal Cornwall Hospitals NHS Trust,361965
3 | Royal Devon and Exeter NHS Foundation Trust,274925
4 | Torbay and South Devon NHS Foundation Trust,210195
5 | University Hospitals Plymouth NHS Trust,377860
6 | 


--------------------------------------------------------------------------------
/content/02_stat_prog/03_exercises/hosp_1_ed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/02_stat_prog/03_exercises/hosp_1_ed.png


--------------------------------------------------------------------------------
/content/02_stat_prog/03_exercises_front_page.md:
--------------------------------------------------------------------------------
1 | # Exercises
2 | 
3 | `pandas` is a data science package orignally developed by Wes McKinney. It builds on top of `numpy` to provide a higher level API for wrangling, analysing and visualising data. It is also closely coupled to matplotlib with a number of shorthand methods to create plots of data.
4 | 
5 | Our lectures on `pandas` will cover beginner and intermediate techniques in data wrangling, manipulation and visualisation.
6 | 


--------------------------------------------------------------------------------
/content/02_stat_prog/04_solutions/total_referrals.csv:
--------------------------------------------------------------------------------
1 | provider,n_referrals
2 | Royal Cornwall Hospitals NHS Trust,361965
3 | Royal Devon and Exeter NHS Foundation Trust,274925
4 | Torbay and South Devon NHS Foundation Trust,210195
5 | University Hospitals Plymouth NHS Trust,377860
6 | 


--------------------------------------------------------------------------------
/content/02_stat_prog/04_solutions_front_page.md:
--------------------------------------------------------------------------------
1 | # Solutions
2 | 
3 | The following sections provide example solutions to the statistical programming exercises.  There are often many ways to solve data wrangling and visualisation problems.  The solutions provided should be taken as guides only.  If you feel you have a better way feel free to raise an issue and suggest your solution is adopted instead!


--------------------------------------------------------------------------------
/content/03_mgt/01_git/01_why.md:
--------------------------------------------------------------------------------
 1 | # The case for version control
 2 | 
 3 | ```{admonition} "Wait.. you DON'T use version control for your code?!"
 4 | I'm going to be very honest and say that I find it odd that I still need to have a conversation with people who code about basic version control. I can't really understand why it isn't taught routinely and second nature to everyone. Its actually best not to admit you don't use it when I'm in the room.  
 5 | 
 6 | I'm old, and in my professional career, I can't remember a time I didn't use version control software. I think its so important for data science that I'm going to admit a few embarrassing (in a geeky sort of way) things about my history with version control. My first admission is that I first ventured into the world of version control in the summer of 2001 for a summer VB6(!) coding job. At the time I was doing my undergrad in Computer Science and let me tell you - I made a lot of mistakes in my coding!  My second admission is that I used to be an avid MS Windows user (so sorry Stallman) and from around 2003-2008 I used TortoiseSVN. This is a GUI extension of the excellent subversion software for version control. I switched to equally excellent Git after I returned to university to study for a PhD (introduced courtesy of the Warwick Computing Society) where I used it to control R, S-PLUS(!), and C# code. It was not until June 28th 2011 that I pushed my first commit to the now famous GitHub (some dodgy C# code to automate a commercial simulation package via the Windows Common Object Model - yuk!).  
 7 | 
 8 | You don't need to remember any of that, just take home the message, that I'm pro version control for one single reason. Part of any data science study is carefully controlling and managing your code. If you don't then you will fail to get it producing the same results or perhaps even working again in 6 months time! **You should view your code as a first class citizen in data science.  Do your code, yourself and others a favour - use version control.**
 9 | ```
10 | 
11 | ## Why use version control?
12 | 
13 | ### Scenario 1
14 | 
15 | Consider a scenario where you take up a position as a data scientist in a government organisation.  On your first day you are told that your predecessor has left already, but all code needed for your job is saved to the server.  You log in and have a look in the directory:
16 | 
17 | ```
18 | uber_import_gov_proj
19 | ├── 20190320_main_v2.py
20 | ├── archive
21 | │   ├── 20190504_v3_main_not_final.py
22 | │   ├── tests_before_fix.py
23 | │   ├── v1_main.py
24 | │   └── v3_main_final.py
25 | ├── v2_main_20190320.py
26 | ├── v3_main_final.py
27 | ├── v2v3_main_final_TM_MP_MA_DC(MA_conflicted_copy).py
28 | └── v3_main_final_TMonks_Conflicted_Copy.py
29 | ```
30 | Take a moment to take in the mess of this project.  Perhaps you can laugh about it.  The questions you should be asking yourself are:
31 | 
32 | * have you ever ended up in a mess like this even though you have had the most noble of intentions at the start of a project?
33 | * have you ever worked with someone who has managed work in this way?
34 | 
35 | In my experience this sort of structure turns up surprisingly often, for all sorts of data science and non-data science projects.  It is certainly more common than a cleanly organised data science project.  This is a totally unnecessary situation.  With version control we actually only need this structure:
36 | 
37 | ```
38 | uber_import_gov_proj
39 | ├── main.py
40 | ```
41 | 
42 | ### Scenario 2
43 | 
44 | Even though the code is a complete mess, you are still working for that government organisation several months later.  It is a Monday morning and you stroll into work with the intention of trying again to work out if you should run the analysis code in `v3_main_final.py` or `archive/v3_main_final.py`.  But alas your plans are interrupted!  Some organisation critical code originally written years ago, by an analyst long since departed, failed to run over the weekend. It's your job to fix it!  You open up the code and after the initial horror of finding its a single 'god function' with a repeating verbose code, begin to try and make sense of the problem.  Your initial findings are:
45 | 
46 | * Its clear from comments in the code that it has been modified by several people over the years, but it is not clear how many times, who the coders were, what the changes were made and in what order.  
47 | * There's no 'archive' folder listing older versions of the code and no documentation.  So there's no way to roll back changes.
48 | * There's no code to test if the main analysis code runs as expected.
49 | 
50 | Before you laugh again this is actually a situation I found myself in many years ago. It wasn't fun (at all - especially as I had lots of people checking if "I'd fixed it yet?" quite frequently). It did turn out that a change had introduced the bug under a given set of conditions. So, after quite a while, I fixed what turned out to be an extremely important piece of code for the organisation.  There was no version control system in place so I carefully documented the changes both in the code via comments and in external documentation. 
51 | 
52 | Can you think of any software that's open source and free that would have made this a bit easier?
53 | 


--------------------------------------------------------------------------------
/content/03_mgt/01_git/02_git.md:
--------------------------------------------------------------------------------
  1 | # Introducing Git
  2 | 
  3 | Congratulations, you have reached **a very important topic** in your data science studies!  Before we get into **Git** I want to acknowledge there other high quality version control tools available for your python or code in any other language; for example, subversion.
  4 | 
  5 | Git is a distributed version control system for files.  Git was originally developed by Linus Torvolds (who famously created the **Linux kernel**).
  6 | 
  7 | > The origin of the name Git is quite amusing.  I will leave you to look these stories up and enjoy!
  8 | 
  9 | I'm going to teach you about Git not because its the best, but because **it is software I use on a daily basis** and also its the software I see most researchers using (which isn't that many!). For most data science tasks I've also found it robust and easy to use (although like all software there is always a few head scratching moments!).  
 10 | 
 11 | > I am managing the material I am writing for this book using Git. 
 12 | 
 13 | ## A simple example
 14 | 
 15 | Before we look at setting up Git and issuing some commands let's just look at a simple use case and understand the benefits.
 16 | 
 17 | Let's assume you create a file `main.py` and add the following code:
 18 | 
 19 | ```python
 20 | '''
 21 | main module
 22 | 
 23 | '''
 24 | 
 25 | def do_something():
 26 |     pass
 27 | 
 28 | if __name__ == '__main__':
 29 |     do_something()
 30 | ```
 31 | 
 32 | Using version control you **commit** this file to a **repository**: a complete history of a project including all changes that have been made to files and directories within it. 
 33 | 
 34 | A few weeks later you modify `main.py`. 
 35 | 
 36 | ```python
 37 | '''
 38 | main module
 39 | 
 40 | '''
 41 | 
 42 | def do_something():
 43 |     pass
 44 | 
 45 | def do_something_else():
 46 |     pass
 47 | 
 48 | if __name__ == '__main__':
 49 |     do_something()
 50 |     do_something_else()
 51 | ```
 52 | 
 53 | With version control you don't need to create a new version of `main.py`.  Instead you save the changes to file, **stage** them and **commit** them to the repository.   This means we can view the history of commits!  For example, with this project git has a log that reports:
 54 | 
 55 | ```bash
 56 | commit 4be943efd265dd58020d64af770ff63d229fd8d8 (HEAD -> master)
 57 | Author: Tom Monks <not_my_real_email@gmail.com>
 58 | Date:   Mon Aug 2 16:13:24 2021 +0100
 59 | 
 60 |     MAIN: added do_something_else()
 61 | 
 62 | commit 2e09f233e392448fdcf82b3c8ed45cd8a72c3e0e
 63 | Author: Tom Monks <not_my_real_email@gmail.com>
 64 | Date:   Mon Aug 2 16:11:58 2021 +0100
 65 | 
 66 |     MAIN: main.py -> do_something()
 67 | 
 68 | ```
 69 | 
 70 | In the output above the first line ends with `(HEAD -> master)`. This the the latest commit (the **head**).
 71 | 
 72 | Git commits track the changes to files between commits (or the history of changes to a file).  We can view changes between specific commits. This is called the **difference** or **diff**. For example for our two simple commits Git outputs:
 73 | 
 74 | ```shell
 75 | diff --git a/main.py b/main.py
 76 | index 38056d8..431d04e 100644
 77 | --- a/main.py
 78 | +++ b/main.py
 79 | @@ -6,5 +6,9 @@ main module
 80 |  def do_something():
 81 |      pass
 82 |  
 83 | +def do_something_else():
 84 | +    pass
 85 | +
 86 |  if __name__ == '__main__':
 87 |      do_something()
 88 | +    do_something_else()
 89 | 
 90 | ```
 91 | 
 92 | This output is designed to be fairly intuitive.  The `+` at the start of a line indicates that this is new code in the second commit.  This is incredibly helpful when you need to understand what has changed and how this might affect an analysis (or introduce bugs). If we had removed a line of code the it would have been prefixed with a `-`.
 93 | 
 94 | > Side note: commits become harder to followed the more changes are included.  So try to avoid huge commits where many many files and lines of code have been changed.  Commit often and thoughtfully.
 95 | 
 96 | Now imagine a scenario where you arrive at work the next day, re-run your analysis code and realise that you have made a mistake in the modified `main.py`: do_something_else() is  not needed and in fact the new code has broken the original analysis.  You need to roll back to the first iteration of the code that you know works.  This is called a **rollback**. After the rollback the git log looks like:
 97 | 
 98 | ```bash
 99 | commit 2e09f233e392448fdcf82b3c8ed45cd8a72c3e0e (HEAD -> master)
100 | Author: Tom Monks <not_my_real_email@gmail.com>
101 | Date:   Mon Aug 2 16:11:58 2021 +0100
102 | ```
103 | Referring back to our previous log we can see that the `HEAD` is now the original commit. Indeed `main.py` has reverted to:
104 | 
105 | ```python
106 | '''
107 | main module
108 | 
109 | '''
110 | 
111 | def do_something():
112 |     pass
113 | 
114 | if __name__ = '__main__':
115 |     do_something()
116 | ```
117 | 
118 | > There are various ways to rollback. In this case I've demoed what is called a a **hard** reset.  In practice, for a bug, you may want to do a **soft** reset or even safer a **revert + restore**.  
119 | 
120 | ## Distributed?
121 | 
122 | A defining feature of Git is that it is **distributed**.  This means that each git **repository** is a complete history of a project and that multiple users are required to merge their changes together.   
123 | 
124 | ## Other Git resources
125 | 
126 | In addition to the resources in this book I very much recommend exploring the Git material provided by [Software Carpentry](https://swcarpentry.github.io/git-novice/).  This is wonderful novice friendly material that is open and free to use.  There's also the main [git website](https://git-scm.com/) which includes a free copy of an excellent git book.
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/content/03_mgt/01_git/04_cs_2.md:
--------------------------------------------------------------------------------
  1 | # Case study: roll back and bug fix
  2 | 
  3 | This case study assumes you have worked through case study 1 and have a the following project that is a git repo.
  4 | 
  5 | ```bash
  6 | analysis_code
  7 | ├── main.py
  8 | ├── readme.md
  9 | ├── run_1.log
 10 | └── run_2.log
 11 | ```
 12 | 
 13 | Issuing the `git log --oneline` command you should get the following history.
 14 | 
 15 | > Remember that your commit hash values will be unique to your repo.
 16 | 
 17 | ```bash
 18 | 6475df6 DOCS: run instructions
 19 | 186d91b SETUP: .gitignore + *.log
 20 | d404939 INIT: add main.py
 21 | ```
 22 | 
 23 | ## Scenario
 24 | 
 25 | You have written an analysis program.  This code executes each night on critical patient level data.  At a recent meeting a senior researcher requested a new subgroup analysis in the code.  You know how to do this and quickly make the change to the data.  Unfortunately your change introduces an unexpected bug into the main analysis.  Given the importance of the code, it is necessary to roll back to the previous version that works with no problems while a fix is found.
 26 | 
 27 | To simulate this, you are going to commit a change to an existing code base that results in a bug.  We will explore using git to undo the change by rolling back to a previous commit and then fixing the bug.
 28 | 
 29 | ## Step 1: Modify and commit changes to`main.py`
 30 | 
 31 | Let's make a change to the `main.py` and commit it to the repo.  The twist is that our new modifications are going to contain a hidden and sneaky bug!
 32 | 
 33 | ```{admonition} This would never happen in real life
 34 | You may be reading this and thinking - I will never commit a bug to the main branch of my git repo because all of my code is tested and triple checked beforehand.  If so good for you.  It never happens to me either... ahem.
 35 | ```
 36 | 
 37 | ```python
 38 | '''
 39 | main module
 40 | 
 41 | '''
 42 | 
 43 | def do_something():
 44 |     print('Friendly code')
 45 | 
 46 | def do_something_else():
 47 |     print('This is a major bug!')
 48 | 
 49 | if __name__ == '__main__':
 50 |     do_something()
 51 |     do_something_else()
 52 | 
 53 | ```
 54 | 
 55 | The git command to commit is
 56 | 
 57 | ```bash
 58 | git status
 59 | git add main.py
 60 | git status
 61 | git commit -m "MAIN:+do_something_else() extends analysis"
 62 | git log -2 --oneline
 63 | ```
 64 | 
 65 | > Note that I checked the status of my repo before and after staging a file.
 66 | 
 67 | The final git command requests a one line summary of the last two commits:
 68 | 
 69 | ```bash
 70 | e1c4fd3 MAIN:+do_something_else() extends analysis
 71 | 6475df6 DOCS: run instructions
 72 | ```
 73 | ## Step 2: Finding the bug
 74 | 
 75 | When you run the analysis code:
 76 | 
 77 | > This assumes you are in the `analysis_code` directory that contains `main.py`
 78 | 
 79 | ```bash
 80 | python3 main.py
 81 | ```
 82 | You are shocked to receive the following output:
 83 | 
 84 | ```shell
 85 | Friendly code
 86 | This is a major bug!
 87 | ```
 88 | This is a major problem for your project.  This code runs every night and needs to be rolled back to a previous working version.  Luckily this is simple because it is only 1 commit previous.
 89 | 
 90 | ## Step 3: Reviewing changes
 91 | 
 92 | Before we undo the last commit to the repo let's have quick look at what changes were actually made to `main.py`.  We can do that by using commit id or by using `HEAD~1` where `~1` refers to 1 commit previous.
 93 | 
 94 | ```bash
 95 | git diff HEAD~1 main.py
 96 | ```
 97 | In English, this asks git for the difference in main.py one commit previous.  This results in:
 98 | 
 99 | ```shell
100 | diff --git a/main.py b/main.py
101 | index 38056d8..d1c50df 100644
102 | --- a/main.py
103 | +++ b/main.py
104 | @@ -4,7 +4,11 @@ main module
105 | 
106 |  
107 |  def do_something():
108 | -    pass
109 | +    print('Friendly code')
110 |  
111 | -if __name__ = '__main__':
112 | +def do_something_else():
113 | +    print('This is a major bug!')
114 | +
115 | +if __name__ == '__main__':
116 |      do_something()
117 | +    do_something_else()
118 | 
119 | ```
120 | 
121 | ## Step 3: Rolling back
122 | 
123 | In my view "undo" operations in git can be some of the most confusing because there is more than one way to do it.  
124 | 
125 | ### `git revert`
126 | 
127 | Here we will take a safe option and **git revert** a commit.  This command creates a **new commit** and reverses changes made in a previous commit.  It is safe because **you don't lose any history**.  The old buggy commit remains and you can access the code within it.  
128 | 
129 | The commit we want to revert is the last one.  To be clear its is the commit that contains the code that introduced the bug.  We first look up its commit hash:
130 | 
131 | ```bash
132 | git log -2 --oneline
133 | ```
134 | 
135 | ```bash
136 | e1c4fd3 (HEAD -> master) MAIN:+do_something_else() extends analysis
137 | 6475df6 DOCS: run instructions
138 | ```
139 | 
140 | and then issue the revert command referencing e1c4fd3
141 | 
142 | > a reminder again that this is the commit that introduced the bug!
143 | 
144 | ```bash
145 | git revert e1c4fd3
146 | ```
147 | 
148 | When you do this you will be prompted to add a commit message.  One is provided for you by git.  I'm just going to accept it as is.
149 | 
150 | ```nano
151 | Revert "MAIN:+do_something_else() extends analysis"
152 | 
153 | This reverts commit e1c4fd3ce836f6fe1f7df3a6d1fb805209a790d8.
154 | ```
155 | 
156 | After reverting git we can check `main.py` and find that it has returned to a bug free state!
157 | 
158 | ```python
159 | '''
160 | main module
161 | 
162 | '''
163 | 
164 | def do_something():
165 |     pass
166 | 
167 | if __name__ = '__main__':
168 |     do_something()
169 | ```
170 | 
171 | We can also confirm that our history is intact and `git revert` has created a new commit by `git log -3 --oneline`
172 | 
173 | ```bash
174 | 73fcaa5 (HEAD -> master) Revert "MAIN:+do_something_else() extends analysis"
175 | e1c4fd3 MAIN:+do_something_else() extends analysis
176 | 6475df6 DOCS: run instructions
177 | 
178 | ```
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/content/03_mgt/01_git/05_cs_3.md:
--------------------------------------------------------------------------------
  1 | # Case study: Branching and merging
  2 | 
  3 | > This case study assumes you have worked through case studies 1 and 2.
  4 | 
  5 | A key feature of version control is the ability to create a **branch** seperate from the main code that are used for development.  Once the code within a branch is complete it is **merged** into the repo's master branch.
  6 | 
  7 | ```{admonition} You should always use branches even when working alone.
  8 | Strictly speaking it is best practice to use a branch to create new features and conduct testing. We have ignored branching so far to keep our introductory material as simple as possible. Ideally you always have a clean (and as best you can bug free) main (master) branch that you or your users/colleagues use for production.  Obviously if you are working on your own then simplicity is key. A minimum safe option is to create a `dev` branch where you work towards a new version of your code (perhaps a group of new features) and then merge when everything is complete and tested. 
  9 | ``` 
 10 | 
 11 | ## Scenario
 12 | Now that you reverted the master branch to the last working version the code will run correctly every night.  In parrallel you will create a new branch and fix the bug in the modified code.  Once you are satified it works you will merge the code into the master branch to use in production.
 13 | 
 14 | To do this you will need to 
 15 | 
 16 | * Create and checkout a new branch called `dev`
 17 | * Restore the version of `main.py` that contains the bug
 18 | * Fix and test the bug
 19 | * Merge the `dev` branch into `main`
 20 | 
 21 | ## Step 1: Create and checking out a new branch
 22 | 
 23 | We create a new branch using the `git branch <branch_name>` command.  To fix the bug we will work on a branch called `dev`
 24 | 
 25 | ```bash
 26 | $ git branch dev
 27 | $ git branch
 28 | ```
 29 | 
 30 | The second command will list all of the branches available in the repo.
 31 | 
 32 | ```bash
 33 |   dev
 34 | * master
 35 | ```
 36 | 
 37 | The output reports that there are two available `dev` and `master`.  The `*` indicates the active branch.  To **switch** the dev branch we can either
 38 | 
 39 | ```bash
 40 | $ git switch dev
 41 | $ git checkout dev
 42 | ```
 43 | 
 44 | Both `git checkout` and `git switch` will move the branch you are working on to `dev`.
 45 | 
 46 | At this point the code in `dev` is an identical copy of `master` with the same commit history.  
 47 | 
 48 | ## Step 2: restore the buggy code.
 49 | 
 50 | One option now is to write the new code from scratch again - this time making sure that is properly tested and working before merging into main.  This isn't really making the most of git, however.  Instead we will **restore** the buggy version of `main.py` and fix it in the `dev` branch.  First let's have a look at the git commit log.
 51 | 
 52 | ```bash
 53 | 73fcaa5 (HEAD -> dev, master) Revert "MAIN:+do_something_else() extends analysis"
 54 | e1c4fd3 MAIN:+do_something_else() extends analysis
 55 | 6475df6 DOCS: run instructions
 56 | ```
 57 | The commit that contains the target version of `main.py` is `e1c4fd3`. Next we restore the file, add it to the staging area and commit it to `dev`.
 58 | 
 59 | ```bash
 60 | $ git restore --source e1c4fd3 main.py
 61 | $ git add main.py
 62 | $ git commit -m "FIX: restore main.py to e1c4fd3"
 63 | ```
 64 | The file `main.py`, including the bug, has now been restored, but it only in the `dev` branch.  
 65 | 
 66 | ```python
 67 | '''
 68 | main module
 69 | 
 70 | '''
 71 | 
 72 | def do_something():
 73 |     print('Friendly code')
 74 | 
 75 | def do_something_else():
 76 |     print('This is a major bug!')
 77 | 
 78 | if __name__ == '__main__':
 79 |     do_something()
 80 |     do_something_else()
 81 | ```
 82 | 
 83 | ## Step 3: Fix the bug
 84 | 
 85 | The fixed `main.py` now looks like
 86 | 
 87 | ```python
 88 | '''
 89 | main module
 90 | 
 91 | Fix issued by TM.
 92 | '''
 93 | 
 94 | def do_something():
 95 |     print('Friendly code')
 96 | 
 97 | def do_something_else():
 98 |     print('Expected value')
 99 | 
100 | if __name__ == '__main__':
101 |     do_something()
102 |     do_something_else()
103 | ```
104 | 
105 | It's the end of the working day. You haven't had time to test your fix, so you won't risk merging it with master.  But you stage and commit it to `dev` before you leave.
106 | 
107 | ```bash
108 | $ git add main.py
109 | $ git commit -m "FIX: do_something_else() patched."
110 | ```
111 | 
112 | You can also quickly switch back to master (`git switch master`) we can confirm the code in `main.py` that will run overnight is the original version.
113 | 
114 | ## Step 4: Test and merge the code.
115 | 
116 | The next day you have plenty of time to test the new code works.  Switching to the `dev` branch we can run
117 | 
118 | ```bash
119 | $ python3 main.py
120 | 
121 | Friendly code
122 | Expected value
123 | ```
124 | This confirms everything has worked as expected so we can **merge** the new version of the code into master.
125 | 
126 | To complete the merge we need to switch to the `master` branch and use the `merge` command.
127 | 
128 | ```bash
129 | $ git switch master
130 | $ git merge dev
131 | 
132 | Updating 73fcaa5..6fcbf0d
133 | Fast-forward
134 |  main.py | 10 ++++++++--
135 |  1 file changed, 8 insertions(+), 2 deletions(-)
136 | ```
137 | 
138 | Just to prove to ourselves that the merge has worked we can check the log.
139 | 
140 | ```bash
141 | $ git log -4 --oneline
142 | 
143 | 6fcbf0d (HEAD -> master, dev) FIX: do_something_else() patched.
144 | 2e9cfcd FIX: restore main.py to e1c4fd3
145 | 73fcaa5 Revert "MAIN:+do_something_else() extends analysis"
146 | e1c4fd3 MAIN:+do_something_else() extends analysis
147 | ```
148 | and that's it you now have updated your code base.  
149 | 


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/my_package_name/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | __author__ = 'Tom Monks'


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/my_package_name/datasets.py:
--------------------------------------------------------------------------------
1 | '''
2 | The datasets module.
3 | 
4 | Dummy module for illustration
5 | '''


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/my_package_name/package_data/example_datset_1.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/02_packaging/my_package_name/package_data/example_datset_1.csv


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/my_package_name/package_data/example_datset_2.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/02_packaging/my_package_name/package_data/example_datset_2.csv


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/my_package_name/plotting.py:
--------------------------------------------------------------------------------
1 | '''
2 | The plotting module.
3 | 
4 | Dummy module for illustration
5 | '''
6 | 
7 | def plt_diagostics():
8 |     '''dummy function just for illustration'''
9 |     pass


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/ts_emergency/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | __author__ = 'Tom Monks'


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/ts_emergency/data/.~lock.ts_ed.csv#:
--------------------------------------------------------------------------------
1 | ,tom,pop-os.localdomain,16.07.2021 15:37,file:///home/tom/.config/libreoffice/4;


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/ts_emergency/datasets.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Functions to load built in datasets for ts_emergency.
  3 | Datasets are downloaded from an external github repo.
  4 | 
  5 | The key loading function is load_ed_ts
  6 | '''
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | LONG_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \
 12 |             + 'hpdm139-datasets/main/syn_ts_ed_long.csv'
 13 | 
 14 | WIDE_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \
 15 |             + 'hpdm139-datasets/main/syn_ts_ed_wide.csv'
 16 | 
 17 | def load_ed_ts(data_format='wide', as_pandas=True):
 18 |     '''
 19 |     Load the built-in ED dataset
 20 |     
 21 |     Params:
 22 |     ------
 23 |     data_format: str
 24 |         'Wide' or 'long' format.  Wide format provides hospital columns.
 25 |         Long format provides a categorical hospital column and single attends
 26 |         column.
 27 |         
 28 |     as_pandas: bool, optional (default = True)
 29 |         Return as `pandas.Dataframe`.  If False then `numpy.ndarray`
 30 |         
 31 |     Returns:
 32 |     -------
 33 |     pandas.Dataframe or if `as_pandas=False` then returns `numpy.ndarray`
 34 |     
 35 |     '''
 36 |     valid_formats = ['wide', 'w', 'long', 'l']
 37 |     
 38 |     if data_format.lower() not in valid_formats:
 39 |         raise ArgumentError(f'data format should be one of {valid_formats}')
 40 | 
 41 |     if data_format == 'wide' or data_format == 'w':
 42 |         df = _ed_data_to_wide(LONG_URL)
 43 |     else:
 44 |         df = _ed_data_to_long(WIDE_URL)
 45 |     
 46 |     if as_pandas:
 47 |         return df
 48 |     else:
 49 |         return df.to_numpy()
 50 |         
 51 |         
 52 |  
 53 | def _ed_data_to_wide(file_path):
 54 |     '''
 55 |     Return the ED data in wide format.
 56 |     
 57 |     1. Pivot table
 58 |     2. Transpose and drop the ('attends', hosp_i) multi-index
 59 |     3. Rename columns [0, 1, 2, 4] tp ['hosp_1', 'hosp_2', 'hosp_3', 'hosp_4']
 60 |     4. Index to DateTimeIndex
 61 |     5. Drop the additional uneeded series 'date' (as stored in index as well)
 62 |     6. Convert attendence numbers from int64 to int16
 63 |     
 64 |     Params:
 65 |     ------
 66 |     file_path: str
 67 |         Path to wide format file
 68 |         
 69 |     Returns:
 70 |     -------
 71 |     pandas.DataFrame
 72 |     '''
 73 |     # column name transfers
 74 |     translated_names = {0:'hosp_1', 
 75 |                         1:'hosp_2',
 76 |                         2:'hosp_3',
 77 |                         3:'hosp_4'}
 78 | 
 79 |     data_types = {'hosp_1':np.int16, 
 80 |                   'hosp_2':np.int16,
 81 |                   'hosp_3':np.int16,
 82 |                   'hosp_4':np.int16}
 83 | 
 84 |     df = (pd.read_csv(file_path)
 85 |             .pivot_table(values=['attends'], index=['date'], columns=['hosp'])
 86 |             .T.reset_index(drop=True)
 87 |             .T.rename(columns=translated_names)
 88 |             .assign(date=lambda x: pd.to_datetime(x.index))
 89 |             .set_index('date')
 90 |             .astype(data_types)
 91 |          )
 92 | 
 93 |     return df
 94 | 
 95 | 
 96 | 
 97 | def _ed_data_to_long(file_path):
 98 |     '''
 99 |     Return the ED data in long format. Uses pd.wide_to_long()
100 |     Assume wide format file is used.
101 |     
102 |     1. pd.wide_to_long()
103 |     2. reset_index() to remove multi-index
104 |     3. rename col 'hosp_'  to 'attends'
105 |     4. date to datetime
106 |     5. Convert attendence numbers from int64 to int16 amd hosp_id to int8.
107 |     (could also be a categorical field.)
108 |     
109 |     Params:
110 |     ------
111 |     file_path: str
112 |         Path to wide format file
113 |         
114 |     Returns:
115 |     -------
116 |     pandas.DataFrame
117 |     '''
118 | 
119 |     translated_names = {'hosp_':'attends'}
120 |     data_types = {'hosp': np.int8, 'attends':np.int16}
121 | 
122 |     long_df = ( 
123 |                 pd.wide_to_long(pd.read_csv(file_path), stubnames='hosp_', 
124 |                                 i=['date'], j='hosp')
125 |                 .reset_index()
126 |                 .rename(columns=translated_names)
127 |                 .assign(date=lambda x: pd.to_datetime(x['date']))
128 |                 .astype(data_types)
129 |                 )
130 | 
131 |     return long_df
132 | 


--------------------------------------------------------------------------------
/content/03_mgt/02_packaging/ts_emergency/plotting.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | DEFAULT_LABEL_FS = 12
 4 | DEFAULT_AXIS_FS = 12
 5 | DEFAULT_FIGSIZE = (12,8)
 6 | 
 7 | def plot_single_ed(wide_df, hosp_id, figsize=(12,3), 
 8 |                    fontsize=DEFAULT_LABEL_FS, line_width=2):
 9 |     '''
10 |     Plot a single ED's data
11 |     Assumes data are passed in wide format.
12 |     
13 |     Params:
14 |     -------
15 |     wide_df: pandas.Dataframe
16 |         ED time series data in wide format
17 |         
18 |     hosp_id: str
19 |         name of hospital column to plot e.g. 'hosp_1'
20 |         
21 |     figsize: tuple(int, int), optional (default=(12,3))
22 |         `matplotlib` figure size 
23 |         
24 |     fontsize: int, optional (default=DEFAULT_LABEL_FS)
25 |         Size of label font
26 |         
27 |     line_width: int
28 |         Width of the line plot
29 |         
30 |     Returns:
31 |     -------
32 |     matplotlib fig, ax
33 |             
34 |     '''
35 |     fig = plt.figure(figsize=figsize)
36 |     ax = fig.add_subplot()
37 |     ax.set_xlabel("Date", fontsize=fontsize)
38 |     ax.set_ylabel("Attendances", fontsize=fontsize)
39 | 
40 |     _ = ax.plot(wide_df[hosp_id], lw=line_width)
41 |     # include x, y grid 
42 |     _ = ax.grid(ls='--')
43 | 
44 |     # set size of x, y ticks
45 |     _ = ax.tick_params(axis='both', labelsize=fontsize)
46 | 
47 |     # return the figure
48 |     return fig, ax
49 | 
50 | 
51 | def plot_eds(wide_df, figsize=DEFAULT_FIGSIZE, label_font_size=DEFAULT_LABEL_FS, 
52 |              axis_font_size=DEFAULT_AXIS_FS):
53 |     '''
54 |     Plot all ED's attendances in a 1x4 grid layout.
55 |     
56 |     Params:
57 |     ------
58 |     wide_df: pandas.Dataframe
59 |         ED time series data in wide format
60 | 
61 |     figsize: tuple(int, int), optional (default=(12,3))
62 |         `matplotlib` figure size 
63 |         
64 |     label_font_size: int, optional (default=DEFAULT_LABEL_FS)
65 |         Size of label font
66 |         
67 |     axis_font_size: int, optional (default=DEFAULT_AXIS_FS)
68 |         Size of axis tick font
69 |     
70 |     Returns:
71 |     --------
72 |     matplotlib fig
73 |     '''
74 |              
75 |     fig, axs = plt.subplots(nrows=4, ncols=1, tight_layout=True, figsize=(12,8),
76 |                             sharex=True)
77 | 
78 |     # note that axs is a 2D array
79 |     for hosp_idx in range(0, 4):
80 |         _ = axs[hosp_idx].plot(wide_df[f'hosp_{hosp_idx+1}'])
81 |         _ = axs[hosp_idx].set_title(f'Hospital {hosp_idx+1}', 
82 |                                     fontsize=label_font_size)
83 |         _ = axs[hosp_idx].grid(ls='--')
84 | 
85 |     # axis labels matplotlib >=3.4 
86 |     AXIS_LABEL_SIZE = 12
87 |     _ = fig.supylabel('ED Attendances', fontsize=axis_font_size)
88 |     _ = fig.supxlabel('Date', fontsize=axis_font_size)
89 |              
90 |     return fig


--------------------------------------------------------------------------------
/content/03_mgt/03_mgt_front_page.md:
--------------------------------------------------------------------------------
 1 | # Deployment
 2 | 
 3 | So you now know how to code in Python and a few of its data science and machine learning extensions. Congratualations! The truth is, however, that this is only the beginning of your journey in health data science. When you start your first job, perhaps in genomics or building machine learning models for a big company, you need to know how to manage and deploy code.  
 4 | 
 5 | In the final part of the book we will look at:
 6 | 
 7 | 1. Local and remote version control
 8 | 2. Setting up and orgaising local python packages
 9 | 3. Deploying python packages on the Python Package Index (PyPi)
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/02_github.md:
--------------------------------------------------------------------------------
 1 | # Install from GitHub
 2 | 
 3 | The steps we followed in the [last section](./01_local.md) have led to an installable package. So far we have installed the package from a local repository. To make installation even easier for our users we will now set things up so they can install the package on GitHub.  This is a good option for early development of the package. You can have an remotely installable package, without the need to make things more complicated by using PyPI. 
 4 | 
 5 | As a reminder we have the following basic package structure:
 6 | 
 7 | ```
 8 | analysis-package
 9 | ├── analysis_package
10 | │   ├── __init__.py
11 | │   ├── model.py
12 | │   ├── data
13 | │   |   ├── model_data.csv
14 | ├── tests
15 | │   ├── test_model.py
16 | ├── LICENSE
17 | ├── environment.yml
18 | ├── README.md
19 | └── pyproject.toml
20 | ```
21 | 
22 | > As a reminder that `pyproject.toml` is the key to allowing our package to be installed via `pip`. 
23 | 
24 | To make this package installable on GitHub we need to create a GitHub repository and **push** our repository to it.  The example repository for `package-template` is available [here](https://github.com/health-data-science-OR/analysis-package).
25 | 
26 | To install from GitHub we need to activate the Python environment that we wish to install into e.g. 
27 | 
28 | ```bash
29 | conda activate hds_code
30 | ```
31 | 
32 | and then we issue the following command:
33 | 
34 | ```bash
35 | pip install git+https://github.com/health-data-science-OR/analysis-package@main
36 | ```
37 | 
38 | The below is an exert from the output generated by the modified pip install. It reveals how the process differs from the local install!  In summary, the **main branch** of the repository is **cloned** to your local machine (stored in a temporary directory). Once the repository has been downloaded the normal pip install process proceeds along with dependency installation.
39 | 
40 | ```bash
41 | Collecting git+https://github.com/health-data-science-OR/package-template@main
42 |   Cloning https://github.com/health-data-science-OR/package-template (to revision main) to /tmp/pip-req-build-raw3ilfx
43 |   Running command git clone --filter=blob:none --quiet https://github.com/health-data-science-OR/package-template /tmp/pip-req-build-raw3ilfx
44 | Resolved https://github.com/health-data-science-OR/package-template to commit cc91d307285b9f10f9cab8cc8290525d84637352
45 |   Installing build dependencies ... done
46 |   Getting requirements to build wheel ... done
47 |   Preparing metadata (pyproject.toml) ... done
48 | Collecting matplotlib>=3.1.3 (from analysis_package==0.1.0)
49 | ```
50 | 
51 | In general to use GitHub for installations we issue a modification of the following command:
52 | 
53 | ```bash
54 | pip install git+https://github.com/user/repo.git@branch_or_tag
55 | ```
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/03_pypi.md:
--------------------------------------------------------------------------------
 1 | # Publish a package on PyPI
 2 | 
 3 | The first thing to say is that **there is a PyPI test site!**  It is called [TestPyPI](https://testpypi.python.org) and is incredibly helpful and I recommend you make use of it while learning instead of the main PyPI site.  I found that I made several mistakes the first time I attempted publication (and still do with new packages!).  The great thing about TestPyPI is that once your package is published you can install it, just like you would if it were on the production PyPI site.
 4 | 
 5 | ```{admonition} pyOpenSci.org: An (excellent) alternative guide to PyPI
 6 | :class: information, dropdown
 7 | While I was updating version 3 of this book I came across an excellent easy to follow guide to python package publishing on PyPI [available on pyopensci.org](https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html). Its another good source of information if you want a different perspective.
 8 | ```
 9 | ##  IMPORTANT: Use a unique package name
10 | 
11 | Before you proceed any further I recommend visiting PyPI and TestPyPI searching for your package name.  You might find a package with an identical name.  If an identically named package exists then you need to **rename your package, before attempting any of the steps outlined in this section**.  
12 | 
13 | In the past I have also regretted using a similar name to existing packages. Don't make the same mistakes as me! Keep things simple for yourself: make your package name distinctive.
14 | 
15 | If you do rename your package then make sure you must update the following:
16 | 
17 | 1. The `pyproject.toml` meta-data
18 | 2. The package directory
19 | 3. The GitHub repository
20 | 
21 | ## Setting up TestPyPI
22 | 
23 | ### Get a TestPyPI account
24 | 
25 | You need to go to https://test.pypi.org and create an account.  You should be greeted by a webpage similar to the below. Note that the banner that is making clear you are on the test site.  As part of the account creation process you will be required to setup two factor authentication.
26 | 
27 | ![testpypi](../../../images/testpypi.png)
28 | 
29 | ### API Tokens
30 | 
31 | Rather than use your username and password to upload to TestPyPI you need to use an **API token**.  Tokens come in two levels of scope: 
32 | 
33 | * **account wide**: an API token scoped to your entire account will have upload permissions for all of your current and future projects.
34 | * **project specific**: this is self explanatory an API token that allows uploads of a specific package.
35 | 
36 | There's a catch to this framework!  You can only create a project specific token for an existing project.  This means that for a new package you need to use an account wide token.  Once the package is uploaded you can then create the project specific token.
37 | 
38 | To create a token head to account settings and select **Create API token**.  You should be presented with a page similar to the below. I've selected account wide token and chosen the name "new_package_uploads" so that its use is clear.  Click on **Create Token**
39 | 
40 | ![testpypi](../../../images/test_pypi2.png)
41 | 
42 | You will then be shown the generated token. **IMPORTANT** - you need to save this token to a very safe place. You won't be shown it again and you don't want to share it with others as it can access all projects in your account.  The token will take the following form: `pypi-[random string]`
43 | 
44 | ### Using hatch to publish to and TestPyPI
45 | 
46 | To publish on PyPI you need to upload a source tarball and wheel distribution. If you need a reminder of what a wheel is head over to the [introduction to installable packages](./01_local.md). To generate these files issue the following command in the top level of repo directory:
47 | 
48 | ```bash
49 | hatch build
50 | ```
51 | 
52 | This will create a new directory `dist/` containing the source and wheel files.
53 | 
54 | ```bash
55 | ├── dist
56 | │   ├── analysis_package-0.1.0-py3-none-any.whl
57 | │   └── analysis_package-0.1.0.tar.gz
58 | ```
59 | 
60 | You are not ready to upload! Have your account wide API token to hand. To publish to `TestPyPI` we simply run 
61 | 
62 | ```bash
63 | hatch publish -r test
64 | ```
65 | 
66 | * You will be prompted for a username enter `__token__`
67 | * You will then be prompted for the API token. Paste in your API token (this won't be displayed).
68 | 
69 | The package will then be uploaded to TestPyPI and `hatch` will inform you if this has been successful.  If it has you will be prompted with a URL to the TestPyPI page for your new package. For example, https://test.pypi.org/project/analysis_package/0.1.0/. Go take a look at your page!
70 | 
71 | ### Install your package from TestPyPI
72 | 
73 | On your web page will be a special TestPyPI link to install your package. This looks a bit different from production PyPI, but has the same result. For example, for `analysis_package` we install it as follows:
74 | 
75 | ```bash
76 | # let's intall into the hds_code env
77 | conda activate hds_code
78 | 
79 | # pip install analysis_package from TestPyPI
80 | pip install -i https://test.pypi.org/simple/analysis_package==0.1.0
81 | ```
82 | 
83 | ### Recommended: create a package specific API token.
84 | 
85 | Now that you have created your package, I recommend logging back into your TestPyPI account and creating a package specific API token. It is more secure to work with API tokens that are specific to packages (especially if working in a team or group when developing the work). This avoids accidental uploads to different packages.  To do this select **projects**, your project (e.g. `analysis_package`) and then **settings**.
86 | 
87 | ![testpypi](../../../images/test_pypi3.png)
88 | 
89 | ### Publish on PyPI production
90 | 
91 | First I just want to say that you should not publish on the main production PyPI platform unless it is needed.  Use PyPI when necessary to help your own research, work or colleagues, but not for testing purposes: use TestPyPI instead.  **You will need a separate account for PyPI.**.  If you intend to publish to PyPI then you need to follow all of the same steps we used for setting up TestPyPI.  
92 | 
93 | When you are ready to upload there is a different `hatch` command to publish:
94 | 
95 | ```bash
96 | hatch publish
97 | ```


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/04_automation.md:
--------------------------------------------------------------------------------
 1 | # Automation
 2 | 
 3 | The manual upload steps I've outlined in the [PyPI section](./03_pypi.md) are somewhat historical.  We know that most modern projects make use of version control in the cloud such as GitLab or GitHub.  These tools include ways to automatically publish updates to PyPI.  
 4 | 
 5 | ## GitHub Actions
 6 | 
 7 | One option to automate publication of updates to a PyPI package is a GitHub action. An action can be described as a job or a workflow that runs when certain events are triggered. For example, when code is pushed to a remote repository or when a new release is created.  To be clear 
 8 | actions aren't part of the package - they are instead a tools for continuous integration of code. They help the package managers do repetitive tasks needed for maintenance and publishing efficiently and consistently.
 9 | 
10 | Actions are specified in YAML (Yet Another Markup Language). And are actually quite straightforward to read.  GitHub has a large number of templates available you can use and adapt.
11 | 
12 | You can read more about GitHub actions [here](https://docs.github.com/en/actions)
13 | 
14 | ## Automating package publication to PyPI
15 | 
16 | The YAML below is an action that is used to automate the updating of a package on PyPI. It is triggered when a new **release** of the code is made on the main branch.
17 | 
18 | ```yaml
19 | name: Upload Python Package
20 | 
21 | on:
22 |   release:
23 |     types: [published]
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   deploy:
30 | 
31 |     runs-on: ubuntu-latest
32 | 
33 |     steps:
34 |     - uses: actions/checkout@v3
35 |     - name: Set up Python
36 |       uses: actions/setup-python@v3
37 |       with:
38 |         python-version: '3.x'
39 |     - name: Install dependencies
40 |       run: |
41 |         python -m pip install --upgrade pip
42 |         pip install hatch
43 |     - name: Build package
44 |       run: hatch build
45 |     - name: Publish package
46 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
47 |       with:
48 |         user: __token__
49 |         password: ${{ secrets.PYPI_TOKEN }}
50 | ```
51 | 
52 | To set this up you will need to supply GitHub with the API token for your package.  Its stored securely in GitHub in what is called a **Secret**.  In the YAML above you see the final line uses `${{ secrets.PYPI_TOKEN }}`. This means I have named my secret that stores the API project token as `PYPI_TOKEN`. It is essential to create and use a package specific API token for PyPI (or TestPyPI). Do not use an account wide token.  
53 | 
54 | This action runs on a new release of the code.  A release is version of the package that follows the {major}.{minor}.{patch} (e.g. v1.1.2) naming convention we introduced when first learning how to [structure a local python package](./01_local.md). For simplicity I recommend ensuring release numbering matching the package version you have in `__init__.py`. For any package on GitHub you can see the current version on the landing page. For example, for a package I am developing called `sim-tools` you can see the current version highlighted in the screenshot below.  
55 | 
56 | ![release](../../../images/release.png)
57 | 
58 | To create a new release is simple. Click on the **Releases** link highlighted above followed by **Draft new release**.  You can read more about releases [here](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)
59 |  
60 | ## Where are actions stored?
61 | 
62 | When we have added added an action to GitHub our repo looks slightly different.  We now have a new directory called `.github` that contains the YAML file describing the action. 
63 | 
64 | ```
65 | analysis-package
66 | ├── .github
67 | │   ├── workflows
68 | │   |   ├── publish_package.yml
69 | ├── analysis_package
70 | │   ├── __init__.py
71 | │   ├── model.py
72 | │   ├── data
73 | │   |   ├── model_data.csv
74 | ├── tests
75 | │   ├── test_model.py
76 | ├── LICENSE
77 | ├── environment.yml
78 | ├── README.md
79 | └── pyproject.toml
80 | ```
81 | 
82 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tom Monks
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | recursive-include test_package/data *.csv
3 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pypi_package_dev
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - jupyterlab=1.2.6
 6 |   - matplotlib=3.1.3
 7 |   - numpy=1.18.1
 8 |   - pandas=1.0.1
 9 |   - pip=20.0.2
10 |   - pytest=5.3.5
11 |   - python=3.8.1
12 |   - scipy=1.4.1
13 |   - seaborn=0.10.0
14 |   - pip:
15 |     - pytest-cov==2.10.0
16 |     - setuptools>=51.1.2
17 |     - twine>=3.3.0
18 |     - wheel>0.36.2
19 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib>=3.1.3
2 | numpy>=1.18.1
3 | pandas>=1.0.1
4 | scipy>=1.4.1
5 | seaborn>=0.10.0
6 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from test_package import __version__
 3 | 
 4 | # Read in the requirements.txt file
 5 | with open("requirements.txt") as f:
 6 |     requirements = []
 7 |     for library in f.read().splitlines():
 8 |         requirements.append(library)
 9 | 
10 | with open("README.md", "r") as fh:
11 |     long_description = fh.read()
12 | 
13 | setuptools.setup(
14 |     name="pypi-template_2222",
15 |     #there must be an way to auto tick up the version number...
16 |     version=__version__,
17 |     author="Thomas Monks",
18 |     #I've created a specific email account before and forwarded to my own.
19 |     author_email="generic@genericemail.com",
20 |     license="The MIT License (MIT)",
21 |     description="A short, but useful description to appear on pypi",
22 |     #read in from readme.md and will appear on PyPi
23 |     long_description=long_description,
24 |     long_description_content_type="text/markdown",
25 |     url="https://github.com/TomMonks/pypi-template",
26 |     packages=setuptools.find_packages(),
27 |     #if true look in MANIFEST.in for data files to include
28 |     include_package_data=True,
29 |     #2nd approach to include data is include_package_data=False
30 |     package_data={"test_package": ["data/*.csv"]},
31 |     #these are for documentation 
32 |     classifiers=[
33 |         "Programming Language :: Python :: 3.6",
34 |         "Programming Language :: Python :: 3.7",
35 |         "Programming Language :: Python :: 3.8",
36 |         "License :: OSI Approved :: MIT License",
37 |         "Operating System :: OS Independent",
38 |     ],
39 |     python_requires='>=3.6.9',
40 |     install_requires=requirements,
41 | )
42 | 


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/test_package/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | test_package
3 | 
4 | Part of a repo containing boilerplate code for publishing 
5 | on PyPi.
6 | 
7 | """
8 | __version__ = '0.1.0'
9 | __author__ = 'Thomas Monks'


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/test_package/data/test_data.csv:
--------------------------------------------------------------------------------
1 | "10", "20", "30"
2 | "40", "50", "60"


--------------------------------------------------------------------------------
/content/03_mgt/03_pypi/test_package/test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/03_pypi/test_package/test.py


--------------------------------------------------------------------------------
/content/03_mgt/03_vc_front_page.md:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | 
 3 | So you now know how to code in Python and a few of its data science and machine learning extensions. Congratualations! The truth is, however, that this is only the beginning of your journey in health data science. When you start your first job, perhaps in genomics or building machine learning models for a big company, you need to know how to manage and deploy code.  
 4 | 
 5 | In the first part of this topic we will look at version control for source code.
 6 | 
 7 | ## The case for version control
 8 | 
 9 | ```{admonition} "Wait.. you DON'T use version control for your code?!"
10 | I'm going to be very honest and say that I find it odd that I still need to have a conversation with people who code about basic version control. I can't really understand why it isn't taught routinely and second nature to everyone. Its actually best not to admit you don't use it when I'm in the room.  
11 | 
12 | I'm old, and in my professional career, I can't remember a time I didn't use version control software. I think its so important for data science that I'm going to admit a few embarrassing (in a geeky sort of way) things about my history with version control. My first admission is that I first ventured into the world of version control in the summer of 2001 for a summer VB6(!) coding job. At the time I was doing my undergrad in Computer Science and let me tell you - I made a lot of mistakes in my coding!  My second admission is that I used to be an avid MS Windows user (so sorry Stallman) and from around 2003-2008 I used TortoiseSVN. This is a GUI extension of the excellent subversion software for version control. I switched to equally excellent Git after I returned to university to study for a PhD (introduced courtesy of the Warwick Computing Society) where I used it to control R, S-PLUS(!), and C# code. It was not until June 28th 2011 that I pushed my first commit to the now famous GitHub (some dodgy C# code to automate a commercial simulation package via the Windows Common Object Model - yuk!).  
13 | 
14 | You don't need to remember any of that, just take home the message, that I'm pro version control for one single reason. Part of any data science study is carefully controlling and managing your code. If you don't then you will fail to get it producing the same results or perhaps even working again in 6 months time! **You should view your code as a first class citizen in data science.  Do your code, yourself and others a favour - use version control.**
15 | ```
16 | 
17 | ## Why use version control?
18 | 
19 | ### Scenario 1
20 | 
21 | Consider a scenario where you take up a position as a data scientist in a government organisation.  On your first day you are told that your predecessor has left already, but all code needed for your job is saved to the server.  You log in and have a look in the directory:
22 | 
23 | ```
24 | uber_import_gov_proj
25 | ├── 20190320_main_v2.py
26 | ├── archive
27 | │   ├── 20190504_v3_main_not_final.py
28 | │   ├── tests_before_fix.py
29 | │   ├── v1_main.py
30 | │   └── v3_main_final.py
31 | ├── v2_main_20190320.py
32 | ├── v3_main_final.py
33 | ├── v2v3_main_final_TM_MP_MA_DC(MA_conflicted_copy).py
34 | └── v3_main_final_TMonks_Conflicted_Copy.py
35 | ```
36 | Take a moment to take in the mess of this project.  Perhaps you can laugh about it.  The questions you should be asking yourself are:
37 | 
38 | * have you ever ended up in a mess like this even though you have had the most noble of intentions at the start of a project?
39 | * have you ever worked with someone who has managed work in this way?
40 | 
41 | In my experience this sort of structure turns up surprisingly often, for all sorts of data science and non-data science projects.  It is certainly more common than a cleanly organised data science project.  This is a totally unnecessary situation.  With version control we actually only need this structure:
42 | 
43 | ```
44 | uber_import_gov_proj
45 | ├── main.py
46 | ```
47 | 
48 | ### Scenario 2
49 | 
50 | Even though the code is a complete mess, you are still working for that government organisation several months later.  It is a Monday morning and you stroll into work with the intention of trying again to work out if you should run the analysis code in `v3_main_final.py` or `archive/v3_main_final.py`.  But alas your plans are interrupted!  Some organisation critical code originally written years ago, by an analyst long since departed, failed to run over the weekend. It's your job to fix it!  You open up the code and after the initial horror of finding its a single 'god function' with a repeating verbose code, begin to try and make sense of the problem.  Your initial findings are:
51 | 
52 | * Its clear from comments in the code that it has been modified by several people over the years, but it is not clear how many times, who the coders were, what the changes were made and in what order.  
53 | * There's no 'archive' folder listing older versions of the code and no documentation.  So there's no way to roll back changes.
54 | * There's no code to test if the main analysis code runs as expected.
55 | 
56 | Before you laugh again this is actually a situation I found myself in many years ago. It wasn't fun (at all - especially as I had lots of people checking if "I'd fixed it yet?" quite frequently). It did turn out that a change had introduced the bug under a given set of conditions. So, after quite a while, I fixed what turned out to be an extremely important piece of code for the organisation.  There was no version control system in place so I carefully documented the changes both in the code via comments and in external documentation. 
57 | 
58 | Can you think of any software that's open source and free that would have made this a bit easier?
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/content/03_mgt/04_exercises/02_conda.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "ad63482d-4247-4589-a572-68345afa9ae5",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Using `conda`\n",
  9 |     "\n",
 10 |     "To help you with your deployment either via PyPi, Binder or handing over a local python package it is a good idea to improve your `conda` package manager skills and working with conda virtual environments.  \n",
 11 |     "\n",
 12 |     "> If you are working on a Windows OS I recommend running these commands from Anaconda prompt.  If you are working on a Mac or a Linux machine then use a terminal.\n",
 13 |     "\n",
 14 |     "## Exercise 1\n",
 15 |     "\n",
 16 |     "* List the `conda` environments on your computer.\n",
 17 |     "\n",
 18 |     "```\n",
 19 |     "conda env list\n",
 20 |     "```\n",
 21 |     "\n",
 22 |     "## Exercise 2\n",
 23 |     "\n",
 24 |     "* By default you will be in the `base` conda environment.  List the packages installed.\n",
 25 |     "\n",
 26 |     "```bash\n",
 27 |     "conda list\n",
 28 |     "```\n",
 29 |     "\n",
 30 |     "## Exercise 3\n",
 31 |     "\n",
 32 |     "Let's practice creating an empty environment, activating it, checking it is empty and then remove it.\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "* Create an empty conda environment `empty_env`\n",
 36 |     "\n",
 37 |     "```bash\n",
 38 |     "conda create --name empty_env\n",
 39 |     "```\n",
 40 |     "\n",
 41 |     "> You will be prompted if you want to proceed.  Answer Yes!\n",
 42 |     "\n",
 43 |     "* Activate the environment\n",
 44 |     "\n",
 45 |     "```bash\n",
 46 |     "$ conda activate empty_env\n",
 47 |     "```\n",
 48 |     "\n",
 49 |     "* List the packages installed\n",
 50 |     "\n",
 51 |     "```bash\n",
 52 |     "conda list\n",
 53 |     "```\n",
 54 |     "\n",
 55 |     "> There should be no packages!  If they are then you are probably in the wrong environment.  Check this with `conda env list`.  The active env is marked with `*`\n",
 56 |     "\n",
 57 |     "* Deactivate the env to return to `base`\n",
 58 |     "\n",
 59 |     "```bash\n",
 60 |     "conda deactivate\n",
 61 |     "```\n",
 62 |     "\n",
 63 |     "* Remove the environment\n",
 64 |     "\n",
 65 |     "```bash\n",
 66 |     "conda env remove --name empty_env\n",
 67 |     "```\n",
 68 |     "\n",
 69 |     "* Verify the environment is removed using list\n",
 70 |     "\n",
 71 |     "```bash\n",
 72 |     "conda env list\n",
 73 |     "```\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "## Exercise 4\n",
 77 |     "\n",
 78 |     "Now let's create an environment and install a few packages from the command line.\n",
 79 |     "\n",
 80 |     "* Create an environment called `test_env` \n",
 81 |     "* Activate `test_env`\n",
 82 |     "* Install `python` version 3.8.8 and `numpy` 1.20.3 \n",
 83 |     "\n",
 84 |     "```bash\n",
 85 |     "conda install python=3.8.8 numpy=1.20.3\n",
 86 |     "```\n",
 87 |     "\n",
 88 |     "> Conda will report what dependencies are going to be installed.  This might vary depending on what operating system you use.  You will also be prompted if you are happy to proceed.  It will take a few seconds to install.\n",
 89 |     "\n",
 90 |     "\n",
 91 |     "\n",
 92 |     "List all the packages installed in `test_env`.  Check that the `python` and `numpy` versions match those you used.\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "## Exercise 5\n",
 96 |     "\n",
 97 |     "Staying with `test_env `create a `environment.yml` file that contains **only** the packages you installed from the command line.\n",
 98 |     "\n",
 99 |     "* Issue the following export command. Make sure you include the `--from-history` option or you will get a full list of everything in the environment.  The output you should is is displayed below as well.\n",
100 |     "\n",
101 |     "```bash\n",
102 |     "$ conda env export --from-history\n",
103 |     "\n",
104 |     "name: test_env\n",
105 |     "channels:\n",
106 |     "  - defaults\n",
107 |     "dependencies:\n",
108 |     "  - numpy=1.20.3\n",
109 |     "  - python=3.8.8\n",
110 |     "```\n",
111 |     "\n",
112 |     "* It is also possible to export this to a named file (typically `environment.yml`)\n",
113 |     "\n",
114 |     "```bash\n",
115 |     "$ conda env export --from-history -f environment.yml\n",
116 |     "```\n",
117 |     "\n",
118 |     "> Remember this will export the file to the current working directory.  For simplicity I recommend working in the same directory as your code.  This makes even more sense for example if you have a git repo.\n",
119 |     "\n",
120 |     "## Exercise 6\n",
121 |     "\n",
122 |     "Now let's practice creating a conda env from file.  I recommend working in the same directory as exercise 5.\n",
123 |     "\n",
124 |     "* Deactivate the `test_env` environment\n",
125 |     "* Remove the `test_env` \n",
126 |     "* Create the conda environment from file\n",
127 |     "\n",
128 |     "```bash\n",
129 |     "$ conda env create -f environment.yml\n",
130 |     "```\n",
131 |     "\n",
132 |     "> This (re)creates `test_env`.\n",
133 |     "\n",
134 |     "* Activate `test_env`\n",
135 |     "* Check what packages are installed."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "id": "35d74941-b5e2-49b9-bced-0e0444026108",
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": []
145 |   }
146 |  ],
147 |  "metadata": {
148 |   "kernelspec": {
149 |    "display_name": "Python 3 (ipykernel)",
150 |    "language": "python",
151 |    "name": "python3"
152 |   },
153 |   "language_info": {
154 |    "codemirror_mode": {
155 |     "name": "ipython",
156 |     "version": 3
157 |    },
158 |    "file_extension": ".py",
159 |    "mimetype": "text/x-python",
160 |    "name": "python",
161 |    "nbconvert_exporter": "python",
162 |    "pygments_lexer": "ipython3",
163 |    "version": "3.8.8"
164 |   }
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 5
168 | }
169 | 


--------------------------------------------------------------------------------
/content/03_mgt/04_exercises/02_use_conda.md:
--------------------------------------------------------------------------------
  1 | # Using `conda`
  2 | 
  3 | To help you with your deployment either via PyPi, Binder or handing over a local python package it is a good idea to improve your `conda` package manager skills and working with conda virtual environments.  
  4 | 
  5 | > If you are working on a Windows OS I recommend running these commands from Anaconda prompt.  If you are working on a Mac or a Linux machine then use a terminal.
  6 | 
  7 | > For more detail on conda check out the [docs](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html)
  8 | 
  9 | ## Exercise 1
 10 | 
 11 | * List the `conda` environments on your computer.
 12 | 
 13 | ```bash
 14 | $ conda env list
 15 | ```
 16 | 
 17 | ## Exercise 2
 18 | 
 19 | * By default you will be in the `base` conda environment.  List the packages installed.
 20 | 
 21 | ```bash
 22 | $ conda list
 23 | ```
 24 | 
 25 | ## Exercise 3
 26 | 
 27 | Let's practice creating an empty environment, activating it, checking it is empty and then remove it.
 28 | 
 29 | 
 30 | * Create an empty conda environment `empty_env`
 31 | 
 32 | ```bash
 33 | $ conda create --name empty_env
 34 | ```
 35 | 
 36 | > You will be prompted if you want to proceed.  Answer Yes!
 37 | 
 38 | * Activate the environment
 39 | 
 40 | ```bash
 41 | $ conda activate empty_env
 42 | ```
 43 | 
 44 | * List the packages installed
 45 | 
 46 | ```bash
 47 | $ conda list
 48 | ```
 49 | 
 50 | > There should be no packages!  If they are then you are probably in the wrong environment.  Check this with `conda env list`.  The active env is marked with `*`
 51 | 
 52 | * Deactivate the env to return to `base`
 53 | 
 54 | ```bash
 55 | $ conda deactivate
 56 | ```
 57 | 
 58 | * Remove the environment
 59 | 
 60 | ```bash
 61 | $ conda env remove --name empty_env
 62 | ```
 63 | 
 64 | * Verify the environment is removed using list
 65 | 
 66 | ```bash
 67 | $ conda env list
 68 | ```
 69 | 
 70 | ## Exercise 4
 71 | 
 72 | Now let's create an environment and install a few packages from the command line.
 73 | 
 74 | * Create an environment called `test_env` 
 75 | * Activate `test_env`
 76 | * Install `python` version 3.8.8 and `numpy` 1.20.3 
 77 | 
 78 | ```bash
 79 | $ conda install python=3.8.8 numpy=1.20.3
 80 | ```
 81 | 
 82 | > Conda will report what dependencies are going to be installed.  This might vary depending on what operating system you use.  You will also be prompted if you are happy to proceed.  It will take a few seconds to install.
 83 | 
 84 | * List all the packages installed in `test_env`.  Check that the `python` and `numpy` versions match those you used.
 85 | 
 86 | 
 87 | ## Exercise 5
 88 | 
 89 | Staying with `test_env `create a `environment.yml` file that contains **only** the packages you installed from the command line.
 90 | 
 91 | * Issue the following export command. Make sure you include the `--from-history` option or you will get a full list of everything in the environment.  The output you should is is displayed below as well.
 92 | 
 93 | ```bash
 94 | $ conda env export --from-history
 95 | 
 96 | name: test_env
 97 | channels:
 98 |   - defaults
 99 | dependencies:
100 |   - numpy=1.20.3
101 |   - python=3.8.8
102 | ```
103 | 
104 | * It is also possible to export this to a named file (typically `environment.yml`)
105 | 
106 | ```bash
107 | $ conda env export --from-history -f environment.yml
108 | ```
109 | 
110 | > Remember this will export the file to the current working directory.  For simplicity I recommend working in the same directory as your code.  This makes even more sense for example if you have a git repo.
111 | 
112 | ## Exercise 6
113 | 
114 | Now let's practice creating a conda env from file.  I recommend working in the same directory as exercise 5.
115 | 
116 | * Deactivate the `test_env` environment
117 | * Remove the `test_env` 
118 | * Create the conda environment from file
119 | 
120 | ```bash
121 | $ conda env create -f environment.yml
122 | ```
123 | 
124 | > This (re)creates `test_env`.
125 | 
126 | * Activate `test_env`
127 | * Check what packages are installed.
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/content/03_mgt/04_exercises/03_binder.md:
--------------------------------------------------------------------------------
 1 | # Binderhub exercises
 2 | 
 3 | > For this exercise you will need a Github account. Sign-up via [https://github.com/](https://github.com/). 
 4 | 
 5 | In exercises 1 to 4 you are going to upload a Jupyter notebook to Github and share it via Binderhub.
 6 | 
 7 | ## Exercise 1:
 8 | 
 9 | First create the Github repo and insert a notebook file.
10 | 
11 | **Task**:
12 | 
13 | * Create a Github repo.  You can use any repo name you choose.  If you cannot decide a suggestion is 'binder_exercise'
14 | * Make a local copy of the notebook that contains the solutions to ED data wrangling exercise.
15 | * Push the notebook to the repo.
16 | 
17 | **Hints**
18 | * If you do not know how to use GitHub you can create the repository and then click on the green **upload** button.  This will allow you to select the notebook and add a commit message.
19 | * If you prefer to do this via git then I recommend creating the remote repo first, cloning locally, add (and stage), commit the notebook.  Finally push using `git push`.  Depending on your authentication method you may be asked for your GitHub username and password.
20 | 
21 | ## Exercise 2:
22 | 
23 | You now need to create a conda environment file so that binderhub knows what version of python and data science packages to install.
24 | 
25 | **Task**:
26 | * Create a directory in the repo called `binder``
27 | * Create a conda environment file in `binder/environment.yml`  with the appropriate libraries. A suggestion is:
28 | 
29 | ```YAML
30 | name: binder_ex
31 | channels:
32 |   - defaults
33 |   - conda-forge
34 | dependencies:
35 |   - matplotlib=3.4.2
36 |   - numpy=1.20.3
37 |   - pandas=1.3.1
38 |   - python=3.8.8
39 | ```
40 | 
41 | * Commit the changes and push to github using your preferred method.
42 | 
43 | 
44 | ## Exercise 3:
45 | 
46 | You are now ready to share your notebook via binder
47 | 
48 | **Task**
49 | * Copy the URL of your GitHub repo's main page.
50 | * Using your browser navigate to [https://mybinder.org](https://mybinder.org)
51 | * Paste the URL of your Github repo and click on 'launch' (the build will take several minutes)
52 | 
53 | ## Exercise 4:
54 | 
55 | Let's add a 'launch binder badge' to a `README.md` file in your repo.
56 | 
57 | **Task**:
58 | * From the BinderHub setup page copy the markdown text that you will use to create the badge.
59 | * If required (i.e. you don't already have one). Create a `README.md` file and add it to your GitHub repo.
60 | * Open `README.md` for editing.  At the top paste in the copied launch binderhub markdown.
61 | * Push the update to your GitHub repo.
62 | * Navigate to your GitHub repo and click on the badge to launch binderhub!
63 | 
64 | 
65 | ## Exercise 5:
66 | 
67 | **Task**
68 | * Use BinderHub to share the more advanced `ts_emergency` package you created in the exercises. 


--------------------------------------------------------------------------------
/content/03_mgt/04_exercises/im/detrended.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/04_exercises/im/detrended.jpg


--------------------------------------------------------------------------------
/content/03_mgt/04_exercises/im/diag.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/04_exercises/im/diag.jpg


--------------------------------------------------------------------------------
/content/03_mgt/04_exercises_front_page.md:
--------------------------------------------------------------------------------
1 | # Exercises
2 | 
3 | Managing python data science projects - exercises.
4 | 


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/im/detrended.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/05_solutions/im/detrended.jpg


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/im/diag.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/05_solutions/im/diag.jpg


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/ts_emergency/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | __author__ = 'Tom Monks'


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/ts_emergency/data/.~lock.ts_ed.csv#:
--------------------------------------------------------------------------------
1 | ,tom,pop-os.localdomain,16.07.2021 15:37,file:///home/tom/.config/libreoffice/4;


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/ts_emergency/datasets.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Functions to load built in datasets for ts_emergency.
  3 | Datasets are downloaded from an external github repo.
  4 | 
  5 | The key loading function is load_ed_ts
  6 | '''
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | LONG_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \
 12 |             + 'hpdm139-datasets/main/syn_ts_ed_long.csv'
 13 | 
 14 | WIDE_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \
 15 |             + 'hpdm139-datasets/main/syn_ts_ed_wide.csv'
 16 | 
 17 | def load_ed_ts(data_format='wide', as_pandas=True):
 18 |     '''
 19 |     Load the built-in ED dataset
 20 |     
 21 |     Params:
 22 |     ------
 23 |     data_format: str
 24 |         'Wide' or 'long' format.  Wide format provides hospital columns.
 25 |         Long format provides a categorical hospital column and single attends
 26 |         column.
 27 |         
 28 |     as_pandas: bool, optional (default = True)
 29 |         Return as `pandas.Dataframe`.  If False then `numpy.ndarray`
 30 |         
 31 |     Returns:
 32 |     -------
 33 |     pandas.Dataframe or if `as_pandas=False` then returns `numpy.ndarray`
 34 |     
 35 |     '''
 36 |     valid_formats = ['wide', 'w', 'long', 'l']
 37 |     
 38 |     if data_format.lower() not in valid_formats:
 39 |         raise ArgumentError(f'data format should be one of {valid_formats}')
 40 | 
 41 |     if data_format == 'wide' or data_format == 'w':
 42 |         df = _ed_data_to_wide(LONG_URL)
 43 |     else:
 44 |         df = _ed_data_to_long(WIDE_URL)
 45 |     
 46 |     if as_pandas:
 47 |         return df
 48 |     else:
 49 |         return df.to_numpy()
 50 |         
 51 |         
 52 |  
 53 | def _ed_data_to_wide(file_path):
 54 |     '''
 55 |     Return the ED data in wide format.
 56 |     
 57 |     1. Pivot table
 58 |     2. Transpose and drop the ('attends', hosp_i) multi-index
 59 |     3. Rename columns [0, 1, 2, 4] tp ['hosp_1', 'hosp_2', 'hosp_3', 'hosp_4']
 60 |     4. Index to DateTimeIndex
 61 |     5. Drop the additional uneeded series 'date' (as stored in index as well)
 62 |     6. Convert attendence numbers from int64 to int16
 63 |     
 64 |     Params:
 65 |     ------
 66 |     file_path: str
 67 |         Path to wide format file
 68 |         
 69 |     Returns:
 70 |     -------
 71 |     pandas.DataFrame
 72 |     '''
 73 |     # column name transfers
 74 |     translated_names = {0:'hosp_1', 
 75 |                         1:'hosp_2',
 76 |                         2:'hosp_3',
 77 |                         3:'hosp_4'}
 78 | 
 79 |     data_types = {'hosp_1':np.int16, 
 80 |                   'hosp_2':np.int16,
 81 |                   'hosp_3':np.int16,
 82 |                   'hosp_4':np.int16}
 83 | 
 84 |     df = (pd.read_csv(file_path)
 85 |             .pivot_table(values=['attends'], index=['date'], columns=['hosp'])
 86 |             .T.reset_index(drop=True)
 87 |             .T.rename(columns=translated_names)
 88 |             .assign(date=lambda x: pd.to_datetime(x.index))
 89 |             .set_index('date')
 90 |             .astype(data_types)
 91 |          )
 92 | 
 93 |     return df
 94 | 
 95 | 
 96 | 
 97 | def _ed_data_to_long(file_path):
 98 |     '''
 99 |     Return the ED data in long format. Uses pd.wide_to_long()
100 |     Assume wide format file is used.
101 |     
102 |     1. pd.wide_to_long()
103 |     2. reset_index() to remove multi-index
104 |     3. rename col 'hosp_'  to 'attends'
105 |     4. date to datetime
106 |     5. Convert attendence numbers from int64 to int16 amd hosp_id to int8.
107 |     (could also be a categorical field.)
108 |     
109 |     Params:
110 |     ------
111 |     file_path: str
112 |         Path to wide format file
113 |         
114 |     Returns:
115 |     -------
116 |     pandas.DataFrame
117 |     '''
118 | 
119 |     translated_names = {'hosp_':'attends'}
120 |     data_types = {'hosp': np.int8, 'attends':np.int16}
121 | 
122 |     long_df = ( 
123 |                 pd.wide_to_long(pd.read_csv(file_path), stubnames='hosp_', 
124 |                                 i=['date'], j='hosp')
125 |                 .reset_index()
126 |                 .rename(columns=translated_names)
127 |                 .assign(date=lambda x: pd.to_datetime(x['date']))
128 |                 .astype(data_types)
129 |                 )
130 | 
131 |     return long_df
132 | 


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/ts_emergency/plotting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/03_mgt/05_solutions/ts_emergency/plotting/__init__.py


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/ts_emergency/plotting/tsa.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | tsa - time series analysis module
 3 | 
 4 | plotting functions for time series analysis
 5 | '''
 6 | 
 7 | # standard imports
 8 | import matplotlib.pyplot as plt
 9 | import numpy as np
10 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
11 | 
12 | # cross package imports
13 | from ts_emergency.plotting.view import plot_single_ed
14 | 
15 | def plot_detrended(wide_df, hosp_id, ax=None):
16 |     '''
17 |     Plot the first difference of the ED time series
18 |     '''
19 |     
20 |     # create differenced dataframe
21 |     diff_df = wide_df.diff(periods=1)
22 |     
23 |     fig, ax = plot_single_ed(diff_df, hosp_id, ax)
24 |     ax.set_title('Detrended')
25 |     
26 |     return fig, ax
27 |     
28 | 
29 | def diagnostic_plot(wide_df, hosp_id, figsize=(9, 6), maxlags=56, 
30 |                     include_zero=False):
31 |     '''
32 |     Basic plot of diagnostics for ED time series.
33 |     
34 |     1. Detrended series
35 |     2. ACF
36 |     3. PACF
37 |     
38 |     Params:
39 |     ------
40 |     wide_df: pandas.Dataframe
41 |         ED data in wide format
42 |         
43 |     hosp_id: str
44 |         column name for hospital
45 |         
46 |     figsize: (int, int), optional (default=(9,6))
47 |         size of figure
48 |         
49 |     maxlags: int, optional (default=56)
50 |         The number of lags to include int the ACF and PACF
51 |         
52 |     include_zero: bool, optional (default=False)
53 |         Include ACF and PACF of observation with itself in plot (=1.0)
54 |     
55 |     Returns:
56 |     -------
57 |     fig, np.ndarray
58 |     ''' 
59 |     fig = plt.figure(figsize=figsize, tight_layout=True)
60 | 
61 |     # add gridspec
62 |     gs = fig.add_gridspec(3, 2)
63 | 
64 |     # detrended axis spans two columns
65 |     ax1 = fig.add_subplot(gs[0, :])
66 |     # acf axis spans 2 rows in column idx 0
67 |     ax2 = fig.add_subplot(gs[1:,0])
68 |     # pacf axis spans 2 rows in column idx 1
69 |     ax3 = fig.add_subplot(gs[1:, 1])
70 |     
71 |     # plot detrended on axis 1
72 |     _ = plot_detrended(wide_df, hosp_id, ax=ax1)
73 | 
74 |     # plot acf on axis 2
75 |     _ = plot_acf(wide_df[hosp_id], lags=maxlags, ax=ax2, zero=include_zero)
76 |     # plot pacf on axi
77 |     _ = plot_pacf(wide_df[hosp_id], lags=maxlags, ax=ax3, zero=include_zero)
78 |     
79 |     axs = np.array([ax1, ax2, ax3])
80 |     return fig, axs
81 | 
82 |     


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions/ts_emergency/plotting/view.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | DEFAULT_LABEL_FS = 12
 4 | DEFAULT_AXIS_FS = 12
 5 | DEFAULT_FIGSIZE = (12,8)
 6 | 
 7 | def plot_single_ed(wide_df, hosp_id, ax=None, figsize=(12,3), 
 8 |                    fontsize=DEFAULT_LABEL_FS, line_width=2):
 9 |     '''
10 |     Plot a single ED's data
11 |     Assumes data are passed in wide format.
12 |     
13 |     Params:
14 |     -------
15 |     wide_df: pandas.Dataframe
16 |         ED time series data in wide format
17 |         
18 |     hosp_id: str
19 |         name of hospital column to plot e.g. 'hosp_1'
20 |         
21 |     figsize: tuple(int, int), optional (default=(12,3))
22 |         `matplotlib` figure size 
23 |         
24 |     fontsize: int, optional (default=DEFAULT_LABEL_FS)
25 |         Size of label font
26 |         
27 |     line_width: int
28 |         Width of the line plot
29 |         
30 |     Returns:
31 |     -------
32 |     matplotlib fig, ax
33 |             
34 |     '''
35 |     
36 |     if ax is None:
37 |         fig = plt.figure(figsize=figsize)
38 |         ax = fig.add_subplot()
39 |         
40 |     ax.set_xlabel("Date", fontsize=fontsize)
41 |     ax.set_ylabel("Attendances", fontsize=fontsize)
42 | 
43 |     _ = ax.plot(wide_df[hosp_id], lw=line_width)
44 |     # include x, y grid 
45 |     _ = ax.grid(ls='--')
46 | 
47 |     # set size of x, y ticks
48 |     _ = ax.tick_params(axis='both', labelsize=fontsize)
49 | 
50 |     # return the figure
51 |     return ax.figure, ax
52 | 
53 | 
54 | def plot_eds(wide_df, figsize=DEFAULT_FIGSIZE, label_font_size=DEFAULT_LABEL_FS, 
55 |              axis_font_size=DEFAULT_AXIS_FS):
56 |     '''
57 |     Plot all ED's attendances in a 1x4 grid layout.
58 |     
59 |     Params:
60 |     ------
61 |     wide_df: pandas.Dataframe
62 |         ED time series data in wide format
63 | 
64 |     figsize: tuple(int, int), optional (default=(12,3))
65 |         `matplotlib` figure size 
66 |         
67 |     label_font_size: int, optional (default=DEFAULT_LABEL_FS)
68 |         Size of label font
69 |         
70 |     axis_font_size: int, optional (default=DEFAULT_AXIS_FS)
71 |         Size of axis tick font
72 |     
73 |     Returns:
74 |     --------
75 |     matplotlib fig
76 |     '''
77 |              
78 |     fig, axs = plt.subplots(nrows=4, ncols=1, tight_layout=True, figsize=(12,8),
79 |                             sharex=True)
80 | 
81 |     # note that axs is a 2D array
82 |     for hosp_idx in range(0, 4):
83 |         _ = axs[hosp_idx].plot(wide_df[f'hosp_{hosp_idx+1}'])
84 |         _ = axs[hosp_idx].set_title(f'Hospital {hosp_idx+1}', 
85 |                                     fontsize=label_font_size)
86 |         _ = axs[hosp_idx].grid(ls='--')
87 | 
88 |     # axis labels matplotlib >=3.4 
89 |     AXIS_LABEL_SIZE = 12
90 |     _ = fig.supylabel('ED Attendances', fontsize=axis_font_size)
91 |     _ = fig.supxlabel('Date', fontsize=axis_font_size)
92 |              
93 |     return fig


--------------------------------------------------------------------------------
/content/03_mgt/05_solutions_front_page.md:
--------------------------------------------------------------------------------
1 | # Solutions
2 | 
3 | The following sections provide example solutions to the managing python project exercises.  There are often many ways to solve these problems.  The solutions provided should be taken as guides only.  If you feel you have a better way feel free to raise an issue and suggest your solution is adopted instead!
4 | 


--------------------------------------------------------------------------------
/content/appendix/acknowledge.md:
--------------------------------------------------------------------------------
 1 | # Acknowledgements
 2 | 
 3 | I'd like to extend my thanks to the following people for their contributions to the book.  All contributions no matter the size are welcome.
 4 | 
 5 | * [agh208](https://github.com/agh208): Amy Heather (MSc Health Data Science, University of Exeter. 2021/22)
 6 | * [SubaruSpirit](https://github.com/SubaruSpirit). 
 7 | * [tristar82](https://github.com/tristar82). Elliott Coyne (MSc Health Data Science, University of Exeter. 2021/22)
 8 | * [reevesglobal](https://github.com/reevesglobal)
 9 | * [trptaylor](https://github.com/trptaylor)
10 | * [kaungmyatwaiyan](https://github.com/kaungmyatwaiyan) MSc Health Data Science 2021/22.
11 | * [JeffAkkerman](https://github.com/JeffAkkerman)
12 | * [ploginovic](https://github.com/ploginovic) Pavel Loginovic  (MSc Health Data Science, University of Exeter. 2023/24)


--------------------------------------------------------------------------------
/content/appendix/fp_lectures.md:
--------------------------------------------------------------------------------
1 | # Lectures


--------------------------------------------------------------------------------
/content/appendix/fp_practicals.md:
--------------------------------------------------------------------------------
1 | # Practicals


--------------------------------------------------------------------------------
/content/appendix/labs/debug1.md:
--------------------------------------------------------------------------------
 1 | # Debug challenge 1
 2 | 
 3 | ```{admonition} Challenge
 4 | 
 5 | The simple code listing below contains a number of bugs.  
 6 | Can you fix the code and help it to run?
 7 | ```
 8 | **Hints:**
 9 | * Use a Python IDE such as`spyder` or `Visual Studio Code` it will help you debug.
10 | * Read the Python interpreter output.  
11 | * The errors reported can look confusing at first, but read them carefully and they will point you to the lines of code with problems.
12 | * The `Spyder` IDE may give you some hints about formatting errors
13 | * It can be useful to use `print()` to display intermediate calculations and variable values.
14 | * Remember that `Spyder` has a variable viewer where you can look at the value of all variables created.  
15 | * There might be multiple bugs!  When you fix one and try to run the code you might find another!
16 | 
17 | Have a go **yourself** and then watch our approach: 
18 | 
19 | * https://www.youtube.com/watch?v=XCuD59bYKx0
20 | 
21 | 
22 | ```python
23 | 
24 | 
25 | def split_word_in_two(to_split):
26 | """
27 | Returns string split into two parts
28 | 
29 | If the word's length is even the two parts have
30 | equal number of characters
31 | 
32 | Params:
33 | -------
34 | to_split: str 
35 |     the string to split int o
36 | """
37 | length = len(to_spit)
38 | half_length = length / 2
39 | 
40 | part1 = to_split[:half]
41 | part2 = to_split[half:]
42 | 
43 | return part1, part2
44 | 
45 | 
46 | def main():
47 | """
48 | Tests the split_word_in_two function.
49 | Input word = 'faster'
50 | Expected output = ('fas', 'ter')
51 | """
52 | word_to_split = 'faster'
53 | result = split_word_in_two(word_to_split)
54 | print('Part 1 = {0}; Part 2 = {1}'.format(result[0], result[1]))
55 | 
56 | 
57 | if __name__ == "__main__":
58 |    main()
59 | 
60 | ```


--------------------------------------------------------------------------------
/content/appendix/labs/debug2.md:
--------------------------------------------------------------------------------
 1 | # Debug challenge 2
 2 | 
 3 | **This weeks debug challenges loops and nested loops.**
 4 | 
 5 | A classic task in programming is to sort a list.  Python makes this 
 6 | very simple by including a number of ways to sort a list.  
 7 | 
 8 | Under the hood the sorting routines are all variations on loops 
 9 | where values in the array are swapped until it is in ascending order.
10 | 
11 | ```{admonition} Challenge
12 | 
13 | The function below implements **insertion sort**.  Insertion sort is an 
14 | efficient algorithm for sorting a list made of two loops.  An outer loop
15 | that iterates forward through the list and an inner list that iterates backwards.
16 | 
17 | The code below is not running.  Can you debug it?
18 | ```
19 | 
20 | ```python
21 | 
22 | def insertion_sort(to_sort):
23 |     """
24 |     Sort a list of numbers using the insertion sort algorithm
25 |     Return a list of sorted numbers
26 |     
27 |     Keyword arguments:
28 |     to_sort -- an unsorted python list of numbers
29 |     """
30 |     
31 |     #This is the outer loop.
32 |     for i in range(1, to_sort):
33 |         
34 |         j = i
35 |         
36 |         #This inner while loop.  Note the backwards iteration.
37 |         #The while loop terminates when either:
38 |         #1. j == 0 i.e. the first element in the list is reached
39 |         #2. there is no need to do any sorting i.e. to_sort[j-1] < to_sort[j]
40 |         while j > 0 and to_sort[j-1] > to_sort[j]
41 |             
42 |          #to swap the values we need a 3rd variables (temp)
43 |          temp = to_sort[j]
44 |          to_sort[j] = to_sort[j-1]
45 |          to_sort[j-1] = temp
46 |          j -= 1
47 |         
48 |     return to_sort
49 | 
50 | 
51 | if __name__ == "__main__":
52 |    list_to_sort = [14,33,27,10,35,19,42,44]
53 |    sorted_list = insertion_sort(list_to_sort)
54 |    print(sorted_list)
55 | 
56 | ```


--------------------------------------------------------------------------------
/content/appendix/labs/src/cinema_exercise.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """ Exercise: Booking Cinema Tickets and Refreshments
  4 | 
  5 | You are going to the cinema and want to know the cost.
  6 | 
  7 | Create 3 functions.
  8 | 
  9 | tickets.  returns the costs of tickets (i.e one or more) purchased.
 10 |           Normal tickets cost 10.99.  Wednesdays reduce the cost by 2.00.
 11 |           Premium seating adds an extra 1.50 regardless of the day
 12 | 
 13 | refreshments. returns the cost of refreshments.  A user could buy 'popcorn' for 2.00 or 'fizzy pop' for 3.50
 14 |              
 15 | cinema_trip. Adds the cost of tickets and refreshments together.
 16 | 
 17 | """
 18 | 
 19 | 
 20 | def tickets(number, day, premium_seating):
 21 |     """
 22 |     The cost of the cinema ticket.
 23 |     Normal ticket cost is $10.99
 24 |     Wednesdays reduce the cost by $2.00
 25 |     Premium seating adds an extra $1.50 regardless of the day
 26 |     
 27 |     Parameters:
 28 |     ----------
 29 |     number: int
 30 |         integer value representing the number of seats to book
 31 |         
 32 |     day: int
 33 |         day of the week to book (1 = Monday ... 7 = Sunday)
 34 |         
 35 |     premium_seating: bool
 36 |         boolean True/False.  Are premium seats required.
 37 |         
 38 |     Returns:
 39 |     -------
 40 |     float
 41 |     """
 42 |     #fill in your code here.  
 43 |     return 0.0
 44 |     
 45 | 
 46 | def refreshment(choice ='popcorn'):
 47 |     """ 
 48 |     The cost of refrehments.  Choices are popcorn or fizzy pop 
 49 |     
 50 |     Parameters:
 51 |     ----------
 52 |     choice The users choice of refreshment (default = 'popcorn')
 53 |     
 54 |     Returns:
 55 |     -------
 56 |     float
 57 |     """
 58 | 
 59 |     #fill in your code here
 60 |     return 0.0
 61 | 
 62 | 
 63 | def cinema_trip(persons, day, premium_seating, treat):
 64 |     """ 
 65 |     The total cost of going to the cinema 
 66 |     
 67 |     Parameters:
 68 |     ----------
 69 |     persons: int
 70 |         number of people who need a ticket
 71 |         
 72 |     day: int
 73 |         day of the week to book (1 = Monday, 7 = Sunday)
 74 |         
 75 |     preimum_seating: bool
 76 |             boolean True/False if premium seats are required
 77 |             
 78 |     treat: str
 79 |         string value representing a choice of refreshment 
 80 |         
 81 |     Returns:
 82 |     -------
 83 |     float
 84 |     """
 85 |     #fill in your code here
 86 |     return tickets(persons, day, premium_seating) + refreshment(treat)
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     persons = 2
 91 |     day = 1
 92 |     premium_seating = True
 93 |     treat = "popcorn"
 94 | 
 95 |     total_cost = cinema_trip(persons, day, premium_seating, treat)
 96 | 
 97 |     msg = f'today a trip to the cineman will cost you £{total_cost:.2f}'
 98 |     print(msg)
 99 |     #expected answer = £26.98
100 | 
101 |     persons = 3
102 |     day = 3
103 |     premium_seating = True
104 |     treat = "fizzy pop"
105 | 
106 |     total_cost = cinema_trip(persons, day, premium_seating, treat)
107 | 
108 |     msg = f'today a trip to the cineman will cost you £{total_cost:.2f}'
109 |     print(msg)
110 |     #expected answer = £34.97
111 | 


--------------------------------------------------------------------------------
/content/appendix/labs/src/list_comprehensions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | List comprehension examples
  5 | 
  6 | List comprehensions are an alternative to for loops
  7 | They work specifically with Python Lists.
  8 | 
  9 | The code examples below give an introduction to using them. 
 10 | 
 11 | 1. Double a list of numbers
 12 | 2. Call a function from a list comprehension
 13 | 3. Using zip within a list function to iterate multiple lists
 14 | 4. Using If statments within a list comprehension
 15 | 5. Creating a list of lists using a nested list comprehension
 16 | 6. Looping through a list of lists using a list comprehension
 17 | 
 18 | @author: tom
 19 | 
 20 | """
 21 | 
 22 | #%%
 23 | # =============================================================================
 24 | # Example 1 - double the numbers
 25 | # =============================================================================
 26 | 
 27 | foo = [1, 2, 3, 4]
 28 | bar = []
 29 | 
 30 | for x in foo:
 31 |     bar.append(x * 2)
 32 |     
 33 | print(bar)
 34 | 
 35 | #%%
 36 | 
 37 | # =============================================================================
 38 | # list comprehension approach for the same result...
 39 | # =============================================================================
 40 | 
 41 | foo = [1, 2, 3, 4]
 42 | bar = [x * 2 for x in foo]
 43 | print(bar)
 44 | 
 45 | #%%
 46 | 
 47 | # =============================================================================
 48 | # Example 2 - convert celsius to fahrenheit
 49 | # This example calls a function from within the list comprehension.
 50 | # =============================================================================
 51 | 
 52 | def convert_celsius_to_fahrenheit(deg_celsius):
 53 |     """
 54 |     Convert degress celsius to fahrenheit
 55 |     Returns float value - temp in fahrenheit
 56 |     Keyword arguments:
 57 |         def_celcius -- temp in degrees celsius
 58 |     """
 59 |     return (9/5) * deg_celsius + 32
 60 | 
 61 | #list of temps in degree celsius to convert to fahrenheit
 62 | celsius = [39.2, 36.5, 37.3, 41.0]
 63 | 
 64 | #standard for loop approach
 65 | fahrenheit = []
 66 | for x in celsius:
 67 |     fahrenheit.append(convert_celsius_to_fahrenheit(x))
 68 | 
 69 |         
 70 | print('using standard for loop: {}'.format(fahrenheit))
 71 | 
 72 | #implementation using a list comprehension
 73 | fahrenheit = [convert_celsius_to_fahrenheit(x) for x in celsius]
 74 | print('using list comprehension: {}'.format(fahrenheit))
 75 | 
 76 | #%%
 77 | # =============================================================================
 78 | # Example 3 - convert the strings to different data types
 79 | # This example also make ue of the zip function
 80 | # Zip allow you to iterate through two lists at the same time
 81 | # =============================================================================
 82 | 
 83 | inputs = ["1", "3.142", "True", "spam"]
 84 | converters = [int, float, bool, str]
 85 | 
 86 | values_with_correct_data_types = [t(s) for (s, t) in zip(inputs, converters)]
 87 | print(values_with_correct_data_types)
 88 | 
 89 | #%%
 90 | # =============================================================================
 91 | # Example 4 - Using if statements within a list comprehension
 92 | # The example filters a list of file names to the python files only
 93 | # =============================================================================
 94 | 
 95 | unfiltered_files = ['test.py', 'names.csv', 'fun_module.py', 'prog.config']
 96 | 
 97 | python_files = []
 98 | 
 99 | # filter the files using a standard for loop 
100 | for file in unfiltered_files:
101 |     if file[-2:] == 'py':
102 |         python_files.append(file)
103 |         
104 | print('using standard for loop: {}'.format(python_files))
105 | 
106 | #list comprehension
107 | python_files = [file for file in unfiltered_files if file[-2:] == 'py']
108 | 
109 | print('using list comprehension {}'.format(python_files))
110 | 
111 | 
112 | #%%
113 | # =============================================================================
114 | # Example 5 - List comprehension to create a list of lists
115 | # List comprehensions can greatly reduce the complexity of code
116 | # needed to create a list of lists.
117 | # =============================================================================
118 | 
119 | list_of_lists = []
120 | 
121 | for i in range(5):
122 |     sub_list = []
123 |     for j in range(3):
124 |         sub_list.append(i * j)
125 |     list_of_lists.append(sub_list)
126 | 
127 | print(list_of_lists)
128 | 
129 | #a lists comprehension reduces 6 lines of code to 1
130 | list_of_lists = [[i * j for j in range(3)] for i in range(5)]
131 | 
132 | print(list_of_lists)
133 | 
134 | 
135 | #%%
136 | # =============================================================================
137 | # Example 6: Iterate over all items in a list of lists
138 | # using a list comprehension
139 | # The code converts a list of lists to a list of items
140 | # We call this flattening the list.
141 | # =============================================================================
142 | 
143 | list_of_lists = [[8, 2, 1], [9, 1, 2], [4, 5, 100]]
144 | 
145 | flat_list = []
146 | for row in list_of_lists:
147 |     for col in row:
148 |         flat_list.append(col)
149 | 
150 | print(flat_list)
151 | 
152 | #implementation as list comprehension
153 | flat_list = [item for sublist in list_of_lists for item in sublist]
154 | 
155 | print(flat_list)
156 | 
157 | 
158 | #%%


--------------------------------------------------------------------------------
/content/appendix/labs/src/moviedb.csv:
--------------------------------------------------------------------------------
1 | "ID","Title","Budget","Box_office","Year","Meta_Critic"
2 | 1,"Amazing Spiderman",230,757.9,2012,66
3 | 2,"Ironman",140,585.2,2008,57
4 | 3,"Thor",150,449.3,2011,54
5 | 4,"Captain America: the first avenger",140,370.6,2011,66
6 | 5,"Antman",130,519.3,2015,64
7 | 6,"Guardians of the Galaxy",232.3,773,2014,76
8 | 


--------------------------------------------------------------------------------
/content/appendix/labs/src/py_finance.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This module is used to demonstrate how to avoid code running on
 5 | import.
 6 | 
 7 | It contains some example functions.
 8 | 
 9 | Example usage:
10 | 
11 | import py_finance
12 | """
13 | 
14 | def pv(future_value, rate, n):
15 | 
16 |    '''
17 |    Discount a value at defined rate n time periods into the future.
18 | 
19 |    Forumula:
20 |    PV = FV / (1 + r)^n
21 |    Where
22 |    FV = future value
23 |    r = the comparator (interest) rate
24 |    n = number of years in the future
25 | 
26 |    Keyword arguments:
27 |    future value -- the value to discount
28 |    rate -- the rate at which to do the discounting
29 |    n -- the number of time periods into the future
30 |    '''
31 |    return future_value / (1 + rate)**n
32 | 
33 | 
34 | def print_pv(future_value, rate, n, present_value):
35 |    '''
36 |    Prints a sentence reporting the present value of a
37 |    future_value assuming a rate in n time units
38 | 
39 |    Keyword arguments:
40 |    future value -- the value to discount
41 |    rate -- the rate at which to do the discounting
42 |    n -- the number of time periods into the future
43 |    present_value -- the present value of the transaction
44 |    '''
45 |    msg = 'Using an interest rate of {0}, ' + \
46 |       'a payment of £{1:.2f} in {2} years time is worth £{3:.2f} today'
47 | 
48 |    print(msg.format(rate, future_value, n, present_value))
49 | 
50 | 
51 | def test_case1():
52 |    #Test case 1
53 |    future_value = 2000
54 |    rate = 0.035
55 |    years = 5
56 |    result = pv(future_value, rate, years)
57 | 
58 |    print_pv(future_value, rate, years, result)
59 | 
60 | def test_case2():
61 |    #Test case 2
62 |    future_value = 350
63 |    rate = 0.01
64 |    years = 10
65 |    result = pv(future_value, rate, years)
66 | 
67 |    print_pv(future_value, rate, years, result)
68 | 
69 | def main():
70 |    test_case1()
71 |    test_case2()
72 | 
73 | 
74 | 
75 | if __name__ == '__main__':
76 |    main()
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/content/appendix/labs/src/string_manipulation.py:
--------------------------------------------------------------------------------
 1 | #######################################################################
 2 | # Example:  Illustrates how to use, format and manipulate strings
 3 | # Author:   T.Monks
 4 | ######################################################################
 5 | 
 6 | 
 7 | #print a string to console
 8 | print("foo")
 9 | 
10 | #access a specific letter
11 | print("Spam"[0])     #access the first character in string
12 | print("Spam"[2])     #access the third character in string
13 | 
14 | 
15 | mystr = "Spam"
16 | print(mystr[-1])    #alternative count from end to access the last character 
17 | print(mystr[-4])    #alternative count from end to access the first character 
18 | 
19 | #string slicing (substrings)
20 | 
21 | print(mystr[:2]) #this will print the first two chars. Same as writing mystr[0:2]
22 | 
23 | print(mystr[1:4]) #print chars 1 to 4 i.e. pam
24 | 
25 | print(mystr[-2:]) #last two chars i.e. am
26 | print(mystr[-3:]) #last three chars i.e. pam
27 | 
28 | print(mystr[1:]) #print string starting from char pos 1 i.e pam
29 | print(mystr[2:]) #print string starting from char pos 2 i.e. am
30 | 
31 | mystr = "123456789"
32 | print(mystr[1:8:2]) #starting from index 1 return every other char up to index 8  = "2468"
33 | print(mystr[0:8:2]) #1357
34 | 
35 | #concatenation
36 | myvar1 = "foo"
37 | myvar2 = "bar"
38 | 
39 | print(myvar1 + myvar2)
40 | 
41 | #string case
42 | print(myvar1.lower())
43 | print("LoWeRCAse".lower())
44 | print(myvar1.upper())
45 | 
46 | #string length
47 | print(len(myvar1))
48 | 
49 | #convert numeric values to strings
50 | my_num= 2
51 | print("my daughter is " + str(my_num))
52 | 
53 | #advanced formatting of output
54 | language = "python"
55 | skill = "productivity"
56 | 
57 | print("%s increases your programming %s" %(language, skill))
58 | 
59 | #modern alternative output formatting using string.format
60 | print("{} increases your programming {}".format(language, skill))
61 | 
62 | #use an optional index to easily rearrange the order of output.
63 | print("{0} increases your programming {1}".format(language, skill))
64 | print("{1} increases your programming {0}".format(language, skill))
65 | 
66 | #reuse the same variable multiple times
67 | print("{0} increases {0} your {0} programming {1}".format(language, skill))
68 | 
69 | #formatting output to n decimal places
70 | print("{:.2f}".format(0.123456789))
71 | print("{:.3f}".format(0.123456789))
72 | 
73 | #if you need {} in your output - double up.
74 | print("{{}}".format("double"))
75 | 
76 | #splitting strings
77 | 
78 | sentence = "we are the knights who say ni!"
79 | print(sentence.split())     #default is to split by space.
80 | 
81 | sentence = "we|are|the|knights|who|say|ni!"
82 | print(sentence.split("|"))
83 | print(sentence.split("|", 1))       #the 2nd parameter limits the number of splits
84 | print(sentence.split("|", 2))
85 | 
86 | #the reverse of split is the join command 
87 | split_data = ["we", "are", "the", "knights", "that", "say", "ni!"]
88 | sentence = " ".join(split_data)     #join words with " " as between each word.
89 | print(sentence)
90 | 
91 | sentence = "-".join(split_data)     #join words with ", " as between each word.
92 | print(sentence)
93 | 
94 | 
95 | #Strings are iterable
96 | mystr = "123456789"
97 | for c in mystr:
98 |     print(c)


--------------------------------------------------------------------------------
/content/appendix/labs/src/test_finance.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jan 11 16:03:41 2019
 5 | 
 6 | @author: tom
 7 | """
 8 | 
 9 | from py_finance import pv, print_pv
10 | 
11 | future_value = 1000
12 | rate = 0.05
13 | years = 10
14 | result = pv(future_value, rate, years)
15 | 
16 | print_pv(future_value, rate, years, result)   
17 | 


--------------------------------------------------------------------------------
/content/appendix/labs/src/week1_debug_challenge1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Debug Exercise Python Lab 1
 5 | 
 6 | The code below has a number of bugs.  
 7 | 
 8 | Can you fix the code and help it to run?
 9 | 
10 | 
11 | """
12 | 
13 | def split_word_in_two(to_split):
14 | """
15 | Returns string split into two parts
16 | 
17 | If the word's length is even the two parts have
18 | equal number of characters
19 | 
20 | Params:
21 | -------
22 | to_split: str 
23 |     the string to split int o
24 | """
25 | length = len(to_spit)
26 | half_length = length / 2
27 | 
28 | part1 = to_split[:half]
29 | part2 = to_split[half:]
30 | 
31 | return part1, part2
32 | 
33 | 
34 | def main():
35 | """
36 | Tests the split_word_in_two function.
37 | Input word = 'faster'
38 | Expected output = ('fas', 'ter')
39 | """
40 | word_to_split = 'faster'
41 | result = split_word_in_two(word_to_split)
42 | print('Part 1 = {0}; Part 2 = {1}'.format(result[0], result[1]))
43 | 
44 | 
45 | if __name__ == "__main__":
46 |    main()
47 | 
48 | 


--------------------------------------------------------------------------------
/content/appendix/labs/src/wk2_debug_challenge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Week 2: Debug Challenge.
 7 | 
 8 | This weeks debug challenges loops and nested loops.
 9 | 
10 | A classic task in programming is to sort a list.  Python makes this 
11 | very simple by including a number of ways to sort a list.  
12 | 
13 | Under the hood the sorting routines are all variations on loops 
14 | where values in the array are swapped until it is in ascending order.
15 | 
16 | The function below implements insertion sort.  Insertion sort is an 
17 | efficient algorithm for sorting a list made of two loops.  An outer loop
18 | that iterates forward through the list and an inner list that iterates backwards.
19 | 
20 | The code below is not running.  Can you debug it?
21 | 
22 | Good luck!
23 | 
24 | 
25 | """
26 | 
27 | def insertion_sort(to_sort):
28 |     """
29 |     Sort a list of numbers using the insertion sort algorithm
30 |     Return a list of sorted numbers
31 |     
32 |     Keyword arguments:
33 |     to_sort -- an unsorted python list of numbers
34 |     """
35 |     
36 |     #This is the outer loop.
37 |     for i in range(1, to_sort):
38 |         
39 |         j = i
40 |         
41 |         #This inner while loop.  Note the backwards iteration.
42 |         #The while loop terminates when either:
43 |         #1. j == 0 i.e. the first element in the list is reached
44 |         #2. there is no need to do any sorting i.e. to_sort[j-1] < to_sort[j]
45 |         while j > 0 and to_sort[j-1] > to_sort[j]
46 |             
47 |          #to swap the values we need a 3rd variables (temp)
48 |          temp = to_sort[j]
49 |          to_sort[j] = to_sort[j-1]
50 |          to_sort[j-1] = temp
51 |          j -= 1
52 |         
53 |     return to_sort
54 | 
55 | 
56 | if __name__ == "__main__":
57 |    list_to_sort = [14,33,27,10,35,19,42,44]
58 |    sorted_list = insertion_sort(list_to_sort)
59 |    print(sorted_list)


--------------------------------------------------------------------------------
/content/front_page.md:
--------------------------------------------------------------------------------
 1 | ![title image](imgs/title_logo.png)
 2 | 
 3 | # Preface
 4 | 
 5 | Welcome to my online textbook for learning enough python to be a credible data scientist. 
 6 | 
 7 | The book was written with three audiences in mind
 8 | 
 9 | 1. Post-graduate students studying or researching a health data science related topic
10 | 2. My MSc students in [health data science](https://www.exeter.ac.uk/postgraduate/courses/medicine/healthdatasciencemsc/) at the University of Exeter.  Indeed I use this book in the module coding for ML and data science.
11 | 3. Health service analysts (particularly in the UK's NHS) who are looking to boost their python skills to be a more rounded data scientist.
12 | 
13 | The book aims to support these groups because data science is a rapidly evolving discipline that offers huge potential for the future of health care, medicine and wider areas of science. I'm very exited about health data science using python and you should be too. We now have wonderful python machine learning packages such as `sklearn`, `keras` + `tensorflow` and `pytorch`. These packages are very easy to use and scripting in them can be learnt using more online tutorials than you can count. For that reason I am not going to write about these popular machine learning packages in this book. I don't really care if you can write an `sklearn` script. Instead I'm going to focus on making you a more rounded data scientist that can write code that's going to stand the test of time. My aim is that, by the end of the book, you will be able to write clean code that can be **confidently** published alongside your research, can be run by others and can be returned to by yourself in 5-10 years and still understood (the person most likely to reuse your work is you!). By the end of the book you will be a health data scientist and a "coder" (or if you are old like me a "programmer") and view code as a first class citizen in your data science projects. You will be able to focus on the data science as opposed to getting bogged down in the frequent coding problems you will face in real studies. This means we are going to focus a bit on code design, a bit on scientific problems, a bit on statistical programming and a bit on the management and deployment of data science code projects.  
14 | 
15 | This all sounds a bit pretentious doesn't it?  Well perhaps a bit, but it comes from a good place. Through my work I regularly meet people and students within the data science discipline who can use a package such as `sklearn`, but can't implement a very basic algorithm in python (or anything else), control code dependencies and versioning, and (to my horror) have manual (or semi-manual) pipelines for wrangling their data in shape. These data scientists would be far more employable and useful to an organisation if they took their coding to the next level.  To be more blunt kids there's money, kudos, better science, and real benefits for society that are up for grabs if people were willing to put in the effort. 
16 | 
17 | The book is powered by [Jupyter Book](https://jupyterbook.org/intro.html). This means that the parts of the book containing code (with a few minor caveats) are executable online using [BinderHub](https://binderhub.readthedocs.io/en/latest/index.html#) or the free version of Google Colabratory.  To use Google Colab you will need to login to a Google account.  BinderHub does not require a login.  You can also download sections of the book as Jupyter notebooks (.ipynb) that you can run locally and with higher performance in Jupyter-Lab or Notebook.
18 | 
19 | My decision to make the textbook entirely open and free is influenced by three factors. The first is Dr Michael Allen's - relentless and subversive - crusade to make science more open and democratic in our discipline. Over time it appears some of his philosophy has rubbed off on me as well; or he has shamelessly brainwashed me. Either way I agree that data science and the knowledge that underpins it should be free to all. The second influence was Rob J Hyndman's and George Athanasopoulos' fantastic and hugely successful online textbook [forecasting: principles and practice](https://otexts.com/fpp3/). After reading an early edition of this many years ago I was immediately convinced of the benefit of sharing and updating knowledge in this way. I've partly modelled the book on these ideas and content will be continually updated to remove errors and expanded WITHOUT needing to purchase a new edition. I'll archive all editions permanently at [Cern via Zenodo](https://zenodo.org/) (I'm confident this is a good place and will make it to the heat death of the universe). Finally, when I saw the wonderful [Jupyter Book project](https://jupyterbook.org/intro.html) I knew immediately that this was the right tool to create a book of my own. The team that develop Jupyter book are so fantastic I even forgive them for changing the way Juypter Book table of content pages work between versions 0.10 and 0.11. As I write this the book is served up for free via Microsoft's [GitHub pages](https://pages.github.com/) and provides interactive code for you to run in the cloud.  The book title image and python symbol were created by me in [Inkscape](https://inkscape.org/).
20 | 
21 | I don't have a copy editor...sorry.  Instructions for reporting mistakes and any garbage is [here](001_setup/contributing.md).
22 | 
23 | I hope the book is useful to you whatever area of data science you are working in.  I'll admit that I've not been able to cover everything I had originally planned for this first edition.  If you have content ideas do let me know via the GitHub repo issues and you might find them, along with credit, in a future release!
24 | 
25 | Work hard and code every day!
26 | 
27 | **Tom Monks**
28 | September 2021.
29 | 
30 | 
31 | ## Creative Commons License¶
32 | 
33 | All written content in this book (this includes all files and content in the [content/](https://github.com/health-data-science-OR/coding-for-ml/tree/main/content) folder) is shared under CC BY 4.0.
34 | 
35 | You are free to:
36 | 
37 |     Share - copy and redistribute the material in any medium or format
38 | 
39 |     Adapt - remix, transform, and build upon the material for any purpose, even commercially.
40 | 
41 | You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
42 | 
43 | License info: https://creativecommons.org/licenses/by/4.0/
44 | 
45 | ## Code: MIT Licensed
46 | 
47 | All code in this book is licensed under a [MIT permissive license](https://github.com/health-data-science-OR/coding-for-ml/blob/main/LICENSE)


--------------------------------------------------------------------------------
/content/imgs/logo_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/logo_v1.png


--------------------------------------------------------------------------------
/content/imgs/package_versus_project.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="app.diagrams.net" modified="2024-08-05T15:37:18.865Z" agent="Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0" etag="Zh93oByyKxorJyitLcNe" version="24.5.4" type="device">
 2 |   <diagram name="Page-1" id="_NGZ6Zvq45jVciO4eiLn">
 3 |     <mxGraphModel dx="1434" dy="2341" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="1654" math="0" shadow="0">
 4 |       <root>
 5 |         <mxCell id="0" />
 6 |         <mxCell id="1" parent="0" />
 7 |         <mxCell id="i-MEir7v6SuMnobHPEUq-23" value="&#xa;&#xa;&#xa;&#xa;&#xa;python=3.8.19&#xa;numpy==1.24&#xa;pandas==1.53&#xa;your-package==1.0.0 &#xa;" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;" vertex="1" parent="1">
 8 |           <mxGeometry x="100" y="80" width="180" height="170" as="geometry" />
 9 |         </mxCell>
10 |         <mxCell id="i-MEir7v6SuMnobHPEUq-19" value="Project 1" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;fontStyle=1;shadow=1;fillColor=#d5e8d4;strokeColor=#82b366;" vertex="1" parent="1">
11 |           <mxGeometry x="135" y="90" width="110" height="40" as="geometry" />
12 |         </mxCell>
13 |         <mxCell id="i-MEir7v6SuMnobHPEUq-24" value="&#xa;&#xa;&#xa;&#xa;&#xa;python=3.7&#xa;numpy==1.18&#xa;pandas==1.21&#xa;your-package==1.0.0 &#xa;" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;" vertex="1" parent="1">
14 |           <mxGeometry x="310" y="80" width="180" height="170" as="geometry" />
15 |         </mxCell>
16 |         <mxCell id="i-MEir7v6SuMnobHPEUq-25" value="Project 2" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;fontStyle=1;shadow=1;fillColor=#d5e8d4;strokeColor=#82b366;" vertex="1" parent="1">
17 |           <mxGeometry x="340" y="90" width="110" height="40" as="geometry" />
18 |         </mxCell>
19 |         <mxCell id="i-MEir7v6SuMnobHPEUq-26" value="&#xa;&#xa;&#xa;&#xa;&#xa;python=3.10&#xa;numpy==1.26&#xa;pandas==2.2.2&#xa;your-package==1.0.0 &#xa;" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;" vertex="1" parent="1">
20 |           <mxGeometry x="520" y="80" width="180" height="170" as="geometry" />
21 |         </mxCell>
22 |         <mxCell id="i-MEir7v6SuMnobHPEUq-27" value="Project 3" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;fontStyle=1;shadow=1;fillColor=#d5e8d4;strokeColor=#82b366;" vertex="1" parent="1">
23 |           <mxGeometry x="555" y="90" width="110" height="40" as="geometry" />
24 |         </mxCell>
25 |         <mxCell id="i-MEir7v6SuMnobHPEUq-31" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="i-MEir7v6SuMnobHPEUq-28" target="i-MEir7v6SuMnobHPEUq-23">
26 |           <mxGeometry relative="1" as="geometry" />
27 |         </mxCell>
28 |         <mxCell id="i-MEir7v6SuMnobHPEUq-32" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="i-MEir7v6SuMnobHPEUq-28" target="i-MEir7v6SuMnobHPEUq-24">
29 |           <mxGeometry relative="1" as="geometry" />
30 |         </mxCell>
31 |         <mxCell id="i-MEir7v6SuMnobHPEUq-33" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;strokeWidth=3;" edge="1" parent="1" source="i-MEir7v6SuMnobHPEUq-28" target="i-MEir7v6SuMnobHPEUq-26">
32 |           <mxGeometry relative="1" as="geometry" />
33 |         </mxCell>
34 |         <mxCell id="i-MEir7v6SuMnobHPEUq-28" value="&#xa;&#xa;&#xa;&#xa;Requires:&#xa;Python &gt;= 3.7  &lt;= 3.11&#xa;numpy &gt;= 1.18 &lt; 2.0&#xa;pandas &gt; 1.2" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;" vertex="1" parent="1">
35 |           <mxGeometry x="310" y="-170" width="180" height="170" as="geometry" />
36 |         </mxCell>
37 |         <mxCell id="i-MEir7v6SuMnobHPEUq-29" value="your-package&#xa;v1.0.0&#xa;" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;fontStyle=1;shadow=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="1">
38 |           <mxGeometry x="340" y="-160" width="110" height="40" as="geometry" />
39 |         </mxCell>
40 |         <mxCell id="i-MEir7v6SuMnobHPEUq-34" value="&#xa;&#xa;&#xa;&#xa;&#xa;python=3.6&#xa;" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;" vertex="1" parent="1">
41 |           <mxGeometry x="730" y="80" width="180" height="170" as="geometry" />
42 |         </mxCell>
43 |         <mxCell id="i-MEir7v6SuMnobHPEUq-35" value="Incompatible project" style="whiteSpace=wrap;strokeWidth=2;verticalAlign=top;fontStyle=1;shadow=1;fillColor=#f8cecc;strokeColor=#b85450;" vertex="1" parent="1">
44 |           <mxGeometry x="765" y="90" width="110" height="40" as="geometry" />
45 |         </mxCell>
46 |       </root>
47 |     </mxGraphModel>
48 |   </diagram>
49 | </mxfile>
50 | 


--------------------------------------------------------------------------------
/content/imgs/package_versus_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/package_versus_project.png


--------------------------------------------------------------------------------
/content/imgs/small_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/small_logo.png


--------------------------------------------------------------------------------
/content/imgs/title.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title.odg


--------------------------------------------------------------------------------
/content/imgs/title_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title_cropped.png


--------------------------------------------------------------------------------
/content/imgs/title_cropped.png~:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title_cropped.png~


--------------------------------------------------------------------------------
/content/imgs/title_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/content/imgs/title_logo.png


--------------------------------------------------------------------------------
/images/binder_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/binder_1.png


--------------------------------------------------------------------------------
/images/binder_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/binder_2.png


--------------------------------------------------------------------------------
/images/book_title_page_log.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    width="210mm"
  6 |    height="297mm"
  7 |    viewBox="0 0 210 297"
  8 |    version="1.1"
  9 |    id="svg5"
 10 |    inkscape:version="1.1.1 (3bf5ae0d25, 2021-09-20)"
 11 |    sodipodi:docname="book_title_page_log.svg"
 12 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 13 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 14 |    xmlns:xlink="http://www.w3.org/1999/xlink"
 15 |    xmlns="http://www.w3.org/2000/svg"
 16 |    xmlns:svg="http://www.w3.org/2000/svg">
 17 |   <sodipodi:namedview
 18 |      id="namedview7"
 19 |      pagecolor="#ffffff"
 20 |      bordercolor="#666666"
 21 |      borderopacity="1.0"
 22 |      inkscape:pageshadow="2"
 23 |      inkscape:pageopacity="0.0"
 24 |      inkscape:pagecheckerboard="0"
 25 |      inkscape:document-units="mm"
 26 |      showgrid="true"
 27 |      inkscape:zoom="1.4142136"
 28 |      inkscape:cx="386.78741"
 29 |      inkscape:cy="175.00893"
 30 |      inkscape:window-width="1920"
 31 |      inkscape:window-height="1016"
 32 |      inkscape:window-x="1920"
 33 |      inkscape:window-y="27"
 34 |      inkscape:window-maximized="1"
 35 |      inkscape:current-layer="layer1"
 36 |      inkscape:snap-others="true"
 37 |      inkscape:snap-nodes="false">
 38 |     <inkscape:grid
 39 |        type="xygrid"
 40 |        id="grid20368" />
 41 |   </sodipodi:namedview>
 42 |   <defs
 43 |      id="defs2">
 44 |     <linearGradient
 45 |        inkscape:collect="always"
 46 |        id="linearGradient25919">
 47 |       <stop
 48 |          style="stop-color:#ffcc00;stop-opacity:1"
 49 |          offset="0"
 50 |          id="stop25915" />
 51 |       <stop
 52 |          style="stop-color:#ffcc00;stop-opacity:1"
 53 |          offset="1"
 54 |          id="stop25917" />
 55 |     </linearGradient>
 56 |     <linearGradient
 57 |        inkscape:collect="always"
 58 |        id="linearGradient25286">
 59 |       <stop
 60 |          style="stop-color:#3771c8;stop-opacity:1;"
 61 |          offset="0"
 62 |          id="stop25282" />
 63 |       <stop
 64 |          style="stop-color:#00112b;stop-opacity:1"
 65 |          offset="1"
 66 |          id="stop25284" />
 67 |     </linearGradient>
 68 |     <linearGradient
 69 |        inkscape:collect="always"
 70 |        xlink:href="#linearGradient25286"
 71 |        id="linearGradient25288"
 72 |        x1="505.11343"
 73 |        y1="40.125202"
 74 |        x2="534.79218"
 75 |        y2="69.738693"
 76 |        gradientUnits="userSpaceOnUse" />
 77 |     <linearGradient
 78 |        inkscape:collect="always"
 79 |        xlink:href="#linearGradient25919"
 80 |        id="linearGradient25921"
 81 |        x1="514.87939"
 82 |        y1="50.11903"
 83 |        x2="544.75647"
 84 |        y2="79.953682"
 85 |        gradientUnits="userSpaceOnUse" />
 86 |   </defs>
 87 |   <g
 88 |      inkscape:label="Layer 1"
 89 |      inkscape:groupmode="layer"
 90 |      id="layer1">
 91 |     <rect
 92 |        style="opacity:1;fill:#ececec;fill-opacity:1;stroke-width:0.139906"
 93 |        id="rect26612"
 94 |        width="50.621185"
 95 |        height="28.575037"
 96 |        x="28.214947"
 97 |        y="5.6089425"
 98 |        rx="0.13229166"
 99 |        ry="0.13229166"
100 |        inkscape:export-filename="/home/tom/Pictures/test_logo.png"
101 |        inkscape:export-xdpi="300"
102 |        inkscape:export-ydpi="300" />
103 |     <text
104 |        xml:space="preserve"
105 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93889px;line-height:1.25;font-family:'League Gothic';-inkscape-font-specification:'League Gothic, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;fill:#241c1c;fill-opacity:1;stroke:none;stroke-width:0.261947"
106 |        x="29.767414"
107 |        y="10.998397"
108 |        id="text1310"
109 |        transform="scale(1.0100637,0.99003658)"
110 |        inkscape:export-filename="/home/tom/Pictures/test_logo.png"
111 |        inkscape:export-xdpi="300"
112 |        inkscape:export-ydpi="300"><tspan
113 |          sodipodi:role="line"
114 |          id="tspan1308"
115 |          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888886px;font-family:'League Gothic';-inkscape-font-specification:'League Gothic, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;fill:#241c1c;stroke-width:0.261947"
116 |          x="29.767414"
117 |          y="10.998397">PYTHON FOR HEALTH DATA SCIENCE</tspan></text>
118 |     <g
119 |        id="g26508"
120 |        transform="matrix(1.8708808,0,0,1.8708808,-207.42104,-7.0165628)"
121 |        inkscape:export-filename="/home/tom/Pictures/test_logo.png"
122 |        inkscape:export-xdpi="300"
123 |        inkscape:export-ydpi="300">
124 |       <path
125 |          id="rect20392"
126 |          style="fill:url(#linearGradient25288);fill-opacity:1;stroke-width:0.986018"
127 |          d="m 520,40 c -2.76998,0 -5,2.229999 -5,5 v 4 h 9.5 c 0.27704,0 0.5,0.222991 0.5,0.5 0,0.277002 -0.223,0.5 -0.5,0.5 H 520.11523 515 510.11523 c -2.76997,0 -5,2.230003 -5,5 v 10 c 0,2.770001 2.23003,5 5,5 h 3.99219 v -5.595703 c 0,-2.769997 2.22999,-5 5,-5 h 0.25586 c 0.2087,0.02634 0.42065,0.04102 0.63672,0.04102 h 10 c 2.77002,0 5,-2.229999 5,-5 V 45 c 0,-2.770001 -2.22998,-5 -5,-5 z"
128 |          transform="scale(0.26458333)" />
129 |       <path
130 |          id="path22419"
131 |          style="fill:url(#linearGradient25921);fill-opacity:1;stroke-width:0.986018"
132 |          d="m 535.96094,50.027344 v 5.595703 c 0,2.769997 -2.22999,5 -5,5 h -0.25391 c -0.20901,-0.02642 -0.42031,-0.04102 -0.63672,-0.04102 h -10 c -2.77001,0 -5,2.230003 -5,5 v 9.445313 c 0,2.769997 2.22999,5 5,5 h 10 c 2.77002,0 5,-2.230003 5,-5 v -4 h -9.5 c -0.27704,0 -0.5,-0.222995 -0.5,-0.5 0,-0.277002 0.22296,-0.5 0.5,-0.5 h 4.38281 5.11719 4.88281 c 2.76998,0 5.00001,-2.230003 5,-5 v -10 c 0,-2.770001 -2.23002,-5 -5,-5 z"
133 |          transform="scale(0.26458333)" />
134 |       <ellipse
135 |          style="opacity:1;fill:#ffffff;stroke-width:0.103629"
136 |          id="path22495"
137 |          cx="137.58482"
138 |          cy="11.906759"
139 |          rx="0.26335111"
140 |          ry="0.26331633" />
141 |       <ellipse
142 |          style="fill:#ffffff;stroke-width:0.103629"
143 |          id="path22495-9"
144 |          cx="140.22546"
145 |          cy="19.840998"
146 |          rx="0.26335111"
147 |          ry="0.26331633" />
148 |     </g>
149 |   </g>
150 | </svg>
151 | 


--------------------------------------------------------------------------------
/images/detrended.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/detrended.jpg


--------------------------------------------------------------------------------
/images/diag.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/diag.jpg


--------------------------------------------------------------------------------
/images/release.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/release.png


--------------------------------------------------------------------------------
/images/test_pypi2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/test_pypi2.png


--------------------------------------------------------------------------------
/images/test_pypi3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/test_pypi3.png


--------------------------------------------------------------------------------
/images/testpypi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/health-data-science-OR/coding-for-ml/1956470affd86afce824c240a849ec7d1b0fdd18/images/testpypi.png


--------------------------------------------------------------------------------