├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── TUTORIAL.md ├── environment.yml ├── notebooks ├── 10-reproducible-environment.ipynb ├── 11-reproducible-environment-solutions.ipynb ├── 20-creating-datasets.ipynb ├── 21-creating-datasets-solutions.ipynb ├── 22-transform-datasource.ipynb ├── 2x-bjorn-add-lvqpak.ipynb ├── 30-bjorn-train-predict.ipynb ├── 40-bjorn-analysis.ipynb ├── 50-mark-add-fmnist.ipynb ├── Notes on the Tutorial.ipynb ├── README.md ├── charts │ ├── munge-supervised.png │ └── munge-unsupervised.png └── references │ ├── charts │ ├── munge-supervised.png │ └── munge-unsupervised.png │ ├── cheat_sheet.pdf │ ├── cheat_sheet.png │ └── workflow │ ├── make-analyze.png │ ├── make-data.png │ ├── make-predict.png │ ├── make-publish.png │ ├── make-raw.png │ └── make-train.png ├── references ├── README.md ├── bus_number_slides.pdf └── cheat_sheet.pdf └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | 72 | # Spyder 73 | .spyproject/ 74 | 75 | # Jupyter NB Checkpoints 76 | .ipynb_checkpoints/ 77 | 78 | # exclude data from source control by default 79 | /data/ 80 | 81 | # Mac OS-specific storage files 82 | .DS_Store 83 | 84 | # Emacs 85 | *~ 86 | .*~ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | The MIT License (MIT) 3 | Copyright (c) 2018, Tutte Institute for Mathematics and Computing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean requirements create_environment test_environment delete_environment show-help help-prefix 2 | 3 | PROJECT_NAME = bus_number 4 | PYTHON_INTERPRETER = python3 5 | VIRTUALENV = conda 6 | 7 | ## Install or update Python Dependencies 8 | requirements: test_environment environment.lock 9 | 10 | ## Delete all compiled Python files 11 | clean: 12 | find . -type f -name "*.py[co]" -delete 13 | find . -type d -name "__pycache__" -delete 14 | 15 | environment.lock: environment.yml 16 | ifeq (conda, $(VIRTUALENV)) 17 | $(CONDA_EXE) env update -n $(PROJECT_NAME) -f $< 18 | $(CONDA_EXE) env export -n $(PROJECT_NAME) -f $@ 19 | else 20 | $(error Unsupported Environment `$(VIRTUALENV)`. Use conda) 21 | endif 22 | 23 | ## Set up python interpreter environment 24 | create_environment: 25 | ifeq (conda,$(VIRTUALENV)) 26 | @echo ">>> Detected conda, creating conda environment." 27 | ifneq ("X$(wildcard ./environment.lock)","X") 28 | $(CONDA_EXE) env create --name $(PROJECT_NAME) -f environment.lock 29 | else 30 | @echo ">>> Creating lockfile from $(CONDA_EXE) environment specification." 31 | $(CONDA_EXE) env create --name $(PROJECT_NAME) -f environment.yml 32 | $(CONDA_EXE) env export --name $(PROJECT_NAME) -f environment.lock 33 | endif 34 | @echo ">>> New conda env created. Activate with: 'conda activate $(PROJECT_NAME)'" 35 | else 36 | @pip install -q virtualenv virtualenvwrapper 37 | @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\ 38 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" 39 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" 40 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" 41 | endif 42 | 43 | delete_environment: 44 | ifeq (conda,$(VIRTUALENV)) 45 | @echo "Deleting conda environment." 46 | $(CONDA_EXE) env remove -n $(PROJECT_NAME) 47 | endif 48 | 49 | 50 | ## Test python environment is set-up correctly 51 | test_environment: 52 | ifeq (conda,$(VIRTUALENV)) 53 | ifneq (${CONDA_DEFAULT_ENV}, $(PROJECT_NAME)) 54 | $(error Must activate `$(PROJECT_NAME)` environment before proceeding) 55 | endif 56 | endif 57 | $(PYTHON_INTERPRETER) test_environment.py 58 | 59 | 60 | .DEFAULT_GOAL := show-help 61 | 62 | # Inspired by 63 | # sed script explained: 64 | # /^##/: 65 | # * save line in hold space 66 | # * purge line 67 | # * Loop: 68 | # * append newline + line to hold space 69 | # * go to next line 70 | # * if line starts with doc comment, strip comment character off and loop 71 | # * remove target prerequisites 72 | # * append hold space (+ newline) to line 73 | # * replace newline plus comments by `---` 74 | # * print line 75 | # Separate expressions are necessary because labels cannot be delimited by 76 | # semicolon; see 77 | 78 | HELP_VARS := PROJECT_NAME 79 | 80 | print-% : ; @echo $* = $($*) 81 | 82 | help-prefix: 83 | @echo "To get started:" 84 | @echo " >>> $$(tput bold)make create_environment$$(tput sgr0)" 85 | @echo " >>> $$(tput bold)conda activate $(PROJECT_NAME)$$(tput sgr0)" 86 | @echo 87 | @echo "$$(tput bold)Project Variables:$$(tput sgr0)" 88 | 89 | show-help: help-prefix $(addprefix print-, $(HELP_VARS)) 90 | @echo 91 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 92 | @sed -n -e "/^## / { \ 93 | h; \ 94 | s/.*//; \ 95 | :doc" \ 96 | -e "H; \ 97 | n; \ 98 | s/^## //; \ 99 | t doc" \ 100 | -e "s/:.*//; \ 101 | G; \ 102 | s/\\n## /---/; \ 103 | s/\\n/ /g; \ 104 | p; \ 105 | }" ${MAKEFILE_LIST} \ 106 | | LC_ALL='C' sort --ignore-case \ 107 | | awk -F '---' \ 108 | -v ncol=$$(tput cols) \ 109 | -v indent=19 \ 110 | -v col_on="$$(tput setaf 6)" \ 111 | -v col_off="$$(tput sgr0)" \ 112 | '{ \ 113 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 114 | n = split($$2, words, " "); \ 115 | line_length = ncol - indent; \ 116 | for (i = 1; i <= n; i++) { \ 117 | line_length -= length(words[i]) + 1; \ 118 | if (line_length <= 0) { \ 119 | line_length = ncol - indent - length(words[i]) - 1; \ 120 | printf "\n%*s ", -indent, " "; \ 121 | } \ 122 | printf "%s ", words[i]; \ 123 | } \ 124 | printf "\n"; \ 125 | }' \ 126 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Bus Number: A Workflow for Reproducible Data Science 2 | 3 | > **Bus Number** (bŭs nŭmʹbər), *noun*: 4 | > 5 | > *The number of people that need to get hit by a bus before your data science 6 | project becomes irreproducible.* 7 | 8 | This number might be **zero**. In this tutorial, we aim to increase your bus number. 9 | 10 | ## The Reproducible Data Science Process 11 | ### How do you spend your "Data Science" time? 12 | A typical data science process involves three main kinds of tasks: 13 | * Munge: Fetch, process data, do EDA 14 | * Science: Train models, Predict, Transform data 15 | * Deliver: Analyze, summarize, publish 16 | 17 | where our time tends to be allocated something like this: 18 | 19 | Typical Data science Process 20 | 21 | Unfortunately, even though most of the work tends to be in the **munge** part of the process, when we do try and make data science reproducible, we tend to focus mainly on reprodibility of the **science** step. 22 | 23 | That seems like a bad idea, especially if we're doing unsupervised learning, where often our time is spent like this: 24 | 25 | Typical Data science Process 26 | 27 | We're going to try to improve this to a process that is **reproducible from start to finish**. 28 | 29 | There are 4 steps to a fully reproducible data science flow: 30 | * Creating a **Reproducible Environment** 31 | * Creating **Reproducible Data** 32 | * Building **Reproducible Models** 33 | * Achieving **Reproducible Results** 34 | 35 | In this series of tutorials, we will look at each of these steps in turn. 36 | This repo is all about **getting you started doing Reproducible Data Science** , and giving you a **deeper look** at some of the concepts we will cover in this tutorial. For the latest version, visit: 37 | 38 | https://github.com/hackalog/bus_number 39 | 40 | To get started, open [Tutorial 1: Reproducible Environments](TUTORIAL.md). 41 | 42 | -------------------------------------------------------------------------------- /TUTORIAL.md: -------------------------------------------------------------------------------- 1 | # Tutorial 1: Reproducible Environments 2 | 3 | ## Overview 4 | 5 | * Requirements: The Bare Minimum 6 | 7 | * Using a Data Science Template: `cookiecutter` 8 | 9 | * Virtual Environments: `conda` and environment files 10 | * Revision Control: git and a git workflow 11 | * Installing, Enabling, and using nbdime 12 | * The Data Science DAG 13 | * make, Makefiles and data flow 14 | * Python Modules 15 | * Creating an editable module 16 | * Testing: doctest, pytest, hypothesis 17 | 18 | ## The Bare Minimum 19 | You will need: 20 | * `conda` (via anaconda or miniconda) 21 | * `cookiecutter` 22 | * `make` 23 | * `git` 24 | * `python >= 3.6` (via `conda`) 25 | 26 | ### ASIDE: Our Favourite Python Parts 27 | Why the `python>=3.6` requirement? 28 | * f-strings: Finally, long, readable strings in our code. 29 | * dictionaries: insertion order is preserved! 30 | 31 | Other great tools: 32 | * `pathlib`: Sane, multiplatorm path handling: https://realpython.com/python-pathlib/ 33 | * `doctest`: Examples that always work: https://docs.python.org/3/library/doctest.html 34 | * `joblib`: Especially the persistence part: https://joblib.readthedocs.io/en/latest/persistence.html 35 | 36 | ### Installing Anaconda 37 | We use `conda` for handling package dependencies, maintaining virtual environments, and installing particular version of python. For proper integration with pip, you should make sure you are running conda >= 4.4.0. Some earlier versions of conda have difficulty with editable packages (which is how we install our `src` package) 38 | 39 | * See the [Anadonda installation guide](https://conda.io/docs/user-guide/install/index.html) for details 40 | 41 | ### Installing Cookiecutter 42 | `cookiecutter` is a python tool for creating projects from project templates. We use cookiecutter to create a reproducible data science template for starting our data science projects. 43 | 44 | To install it: 45 | ``` 46 | conda install -c conda-forge cookiecutter 47 | ``` 48 | ### make 49 | We use gnu `make` (and `Makefiles`) as a convenient interface to the various stages of the reproducible data science data flow. If for some reason your system doesn't have make installed, try: 50 | ``` 51 | conda install -c anaconda make 52 | ``` 53 | ### git 54 | We use git (in conjunction with a workflow tool like GitHub, BitBucket, or GitLab) to manage version control. 55 | 56 | Atlassian has good [instructions for installing git](https://www.atlassian.com/git/tutorials/install-git) if it is not already available on your platform. 57 | 58 | ### Exercise 1: Install the requirements 59 | * Anaconda 60 | * Cookiecutter 61 | * make 62 | * git 63 | 64 | Check your installations: 65 | 66 | ``` 67 | conda --version 68 | make --version 69 | git --version 70 | cookiecutter --version 71 | ``` 72 | 73 | ## Using a Data Science Template: `cookiecutter` 74 | 75 | We use cookiecutter to create a reproducible data science template for starting our data science projects. 76 | 77 | 78 | You can install from the github/bitbucket repo directly, even from a particular branch. For example, we will want to use the `bus_number` branch of `cookiecutter-easydata`. 79 | 80 | ``` 81 | cookiecutter https://github.com/hackalog/cookiecutter-easydata.git --checkout bus_number 82 | ``` 83 | 84 | **Note**: Once you have completed this tutorial, we recommend that you use the standard version of `cookiecutter-easydata` as `bus_number` branch has been customized for this tutorial: 85 | ``` 86 | cookiecutter https://github.com/hackalog/cookiecutter-easydata.git 87 | ``` 88 | 89 | ### Exercise 2: Start your cookiecutter-based project 90 | Create a project called `Bus Number Tutorial`: 91 | * Use `conda` as your virtualenv manager 92 | * Use python 3.6 or greater 93 | 94 | When complete, you should have a fully populated project directory (`bus_number_tutorial`), complete with customized `README.md`. 95 | 96 | We will be working in this project from now on. 97 | 98 | 99 | ## Virtual Environments: `conda` and environment files 100 | 101 | Everyone's computing environment is different. How can we ensure that another user running a different platform can successfully run the code you are creating? How do we know they are using the same versions of your code and all its various supporting libraries? How do we reproduce your working environment on someone else's machine? 102 | 103 | In short, by using **virtual environments**. 104 | 105 | In this toolkit, we use `conda` (as provided by either *anaconda* or *miniconda*) to create and manage virtual environments. Furthermore, we use an **environment file**, `environment.yml` to specify all of the dependencies that need to be installed to run our code. 106 | 107 | Two `make` commands ensure that we have the appropriate environment. Use 108 | * `make create_environment`: for the initial creation of a project specific conda environment 109 | * `make requirements`: to update your environment whenever you change your `environment.yml` specs. 110 | 111 | If you ever get the urge to delete your environment and start again from scratch, you can do a 112 | * `make delete_environment` 113 | 114 | We will get to `make` in the next section of this tutorial. 115 | 116 | **Caveat**: Technically speaking, a `conda` environment created from an `environment.yml` file is likely **not reproducible**. Even if you specify a specific version of a package in your `environment.yml`, the way its dependencies get resolved may differ from system to system. One way to work around this ambiguity is to have an additional file (called a **lockfile**) that explicitly records all dependencies and version numbers. This is the **great way** to handle ambiguity, while keeping your `environment.yml` requirements manageable. In this toolkig, we have implemented this lockfile mechanism by automatically generating an `environment.lock` file from your `environment.yml` whenever it changes. 117 | 118 | ### Exercise 3: Set up your virtual environment and install all dependencies 119 | * Create and activate your `bus_number_tutorial` conda environment using the above `make` commands. 120 | * Look at the difference between `environment.yml` and the generated `environment.lock` 121 | 122 | ### Exercise 4: Pick up this tutorial in your new conda environment 123 | * Run `jupyter notebook` and open `notebooks/10-reproducible-environment.ipynb`. 124 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: bus_number 2 | dependencies: 3 | - pip 4 | - pip: 5 | - cookiecutter 6 | - setuptools 7 | - wheel 8 | - ipykernel 9 | - jupyter 10 | - nb_conda 11 | - python>=3.6 12 | -------------------------------------------------------------------------------- /notebooks/10-reproducible-environment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tutorial 1: Reproducible Environments\n", 8 | "(Continued from `README.md`)\n", 9 | "\n", 10 | "## Overview\n", 11 | "\n", 12 | "* Requirements: The Bare Minimum \n", 13 | "\n", 14 | "* Using a Data Science Template: `cookiecutter`\n", 15 | "\n", 16 | "* Virtual Environments: `conda` and environment files\n", 17 | "* Revision Control: git and a git workflow\n", 18 | " * Installing, Enabling, and using nbdime\n", 19 | "* The Data Science DAG\n", 20 | " * make, Makefiles and data flow\n", 21 | "* Python Modules\n", 22 | " * Creating an editable module\n", 23 | "* Testing: doctest, pytest, hypothesis" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "We'll start out by checking that all the requirements are met from the previous exercises (started in `README.md`)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Exercise 1: Install the requirements\n", 38 | "\n", 39 | "* Anaconda\n", 40 | "* Cookiecutter\n", 41 | "* make\n", 42 | "* git" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "### Test your installation" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "!conda --version # or `$CONDA_EXE --version` in some environments" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "!make --version" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!git --version" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "!cookiecutter --version" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Exercise 2: Start your cookiecutter-based project\n", 93 | "\n", 94 | "Create a project called `bus_number_tutorial`:\n", 95 | "\n", 96 | " Use conda as your virtualenv manager\n", 97 | " Use python 3.6 or greater\n", 98 | "\n", 99 | "When complete, you should have a fully populated project directory, complete with customized README.md.\n", 100 | "\n", 101 | "We will be working in this project from now on." 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Exercise 2b:\n", 109 | "\n", 110 | "Explore the `README.md` from your new `bus_number_tutorial` project\n", 111 | "\n", 112 | "(Hint: You can use the `%load` magic, or `!cat` to look at it in your notebook)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Exercise 3: Set up your virtual environment and install all dependencies\n", 141 | "\n", 142 | "Create and activate your `bus_number_tutorial` conda environment using the above make commands." 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Your `active environment` should be `bus_number_tutorial`\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "!conda info" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "If done correctly, you should also be able to import from `src`" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# if importing src doesn't work, try `make requirements`\n", 175 | "import src" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### Exercise 4: Pick up this tutorial in your new repo\n", 183 | "\n", 184 | "* Copy the notebooks from `bus_number` into your new `bus_number_tutorial` repo\n", 185 | "* Run jupyter notebook and open `notebooks/10-reproducible-environment.ipynb`\n", 186 | "\n", 187 | "If you're currently running this notebook and the checks from the previous exercises worked, then you're in business!\n", 188 | "\n", 189 | "Keep going from here!" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Revision Control: `git`\n", 197 | "\n", 198 | "How do we keep track of our changes? We use **git**.\n", 199 | "\n", 200 | "Before we do anything interesting, let's initialize a git repository (repo) here.\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Exercise 5: Initialize a git repo for `bus_number_tutorial`" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "```\n", 215 | "git init\n", 216 | "git add .\n", 217 | "git commit -m \"Initial Import\"\n", 218 | "```" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "!git status" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "We will get back to using git again soon." 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### Exercise 6: Add a dependency\n", 242 | "Modify the environment file so that `make requirements` installs some additional packages\n", 243 | "* install `joblib` using conda\n", 244 | "* install `nbdime` using pip" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# Check that you now have scikit-learn and nbdime installed\n", 268 | "# Don't forget that you need to run `make requirements` once you've change the `environment.yml` file\n", 269 | "import joblib\n", 270 | "import nbdime" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### Exercise 7: Basic git interactions\n", 278 | "\n", 279 | "Check the changes to your `environment.yml` file into your git repo" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "See what has changed with git:" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "!git status" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "!git diff -u ../environment.yml" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "To add or reject your changes incrementally:" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "#!git add -p\n", 321 | "#!git reset -p\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "Commit the changes" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "#!git commit -v" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# You should have no differences in your branch now\n", 347 | "# Except for those that you've made by running notebooks\n", 348 | "!git status" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "## The Data Science DAG\n", 356 | "DAG = Directed Acyclic Graph. \n", 357 | "\n", 358 | "That means the process eventually stops. (This is a good thing!) \n", 359 | "\n", 360 | "It also means we can use a super old, but incredibly handy tool to implement this workflow: `make`.\n", 361 | "\n", 362 | "### Make, Makefiles, and the Data Flow\n", 363 | "\n", 364 | "\n", 365 | "We use a `Makefile` to organize and invoke the various steps in our Data Science pipeline.\n", 366 | "You have already used this file when you created your virtual environment in the first place:\n", 367 | "```\n", 368 | "make create_environment\n", 369 | "```\n", 370 | "Here are the steps we will be working through in this tutorial:\n", 371 | "\"Reproducible\n", 372 | "\n", 373 | "A [PDF version of the cheat sheet](references/cheat_sheet.pdf) is also available.\n" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "\n", 381 | "### What's my make target doing?\n", 382 | "If you are ever curious what commands a `make` command will invoke (including any invoked dependencies), use `make -n`, which lists the commands without executing them:" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "%%bash\n", 392 | "cd .. && make -n requirements" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "We use a cute **self-documenting makefiles trick** (borrowed from `cookiecutter-datascience`) to make it easy to document the various targets that you add. This documentation is produced when you type a plain `make`:" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "%%bash\n", 409 | "cd .. && make" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "### Under the Hood: The Format of a Makefile\n", 417 | "\n", 418 | "```\n", 419 | "## Comment to appear in the auto-generated documentation\n", 420 | "thing_to_build: space separated list of dependencies\n", 421 | "\tcommand_to_run # there is a tab before this command.\n", 422 | "\tanother_command_to_run # every line gets run in a *new shell*\n", 423 | "```\n", 424 | "\n" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "### Exercise 8: What does this makefile print when you run `make train`?" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "%%file Makefile.test\n", 441 | "\n", 442 | "data: raw\n", 443 | "\t@echo \"Build Datasets\"\n", 444 | "train_test_split:\n", 445 | "\t@echo \"do train/test split\"\n", 446 | "train: data transform_data train_test_split\n", 447 | "\t@echo \"Train Models\"\n", 448 | "transform_data:\n", 449 | "\t@echo \"do a data transformation\"\n", 450 | "raw:\n", 451 | "\t@echo \"Fetch raw data\"\n" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "Note: If you see: ```*** missing separator. Stop.``` it's because you have used spaces instead of **tabs** before your commands. " 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "### Exercise 9: What happens when you add a cycle to a Makefile\n", 494 | "Set up a makefile with a cyclic dependency and run it" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "Using a Makefile like this is an easy way to set up a process flow expressed as a Directed Acyclic Graph (DAG).\n", 516 | "\n", 517 | "**Note**: We have only scratched the surface here. The are lots of interesting tricks you can do with make.\n", 518 | "* http://zmjones.com/make/\n", 519 | "* http://blog.byronjsmith.com/makefile-shortcuts.html\n", 520 | "* https://www.gnu.org/software/make/manual/\n" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "## Back to Revision Control: git workflows\n", 535 | "\n", 536 | "Git isn't really a collaboration tool. It's more a tool for implementing collaboration workflows.\n", 537 | "\n", 538 | "What do we mean by workflow? A process built on top of git that incorporates **pull requests** and **branches**. Typically, this is provided by sites like: GitHub, GitLab, BitBucket.\n" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "#### Some useful references if `gitflow` isn't second nature to you yet\n", 546 | "* Introduction to GitHub tutorial: https://lab.github.com/githubtraining/introduction-to-github\n", 547 | "* Git Handbook: https://guides.github.com/introduction/git-handbook/\n", 548 | "* GitHub workflow cheatsheet: https://github.com/hackalog/bus_number/wiki/Github-Workflow-Cheat-Sheet" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "## Life Rules for using `git`\n", 556 | "\n", 557 | "* Always work on a branch: `git checkout -b my_branch_name`. Delete branches once they are merged.\n", 558 | "* **Never** push to master. Always **work on a branch** and do a pull request.\n", 559 | "* Seriously, don't do work on master if you are collaborating with **anyone**.\n", 560 | "* If you pushed it anywhere, or shared it with anyone, don't `git rebase`. In fact, if you're reading this, don't `git rebase`. Save that for when you are comfortable solving git merge nightmares on your own.\n" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "### Exercise 10: \n", 568 | "\n", 569 | "Create a GitHub/GitLab/BitBucket repo and sync your repo to it.\n" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "### Exercise 11:\n", 598 | "* Create a branch called `add_sklearn`\n", 599 | "* Add a scikit-learn dependency\n", 600 | "* Check in these changes using git to your local repo\n", 601 | "* Push the new branch to GitHub\n", 602 | "* Create a pull request to merge this branch into master\n", 603 | "* Merge your PR (delete the branch afterwards)\n", 604 | "* Sync your local repo with GitHub, including deleting the merged branches" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "## Python Modules\n", 633 | "By default, we keep our source code in a module called `src`. (this can be overridden in the cookieccutter)\n", 634 | "\n", 635 | "This is enabled via one line in `environment.yml`:\n", 636 | "```\n", 637 | "- pip:\n", 638 | " - -e .\n", 639 | "```\n", 640 | "\n", 641 | "This creates an **editable module**, and looks in the current directory for a file called `setup.py` to indicate the module name and location" 642 | ] 643 | }, 644 | { 645 | "cell_type": "raw", 646 | "metadata": {}, 647 | "source": [ 648 | "# %load ../setup.py\n", 649 | "from setuptools import find_packages, setup\n", 650 | "\n", 651 | "setup(\n", 652 | " name='src',\n", 653 | " packages=find_packages(),\n", 654 | " version='0.0.1',\n", 655 | " description='Up Your Bus Number: A Primer for Reproducible Data Science',\n", 656 | " author='Tutte Institute for Mathematics and Computing',\n", 657 | " license='MIT',\n", 658 | ")\n" 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "metadata": {}, 664 | "source": [ 665 | "This lets you easily use your code in notebooks and other scripts, and avoids any `sys.path.append` silliness" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "### ASIDE: Semantic Versioning\n", 673 | "\n", 674 | "Semantic versioning (or *semver*), refers to the convention of versioning with a triple:\n", 675 | "\n", 676 | " MAJOR.MINOR.PATCH\n", 677 | "\n", 678 | "With the following convention: when releasing new versions, increment the:\n", 679 | "\n", 680 | "* MAJOR version when you make **incompatible API changes**,\n", 681 | "* MINOR version when you **add functionality** in a backwards-compatible manner, and\n", 682 | "* PATCH version when you make backwards-compatible **bug fixes**.\n", 683 | "\n", 684 | "If you have no other plan, this is a great convention to follow.\n", 685 | "\n", 686 | "For an obscene amount of detail on this concept, see https://semver.org/" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "metadata": {}, 699 | "source": [ 700 | "### Exercise 11:\n", 701 | "* add your favorite utility function to `src/utils`\n", 702 | "* increment the version number of the editable package\n", 703 | "* run `make requirements` (required if you added dependencies for your utility function)\n", 704 | "* import your utility function and run it from this notebook" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": {}, 718 | "outputs": [], 719 | "source": [] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "## Testing: doctest, pytest, coverage\n", 733 | "\n", 734 | "\n", 735 | "Python has built in testing frameworks via:\n", 736 | "* doctests:https://docs.python.org/3/library/doctest.html#module-doctest\n", 737 | "* unittest: https://docs.python.org/3/library/unittest.html\n", 738 | "\n", 739 | "Additionally, you'll want to make regular use of:\n", 740 | "* pytest: https://docs.pytest.org/en/latest/\n", 741 | "* pytest-cov: https://pypi.org/project/pytest-cov/\n", 742 | "* hypothesis: https://hypothesis.readthedocs.io/en/latest\n", 743 | "\n", 744 | "Cookiecutter (vanilla flavoured) comes witha setup for the `tox` testing framework built in.\n", 745 | "* https://tox.readthedocs.io/en/latest/" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "### Exercise 12:\n", 753 | "\n", 754 | "Add a `make test` target to your makefile that:\n", 755 | "* runs doctests\n", 756 | "* runs pytest unit tests\n", 757 | "* (extra credit) Displays test coverage results\n", 758 | " \n", 759 | "When you run `make test`, you will find tests that will fail in `src/test_example.py`. Fix them in the next exercise." 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": null, 779 | "metadata": {}, 780 | "outputs": [], 781 | "source": [ 782 | "!cd .. && make test" 783 | ] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": {}, 788 | "source": [ 789 | "***Note:*** `make test` is normally functionality built into `cookiecutter-easydata`. We're building it from scratch here for the sake of practice." 790 | ] 791 | }, 792 | { 793 | "cell_type": "markdown", 794 | "metadata": {}, 795 | "source": [ 796 | "### Exercise 13:\n", 797 | "Fix the failing tests" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": null, 817 | "metadata": {}, 818 | "outputs": [], 819 | "source": [ 820 | "# Should pass all tests now!\n", 821 | "!cd .. && make test" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "\n" 829 | ] 830 | }, 831 | { 832 | "cell_type": "markdown", 833 | "metadata": {}, 834 | "source": [ 835 | "### Exercise 14:\n", 836 | "* Check in all your changes to git\n", 837 | "* Merge them into your master branch via a PR in GitHub" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "metadata": {}, 858 | "outputs": [], 859 | "source": [ 860 | "!git status" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": null, 866 | "metadata": {}, 867 | "outputs": [], 868 | "source": [] 869 | } 870 | ], 871 | "metadata": { 872 | "kernelspec": { 873 | "display_name": "Python 3", 874 | "language": "python", 875 | "name": "python3" 876 | }, 877 | "language_info": { 878 | "codemirror_mode": { 879 | "name": "ipython", 880 | "version": 3 881 | }, 882 | "file_extension": ".py", 883 | "mimetype": "text/x-python", 884 | "name": "python", 885 | "nbconvert_exporter": "python", 886 | "pygments_lexer": "ipython3", 887 | "version": "3.7.2" 888 | } 889 | }, 890 | "nbformat": 4, 891 | "nbformat_minor": 2 892 | } 893 | -------------------------------------------------------------------------------- /notebooks/11-reproducible-environment-solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tutorial 1: Reproducible Environments\n", 8 | "(Continued from `README.md`)\n", 9 | "\n", 10 | "## Overview\n", 11 | "\n", 12 | "* Requirements: The Bare Minimum \n", 13 | "\n", 14 | "* Using a Data Science Template: `cookiecutter`\n", 15 | "\n", 16 | "* Virtual Environments: `conda` and environment files\n", 17 | "* Revision Control: git and a git workflow\n", 18 | " * Installing, Enabling, and using nbdime\n", 19 | "* The Data Science DAG\n", 20 | " * make, Makefiles and data flow\n", 21 | "* Python Modules\n", 22 | " * Creating an editable module\n", 23 | "* Testing: doctest, pytest, hypothesis" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "We'll start out by checking that all the requirements are met from the previous exercises (started in `README.md`)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Exercise 1: Install the requirements\n", 38 | "\n", 39 | "* Anaconda\n", 40 | "* Cookiecutter\n", 41 | "* make\n", 42 | "* git" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "### Test your installation" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "!conda --version # or `$CONDA_EXE --version` in some environments" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "!make --version" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!git --version" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "!cookiecutter --version" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Exercise 2: Start your cookiecutter-based project\n", 93 | "\n", 94 | "Create a project called `bus_number_tutorial`:\n", 95 | "\n", 96 | " Use conda as your virtualenv manager\n", 97 | " Use python 3.6 or greater\n", 98 | "\n", 99 | "When complete, you should have a fully populated project directory, complete with customized README.md.\n", 100 | "\n", 101 | "We will be working in this project from now on." 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Solution 2\n", 109 | "\n", 110 | "
\n",
 111 |     " $ cookiecutter cookiecutter-easydata\n",
 112 |     "\n",
 113 |     "project_name [project_name]: bus_number_tutorial\n",
 114 |     "repo_name [bus_number]: \n",
 115 |     "module_name [src]: \n",
 116 |     "author_name [Your name (or your organization/company/team)]: Kjell Wooding\n",
 117 |     "description [A short description of this project.]: Reproducible Data Science\n",
 118 |     "Select open_source_license:\n",
 119 |     "1 - MIT\n",
 120 |     "2 - BSD-2-Clause\n",
 121 |     "3 - Proprietary\n",
 122 |     "Choose from 1, 2, 3 [1]: \n",
 123 |     "s3_bucket [[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')]: \n",
 124 |     "aws_profile [default]: \n",
 125 |     "Select virtualenv:\n",
 126 |     "1 - conda\n",
 127 |     "2 - virtualenv\n",
 128 |     "Choose from 1, 2 [1]: \n",
 129 |     "Select python_interpreter:\n",
 130 |     "1 - python3\n",
 131 |     "2 - python\n",
 132 |     "Choose from 1, 2 [1]: \n",
 133 |     "\n",
 134 |     "\n",
 135 |     " $ cd bus_number_tutorial\n",
 136 |     "\n",
 137 |     "
" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### Exercise 2b:\n", 145 | "\n", 146 | "Explore the `README.md` from your new `bus_number_tutorial` project\n", 147 | "\n", 148 | "(Hint: You can use the `%load` magic, or `!cat` to look at it in your notebook)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Solution 2b:" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# %load ../README.md" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "bus_number_tutorial\n", 172 | "==============================\n", 173 | "\n", 174 | "Reproducible data science tutorial\n", 175 | "\n", 176 | "GETTING STARTED\n", 177 | "---------------\n", 178 | "\n", 179 | "For complete instructions, visit: https://github.com/hackalog/bus_number/wiki/Getting-Started\n", 180 | "\n", 181 | "* Create and switch to the virtual environment:\n", 182 | "```\n", 183 | "cd bus_number_tutorial\n", 184 | "make create_environment\n", 185 | "conda activate bus_number_tutorial\n", 186 | "make requirements\n", 187 | "```\n", 188 | "* Explore the notebooks in the `notebooks` directory\n", 189 | "\n", 190 | "Project Organization\n", 191 | "------------\n", 192 | "* `LICENSE`\n", 193 | "* `Makefile`\n", 194 | " * top-level makefile. Type `make` for a list of valid commands\n", 195 | "* `README.md`\n", 196 | " * this file\n", 197 | "* `data`\n", 198 | " * Data directory. often symlinked to a filesystem with lots of space\n", 199 | " * `data/raw`\n", 200 | " * Raw (immutable) hash-verified downloads\n", 201 | " * `data/interim`\n", 202 | " * Extracted and interim data representations\n", 203 | " * `data/processed`\n", 204 | " * The final, canonical data sets for modeling.\n", 205 | "* `docs`\n", 206 | " * A default Sphinx project; see sphinx-doc.org for details\n", 207 | "* `models`\n", 208 | " * Trained and serialized models, model predictions, or model summaries\n", 209 | " * `models/trained`\n", 210 | " * Trained models\n", 211 | " * `models/output`\n", 212 | " * predictions and transformations from the trained models\n", 213 | "* `notebooks`\n", 214 | " * Jupyter notebooks. Naming convention is a number (for ordering),\n", 215 | " the creator's initials, and a short `-` delimited description,\n", 216 | " e.g. `1.0-jqp-initial-data-exploration`.\n", 217 | "* `references`\n", 218 | " * Data dictionaries, manuals, and all other explanatory materials.\n", 219 | "* `reports`\n", 220 | " * Generated analysis as HTML, PDF, LaTeX, etc.\n", 221 | " * `reports/figures`\n", 222 | " * Generated graphics and figures to be used in reporting\n", 223 | " * `reports/tables`\n", 224 | " * Generated data tables to be used in reporting\n", 225 | " * `reports/summary`\n", 226 | " * Generated summary information to be used in reporting\n", 227 | "* `requirements.txt`\n", 228 | " * (if using pip+virtualenv) The requirements file for reproducing the\n", 229 | " analysis environment, e.g. generated with `pip freeze > requirements.txt`\n", 230 | "* `environment.yml`\n", 231 | " * (if using conda) The YAML file for reproducing the analysis environment\n", 232 | "* `setup.py`\n", 233 | " * Turns contents of `src` into a\n", 234 | " pip-installable python module (`pip install -e .`) so it can be\n", 235 | " imported in python code\n", 236 | "* `src`\n", 237 | " * Source code for use in this project.\n", 238 | " * `src/__init__.py`\n", 239 | " * Makes src a Python module\n", 240 | " * `src/data`\n", 241 | " * Scripts to fetch or generate data. In particular:\n", 242 | " * `src/data/make_dataset.py`\n", 243 | " * Run with `python -m src.data.make_dataset fetch`\n", 244 | " or `python -m src.data.make_dataset process`\n", 245 | " * `src/analysis`\n", 246 | " * Scripts to turn datasets into output products\n", 247 | " * `src/models`\n", 248 | " * Scripts to train models and then use trained models to make predictions.\n", 249 | " e.g. `predict_model.py`, `train_model.py`\n", 250 | "* `tox.ini`\n", 251 | " * tox file with settings for running tox; see tox.testrun.org\n", 252 | "\n", 253 | "\n", 254 | "--------\n", 255 | "\n", 256 | "

This project was built using cookiecutter-easydata, an experimental fork of [cookiecutter-data-science](https://github.com/drivendata/cookiecutter-data-science) aimed at making your data science workflow reproducible.

\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "### Exercise 3: Set up your virtual environment and install all dependencies\n", 264 | "\n", 265 | "Create and activate your `bus_number_tutorial` conda environment using the above make commands." 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "Your `active environment` should be `bus_number_tutorial`\n" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "!conda info" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "If done correctly, you should also be able to import from `src`" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "# if importing src doesn't work, try `make requirements`\n", 298 | "import src" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "### Exercise 4: Pick up this tutorial in your new repo\n", 306 | "\n", 307 | "* Copy the notebooks from `bus_number` into your new `bus_number_tutorial` repo\n", 308 | "* Run jupyter notebook and open `notebooks/10-reproducible-environment.ipynb`\n", 309 | "\n", 310 | "If you're currently running this notebook and the checks from the previous exercises worked, then you're in business!\n", 311 | "\n", 312 | "Keep going from here!" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "## Revision Control: `git`\n", 320 | "\n", 321 | "How do we keep track of our changes? We use **git**.\n", 322 | "\n", 323 | "Before we do anything interesting, let's initialize a git repository (repo) here.\n" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "### Exercise 5: Initialize a git repo for `bus_number_tutorial`" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "```\n", 338 | "git init\n", 339 | "git add .\n", 340 | "git commit -m \"Initial Import\"\n", 341 | "```" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "!git status" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "You should see: \n", 358 | " \n", 359 | " # On branch master\n", 360 | " nothing to commit, working directory clean\n" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "We will get back to using git again soon." 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "### Exercise 6: Add a dependency\n", 375 | "Modify the environment file so that `make requirements` installs some additional packages\n", 376 | "* install `scikit-learn` using conda\n", 377 | "* install `nbdime` using pip" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "# Check that you now have scikit-learn and nbdime installed\n", 387 | "# Don't forget that you need to run `make requirements` once you've change the `environment.yml` file\n", 388 | "import sklearn\n", 389 | "import nbdime" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "### Solution 6\n", 397 | "Your `environment.yml` should look like this now:\n", 398 | "
\n",
 399 |     "name: bus_number_tutorial\n",
 400 |     "channels:\n",
 401 |     "  - conda-forge\n",
 402 |     "dependencies:\n",
 403 |     "  - pip\n",
 404 |     "  - pip:\n",
 405 |     "    - -e .\n",
 406 |     "    - python-dotenv>=0.5.1\n",
 407 |     "    - nbdime\n",
 408 |     "  - setuptools\n",
 409 |     "  - wheel\n",
 410 |     "  - sphinx\n",
 411 |     "  - click\n",
 412 |     "  - coverage\n",
 413 |     "  - pytest-cov\n",
 414 |     "  - jupyter\n",
 415 |     "  - joblib\n",
 416 |     "  - nb_conda\n",
 417 |     "  - nbval\n",
 418 |     "  - pandas\n",
 419 |     "  - requests\n",
 420 |     "  - python>=3.6\n",
 421 |     "
" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "# you should be able to see the difference via git\n", 431 | "!git diff ../environment.yml" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "### Exercise 7: Basic git interactions" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "See what has changed with git:" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "!git status" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "!git diff -u ../environment.yml" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "Add or reject the changes incrementally" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "#!git add -p\n", 480 | "#!git reset -p\n" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "Commit the changes" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "#!git commit -v" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "### Solution 7" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "# You should have no differences in your branch now\n", 513 | "# Except for those that you've made by running notebooks\n", 514 | "!git status" 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "## The Data Science DAG\n", 522 | "DAG = Directed Acyclic Graph. \n", 523 | "\n", 524 | "That means the process eventually stops. (This is a good thing!) \n", 525 | "\n", 526 | "It also means we can use a super old, but incredibly handy tool to implement this workflow: `make`.\n", 527 | "\n", 528 | "### Make, Makefiles, and the Data Flow\n", 529 | "\n", 530 | "\n", 531 | "We use a `Makefile` to organize and invoke the various steps in our Data Science pipeline.\n", 532 | "You have already used this file when you created your virtual environment in the first place:\n", 533 | "```\n", 534 | "make create_environment\n", 535 | "```\n", 536 | "Here are the steps we will be working through in this tutorial:\n", 537 | "\"Reproducible\n", 538 | "\n", 539 | "A [PDF version of the cheat sheet](references/cheat_sheet.pdf) is also available.\n" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "\n", 547 | "### What's my make target doing?\n", 548 | "If you are ever curious what commands a `make` command will invoke (including any invoked dependencies), use `make -n`, which lists the commands without executing them:" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "%%bash\n", 558 | "cd .. && make -n requirements" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "We use a cute **self-documenting makefiles trick** (borrowed from `cookiecutter-datascience`) to make it easy to document the various targets that you add. This documentation is produced when you type a plain `make`:" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "%%bash\n", 575 | "cd .. && make" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "metadata": {}, 581 | "source": [ 582 | "### Under the Hood: The Format of a Makefile\n", 583 | "\n", 584 | "```\n", 585 | "## Comment to appear in the auto-generated documentation\n", 586 | "thing_to_build: space separated list of dependencies\n", 587 | "\tcommand_to_run # there is a tab before this command.\n", 588 | "\tanother_command_to_run # every line gets run in a *new shell*\n", 589 | "```\n", 590 | "\n" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "### Exercise 8: What does this makefile print when you run `make train`?" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "%%file Makefile.test\n", 607 | "\n", 608 | "data: raw\n", 609 | "\t@echo \"Build Datasets\"\n", 610 | "train_test_split:\n", 611 | "\t@echo \"do train/test split\"\n", 612 | "train: data transform_data train_test_split\n", 613 | "\t@echo \"Train Models\"\n", 614 | "transform_data:\n", 615 | "\t@echo \"do a data transformation\"\n", 616 | "raw:\n", 617 | "\t@echo \"Fetch raw data\"\n" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "Note: If you see: ```*** missing separator. Stop.``` it's because you have used spaces instead of **tabs** before your commands. " 625 | ] 626 | }, 627 | { 628 | "cell_type": "markdown", 629 | "metadata": {}, 630 | "source": [ 631 | "### Solution 8" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "%%bash\n", 641 | "make -f Makefile.test train" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "### Exercise 9: What happens when you add a cycle to a Makefile\n", 656 | "Set up a makefile with a cyclic dependency and run it" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "### Solution 9" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "%%file Makefile.test\n", 673 | "\n", 674 | "cycle: cycle_b\n", 675 | "\t@echo \"in a Makefile\"\n", 676 | "cycle_b: cycle_c\n", 677 | "\t@echo \"have a cycle\"\n", 678 | "cycle_c: cycle\n", 679 | "\t@echo \"You can't\"" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "%%bash\n", 689 | "make -f Makefile.test cycle" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "Using a Makefile like this is an easy way to set up a process flow expressed as a Directed Acyclic Graph (DAG).\n", 697 | "\n", 698 | "**Note**: We have only scratched the surface here. The are lots of interesting tricks you can do with make.\n", 699 | "* http://zmjones.com/make/\n", 700 | "* http://blog.byronjsmith.com/makefile-shortcuts.html\n", 701 | "* https://www.gnu.org/software/make/manual/\n" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "## Back to Revision Control: git workflows\n", 716 | "\n", 717 | "Git isn't really a collaboration tool. It's more a tool for implementing collaboration workflows.\n", 718 | "\n", 719 | "What do we mean by workflow? A process built on top of git that incorporates **pull requests** and **branches**. Typically, this is provided by sites like: GitHub, GitLab, BitBucket.\n" 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "#### Some useful references if `gitflow` isn't second nature to you yet\n", 727 | "* Introduction to GitHub tutorial: https://lab.github.com/githubtraining/introduction-to-github\n", 728 | "* Git Handbook: https://guides.github.com/introduction/git-handbook/\n", 729 | "* GitHub workflow cheatsheet: https://github.com/hackalog/bus_number/wiki/Github-Workflow-Cheat-Sheet" 730 | ] 731 | }, 732 | { 733 | "cell_type": "markdown", 734 | "metadata": {}, 735 | "source": [ 736 | "## Life Rules for using `git`\n", 737 | "\n", 738 | "* Always work on a branch: `git checkout -b my_branch_name`. Delete branches once they are merged.\n", 739 | "* **Never** push to master. Always **work on a branch** and do a pull request.\n", 740 | "* Seriously, don't do work on master if you are collaborating with **anyone**.\n", 741 | "* If you pushed it anywhere, or shared it with anyone, don't `git rebase`. In fact, if you're reading this, don't `git rebase`. Save that for when you are comfortable solving git merge nightmares on your own.\n" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": {}, 747 | "source": [ 748 | "### Exercise 10: \n", 749 | "\n", 750 | "Create a GitHub/GitLab/BitBucket repo and sync your repo to it.\n" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "### Exercise 11:\n", 779 | "* Create a branch called `add_sklearn`\n", 780 | "* Add a scikit-learn dependency\n", 781 | "* Check in these changes using git to your local repo\n", 782 | "* Push the new branch to GitHub\n", 783 | "* Create a pull request to merge this branch into master\n", 784 | "* Merge your PR (delete the branch afterwards)\n", 785 | "* Sync your local repo with GitHub, including deleting the merged branches" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "# You should now only have a branch called master\n", 809 | "!git branch" 810 | ] 811 | }, 812 | { 813 | "cell_type": "markdown", 814 | "metadata": {}, 815 | "source": [ 816 | "## Python Modules\n", 817 | "By default, we keep our source code in a module called `src`. (this can be overridden in the cookieccutter)\n", 818 | "\n", 819 | "This is enabled via one line in `environment.yml`:\n", 820 | "```\n", 821 | "- pip:\n", 822 | " - -e .\n", 823 | "```\n", 824 | "\n", 825 | "This creates an **editable module**, and looks in the current directory for a file called `setup.py` to indicate the module name and location" 826 | ] 827 | }, 828 | { 829 | "cell_type": "raw", 830 | "metadata": {}, 831 | "source": [ 832 | "# %load ../setup.py\n", 833 | "from setuptools import find_packages, setup\n", 834 | "\n", 835 | "setup(\n", 836 | " name='src',\n", 837 | " packages=find_packages(),\n", 838 | " version='0.0.1',\n", 839 | " description='Up Your Bus Number: A Primer for Reproducible Data Science',\n", 840 | " author='Tutte Institute for Mathematics and Computing',\n", 841 | " license='MIT',\n", 842 | ")\n" 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": {}, 848 | "source": [ 849 | "This lets you easily use your code in notebooks and other scripts, and avoids any `sys.path.append` silliness" 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "### ASIDE: Semantic Versioning\n", 857 | "\n", 858 | "Semantic versioning (or *semver*), refers to the convention of versioning with a triple:\n", 859 | "\n", 860 | " MAJOR.MINOR.PATCH\n", 861 | "\n", 862 | "With the following convention: when releasing new versions, increment the:\n", 863 | "\n", 864 | "* MAJOR version when you make **incompatible API changes**,\n", 865 | "* MINOR version when you **add functionality** in a backwards-compatible manner, and\n", 866 | "* PATCH version when you make backwards-compatible **bug fixes**.\n", 867 | "\n", 868 | "If you have no other plan, this is a great convention to follow.\n", 869 | "\n", 870 | "For an obscene amount of detail on this concept, see https://semver.org/" 871 | ] 872 | }, 873 | { 874 | "cell_type": "code", 875 | "execution_count": null, 876 | "metadata": {}, 877 | "outputs": [], 878 | "source": [] 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "metadata": {}, 883 | "source": [ 884 | "### Exercise 12:\n", 885 | "* add your favorite utility function to `src/utils`\n", 886 | "* increment the version number of the editable package\n", 887 | "* run `make requirements` (required if you added dependencies for your utility function)\n", 888 | "* import your utility function and run it from this notebook" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": null, 894 | "metadata": {}, 895 | "outputs": [], 896 | "source": [] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "### Solution:\n" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": null, 915 | "metadata": {}, 916 | "outputs": [], 917 | "source": [ 918 | "%%file -a ../src/utils.py\n", 919 | "def read_space_delimited(filename, skiprows=None, class_labels=True, metadata=None):\n", 920 | " \"\"\"Read an space-delimited file\n", 921 | " \n", 922 | " Data is space-delimited. Last column is the (string) label for the data\n", 923 | "\n", 924 | " Note: we can't use automatic comment detection, as `#` characters are also\n", 925 | " used as data labels.\n", 926 | "\n", 927 | " Parameters\n", 928 | " ----------\n", 929 | " skiprows: list-like, int or callable, optional\n", 930 | " list of rows to skip when reading the file. See `pandas.read_csv`\n", 931 | " entry on `skiprows` for more\n", 932 | " class_labels: boolean\n", 933 | " if true, the last column is treated as the class (target) label\n", 934 | " \"\"\"\n", 935 | " with open(filename, 'r') as fd:\n", 936 | " df = pd.read_csv(fd, skiprows=skiprows, skip_blank_lines=True,\n", 937 | " comment=None, header=None, sep=' ', dtype=str)\n", 938 | " # targets are last column. Data is everything else\n", 939 | " if class_labels is True:\n", 940 | " target = df.loc[:, df.columns[-1]].values\n", 941 | " data = df.loc[:, df.columns[:-1]].values\n", 942 | " else:\n", 943 | " data = df.values\n", 944 | " target = np.zeros(data.shape[0])\n", 945 | " return data, target, metadata" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [ 954 | "from src.utils import read_space_delimited" 955 | ] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": null, 960 | "metadata": {}, 961 | "outputs": [], 962 | "source": [ 963 | "read_space_delimited?" 964 | ] 965 | }, 966 | { 967 | "cell_type": "markdown", 968 | "metadata": {}, 969 | "source": [ 970 | "## Testing: doctest, pytest, coverage\n", 971 | "\n", 972 | "\n", 973 | "Python has built in testing frameworks via:\n", 974 | "* doctests:https://docs.python.org/3/library/doctest.html#module-doctest\n", 975 | "* unittest: https://docs.python.org/3/library/unittest.html\n", 976 | "\n", 977 | "Additionally, you'll want to make regular use of:\n", 978 | "* pytest: https://docs.pytest.org/en/latest/\n", 979 | "* pytest-cov: https://pypi.org/project/pytest-cov/\n", 980 | "* hypothesis: https://hypothesis.readthedocs.io/en/latest\n", 981 | "\n", 982 | "Cookiecutter (vanilla flavoured) comes witha setup for the `tox` testing framework built in.\n", 983 | "* https://tox.readthedocs.io/en/latest/" 984 | ] 985 | }, 986 | { 987 | "cell_type": "markdown", 988 | "metadata": {}, 989 | "source": [ 990 | "### Exercise 12:\n", 991 | "\n", 992 | "Add a `make test` target to your makefile that:\n", 993 | "* runs doctests\n", 994 | "* runs pytest unit tests\n", 995 | "* (extra credit) Displays test coverage results\n", 996 | " \n", 997 | "When you run `make test`, you will find tests that will fail in `src/test_example.py`. Fix them in the next exercise." 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "markdown", 1002 | "metadata": {}, 1003 | "source": [ 1004 | "### Solution 12:\n", 1005 | "\n", 1006 | " test:\n", 1007 | " cd src && pytest --doctest-modules --verbose --cov" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "metadata": {}, 1014 | "outputs": [], 1015 | "source": [ 1016 | "!cd .. && make -n test" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": null, 1022 | "metadata": {}, 1023 | "outputs": [], 1024 | "source": [ 1025 | "!cd .. && make test" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "markdown", 1030 | "metadata": {}, 1031 | "source": [ 1032 | " cd src && pytest --doctest-modules --verbose --cov\n", 1033 | " ============================= test session starts ==============================\n", 1034 | " platform linux -- Python 3.6.7, pytest-4.2.1, py-1.7.0, pluggy-0.8.1 -- /opt/software/anaconda3/envs/bus_number_tutorial/bin/python\n", 1035 | " cachedir: .pytest_cache\n", 1036 | " rootdir: ~/src/devel/bus_number_tutorial, inifile:\n", 1037 | " plugins: cov-2.6.1, nbval-0.9.1\n", 1038 | " collected 7 items \n", 1039 | "\n", 1040 | " test_example.py::src.test_example.addition FAILED [ 14%]\n", 1041 | " test_example.py::TestExercises::test_addition FAILED [ 28%]\n", 1042 | " data/fetch.py::src.data.fetch.available_hashes PASSED [ 42%]\n", 1043 | " data/fetch.py::src.data.fetch.fetch_file PASSED [ 57%]\n", 1044 | " data/fetch.py::src.data.fetch.fetch_files PASSED [ 71%]\n", 1045 | " data/fetch.py::src.data.fetch.get_dataset_filename PASSED [ 85%]\n", 1046 | " data/utils.py::src.data.utils.normalize_labels PASSED [100%]\n", 1047 | "\n", 1048 | " =================================== FAILURES ===================================\n", 1049 | " _____________________ [doctest] src.test_example.addition ______________________\n", 1050 | " 004 \n", 1051 | " 005 I'm a failing doctest. Please fix me.\n", 1052 | " 006 >>> addition(10, 12)\n", 1053 | " Expected:\n", 1054 | " 20\n", 1055 | " Got:\n", 1056 | " -2\n", 1057 | "\n", 1058 | " ~/src/devel/bus_number_tutorial/src/test_example.py:6: DocTestFailure\n", 1059 | " _________________________ TestExercises.test_addition __________________________\n", 1060 | "\n", 1061 | " self = \n", 1062 | "\n", 1063 | " def test_addition(self):\n", 1064 | " \"\"\"\n", 1065 | " I'm a failing unittest. Fix me.\n", 1066 | " \"\"\"\n", 1067 | " > assert subtraction(5, 5) == 0\n", 1068 | " E AssertionError: assert 10 == 0\n", 1069 | " E -10\n", 1070 | " E +0\n", 1071 | "\n", 1072 | " test_example.py:22: AssertionError\n", 1073 | "\n", 1074 | " ----------- coverage: platform linux, python 3.6.7-final-0 -----------\n", 1075 | " Name Stmts Miss Cover\n", 1076 | " ------------------------------------------------\n", 1077 | " __init__.py 0 0 100%\n", 1078 | " analysis/__init__.py 0 0 100%\n", 1079 | " analysis/analysis.py 105 86 18%\n", 1080 | " analysis/run_analysis.py 23 9 61%\n", 1081 | " data/__init__.py 4 0 100%\n", 1082 | " data/apply_transforms.py 27 12 56%\n", 1083 | " data/datasets.py 311 262 16%\n", 1084 | " data/fetch.py 143 109 24%\n", 1085 | " data/localdata.py 1 0 100%\n", 1086 | " data/make_dataset.py 15 4 73%\n", 1087 | " data/transform_data.py 88 72 18%\n", 1088 | " data/transformers.py 42 29 31%\n", 1089 | " data/utils.py 85 61 28%\n", 1090 | " features/__init__.py 0 0 100%\n", 1091 | " features/build_features.py 0 0 100%\n", 1092 | " logging.py 7 0 100%\n", 1093 | " models/__init__.py 3 0 100%\n", 1094 | " models/algorithms.py 5 4 20%\n", 1095 | " models/model_list.py 74 60 19%\n", 1096 | " models/predict.py 100 80 20%\n", 1097 | " models/predict_model.py 22 9 59%\n", 1098 | " models/train.py 54 39 28%\n", 1099 | " models/train_models.py 25 11 56%\n", 1100 | " paths.py 17 0 100%\n", 1101 | " test_example.py 8 0 100%\n", 1102 | " utils.py 58 45 22%\n", 1103 | " visualization/__init__.py 0 0 100%\n", 1104 | " visualization/visualize.py 0 0 100%\n", 1105 | " workflow.py 8 0 100%\n", 1106 | " ------------------------------------------------\n", 1107 | " TOTAL 1225 892 27%\n", 1108 | "\n", 1109 | " ====================== 2 failed, 5 passed in 1.69 seconds ======================\n", 1110 | " make: *** [test] Error 1" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "markdown", 1115 | "metadata": {}, 1116 | "source": [ 1117 | "***Note:*** `make test` is normally functionality built into `cookiecutter-easydata`. We're building it from scratch here for the sake of practice." 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "markdown", 1122 | "metadata": {}, 1123 | "source": [ 1124 | "### Exercise 13:\n", 1125 | "Fix the failing tests" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "code", 1130 | "execution_count": null, 1131 | "metadata": {}, 1132 | "outputs": [], 1133 | "source": [] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": null, 1138 | "metadata": {}, 1139 | "outputs": [], 1140 | "source": [] 1141 | }, 1142 | { 1143 | "cell_type": "code", 1144 | "execution_count": null, 1145 | "metadata": {}, 1146 | "outputs": [], 1147 | "source": [ 1148 | "%%file ../src/test_example.py\n", 1149 | "import unittest\n", 1150 | "\n", 1151 | "def addition(n1, n2):\n", 1152 | " \"\"\"\n", 1153 | " I'm addition\n", 1154 | " >>> addition(10, 10)\n", 1155 | " 20\n", 1156 | " \"\"\"\n", 1157 | " return n1 + n2\n", 1158 | "\n", 1159 | "def subtraction(n1, n2):\n", 1160 | " \"\"\"\n", 1161 | " I'm subtraction.\n", 1162 | " \"\"\"\n", 1163 | " return n1 - n2\n", 1164 | "\n", 1165 | "class TestExercises(unittest.TestCase):\n", 1166 | " def test_subtraction(self):\n", 1167 | " \"\"\"\n", 1168 | " I'm a failing unittest. Fix me.\n", 1169 | " \"\"\"\n", 1170 | " assert subtraction(5, 5) == 0\n" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "code", 1175 | "execution_count": null, 1176 | "metadata": {}, 1177 | "outputs": [], 1178 | "source": [ 1179 | "# Should pass all tests now!\n", 1180 | "!cd .. && make test" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "markdown", 1185 | "metadata": {}, 1186 | "source": [ 1187 | "\n" 1188 | ] 1189 | }, 1190 | { 1191 | "cell_type": "markdown", 1192 | "metadata": {}, 1193 | "source": [ 1194 | "### Exercise 14:\n", 1195 | "* Check in all your changes to git\n", 1196 | "* Merge them into your master branch via a PR in GitHub" 1197 | ] 1198 | }, 1199 | { 1200 | "cell_type": "code", 1201 | "execution_count": null, 1202 | "metadata": {}, 1203 | "outputs": [], 1204 | "source": [ 1205 | "!git status" 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "execution_count": null, 1211 | "metadata": {}, 1212 | "outputs": [], 1213 | "source": [] 1214 | } 1215 | ], 1216 | "metadata": { 1217 | "kernelspec": { 1218 | "display_name": "Python 3", 1219 | "language": "python", 1220 | "name": "python3" 1221 | }, 1222 | "language_info": { 1223 | "codemirror_mode": { 1224 | "name": "ipython", 1225 | "version": 3 1226 | }, 1227 | "file_extension": ".py", 1228 | "mimetype": "text/x-python", 1229 | "name": "python", 1230 | "nbconvert_exporter": "python", 1231 | "pygments_lexer": "ipython3", 1232 | "version": "3.7.2" 1233 | } 1234 | }, 1235 | "nbformat": 4, 1236 | "nbformat_minor": 2 1237 | } 1238 | -------------------------------------------------------------------------------- /notebooks/20-creating-datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 2.0: Reproducible Data Sources\n", 8 | "\"In God we trust. All others must bring data.” – W. Edwards Deming\"" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "%load_ext autoreload\n", 18 | "%autoreload 2" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import logging\n", 28 | "from src.logging import logger\n", 29 | "logger.setLevel(logging.INFO)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Introducing the `DataSource`\n", 37 | "The `DataSource` object handles downloading, unpacking, and processing raw data files, and serves as a container for some basic metadata about the raw data, including **documentation** and **license** information.\n", 38 | "\n", 39 | "Raw data files are downloaded to `paths.raw_data_path`.\n", 40 | " Cache files and unpacked raw files are saved to `paths.interim_data_path`.\n", 41 | " " 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Example: LVQ-Pak, a Finnish phonetic dataset\n", 49 | "The Learning Vector Quantization (lvq-pak) project includes a simple Finnish phonetic dataset\n", 50 | "consisting 20-dimensional Mel Frequency Cepstrum Coefficients (MFCCs) labelled with target phoneme information. Our goal is to explore this dataset, process it into a useful form, and make it a part of a reproducible data science workflow. The project can be found at: http://www.cis.hut.fi/research/lvq_pak/\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "\n", 59 | "For this example, we are going create a `DataSource` for the LVQ-Pak dataset. The process will consist of\n", 60 | "1. Downloading and unpacking the raw data files. \n", 61 | "2. Generating (and recording) hash values for these files.\n", 62 | "3. Adding LICENSE and DESCR (description) metadata to this DataSource\n", 63 | "4. Adding the complete `DataSource` to the Catalog \n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Downloading Raw Data Source Files" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from src.data import DataSource\n", 80 | "from src.utils import list_dir\n", 81 | "from src import paths" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Create a data source object\n", 91 | "datasource_name = 'lvq-pak'\n", 92 | "dsrc = DataSource(datasource_name)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Add URL(s) for raw data files\n", 102 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# Fetch the files\n", 112 | "logger.setLevel(logging.DEBUG)\n", 113 | "dsrc.fetch()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "By default, data files are downloaded to the `paths.raw_data_path` directory:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "!ls -la $paths.raw_data_path" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Since we did not specify a hash, or target filename, these are inferred from the downloaded file:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "dsrc.file_list" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### Remove a file from the file_list" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# Note that if we add a url again, we end up with more of the same file in the file list\n", 162 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "dsrc.file_list" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "dsrc.fetch()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Fetch is smart enough to not redownload the same file in this case. Still, this is messy and cumbersome. We can remove entries by removing them from the `file_list`." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "dsrc.file_list.pop(1)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "dsrc.file_list" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "dsrc.fetch(force=True)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Sometimes we make mistakes when entering information" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\", name='cat', file_name='dog')" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "dsrc.file_list" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "dsrc.fetch()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "!ls -la $paths.raw_data_path" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "We now have a copy of `lvq_pak-3.1.tar` called `dog`. Every time we fetch, we will fetch twice unless we get rid of the entry for `dog`.\n", 265 | "\n", 266 | "First, we will want to remove `dog` from our raw data.\n", 267 | "\n", 268 | "Let's take the \"Nuke it from orbit. It's the only way to be sure\" approach and clean our entire raw data directory. " 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "!cd .. && make clean_raw" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "!ls -la $paths.raw_data_path" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "The other option would have been to manually remove the `dog` file and then forced a refetch." 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Exercise: Remove the entry for dog and refetch" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# You should now only see the lvq_pak-3.1.tar file\n", 338 | "!ls -la $paths.raw_data_path" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "#### Cached Downloads" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "The DataSource object keeps track of whether the fetch has been performed successfully. Subsequent downloads will be skipped by default:" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "dsrc.fetch()" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "We can override this, which will check if the downloaded file exists, redownloading if necessary" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "dsrc.fetch(force=True)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "In the previous case, the raw data file existed on the filesystem, and had the correct hash. If the local file has a checksum that doesn't match the saved hash, it will be re-downloaded automatically. Let's corrupt the file and see what happens." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "!echo \"XXX\" >> $paths.raw_data_path/lvq_pak-3.1.tar" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "dsrc.fetch(force=True)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "## Exercise: Creating an F-MNIST `DataSource`" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "For this excercise, you are going build a `DataSource` out of the Fashion-MNIST dataset.\n", 417 | "\n", 418 | "[Fashion-MNIST][FMNIST] is available from GitHub. Looking at their [README], we see that the raw data is distributed as a set of 4 files with the following checksums:\n", 419 | "\n", 420 | "[FMNIST]: https://github.com/zalandoresearch/fashion-mnist\n", 421 | "[README]: https://github.com/zalandoresearch/fashion-mnist/blob/master/README.md\n", 422 | "\n", 423 | "| Name | Content | Examples | Size | Link | MD5 Checksum|\n", 424 | "| --- | --- |--- | --- |--- |--- |\n", 425 | "| `train-images-idx3-ubyte.gz` | training set images | 60,000|26 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz)|`8d4fb7e6c68d591d4c3dfef9ec88bf0d`|\n", 426 | "| `train-labels-idx1-ubyte.gz` | training set labels |60,000|29 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz)|`25c81989df183df01b3e8a0aad5dffbe`|\n", 427 | "| `t10k-images-idx3-ubyte.gz` | test set images | 10,000|4.3 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz)|`bef4ecab320f06d8554ea6380940ec79`|\n", 428 | "| `t10k-labels-idx1-ubyte.gz` | test set labels | 10,000| 5.1 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz)|`bb300cfdad3c16e7a12a480ee83cd310`|\n", 429 | "\n", 430 | "By the end of this running example, you will build a `DataSource` that downloads these raw files and verifies that the hash values are as expected. You should make sure to include **Description** and **License** metadata in this `DataSource`. When you are finished, save the `DataSource` to the Catalog." 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "### Exercise: Download Raw Data Source Files for F-MNIST" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "# Create an fmnist data source object\n" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "# Add URL(s) for raw data files\n", 456 | "# Note that you will be adding four files to the DataSource object\n", 457 | "# and that the hash values have already been provided above!\n" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "# Fetch the files\n" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "# Check for your new files\n", 476 | "!ls -la $paths.raw_data_path" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "### Unpacking Raw Data Files" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "unpack_dir = dsrc.unpack()" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "By default, files are decompressed/unpacked to the `paths.interim_data_path`/`datasource_name` directory:" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "!ls -la $paths.interim_data_path" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "# We unpack everything into interim_data_path/datasource_name, which is returned by `unpack()`" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "!ls -la $unpack_dir" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "!ls -la $unpack_dir/lvq_pak-3.1" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "### Exercise: Unpack raw data files for F-MNIST" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "# Check for your files in the unpacked dirs\n" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "### Adding Metadata to Raw Data\n", 580 | "Wait, what have we actually downloaded, and are we actually allowed to **use** this data? We keep track of two key pieces of metadata along with a raw dataset:\n", 581 | "* Description (`DESCR`) Text: Human-readable text describing the dataset, its source, and what it represents\n", 582 | "* License (`LICENSE`) Text: Terms of use for this dataset, often in the form of a license agreement" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "Often, a dataset comes complete with its own README and LICENSE files. If these are available via URL, we can add these like we add any other data file, tagging them as metadata using the `name` field:" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/README\",\n", 599 | " file_name='lvq-pak.readme', name='DESCR')" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "dsrc.fetch()\n", 609 | "dsrc.unpack()" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "# We now fetch 2 files. Note the metadata has been tagged accordingly in the `name` field\n", 619 | "dsrc.file_list" 620 | ] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": {}, 625 | "source": [ 626 | "We need to dig a little deeper to find the license. we find it at the beginning of the README file contained within that distribution:" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "!head -35 $paths.interim_data_path/lvq-pak/lvq_pak-3.1/README" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "Rather than trying to be clever, let's just add the license metadata from a python string that we cut and paste from the above." 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "license_txt = '''\n", 652 | "************************************************************************\n", 653 | "* *\n", 654 | "* LVQ_PAK *\n", 655 | "* *\n", 656 | "* The *\n", 657 | "* *\n", 658 | "* Learning Vector Quantization *\n", 659 | "* *\n", 660 | "* Program Package *\n", 661 | "* *\n", 662 | "* Version 3.1 (April 7, 1995) *\n", 663 | "* *\n", 664 | "* Prepared by the *\n", 665 | "* LVQ Programming Team of the *\n", 666 | "* Helsinki University of Technology *\n", 667 | "* Laboratory of Computer and Information Science *\n", 668 | "* Rakentajanaukio 2 C, SF-02150 Espoo *\n", 669 | "* FINLAND *\n", 670 | "* *\n", 671 | "* Copyright (c) 1991-1995 *\n", 672 | "* *\n", 673 | "************************************************************************\n", 674 | "* *\n", 675 | "* NOTE: This program package is copyrighted in the sense that it *\n", 676 | "* may be used for scientific purposes. The package as a whole, or *\n", 677 | "* parts thereof, cannot be included or used in any commercial *\n", 678 | "* application without written permission granted by its producents. *\n", 679 | "* No programs contained in this package may be copied for commercial *\n", 680 | "* distribution. *\n", 681 | "* *\n", 682 | "* All comments concerning this program package may be sent to the *\n", 683 | "* e-mail address 'lvq@nucleus.hut.fi'. *\n", 684 | "* *\n", 685 | "************************************************************************\n", 686 | "'''\n", 687 | "dsrc.add_metadata(contents=license_txt, kind='LICENSE')" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "Under the hood, this will create a file, storing the creation instructions in the same `file_list` we use to store the URLs we wish to download:" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "dsrc.file_list" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "Now when we fetch, the license file is created from this information:" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "logger.setLevel(logging.DEBUG)\n", 720 | "dsrc.fetch(force=True)\n", 721 | "dsrc.unpack()" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": null, 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [ 730 | "!ls -la $paths.raw_data_path" 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": {}, 736 | "source": [ 737 | "### Exercise: Add metadata to F-MNIST" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": {}, 758 | "outputs": [], 759 | "source": [] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "### Adding Raw Data to the Catalog" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "from src import workflow" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": {}, 795 | "outputs": [], 796 | "source": [ 797 | "workflow.available_datasources()" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [ 806 | "workflow.add_datasource(dsrc)" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": {}, 813 | "outputs": [], 814 | "source": [ 815 | "workflow.available_datasources()" 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "We will make use of this raw dataset catalog later in this tutorial. We can now load our `DataSource` by name:" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": null, 828 | "metadata": {}, 829 | "outputs": [], 830 | "source": [ 831 | "ds = DataSource.from_name('lvq-pak')" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "metadata": { 838 | "scrolled": true 839 | }, 840 | "outputs": [], 841 | "source": [ 842 | "ds.file_list" 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": {}, 848 | "source": [ 849 | "### Exercise: Add F-MNIST to the Raw Dataset Catalog" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": null, 855 | "metadata": {}, 856 | "outputs": [], 857 | "source": [] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": null, 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "# Your fmnist dataset should now show up here:\n", 866 | "workflow.available_datasources()" 867 | ] 868 | }, 869 | { 870 | "cell_type": "markdown", 871 | "metadata": {}, 872 | "source": [ 873 | "### Nuke it from Orbit\n", 874 | "\n", 875 | "Now we can blow away all the data that we've downloaded and set up so far, and recreate it from the workflow datasource. Or, use some of our `make` commands!" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": null, 881 | "metadata": {}, 882 | "outputs": [], 883 | "source": [ 884 | "!cd .. && make clean_raw" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": null, 890 | "metadata": {}, 891 | "outputs": [], 892 | "source": [ 893 | "!ls -la $paths.raw_data_path" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [ 902 | "!cd .. && make fetch_sources" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "metadata": {}, 909 | "outputs": [], 910 | "source": [ 911 | "!ls -la $paths.raw_data_path" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "# What about fetch and unpack?\n", 921 | "!cd .. && make clean_raw && make clean_interim" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": {}, 928 | "outputs": [], 929 | "source": [ 930 | "!ls -la $paths.raw_data_path" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": null, 936 | "metadata": {}, 937 | "outputs": [], 938 | "source": [ 939 | "!cd .. && make unpack_sources" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": null, 945 | "metadata": {}, 946 | "outputs": [], 947 | "source": [ 948 | "!ls -la $paths.raw_data_path" 949 | ] 950 | }, 951 | { 952 | "cell_type": "code", 953 | "execution_count": null, 954 | "metadata": {}, 955 | "outputs": [], 956 | "source": [ 957 | "!ls -la $paths.interim_data_path" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": {}, 963 | "source": [ 964 | "### Your data is now reproducible!" 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [] 973 | } 974 | ], 975 | "metadata": { 976 | "kernelspec": { 977 | "display_name": "Python [default]", 978 | "language": "python", 979 | "name": "python3" 980 | }, 981 | "language_info": { 982 | "codemirror_mode": { 983 | "name": "ipython", 984 | "version": 3 985 | }, 986 | "file_extension": ".py", 987 | "mimetype": "text/x-python", 988 | "name": "python", 989 | "nbconvert_exporter": "python", 990 | "pygments_lexer": "ipython3", 991 | "version": "3.7.2" 992 | } 993 | }, 994 | "nbformat": 4, 995 | "nbformat_minor": 2 996 | } 997 | -------------------------------------------------------------------------------- /notebooks/21-creating-datasets-solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 2.0: Reproducible Data Sources\n", 8 | "\"In God we trust. All others must bring data.” – W. Edwards Deming\"" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "%load_ext autoreload\n", 18 | "%autoreload 2" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import logging\n", 28 | "from src.logging import logger\n", 29 | "logger.setLevel(logging.INFO)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Introducing the `DataSource`\n", 37 | "The `DataSource` object handles downloading, unpacking, and processing raw data files, and serves as a container for some basic metadata about the raw data, including **documentation** and **license** information.\n", 38 | "\n", 39 | "Raw data files are downloaded to `paths.raw_data_path`.\n", 40 | " Cache files and unpacked raw files are saved to `paths.interim_data_path`.\n", 41 | " " 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Example: LVQ-Pak, a Finnish phonetic dataset\n", 49 | "The Learning Vector Quantization (lvq-pak) project includes a simple Finnish phonetic dataset\n", 50 | "consisting 20-dimensional Mel Frequency Cepstrum Coefficients (MFCCs) labelled with target phoneme information. Our goal is to explore this dataset, process it into a useful form, and make it a part of a reproducible data science workflow. The project can be found at: http://www.cis.hut.fi/research/lvq_pak/\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "\n", 59 | "For this example, we are going create a `DataSource` for the LVQ-Pak dataset. The process will consist of\n", 60 | "1. Downloading and unpacking the raw data files. \n", 61 | "2. Generating (and recording) hash values for these files.\n", 62 | "3. Adding LICENSE and DESCR (description) metadata to this DataSource\n", 63 | "4. Adding the complete `DataSource` to the Catalog \n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Downloading Raw Data Source Files" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from src.data import DataSource\n", 80 | "from src.utils import list_dir\n", 81 | "from src import paths" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Create a data source object\n", 91 | "datasource_name = 'lvq-pak'\n", 92 | "dsrc = DataSource(datasource_name)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Add URL(s) for raw data files\n", 102 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# Fetch the files\n", 112 | "logger.setLevel(logging.DEBUG)\n", 113 | "dsrc.fetch()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "By default, data files are downloaded to the `paths.raw_data_path` directory:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "!ls -la $paths.raw_data_path" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Since we did not specify a hash, or target filename, these are inferred from the downloaded file:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "dsrc.file_list" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### Remove a file from the file_list" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# Note that if we add a url again, we end up with more of the same file in the file list\n", 162 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "dsrc.file_list" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "dsrc.fetch()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Fetch is smart enough to not redownload the same file in this case. Still, this is messy and cumbersome. We can remove entries by removing them from the `file_list`." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "dsrc.file_list.pop(1)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "dsrc.file_list" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "dsrc.fetch(force=True)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Sometimes we make mistakes when entering information" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar\", name='cat', file_name='dog')" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "dsrc.file_list" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "dsrc.fetch()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "!ls -la $paths.raw_data_path" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "We now have a copy of `lvq_pak-3.1.tar` called `dog`. Every time we fetch, we will fetch twice unless we get rid of the entry for `dog`.\n", 265 | "\n", 266 | "First, we will want to remove `dog` from our raw data.\n", 267 | "\n", 268 | "Let's take the \"Nuke it from orbit. It's the only way to be sure\" approach and clean our entire raw data directory. " 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "!cd .. && make clean_raw" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "!ls -la $paths.raw_data_path" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "The other option would have been to manually remove the `dog` file and then forced a refetch." 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Exercise: Remove the entry for dog and refetch" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "dsrc.file_list" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "dsrc.file_list.pop(1)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "dsrc.file_list" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# The fetch here will need to be forced\n", 337 | "dsrc.fetch(force=True)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# You should now only see the lvq_pak-3.1.tar file\n", 347 | "!ls -la $paths.raw_data_path" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "#### Cached Downloads" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "The DataSource object keeps track of whether the fetch has been performed successfully. Subsequent downloads will be skipped by default:" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "dsrc.fetch()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "We can override this, which will check if the downloaded file exists, redownloading if necessary" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "dsrc.fetch(force=True)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "In the previous case, the raw data file existed on the filesystem, and had the correct hash. If the local file has a checksum that doesn't match the saved hash, it will be re-downloaded automatically. Let's corrupt the file and see what happens." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "!echo \"XXX\" >> $paths.raw_data_path/lvq_pak-3.1.tar" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "dsrc.fetch(force=True)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "## Exercise: Creating an F-MNIST `DataSource`" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "For this excercise, you are going build a `DataSource` out of the Fashion-MNIST dataset.\n", 426 | "\n", 427 | "[Fashion-MNIST][FMNIST] is available from GitHub. Looking at their [README], we see that the raw data is distributed as a set of 4 files with the following checksums:\n", 428 | "\n", 429 | "[FMNIST]: https://github.com/zalandoresearch/fashion-mnist\n", 430 | "[README]: https://github.com/zalandoresearch/fashion-mnist/blob/master/README.md\n", 431 | "\n", 432 | "| Name | Content | Examples | Size | Link | MD5 Checksum|\n", 433 | "| --- | --- |--- | --- |--- |--- |\n", 434 | "| `train-images-idx3-ubyte.gz` | training set images | 60,000|26 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz)|`8d4fb7e6c68d591d4c3dfef9ec88bf0d`|\n", 435 | "| `train-labels-idx1-ubyte.gz` | training set labels |60,000|29 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz)|`25c81989df183df01b3e8a0aad5dffbe`|\n", 436 | "| `t10k-images-idx3-ubyte.gz` | test set images | 10,000|4.3 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz)|`bef4ecab320f06d8554ea6380940ec79`|\n", 437 | "| `t10k-labels-idx1-ubyte.gz` | test set labels | 10,000| 5.1 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz)|`bb300cfdad3c16e7a12a480ee83cd310`|\n", 438 | "\n", 439 | "By the end of this running example, you will build a `DataSource` that downloads these raw files and verifies that the hash values are as expected. You should make sure to include **Description** and **License** metadata in this `DataSource`. When you are finished, save the `DataSource` to the Catalog." 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "### Exercise: Download Raw Data Source Files for F-MNIST" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "# Create an fmnist data source object\n", 456 | "fmnist_dsname = 'fmnist'\n", 457 | "fmnist = DataSource(fmnist_dsname)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "# Add URL(s) for raw data files\n", 467 | "# Note that you will be adding four files to the DataSource object\n", 468 | "# and that the hash values have already been provided above!\n", 469 | "fmnist.add_url(url='http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',\n", 470 | " hash_type='md5',\n", 471 | " hash_value='8d4fb7e6c68d591d4c3dfef9ec88bf0d',\n", 472 | " name='train-images')\n" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "## Now all the rest at once\n", 482 | "url_base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com'\n", 483 | "file_list = [\n", 484 | " ('train-labels-idx1-ubyte.gz','25c81989df183df01b3e8a0aad5dffbe', 'train-labels'),\n", 485 | " ('t10k-images-idx3-ubyte.gz', 'bef4ecab320f06d8554ea6380940ec79', 'test-images'),\n", 486 | " ('t10k-labels-idx1-ubyte.gz', 'bb300cfdad3c16e7a12a480ee83cd310', 'test-labels'),\n", 487 | "]\n", 488 | "for file, hashval, name in file_list:\n", 489 | " url = f\"{url_base}/{file}\"\n", 490 | " fmnist.add_url(url=url, hash_type='md5', hash_value=hashval, name=name)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "fmnist.file_list" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "# Fetch the files\n", 509 | "fmnist.fetch()" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "# Check for your new files\n", 519 | "!ls -la $paths.raw_data_path" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "### Unpacking Raw Data Files" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "unpack_dir = dsrc.unpack()" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "By default, files are decompressed/unpacked to the `paths.interim_data_path`/`datasource_name` directory:" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "!ls -la $paths.interim_data_path" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "# We unpack everything into interim_data_path/datasource_name, which is returned by `unpack()`" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "!ls -la $unpack_dir" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "!ls -la $unpack_dir/lvq_pak-3.1" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "### Exercise: Unpack raw data files for F-MNIST" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "fmnist_unpack = fmnist.unpack()" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "fmnist_unpack" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "!ls -la $paths.interim_data_path" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [ 621 | "# Check for your files in the unpacked dirs\n", 622 | "!ls -la $fmnist_unpack" 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": {}, 628 | "source": [ 629 | "### Adding Metadata to Raw Data\n", 630 | "Wait, what have we actually downloaded, and are we actually allowed to **use** this data? We keep track of two key pieces of metadata along with a raw dataset:\n", 631 | "* Description (`DESCR`) Text: Human-readable text describing the dataset, its source, and what it represents\n", 632 | "* License (`LICENSE`) Text: Terms of use for this dataset, often in the form of a license agreement" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "Often, a dataset comes complete with its own README and LICENSE files. If these are available via URL, we can add these like we add any other data file, tagging them as metadata using the `name` field:" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "dsrc.add_url(\"http://www.cis.hut.fi/research/lvq_pak/README\",\n", 649 | " file_name='lvq-pak.readme', name='DESCR')" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "dsrc.fetch()\n", 659 | "dsrc.unpack()" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "# We now fetch 2 files. Note the metadata has been tagged accordingly in the `name` field\n", 669 | "dsrc.file_list" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": {}, 675 | "source": [ 676 | "We need to dig a little deeper to find the license. we find it at the beginning of the README file contained within that distribution:" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "!head -35 $paths.interim_data_path/lvq-pak/lvq_pak-3.1/README" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": {}, 691 | "source": [ 692 | "Rather than trying to be clever, let's just add the license metadata from a python string that we cut and paste from the above." 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "license_txt = '''\n", 702 | "************************************************************************\n", 703 | "* *\n", 704 | "* LVQ_PAK *\n", 705 | "* *\n", 706 | "* The *\n", 707 | "* *\n", 708 | "* Learning Vector Quantization *\n", 709 | "* *\n", 710 | "* Program Package *\n", 711 | "* *\n", 712 | "* Version 3.1 (April 7, 1995) *\n", 713 | "* *\n", 714 | "* Prepared by the *\n", 715 | "* LVQ Programming Team of the *\n", 716 | "* Helsinki University of Technology *\n", 717 | "* Laboratory of Computer and Information Science *\n", 718 | "* Rakentajanaukio 2 C, SF-02150 Espoo *\n", 719 | "* FINLAND *\n", 720 | "* *\n", 721 | "* Copyright (c) 1991-1995 *\n", 722 | "* *\n", 723 | "************************************************************************\n", 724 | "* *\n", 725 | "* NOTE: This program package is copyrighted in the sense that it *\n", 726 | "* may be used for scientific purposes. The package as a whole, or *\n", 727 | "* parts thereof, cannot be included or used in any commercial *\n", 728 | "* application without written permission granted by its producents. *\n", 729 | "* No programs contained in this package may be copied for commercial *\n", 730 | "* distribution. *\n", 731 | "* *\n", 732 | "* All comments concerning this program package may be sent to the *\n", 733 | "* e-mail address 'lvq@nucleus.hut.fi'. *\n", 734 | "* *\n", 735 | "************************************************************************\n", 736 | "'''\n", 737 | "dsrc.add_metadata(contents=license_txt, kind='LICENSE')" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "Under the hood, this will create a file, storing the creation instructions in the same `file_list` we use to store the URLs we wish to download:" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "dsrc.file_list" 754 | ] 755 | }, 756 | { 757 | "cell_type": "markdown", 758 | "metadata": {}, 759 | "source": [ 760 | "Now when we fetch, the license file is created from this information:" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": null, 766 | "metadata": {}, 767 | "outputs": [], 768 | "source": [ 769 | "logger.setLevel(logging.DEBUG)\n", 770 | "dsrc.fetch(force=True)\n", 771 | "dsrc.unpack()" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": {}, 778 | "outputs": [], 779 | "source": [ 780 | "!ls -la $paths.raw_data_path" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "### Exercise: Add metadata to F-MNIST" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": {}, 794 | "outputs": [], 795 | "source": [ 796 | "# Here's the link to the readme\n", 797 | "readme_url = 'https://github.com/zalandoresearch/fashion-mnist/blob/master/README.md'" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [ 806 | "# tidying up the readme to a nice useable format for this dataset\n", 807 | "fmnist_readme = '''\n", 808 | "Fashion-MNIST\n", 809 | "=============\n", 810 | "\n", 811 | "Notes\n", 812 | "-----\n", 813 | "Data Set Characteristics:\n", 814 | " :Number of Instances: 70000\n", 815 | " :Number of Attributes: 728\n", 816 | " :Attribute Information: 28x28 8-bit greyscale image\n", 817 | " :Missing Attribute Values: None\n", 818 | " :Creator: Zalando\n", 819 | " :Date: 2017\n", 820 | "\n", 821 | "This is a copy of Zalando's Fashion-MNIST [F-MNIST] dataset:\n", 822 | "https://github.com/zalandoresearch/fashion-mnist\n", 823 | "\n", 824 | "Fashion-MNIST is a dataset of Zalando's article images—consisting of a\n", 825 | "training set of 60,000 examples and a test set of 10,000\n", 826 | "examples. Each example is a 28x28 grayscale image, associated with a\n", 827 | "label from 10 classes. Fashion-MNIST is intended to serve as a direct\n", 828 | "drop-in replacement for the original [MNIST] dataset for benchmarking\n", 829 | "machine learning algorithms. It shares the same image size and\n", 830 | "structure of training and testing splits.\n", 831 | "\n", 832 | "References\n", 833 | "----------\n", 834 | " - [F-MNIST] Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms.\n", 835 | " Han Xiao, Kashif Rasul, Roland Vollgraf. arXiv:1708.07747\n", 836 | " - [MNIST] The MNIST Database of handwritten digits. Yann LeCun, Corinna Cortes,\n", 837 | " Christopher J.C. Burges. http://yann.lecun.com/exdb/mnist/\n", 838 | "'''" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": null, 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "# Add the readme info as the DESCR\n", 848 | "fmnist.add_metadata(contents=fmnist_readme, kind='DESCR')" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "fmnist.file_list" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": {}, 864 | "outputs": [], 865 | "source": [ 866 | "# We can also find the LICENSE in the repo (note from github you need to select Raw)\n", 867 | "fmnist_license_url = 'https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/LICENSE'" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "fmnist.add_url(url=fmnist_license_url, name='LICENSE', file_name=\"fmnist.LICENSE\")" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [ 885 | "fmnist.fetch(force=True)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "!cat $paths.raw_data_path/fmnist.LICENSE" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": null, 900 | "metadata": {}, 901 | "outputs": [], 902 | "source": [ 903 | "fmnist.file_list" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": null, 909 | "metadata": {}, 910 | "outputs": [], 911 | "source": [ 912 | "fmnist.fetch(force=True)" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": null, 918 | "metadata": {}, 919 | "outputs": [], 920 | "source": [ 921 | "fmnist.unpack(force=True)" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": {}, 928 | "outputs": [], 929 | "source": [ 930 | "!ls -la $fmnist_unpack" 931 | ] 932 | }, 933 | { 934 | "cell_type": "markdown", 935 | "metadata": {}, 936 | "source": [ 937 | "### Adding Raw Data to the Catalog" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": null, 943 | "metadata": {}, 944 | "outputs": [], 945 | "source": [ 946 | "from src import workflow" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": null, 952 | "metadata": {}, 953 | "outputs": [], 954 | "source": [ 955 | "workflow.available_datasources()" 956 | ] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": null, 961 | "metadata": {}, 962 | "outputs": [], 963 | "source": [ 964 | "workflow.add_datasource(dsrc)" 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [ 973 | "workflow.available_datasources()" 974 | ] 975 | }, 976 | { 977 | "cell_type": "markdown", 978 | "metadata": {}, 979 | "source": [ 980 | "We will make use of this raw dataset catalog later in this tutorial. We can now load our `DataSource` by name:" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": null, 986 | "metadata": {}, 987 | "outputs": [], 988 | "source": [ 989 | "ds = DataSource.from_name('lvq-pak')" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "metadata": {}, 996 | "outputs": [], 997 | "source": [ 998 | "ds.file_list" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "markdown", 1003 | "metadata": {}, 1004 | "source": [ 1005 | "### Exercise: Add F-MNIST to the Raw Dataset Catalog" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": null, 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "source": [ 1014 | "workflow.add_datasource(fmnist)" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": null, 1020 | "metadata": {}, 1021 | "outputs": [], 1022 | "source": [ 1023 | "# Your fmnist dataset should now show up here:\n", 1024 | "workflow.available_datasources()" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": null, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [] 1033 | }, 1034 | { 1035 | "cell_type": "markdown", 1036 | "metadata": {}, 1037 | "source": [ 1038 | "### Nuke it from Orbit\n", 1039 | "\n", 1040 | "Now we can blow away all the data that we've downloaded and set up so far, and recreate it from the workflow datasource. Or, use some of our `make` commands!" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": null, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [ 1049 | "!cd .. && make clean_raw" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "execution_count": null, 1055 | "metadata": {}, 1056 | "outputs": [], 1057 | "source": [ 1058 | "!ls -la $paths.raw_data_path" 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "code", 1063 | "execution_count": null, 1064 | "metadata": {}, 1065 | "outputs": [], 1066 | "source": [ 1067 | "!cd .. && make fetch_sources" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": null, 1073 | "metadata": {}, 1074 | "outputs": [], 1075 | "source": [ 1076 | "!ls -la $paths.raw_data_path" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": null, 1082 | "metadata": {}, 1083 | "outputs": [], 1084 | "source": [ 1085 | "# What about fetch and unpack?\n", 1086 | "!cd .. && make clean_raw && make clean_interim" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": null, 1092 | "metadata": {}, 1093 | "outputs": [], 1094 | "source": [ 1095 | "!ls -la $paths.raw_data_path" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": null, 1101 | "metadata": {}, 1102 | "outputs": [], 1103 | "source": [ 1104 | "!ls -la $paths.interim_data_path" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "code", 1109 | "execution_count": null, 1110 | "metadata": {}, 1111 | "outputs": [], 1112 | "source": [ 1113 | "!cd .. && make unpack_sources" 1114 | ] 1115 | }, 1116 | { 1117 | "cell_type": "code", 1118 | "execution_count": null, 1119 | "metadata": {}, 1120 | "outputs": [], 1121 | "source": [ 1122 | "!ls -la $paths.raw_data_path" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": null, 1128 | "metadata": {}, 1129 | "outputs": [], 1130 | "source": [ 1131 | "!ls -la $paths.interim_data_path" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "markdown", 1136 | "metadata": {}, 1137 | "source": [ 1138 | "### Your data is now reproducible!" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "code", 1143 | "execution_count": null, 1144 | "metadata": {}, 1145 | "outputs": [], 1146 | "source": [] 1147 | } 1148 | ], 1149 | "metadata": { 1150 | "kernelspec": { 1151 | "display_name": "Python [default]", 1152 | "language": "python", 1153 | "name": "python3" 1154 | }, 1155 | "language_info": { 1156 | "codemirror_mode": { 1157 | "name": "ipython", 1158 | "version": 3 1159 | }, 1160 | "file_extension": ".py", 1161 | "mimetype": "text/x-python", 1162 | "name": "python", 1163 | "nbconvert_exporter": "python", 1164 | "pygments_lexer": "ipython3", 1165 | "version": "3.7.2" 1166 | } 1167 | }, 1168 | "nbformat": 4, 1169 | "nbformat_minor": 2 1170 | } 1171 | -------------------------------------------------------------------------------- /notebooks/22-transform-datasource.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 2.2 Transforming Data Sources into Data\n", 8 | "“It is a capital mistake to theorize before one has data.” Sherlock Holmes, “A Study in Scarlett” (Arthur Conan Doyle).\n", 9 | "\n", 10 | "“If we have data, let’s look at data. If all we have are opinions, let’s go with mine.” – Jim Barksdale, former Netscape CEO" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "%load_ext autoreload\n", 20 | "%autoreload 2" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import logging\n", 30 | "from src.logging import logger\n", 31 | "logger.setLevel(logging.INFO)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Turning a `DataSource` into a `Dataset`\n", 39 | "How do we turn raw data sources into something useful? There are 2 steps:\n", 40 | "1. Write a function to extract meaningful `data` (and optionally, `target`) objects from your raw source files, ( a **parse function**, and\n", 41 | "2. package this **parse function** according to a very simple API\n", 42 | "\n", 43 | "\n", 44 | "First, let's grab the dataset we created in the last notebook.\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Loading a `DataSet` from the Catalog" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from src import workflow\n", 61 | "from src.data import DataSource\n", 62 | "import pathlib" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "workflow.available_datasources()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "dsrc = DataSource.from_name('lvq-pak') # load it from the catalog\n", 81 | "unpack_dir = dsrc.unpack() # Find the location of the unpacked files" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "!ls -la $unpack_dir" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### `parse_function` Template\n", 98 | "A **parse function** is a function that conforms to a very simple API: given some input, it returns a triple\n", 99 | "\n", 100 | "```(data, target, additional_metadata)```\n", 101 | "\n", 102 | "\n", 103 | "where `data` and `target` are in a format ingestible by, say, an sklearn pipeline.\n", 104 | "`additional_metadata` is a dictionary of key-value pairs that will be added to any existing metadata." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Example: Processing lvq-pak data\n", 112 | "Let's convert the lvq-pak data (introduced in the last section) into into `data` and `target` vectors.\n", 113 | "\n", 114 | "#### Some exploratory EDA on lvq-pak datafiles" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "!ls -la $unpack_dir/lvq_pak-3.1 # Files are extracted to a subdirectory:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "datafile_train = unpack_dir / 'lvq_pak-3.1' / 'ex1.dat'\n", 133 | "datafile_test = unpack_dir / 'lvq_pak-3.1' / 'ex2.dat'\n", 134 | "datafile_train.exists() and datafile_test.exists()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "What do these datafiles look like?" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "!head -5 $datafile_train" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "So `datafile_train` (`ex1.dat`) appears to consists of:\n", 158 | "* the number of data columns, followed by\n", 159 | "* a comment line, then\n", 160 | "* space-delimited data\n", 161 | "\n", 162 | "**Wait!** There's a gotcha here. Look at the last entry in each row. That's the data label. In the last row, however, we see that `#` is used as a data label (easily confused for a comment). Be careful handling this!" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "!head -5 $datafile_test " 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | " `datafile_test` (`ex2.dat`) is similar, but has no comment header.\n", 179 | " \n", 180 | "#### Parsing lvq-pak data files" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "import pandas as pd\n", 190 | "import numpy as np\n", 191 | "from functools import partial" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "def read_space_delimited(filename, skiprows=None, class_labels=True, metadata=None):\n", 201 | " \"\"\"Read an space-delimited file\n", 202 | " \n", 203 | " Data is space-delimited. Last column is the (string) label for the data\n", 204 | "\n", 205 | " Note: we can't use automatic comment detection, as `#` characters are also\n", 206 | " used as data labels.\n", 207 | "\n", 208 | " Parameters\n", 209 | " ----------\n", 210 | " skiprows: list-like, int or callable, optional\n", 211 | " list of rows to skip when reading the file. See `pandas.read_csv`\n", 212 | " entry on `skiprows` for more\n", 213 | " class_labels: boolean\n", 214 | " if true, the last column is treated as the class (target) label\n", 215 | " \"\"\"\n", 216 | " with open(filename, 'r') as fd:\n", 217 | " df = pd.read_csv(fd, skiprows=skiprows, skip_blank_lines=True,\n", 218 | " comment=None, header=None, sep=' ', dtype=str)\n", 219 | " # targets are last column. Data is everything else\n", 220 | " if class_labels is True:\n", 221 | " target = df.loc[:, df.columns[-1]].values\n", 222 | " data = df.loc[:, df.columns[:-1]].values\n", 223 | " else:\n", 224 | " data = df.values\n", 225 | " target = np.zeros(data.shape[0])\n", 226 | " return data, target, metadata" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "data, target, metadata = read_space_delimited(datafile_train, skiprows=[0,1])\n", 236 | "data.shape, target.shape, metadata" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "We could be done here, but let's go a little further and allow the parsing function to return either `train`, `test` or `all` data:" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "def process_lvq_pak(*, unpack_dir, kind='all', extract_dir='lvq_pak-3.1', metadata=None):\n", 253 | " \"\"\"\n", 254 | " Parse LVQ-PAK datafiles into usable numpy arrays\n", 255 | " \n", 256 | " Parameters\n", 257 | " ----------\n", 258 | " unpack_dir: path\n", 259 | " path to unpacked tarfile\n", 260 | " extract_dir: string\n", 261 | " name of directory in the unpacked tarfile containing\n", 262 | " the raw data files\n", 263 | " kind: {'train', 'test', 'all'}\n", 264 | " \n", 265 | " \n", 266 | " Returns\n", 267 | " -------\n", 268 | " A tuple: \n", 269 | " (data, target, additional_metadata)\n", 270 | " \n", 271 | " \"\"\"\n", 272 | " if metadata is None:\n", 273 | " metadata = {}\n", 274 | "\n", 275 | " if unpack_dir:\n", 276 | " unpack_dir = pathlib.Path(unpack_dir)\n", 277 | "\n", 278 | " data_dir = unpack_dir / extract_dir\n", 279 | "\n", 280 | " if kind == 'train':\n", 281 | " data, target, metadata = read_space_delimited(data_dir / 'ex1.dat',\n", 282 | " skiprows=[0,1],\n", 283 | " metadata=metadata)\n", 284 | " elif kind == 'test':\n", 285 | " data, target, metadata = read_space_delimited(data_dir / 'ex2.dat',\n", 286 | " skiprows=[0],\n", 287 | " metadata=metadata)\n", 288 | " elif kind == 'all':\n", 289 | " data1, target1, metadata = read_space_delimited(data_dir / 'ex1.dat', skiprows=[0,1],\n", 290 | " metadata=metadata)\n", 291 | " data2, target2, metadata = read_space_delimited(data_dir / 'ex2.dat', skiprows=[0],\n", 292 | " metadata=metadata)\n", 293 | " data = np.vstack((data1, data2))\n", 294 | " target = np.append(target1, target2)\n", 295 | " else:\n", 296 | " raise Exception(f'Unknown kind: {kind}')\n", 297 | "\n", 298 | " return data, target, metadata" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "# All data by default\n", 308 | "data, target, metadata = process_lvq_pak(unpack_dir=unpack_dir)\n", 309 | "data.shape, target.shape, metadata" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "# Training data \n", 319 | "data, target, metadata = process_lvq_pak(unpack_dir=unpack_dir, kind='train')\n", 320 | "data.shape, target.shape, metadata" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# Test data \n", 330 | "data, target, metadata = process_lvq_pak(unpack_dir=unpack_dir, kind='test')\n", 331 | "data.shape, target.shape, metadata" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "dsrc.parse_function = partial(process_lvq_pak, unpack_dir=str(unpack_dir))" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "dsrc.dataset_opts()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### Write this into the catalog" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "# Now we want to save this to the workflow. We can just do the same as before, right?" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "workflow.add_datasource(dsrc)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "workflow.available_datasources()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "dset_catalog, dset_catalog_file = workflow.available_datasources(keys_only=False)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "dset_catalog['lvq-pak']" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "### Create a Dataset" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "ds = dsrc.process() # Use the load_function to convert this DataSource to a real Dataset\n", 425 | "str(ds)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "print(ds)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "ds = dsrc.process(kind=\"test\") # Should be half the size\n", 444 | "print(ds)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "type(ds)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "## EXERCISE: Turn the F-MNIST `DataSource` into a `Dataset`\n", 461 | "In the last exercise, you fetched and unpacked F-MNIST data.\n", 462 | "Now it's time to process it into a `Dataset` object." 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "## The `Dataset` and Data Transformations" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "### Tour of the Dataset Object" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "### Creating a Simple Transformer" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "### More Complicated Transformers" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "## Reproducible Data: The Punchline" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [] 527 | } 528 | ], 529 | "metadata": { 530 | "kernelspec": { 531 | "display_name": "Python 3", 532 | "language": "python", 533 | "name": "python3" 534 | }, 535 | "language_info": { 536 | "codemirror_mode": { 537 | "name": "ipython", 538 | "version": 3 539 | }, 540 | "file_extension": ".py", 541 | "mimetype": "text/x-python", 542 | "name": "python", 543 | "nbconvert_exporter": "python", 544 | "pygments_lexer": "ipython3", 545 | "version": "3.7.2" 546 | } 547 | }, 548 | "nbformat": 4, 549 | "nbformat_minor": 2 550 | } 551 | -------------------------------------------------------------------------------- /notebooks/30-bjorn-train-predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from src.data import Dataset\n", 10 | "from src import workflow" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "%load_ext autoreload\n", 20 | "%autoreload 2" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import logging\n", 30 | "logging.basicConfig(level=logging.DEBUG)\n", 31 | "logger = logging.getLogger()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# We’re gonna (data) science the *@#! out of this\n", 39 | "\n", 40 | "Now that we're getting good at automating the `Dataset` generation process, let's acutally **use** our data!\n", 41 | "\n", 42 | "## Bjørn's Problem: Supervised Learning\n", 43 | "\n", 44 | "Bjørn employs a large number of Finnish line cooks. He can’t understand a word they say.\n", 45 | "\n", 46 | "Bjørn needs a trained model to do real-time translation from Finnish to Swedish.\n", 47 | "\n", 48 | "Bjørn has decided to start with the Finnish phoneme dataset shipped with a project called lvq-pak. His objective is to train three different models, and choose the one with the best overall accuracy score.\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Load the Dataset\n", 56 | "In a previous notebook, we created training and test versions of the lvq-pak `Dataset` object. Let's reload these and have a look." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "workflow.available_datasets()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "**Recall**: the data consists of 20-dimensional MFCC data." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "ds_train = Dataset.load('lvq-pak_train')" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "ds_train.data.shape" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "ds_train.target" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "The target labels are numerical. If for some reason you were interested in phoneme labels themselves, this map is stored in the Dataset metadata:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "ds_train.LABEL_MAP" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Let's grab the test set as well." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "ds_test = Dataset.load('lvq-pak_test')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "ds_test.data.shape" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "A quick look at the license verifies that, while we are free to use this data for experimentation, we can't turn around and ship a commercial Finnish to Swedish translator. That's okay. This is for Bjørn's kitchen only:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "print(ds_train.LICENSE)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## Let's train a model (the old-fashioned way)\n", 164 | "We will walk through one example of building a model by hand. Later, we will convert this process to a reproducible data science workflow. \n", 165 | "\n", 166 | "Let's add the **Linear Support Vector Classifier** from scikit-learn." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "from sklearn.svm import LinearSVC" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "model = LinearSVC(random_state=42)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "model.fit(ds_train.data, ds_train.target)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Whoops. We had better increase the number of iterations until the model actually converges." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "%%time\n", 210 | "model = LinearSVC(random_state=42, max_iter=200000)\n", 211 | "model.fit(ds_train.data, ds_train.target)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Use the model to predict phoneme classes\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "lsvc_prediction = model.predict(ds_test.data);\n", 228 | "lsvc_prediction[:20]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## Assess the quality of the prediction\n" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "model.score(ds_test.data, ds_test.target)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "\"Score\" seems a little opaque. What kind of score is being used here? Turns out it's an **accuracy score**. Here it is a little more explicitly:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "from sklearn.metrics import accuracy_score\n", 261 | "help(accuracy_score)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "accuracy_score(ds_test.target, lsvc_prediction)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Now, let's automate this process, and make it reproducible." 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "# Step 3: Train Models (`make train`)\n", 285 | "In this step, we use the processed datasets we created in *Step 2* (`make data`) to train and save models. For this workflow, a **Model** is an object that conforms to the scikit-learn `BaseEstimator` API.\n", 286 | "\n", 287 | "\"The\n", 288 | "\n", 289 | "\n", 290 | "## Add our algorithm to `available_algorithms()`\n", 291 | "\n", 292 | "How do we make an algorithm available for use with our reproducible data science workflow? We give it a name (a text string), and map this string to the function we wish to call. We will use this general technique throughout this flow to make various algorithms, datasets, models, and analyses usable by our workflow process\n" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "workflow.available_algorithms()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "### EXERCISE\n", 309 | "To add an algorithm to this list, you need to add a key:value pair to the dict `_ALGORITHMS` in `src/models/algorithms.py`.\n", 310 | "\n", 311 | "1. add\n", 312 | "```\n", 313 | "'linearSVC': LinearSVC()\n", 314 | "```\n", 315 | "to the `_ALGORITHMS` dict\n", 316 | "2. Add\n", 317 | "```\n", 318 | "from sklearn.svm import LinearSVC\n", 319 | "```\n", 320 | "to the top of the file.\n", 321 | "\n", 322 | "3. Add `linearSVC` to the docstring of `available_algorithms`." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "help(workflow.available_algorithms)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "workflow.available_algorithms()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Now we can add **model generation instructions** to our reproducible data science workflow. In this case, apply the `linearSVC` model to the `lvq-pak_train` dataset:" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "workflow.add_model(dataset_name='lvq-pak_train',\n", 357 | " algorithm_name=\"linearSVC\",\n", 358 | " algorithm_params={'random_state': 42, 'max_iter': 200000})" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "We can see the complete list of model/dataset combinations using `get_model_list()`" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "workflow.get_model_list()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "To actually train this model:" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "workflow.make_train()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "Or alternately, from the Makefile:" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "!cd .. && make train" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "The output of this process is a **trained model**. We currently record this in two places:\n", 414 | "* A trained model in `models/trained_models`\n", 415 | "* A json file on disk (`models/trained_models.json`). \n", 416 | "\n", 417 | "Of course, we also make this information available via a workflow command: `available_models()`. Notice the clever naming scheme for the model produced by applying `linearSVC` to `lvq-pak_train`:" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "workflow.available_models()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "### ASIDE: Under the Hood" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "If you take a peek into the `Makefile`, you'll notice that `make train` takes a `models/model_list.json` as input.\n", 441 | "```\n", 442 | "## train / fit / build models\n", 443 | "train: models/model_list.json\n", 444 | "\t$(PYTHON_INTERPRETER) -m src.models.train_model model_list.json\n", 445 | "```\n", 446 | "\n", 447 | "Under the hood, a `model_list.json` is a list of dicts, where each dict specifices a combination of:\n", 448 | "* `dataset_name`: A valid dataset name from `available_datasets()`\n", 449 | "* `algorithm_name`: A valid dataset name from `available_algorithms()`\n", 450 | "* `algorithm_params`: A dictionary of parameters to use when running the specified algorithm\n", 451 | "* `run_number`: (optional, default 1) A unique integer used to distinguish between different builds with otherwise identical parameters\n", 452 | "\n", 453 | "Throughout this reproducible data science workflow, we are constantly creating and storing information in json files on disk. " 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "!cat ../models/model_list.json" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "You don't necessarily need to know any of this, but sometimes it's nice to know what's going on under the hood." 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "### What exactly is a trained model in our reproducible workflow?\n", 477 | "Let's take a look at the output from `make train`" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "from src.paths import trained_model_path" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "workflow.available_models()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "# load up the trained model\n", 505 | "from src.models.train import load_model\n", 506 | "\n", 507 | "tm, tm_metadata = load_model(model_name='linearSVC_lvq-pak_train_1')" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "tm" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "Just as before, this is function that conforms to the sklearn `BaseEstimator` API. In addition to the trained model, we also returned some useful metadata, which includes the hashes of the input data, the hash of the generated model, and everything we need to know to train the model from scratch" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "tm_metadata" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "Just to check, we can verify that the stored dataset called `lvq-pak_train` was the same one used to train this model: (**data provenance** in action!)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "ds = Dataset.load('lvq-pak_train')\n", 549 | "ds.DATA_HASH" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "## An Aside: sklearn Estimator API\n", 557 | "To implement the notion of a model, we borrow a basic data type from scikit-learn: the **BaseEstimator**. To use an algorithm as a model, we must build it into a class which:\n", 558 | "* is a subclass of the sklearn `BaseEstimator` class (or implements `get_params`, `set_params`)\n", 559 | "* has a `fit` method (needed for `make train`)\n", 560 | "* has either a `predict` method (if it's a **supervised learning** problem) or a `transform` method (**unsupervised learning** problem) (needed for `make predict`)\n", 561 | "\n", 562 | "We will see how things work in the unsupervised case in the next workbook. \n", 563 | "\n", 564 | "One of the advantages of using the sklearn **Estimator** API is that a model can consist of any combination of \"algorithms\" as long as that combination is a `BaseEstimator` implementing above methods. For example, you can use an sklearn `Pipeline`, or an sklearn meta-estimator like `GridSearchCV` to implement a model. \n", 565 | "\n", 566 | "If your algorithm of choice is **not yet** a `BaseEstimator` with the appropriate API, it is fairly easy to wrap it to be used in this way. We'll leave this as an exercise for the reader.\n" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "# Step 4: `make predict`\n", 574 | "\n", 575 | "In the **Predict/Transform** step, we flow data through our trained models to obtain **new Datasets** - either predictions, or transformations, depending whether we are using supervised or unsupervised-style algorithms. \n", 576 | "\n", 577 | "\"The\n", 578 | "\n", 579 | "\n" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "### Predicting Phonemes\n", 587 | "Bjørn is doing supervised learning, (and he did a train/test split on the data before we started), so let's use the test set here to do the prediction." 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "workflow.add_prediction(dataset_name='lvq-pak_test',\n", 597 | " model_name='linearSVC_lvq-pak_train_1',\n", 598 | " is_supervised=True)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "workflow.get_prediction_list()" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "workflow.make_predict()" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "# This is the same as\n", 626 | "!cd .. && make predict" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "workflow.available_predictions()" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "Yuck. We didn't specify an output dataset name, so our workflow just inferred one that makes sense (though it is a bit of a mouthful). Let's fix that." 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "workflow.get_prediction_list()" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [ 660 | "prediction = workflow.pop_prediction()\n", 661 | "prediction['output_dataset'] = 'lvq-test-svc'\n", 662 | "workflow.add_prediction(**prediction)\n", 663 | "workflow.get_prediction_list()" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "workflow.make_predict()" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "workflow.available_predictions()" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "Now we have two predictions. We'll see here that they are the same." 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "### What is a Prediction?\n", 696 | "\n", 697 | "Under the hood, a Prediction is just a `Dataset` with an added `experiment` metadata header." 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "from src.paths import model_output_path\n", 707 | "from src.utils import list_dir" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": {}, 714 | "outputs": [], 715 | "source": [ 716 | "list_dir(model_output_path)" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "predict_ds = Dataset.load('lvq-test-svc', data_path=model_output_path)" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "predict_ds.data.shape" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "predict_ds.metadata['experiment']" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "Here we have saved all sorts of useful information, such as the hashes of the data that went in, and the start time/diration of the prediction itself. Most importantly, the prediction we got via this process was exactly the same as the one we did manually, before converting our process to a reproducible workflow." 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "ds = Dataset.load('lvq-pak_test')\n", 760 | "ds.DATA_HASH" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": {}, 766 | "source": [ 767 | "Finally, check that our prediction matches what we got **before** we turned this into an automated reproducible workflow:\n" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "all(predict_ds.data == lsvc_prediction)" 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": {}, 782 | "source": [ 783 | "### An Aside: \"Randomness\" and `random_state`\n", 784 | "Randomness is often a key feature of machine learning algorithms, but for reproducible data science, it is death. It's essential, when building reproducible data science flows, that our randomness is controlled by a deterministic `random_state` (or random_seed). " 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "model, model_meta = load_model('linearSVC_lvq-pak_train_1')\n", 794 | "model_meta['algorithm_params']" 795 | ] 796 | }, 797 | { 798 | "cell_type": "markdown", 799 | "metadata": {}, 800 | "source": [ 801 | "**Always** pass in a `random_state`. If we want to run our algorithm multiple times with different random states, we can even use `GridSearchCV` where the only parameter that we're varying over is the `random_state`. " 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": {}, 807 | "source": [ 808 | "## Summary: `make train`, `make predict`\n", 809 | "That felt like a lot of exposition. In fact, here's what we ended up doing:" 810 | ] 811 | }, 812 | { 813 | "cell_type": "raw", 814 | "metadata": {}, 815 | "source": [ 816 | "# Add `linearSCV` to the algorithm list in `src/models/algorithms.py`\n", 817 | "\n", 818 | "# train a model called \"linearSVC_lvq-pak_train_1\"\n", 819 | "workflow.add_model(dataset_name='lvq-pak_train',\n", 820 | " algorithm_name=\"linearSVC\",\n", 821 | " algorithm_params={'random_state': 42, 'max_iter': 200000})\n", 822 | "# \n", 823 | "workflow.add_prediction(dataset_name='lvq-pak_test',\n", 824 | " model_name='linearSVC_lvq-pak_train_1', \n", 825 | " is_supervised=True, output_dataset='lvq-tets-svc')\n", 826 | "\n", 827 | "workflow.make_train() # or `make train`\n", 828 | "workflow.make_predict() # or `make predict`" 829 | ] 830 | } 831 | ], 832 | "metadata": { 833 | "kernelspec": { 834 | "display_name": "Python [conda env:bus_number]", 835 | "language": "python", 836 | "name": "conda-env-bus_number-py" 837 | }, 838 | "language_info": { 839 | "codemirror_mode": { 840 | "name": "ipython", 841 | "version": 3 842 | }, 843 | "file_extension": ".py", 844 | "mimetype": "text/x-python", 845 | "name": "python", 846 | "nbconvert_exporter": "python", 847 | "pygments_lexer": "ipython3", 848 | "version": "3.6.6" 849 | } 850 | }, 851 | "nbformat": 4, 852 | "nbformat_minor": 2 853 | } 854 | -------------------------------------------------------------------------------- /notebooks/40-bjorn-analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from src import workflow\n", 10 | "from src.paths import summary_path" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# I'm Only Here for the Pretty Pictures\n", 18 | "\n", 19 | "## Step 5: Analyze and Summarize\n", 20 | "\n", 21 | "An **Analysis** takes Datasets (the predictions or transformed datasets obtained from the last step), and produces data (e.g. Pandas DataFrame, CSV). This is how we would generate tables in a paper, for example, or data to be consumed by a graphing function.\n", 22 | "\n", 23 | "\"\"\n", 24 | "In Bjørn's case, he have a target vector associated with the original dataset, so hecan compare his prediction performance against that target. We will use a **scoring function** to compare these vectors." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "Like in sklearn, A **scoring function** is a function with the signature:\n", 32 | " `score_func(y, y_pred, **kwargs)`\n", 33 | " " 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "workflow.available_scorers()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## TODO: Add all of this to the standard workflow" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "workflow.available_analyses()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "help(workflow.available_analyses)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "workflow.available_predictions()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "workflow.add_analysis(analysis_name='score_predictions')\n", 86 | "workflow.get_analysis_list()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "workflow.make_analysis()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "!cd .. && make analysis" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "!cat ../reports/analyses.json" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### EXERCISE: Implement `available_results()`" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "## Look at the Summary Statistics" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "import pandas as pd" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "df = pd.read_csv(summary_path / 'score_predictions.csv')\n", 167 | "df" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Add other algorithms\n", 175 | "\n", 176 | "Comparing one algorithm isn't that interesting. Let's add a couple more so Bjørn can compare them." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### Exercise: Add a GradientBoostingClassifier \n", 184 | "Modify `src/models/algorithms.py` so that the next cell works.\n", 185 | "\n", 186 | "Hint: ```sklearn.ensemble.GradientBoostingClassifier```" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "workflow.add_model(\n", 196 | " dataset_name = 'lvq-pak_train',\n", 197 | " algorithm_name = 'GradientBoostingClassifier',\n", 198 | " algorithm_params = {'random_state': 42} \n", 199 | ")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### Exercise: Add your choice of classifier here" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "workflow.add_model(\n", 216 | " dataset_name = 'lvq-pak_train',\n", 217 | " ## add your algorithm here!\n", 218 | ")" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "### Take a look to see what's there\n", 228 | "workflow.get_model_list()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "workflow.available_algorithms(keys_only=False)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "workflow.make_train()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "workflow.available_models()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "workflow.get_prediction_list()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "## Set up predictions using all of the available models\n", 274 | "for model in workflow.available_models():\n", 275 | " workflow.add_prediction(\n", 276 | " dataset_name = 'lvq-pak_test',\n", 277 | " model_name = model,\n", 278 | " is_supervised = True,\n", 279 | " )" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "workflow.get_prediction_list()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "workflow.make_predict()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "workflow.available_predictions()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "The default for running the the summary df is to run on all available predictions. We have nothing more that we have to add to our existing script to get all the new scores. " 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "!cd .. && make analysis" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Look at summary statistics" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "df = pd.read_csv(summary_path / 'score_predictions.csv')\n", 339 | "df" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Make Publish\n", 354 | "\"\"" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "### Publishing Data:\n", 362 | "A DOI is a commonly used **digital object identifier**, and can be used to publish a dataset.\n", 363 | "\n", 364 | "There are a few easy ways to get a DOI for your work:\n", 365 | "* [Figshare](http://figshare.com/) will provide a DOI for any virtually any digital work, which includes data.\n", 366 | "* [Zenodo](https://zenodo.org/) provides DOIs research output, which may include datasets" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "We aren't going to take you through the publishing process here" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [] 382 | } 383 | ], 384 | "metadata": { 385 | "kernelspec": { 386 | "display_name": "Python [conda env:bus_number]", 387 | "language": "python", 388 | "name": "conda-env-bus_number-py" 389 | }, 390 | "language_info": { 391 | "codemirror_mode": { 392 | "name": "ipython", 393 | "version": 3 394 | }, 395 | "file_extension": ".py", 396 | "mimetype": "text/x-python", 397 | "name": "python", 398 | "nbconvert_exporter": "python", 399 | "pygments_lexer": "ipython3", 400 | "version": "3.6.6" 401 | } 402 | }, 403 | "nbformat": 4, 404 | "nbformat_minor": 2 405 | } 406 | -------------------------------------------------------------------------------- /notebooks/50-mark-add-fmnist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Mark's Problem: Unsupervised Learning\n", 8 | "\n", 9 | "Mark regularly gets handed files full of fashion images, labelled by category. He wants to know how he can use this to help keep up with the latest trends for the magazine.\n", 10 | "\n", 11 | "For now, he's interested in producing a visualization of the various categories so that he can learn more about them. He's hoping his these explorations will eventually help him speed up the process of sorting through what he gets sent to review every week. \n", 12 | "\n", 13 | "But first, he has to put this data in a usable format." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from src.data import RawDataset, Dataset\n", 23 | "from src.utils import list_dir\n", 24 | "from src.paths import raw_data_path" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "When you are developing in a module, it's really handy to have these lines:" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "%load_ext autoreload\n", 41 | "%autoreload 2" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "We want to see debug-level logging in the notebook. Here's the incantation" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "import logging\n", 58 | "logging.basicConfig(level=logging.DEBUG)\n", 59 | "logger = logging.getLogger()\n", 60 | "logger.setLevel(logging.INFO)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# More Datasets! Practice Makes Perfect. \n", 68 | "Acually, practice just makes permanent. **Perfect practice** makes perfect, but we digress." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Adding and processing the Fashion-MNIST (FMNIST) Dataset\n", 76 | "\n", 77 | "\n", 78 | "Recall that our approach to building a usable dataset is:\n", 79 | "\n", 80 | "1. Assemble the raw data files. Generate (and record) hashes to ensure the validity of these files.\n", 81 | "2. Add LICENSE and DESCR (description) metadata to make the raw data usable for other people, and\n", 82 | "3. Write a function to process the raw data into a usable format (for us, a `Dataset` object)\n", 83 | "4. Write transformation functions on `Dataset` objects that fit our data munging into an automated reproducible workflow. \n", 84 | "\n", 85 | "In practice, that means:\n", 86 | "\n", 87 | "* Create a `RawDataset`\n", 88 | " * `add_url()`: give instructions for how to `fetch` your data and add a `DESCR` and `LICENSE`\n", 89 | " * `add_process()`: add a function that knows how to process your specific dataset\n", 90 | "* `workflow.add_raw_dataset()`: add the `RawDataset` to your `workflow`\n", 91 | "* Transform your `Dataset`\n", 92 | " * (Optionally add a `transformer` function to the `workflow`)\n", 93 | " * `workflow.add_transformer()`: further transform your data. \n", 94 | "* Run `make data`" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "Looking at the FMNIST GitHub documentation, we see that the raw data is distributed as a set of 4 files. \n", 102 | "\n", 103 | "| Name | Content | Examples | Size | Link | MD5 Checksum|\n", 104 | "| --- | --- |--- | --- |--- |--- |\n", 105 | "| `train-images-idx3-ubyte.gz` | training set images | 60,000|26 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz)|`8d4fb7e6c68d591d4c3dfef9ec88bf0d`|\n", 106 | "| `train-labels-idx1-ubyte.gz` | training set labels |60,000|29 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz)|`25c81989df183df01b3e8a0aad5dffbe`|\n", 107 | "| `t10k-images-idx3-ubyte.gz` | test set images | 10,000|4.3 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz)|`bef4ecab320f06d8554ea6380940ec79`|\n", 108 | "| `t10k-labels-idx1-ubyte.gz` | test set labels | 10,000| 5.1 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz)|`bb300cfdad3c16e7a12a480ee83cd310`|\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Let's give our dataset a name." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "dataset_name=\"f-mnist\"" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### Download and Check Hashes\n", 132 | "Because Zalando are excellent data citizens, they have conveniently given us MD5 hashes that we can verify when we download this data." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Set the log level to DEBUG so we can see what's going on\n", 142 | "logger.setLevel(logging.DEBUG)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "# Specify the raw files and their hashes\n", 152 | "data_site = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com'\n", 153 | "file_list = [\n", 154 | " ('train-images-idx3-ubyte.gz','8d4fb7e6c68d591d4c3dfef9ec88bf0d'),\n", 155 | " ('train-labels-idx1-ubyte.gz','25c81989df183df01b3e8a0aad5dffbe'),\n", 156 | " ('t10k-images-idx3-ubyte.gz', 'bef4ecab320f06d8554ea6380940ec79'),\n", 157 | " ('t10k-labels-idx1-ubyte.gz', 'bb300cfdad3c16e7a12a480ee83cd310'),\n", 158 | "]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "fmnist = RawDataset(dataset_name)\n", 168 | "for file, hashval in file_list:\n", 169 | " url = f\"{data_site}/{file}\"\n", 170 | " fmnist.add_url(url=url, hash_type='md5', hash_value=hashval)\n", 171 | "# Download and check the hashes\n", 172 | "fmnist.fetch()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "list_dir(raw_data_path)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Don't forget the License and Description" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Easy case. Zalando are good data citizens, so their data License is directly available from\n", 198 | "# their Raw Data Repo on github\n", 199 | "\n", 200 | "# Notice we tag this data with the name `LICENSE`\n", 201 | "fmnist.add_url(url='https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/LICENSE',\n", 202 | " name='LICENSE', file_name=f'{dataset_name}.license')\n" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# What does the raw data look like?\n", 212 | "# Where did I get it from? \n", 213 | "# What format is it in?\n", 214 | "# What should it look like when it's processed?\n", 215 | "fmnist_readme = '''\n", 216 | "Fashion-MNIST\n", 217 | "=============\n", 218 | "\n", 219 | "Notes\n", 220 | "-----\n", 221 | "Data Set Characteristics:\n", 222 | " :Number of Instances: 70000\n", 223 | " :Number of Attributes: 728\n", 224 | " :Attribute Information: 28x28 8-bit greyscale image\n", 225 | " :Missing Attribute Values: None\n", 226 | " :Creator: Zalando\n", 227 | " :Date: 2017\n", 228 | "\n", 229 | "This is a copy of Zalando's Fashion-MNIST [F-MNIST] dataset:\n", 230 | "https://github.com/zalandoresearch/fashion-mnist\n", 231 | "\n", 232 | "Fashion-MNIST is a dataset of Zalando's article images—consisting of a\n", 233 | "training set of 60,000 examples and a test set of 10,000\n", 234 | "examples. Each example is a 28x28 grayscale image, associated with a\n", 235 | "label from 10 classes. Fashion-MNIST is intended to serve as a direct\n", 236 | "drop-in replacement for the original [MNIST] dataset for benchmarking\n", 237 | "machine learning algorithms. It shares the same image size and\n", 238 | "structure of training and testing splits.\n", 239 | "\n", 240 | "References\n", 241 | "----------\n", 242 | " - [F-MNIST] Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms.\n", 243 | " Han Xiao, Kashif Rasul, Roland Vollgraf. arXiv:1708.07747\n", 244 | " - [MNIST] The MNIST Database of handwritten digits. Yann LeCun, Corinna Cortes,\n", 245 | " Christopher J.C. Burges. http://yann.lecun.com/exdb/mnist/\n", 246 | "'''\n", 247 | "\n", 248 | "fmnist.add_metadata(kind=\"DESCR\", contents=fmnist_readme)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "fmnist.fetch()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Recall, most unpacking can be handled automagically. Just run it." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "fmnist.unpack()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "## Converting a `RawDataset` into a usable `Dataset`\n", 281 | "\n", 282 | "Recall that we need to write a processing function and add it to our `RawDataset`." 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "### Processing the raw data\n", 290 | "Finally, we need to convert the raw data into usable `data` and `target` vectors.\n", 291 | "The code at https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py tells us how to do that. Having a look at the sample code, we notice that we need numpy. How do we add this to the environment?\n", 292 | "* Add it to `environment.yml`\n", 293 | "* `make requirements`\n", 294 | "\n", 295 | "Once we have done this, we can do the following processing and setup:" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "import numpy as np\n", 305 | "\n", 306 | "unpack_path = fmnist.unpack()\n", 307 | "kind = \"train\"\n", 308 | "\n", 309 | "label_path = unpack_path / f\"{kind}-labels-idx1-ubyte\"\n", 310 | "with open(label_path, 'rb') as fd:\n", 311 | " target = np.frombuffer(fd.read(), dtype=np.uint8, offset=8)\n", 312 | "dataset_path = unpack_path / f\"{kind}-images-idx3-ubyte\"\n", 313 | "with open(dataset_path, 'rb') as fd:\n", 314 | " data = np.frombuffer(fd.read(), dtype=np.uint8, offset=16).reshape(len(target), 784)\n", 315 | "\n", 316 | "print(f'Data: {data.shape}, Target: {target.shape}')" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "### Building a `Dataset`\n", 324 | "\n", 325 | "Time to build a processing function. Recall that a processing function produces a dictionary of kwargs that can be used as a `Dataset` constructor:\n", 326 | " " 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "from src.data import Dataset\n", 336 | "help(Dataset.__init__)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "Rewriting the sample code into the framework gives us this:\n", 344 | "### EXERCISE: Add this into the right place" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "#%%file -a ../src/data/localdata.py\n", 354 | "#__all__ += ['process_mnist']\n", 355 | "\n", 356 | "def process_mnist(dataset_name='mnist', kind='train', metadata=None):\n", 357 | " '''\n", 358 | " Load the MNIST dataset (or a compatible variant; e.g. F-MNIST)\n", 359 | "\n", 360 | " dataset_name: {'mnist', 'f-mnist'}\n", 361 | " Which variant to load\n", 362 | " kind: {'train', 'test'}\n", 363 | " Dataset comes pre-split into training and test data.\n", 364 | " Indicates which dataset to load\n", 365 | " metadata: dict\n", 366 | " Additional metadata fields will be added to this dict.\n", 367 | " 'kind': value of `kind` used to generate a subset of the data\n", 368 | " '''\n", 369 | " if metadata is None:\n", 370 | " metadata = {}\n", 371 | " \n", 372 | " if kind == 'test':\n", 373 | " kind = 't10k'\n", 374 | "\n", 375 | " label_path = interim_data_path / dataset_name / f\"{kind}-labels-idx1-ubyte\"\n", 376 | " with open(label_path, 'rb') as fd:\n", 377 | " target = np.frombuffer(fd.read(), dtype=np.uint8, offset=8)\n", 378 | " dataset_path = interim_data_path / dataset_name / f\"{kind}-images-idx3-ubyte\"\n", 379 | " with open(dataset_path, 'rb') as fd:\n", 380 | " data = np.frombuffer(fd.read(), dtype=np.uint8,\n", 381 | " offset=16).reshape(len(target), 784)\n", 382 | " metadata['subset'] = kind\n", 383 | " \n", 384 | " dset_opts = {\n", 385 | " 'dataset_name': dataset_name,\n", 386 | " 'data': data,\n", 387 | " 'target': target,\n", 388 | " 'metadata': metadata,\n", 389 | " }\n", 390 | " return dset_opts\n" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "Now add this process function to the built in workflow in order to automate `Dataset` creation." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "from functools import partial\n", 407 | "from src.data.localdata import process_mnist" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "fmnist.unpack(force=True)\n", 417 | "fmnist.load_function = partial(process_mnist, dataset_name='f-mnist')\n", 418 | "ds = fmnist.process(force=True)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "ds.data.shape, ds.target.shape" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "## Add this Dataset to the master dataset list" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "from src import workflow" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "# Add the Raw Dataset to the master list of Raw Datasets\n", 453 | "workflow.add_raw_dataset(fmnist)\n", 454 | "workflow.available_raw_datasets()" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "# Create a pair of Datasets from this Raw Dataset, by specifying different options for the RawDataset creation\n", 464 | "for kind in ['train', 'test']:\n", 465 | " workflow.add_transformer(from_raw=fmnist.name, raw_dataset_opts={'kind':kind}, \n", 466 | " output_dataset=f\"{fmnist.name}_{kind}\")\n", 467 | "\n", 468 | "workflow.get_transformer_list()" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "Apply the transforms and save the resulting Datasets. This is the same as doing a `make data`\n" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "logger.setLevel(logging.INFO)\n", 485 | "workflow.make_data()" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "!cd .. && make data" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "Now we can load these datsets by name:\n" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "ds = Dataset.load(\"f-mnist_test\")\n", 511 | "print(f\"Data:{ds.data.shape}, Target:{ds.target.shape}\")" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "ds = Dataset.load(\"f-mnist_train\")\n", 521 | "print(f\"Data:{ds.data.shape}, Target:{ds.target.shape}\")" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "### Don't forget: check in your changes using `git`" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "* Check in the generated `raw_datasets.json`, `transformer_list.json` in to source code control\n", 536 | "* do a `make data`\n", 537 | "* add tests if you haven't yet\n" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "## Summary\n", 545 | "Mark is well on his way to doing data science on his fashion data. In this example, he:\n", 546 | "* Created a `RawDataset` consisting of 4 raw data files\n", 547 | "* Checked the hashes of these files against known (published) values\n", 548 | "* Added license and description metadata\n", 549 | "* Added a processing function to parse the contents of these raw data files into a usable format, and\n", 550 | "* Created \"test\" and \"train\" variants of a `Dataset` object from this `RawDataset`\n" 551 | ] 552 | }, 553 | { 554 | "cell_type": "raw", 555 | "metadata": {}, 556 | "source": [ 557 | "from functools import partial\n", 558 | "from src.data.localdata import process_mnist\n", 559 | "\n", 560 | "# Create a RawDataset from known hashes\n", 561 | "fmnist = RawDataset('f-mnist')\n", 562 | "data_site = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com'\n", 563 | "file_list = [\n", 564 | " ('train-images-idx3-ubyte.gz','8d4fb7e6c68d591d4c3dfef9ec88bf0d'),\n", 565 | " ('train-labels-idx1-ubyte.gz','25c81989df183df01b3e8a0aad5dffbe'),\n", 566 | " ('t10k-images-idx3-ubyte.gz', 'bef4ecab320f06d8554ea6380940ec79'),\n", 567 | " ('t10k-labels-idx1-ubyte.gz', 'bb300cfdad3c16e7a12a480ee83cd310'),\n", 568 | "]\n", 569 | "for file, hashval in file_list:\n", 570 | " fmnist.add_url(url=f\"{data_site}/{file}\", hash_type='md5', hash_value=hashval)\n", 571 | "# Add metadata and processing functions\n", 572 | "fmnist.add_url(url='https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/LICENSE',\n", 573 | " name='LICENSE', file_name=f'{dataset_name}.license')\n", 574 | "fmnist.add_metadata(kind=\"DESCR\", contents=fmnist_readme)\n", 575 | "fmnist.load_function = partial(process_mnist, dataset_name='f-mnist')\n", 576 | "workflow.add_raw_dataset(fmnist)\n", 577 | "workflow.make_raw()\n", 578 | "\n", 579 | "# Add Datasets (directly from raw)\n", 580 | "for kind in ['train', 'test']:\n", 581 | " workflow.add_transformer(from_raw=fmnist.name, raw_dataset_opts={'kind':kind}, \n", 582 | " output_dataset=f\"{fmnist.name}_{kind}\")\n", 583 | "workflow.make_data()" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "workflow.available_datasets()" 593 | ] 594 | } 595 | ], 596 | "metadata": { 597 | "kernelspec": { 598 | "display_name": "Python [default]", 599 | "language": "python", 600 | "name": "python3" 601 | }, 602 | "language_info": { 603 | "codemirror_mode": { 604 | "name": "ipython", 605 | "version": 3 606 | }, 607 | "file_extension": ".py", 608 | "mimetype": "text/x-python", 609 | "name": "python", 610 | "nbconvert_exporter": "python", 611 | "pygments_lexer": "ipython3", 612 | "version": "3.6.6" 613 | } 614 | }, 615 | "nbformat": 4, 616 | "nbformat_minor": 2 617 | } 618 | -------------------------------------------------------------------------------- /notebooks/Notes on the Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "* cookiecutter is too slow to do live\n", 8 | "** nbdime, nbval was broken\n", 9 | "\n", 10 | "* Too long talking after setup, before notebook\n", 11 | "\n", 12 | "* More instructions for what doing while talking\n", 13 | "\n", 14 | "* Too much text in notebooks - fine for SOLUTION notebook, but the problem notebooks should be pretty bare\n", 15 | "\n", 16 | "Problem/solution.\n", 17 | "\n", 18 | "* Need stop/talk points in notebook (and shift-enter sections)\n", 19 | "* Explain what they did. (not what they are about to do)\n", 20 | "Clearer instructions on what they should be doing.\n", 21 | "\n", 22 | "\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Exposition: Introduce a problem, the tools to solve it.\n", 30 | "* Give a quick example.\n", 31 | "Give an exercise:\n", 32 | " * Give an explicit problem to be solved\n", 33 | " * Have an outcome in mind\n", 34 | "review our solution to the same problem\n", 35 | "\n", 36 | "Rinse, Lather, Repeat\n", 37 | "\n", 38 | "e.g. here's the site. PRodice a RawDataset with tar URL, license, readme, and do a \"fetch\"\n", 39 | "\n", 40 | "You should end up with 3 files.\n", 41 | "\n", 42 | "\n", 43 | "* Suggestion: Visualize the flow/graph of steps\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "* instructions available for installation of \n", 51 | " * git\n", 52 | " * make\n", 53 | " * editor (not familiar with their laptop)\n", 54 | " " 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Tutorial 0: Reproducible Environment (30m)\n", 62 | "\n", 63 | "* Code Flow: make and makefiles\n", 64 | "* Templates: cookiecutter\n", 65 | "* Revision Control: git and github\n", 66 | "* Virtualenv: conda / pipenv\n", 67 | "* Testing: doctest, pytest, hypothesis\n", 68 | "\n", 69 | "## Tutorial 1: Reproducible Data (1h)\n", 70 | "\"Raw Data is Read Only. Sing it with me\"\n", 71 | "\n", 72 | "\n", 73 | "* RawDataset\n", 74 | " * Fetching + Unpack\n", 75 | " * Example 1: lvq-pak\n", 76 | " * Exercise: fmnist\n", 77 | " * Processing data\n", 78 | " * Process into data, (optionally, target)\n", 79 | " * create a process_my_dataset() function\n", 80 | " * Example 1: lvq-pak\n", 81 | " * Exercise: fmnist\n", 82 | " * save the raw dataset to the raw dataset catalog\n", 83 | "\n", 84 | "* Datasets and Data Transformers\n", 85 | " * Create a transformer to produce a Dataset from the RawDataset\n", 86 | " * Add this dataset to the catalog\n", 87 | " * Load the dataset\n", 88 | " * example: lvq-pak\n", 89 | " * exercise: fmnist_test, fmnist_train\n", 90 | " \n", 91 | " * More Complicated Transformers\n", 92 | " * Example: Train/Test Split on lvq-pak\n", 93 | " * Exercise: merge labels on lvq-pak\n", 94 | " * Exercise: merge labels on fmnist\n", 95 | " \n", 96 | "* Punchline: \n", 97 | " * make clean_raw, clean_cache, clean_processed, (clean_data?) `make data`\n", 98 | "\n", 99 | "\n", 100 | "## Tutorial 2: Reproducible Models (1h)\n", 101 | "\"We're gonna data science the @#&! out of this\"\n", 102 | "\n", 103 | "* Models (Estimators with metadata)\n", 104 | "\n", 105 | "* Experiments (Datasets with metadata)\n", 106 | "\n", 107 | "* Punchline:\n", 108 | " * make clean_models, clean_predictions. `make predict`\n", 109 | " \n", 110 | "## Tutorial 3: Reproducible Results (30m)\n", 111 | "\"I'm only here for the pretty pictures\"\n", 112 | "\n", 113 | "* Punchline\n", 114 | " * make clean_analysis, clean_results, `make results`\n", 115 | "\n", 116 | "\n", 117 | "## The Big Punchline\n", 118 | "```\n", 119 | "make clean\n", 120 | "make results\n", 121 | "```" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python [conda env:bus_number]", 140 | "language": "python", 141 | "name": "conda-env-bus_number-py" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.6.6" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 2 158 | } 159 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/README.md -------------------------------------------------------------------------------- /notebooks/charts/munge-supervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/charts/munge-supervised.png -------------------------------------------------------------------------------- /notebooks/charts/munge-unsupervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/charts/munge-unsupervised.png -------------------------------------------------------------------------------- /notebooks/references/charts/munge-supervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/charts/munge-supervised.png -------------------------------------------------------------------------------- /notebooks/references/charts/munge-unsupervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/charts/munge-unsupervised.png -------------------------------------------------------------------------------- /notebooks/references/cheat_sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/cheat_sheet.pdf -------------------------------------------------------------------------------- /notebooks/references/cheat_sheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/cheat_sheet.png -------------------------------------------------------------------------------- /notebooks/references/workflow/make-analyze.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/workflow/make-analyze.png -------------------------------------------------------------------------------- /notebooks/references/workflow/make-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/workflow/make-data.png -------------------------------------------------------------------------------- /notebooks/references/workflow/make-predict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/workflow/make-predict.png -------------------------------------------------------------------------------- /notebooks/references/workflow/make-publish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/workflow/make-publish.png -------------------------------------------------------------------------------- /notebooks/references/workflow/make-raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/workflow/make-raw.png -------------------------------------------------------------------------------- /notebooks/references/workflow/make-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/notebooks/references/workflow/make-train.png -------------------------------------------------------------------------------- /references/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/references/README.md -------------------------------------------------------------------------------- /references/bus_number_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/references/bus_number_slides.pdf -------------------------------------------------------------------------------- /references/cheat_sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackalog/bus_number/43d4062b33515270de5d7e6677847c730390b653/references/cheat_sheet.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # local package 2 | -e . 3 | 4 | # external requirements 5 | 6 | 7 | --------------------------------------------------------------------------------