├── bin
    ├── .gitkeep
    ├── create-conda-env.sh
    ├── create-conda-env.sbatch
    ├── launch-code-server.sbatch
    ├── launch-jupyter-server.sbatch
    ├── launch-nvdashboard-server.srun
    ├── train.sbatch
    ├── launch-jupyter-server.srun
    ├── launch-code-server.srun
    ├── launch-train.sh
    ├── launch-checkpoint-and-resubmit.sh
    └── README.md
├── data
    └── .gitkeep
├── doc
    └── .gitkeep
├── src
    ├── .gitkeep
    ├── train.py
    ├── train-argparse.py
    └── train-checkpoint-restart.py
├── docker
    ├── .gitkeep
    ├── entrypoint.sh
    ├── img
    │   └── creating-dockerhub-repo-screenshot.png
    ├── hooks
    │   └── build
    ├── docker-compose.yml
    ├── Dockerfile
    └── README.md
├── notebooks
    ├── .gitkeep
    ├── 01d-mlp-for-regression-with-pytorch.ipynb
    ├── 02a-building-data-pipelines-in-pytorch.ipynb
    ├── 02b-implementing-minibatch-gradient-descent.ipynb
    ├── introduction-to-pytorch-part-2.ipynb
    ├── 02d-building-an-image-classifier-with-pytorch.ipynb
    ├── 01e-mlp-for-classification-with-pytorch.ipynb
    └── 02c-model-evaluation.ipynb
├── results
    └── .gitkeep
├── requirements.txt
├── environment.yml
├── LICENSE
├── .gitignore
└── README.md


/bin/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docker/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash --login
2 | set -e
3 | 
4 | conda activate $HOME/app/env
5 | exec "$@"
6 | 


--------------------------------------------------------------------------------
/docker/img/creating-dockerhub-repo-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidrpugh/introduction-to-deep-learning/master/docker/img/creating-dockerhub-repo-screenshot.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | kornia
2 | pandas-bokeh
3 | 
4 | # install NVIDIA DALI
5 | --extra-index-url https://developer.download.nvidia.com/compute/redist
6 | nvidia-dali-cuda110
7 | 


--------------------------------------------------------------------------------
/docker/hooks/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | docker image build \
 5 |     --build-arg username=al-khawarizmi \
 6 |     --build-arg uid=1000 \
 7 |     --build-arg gid=100 \
 8 |     --tag "$DOCKER_REPO:latest" \
 9 |     --file Dockerfile \
10 |     ../
11 | 


--------------------------------------------------------------------------------
/bin/create-conda-env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | 
 3 | # entire script fails if a single command fails
 4 | set -e
 5 | 
 6 | # create the conda environment
 7 | PROJECT_DIR="$PWD"
 8 | ENV_PREFIX="$PROJECT_DIR"/env
 9 | mamba env create --prefix $ENV_PREFIX --file "$PROJECT_DIR"/environment.yml --force
10 | 


--------------------------------------------------------------------------------
/bin/create-conda-env.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --time=2:00:00
 3 | #SBATCH --cpus-per-task=2
 4 | #SBATCH --mem=8G
 5 | #SBATCH --partition=debug
 6 | #SBATCH --job-name=create-conda-env
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=bin/%x-%j-slurm.out
 9 | #SBATCH --error=bin/%x-%j-slurm.err
10 | 
11 | # create the conda environment
12 | ./bin/create-conda-env.sh
13 | 


--------------------------------------------------------------------------------
/bin/launch-code-server.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | #SBATCH --time=2:00:00
 3 | #SBATCH --nodes=1
 4 | #SBATCH --gpus-per-node=v100:1
 5 | #SBATCH --cpus-per-gpu=6  
 6 | #SBATCH --mem=64G
 7 | #SBATCH --partition=debug 
 8 | #SBATCH --job-name=launch-code-server
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --output=bin/%x-%j-slurm.out
11 | #SBATCH --error=bin/%x-%j-slurm.err
12 | 
13 | # use srun to launch code server in order to reserve a port
14 | srun --resv-ports=1 ./bin/launch-code-server.srun
15 | 


--------------------------------------------------------------------------------
/bin/launch-jupyter-server.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | #SBATCH --time=2:00:00
 3 | #SBATCH --nodes=1
 4 | #SBATCH --gpus-per-node=v100:1
 5 | #SBATCH --cpus-per-gpu=6  
 6 | #SBATCH --mem=64G
 7 | #SBATCH --partition=debug 
 8 | #SBATCH --job-name=launch-jupyter-server
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --output=bin/%x-%j-slurm.out
11 | #SBATCH --error=bin/%x-%j-slurm.err
12 | 
13 | # use srun to launch Jupyter server in order to reserve a port
14 | srun --resv-ports=1 ./bin/launch-jupyter-server.srun
15 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2.3"
 2 | 
 3 | services:
 4 |   jupyterlab-server:
 5 |     build:
 6 |       args:
 7 |         - username=${USER}
 8 |         - uid=${UID}
 9 |         - gid=${GID}
10 |       context: ../
11 |       dockerfile: docker/Dockerfile
12 |     ports:
13 |       - "8888:8888"
14 |     runtime: nvidia
15 |     volumes:
16 |       - ../bin:/home/${USER}/app/bin
17 |       - ../data:/home/${USER}/app/data
18 |       - ../doc:/home/${USER}/app/doc
19 |       - ../notebooks:/home/${USER}/app/notebooks
20 |       - ../results:/home/${USER}/app/results
21 |       - ../src:/home/${USER}/app/src
22 |     init: true
23 |     stdin_open: true
24 |     tty: true    
25 | 


--------------------------------------------------------------------------------
/bin/launch-nvdashboard-server.srun:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NVDASHBOARD_PORT=$SLURM_STEP_RESV_PORTS
 4 | IBEX_NODE=$(hostname -s)
 5 | 
 6 | echo "
 7 | To connect to the compute node ${IBEX_NODE} on Ibex running your NVDashboard server, 
 8 | you need to create an ssh tunnel from your local machine to login node on Ibex 
 9 | using the following command.
10 | 
11 | ssh -L ${NVDASHBOARD_PORT}:${IBEX_NODE}:${NVDASHBOARD_PORT} ${USER}@glogin.ibex.kaust.edu.sa 
12 | 
13 | Next, you need to copy the url provided below and paste it into the browser on your 
14 | local machine.
15 | 
16 | http://127.0.0.1:${NVDASHBOARD_PORT}
17 | " >&2
18 | 
19 | # Start the nvdashboard server
20 | python -m jupyterlab_nvdashboard.server $NVDASHBOARD_PORT
21 | 


--------------------------------------------------------------------------------
/bin/train.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | #SBATCH --time=2:00:00
 3 | #SBATCH --gpus-per-node=v100:1
 4 | #SBATCH --cpus-per-gpu=4  
 5 | #SBATCH --mem=64G 
 6 | #SBATCH --partition=batch 
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=results/%x/%j-slurm.out
 9 | #SBATCH --error=results/%x/%j-slurm.err
10 | 
11 | # entire script fails if single command fails
12 | set -e
13 | 
14 | # activate the conda environment
15 | module purge
16 | conda activate "$1"
17 | 
18 | # use srun to launch NVDashboard server in order to reserve a port
19 | srun --resv-ports=1 ./bin/launch-nvdashboard-server.srun &
20 | NVDASHBOARD_PID=$!
21 | 
22 | # launch the training script
23 | python "${@:2}"
24 | 
25 | # shutdown the NVDashboard server
26 | kill $NVDASHBOARD_PID
27 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pytorch-env
 2 | 
 3 | channels:
 4 |   - pytorch
 5 |   - conda-forge
 6 |   - defaults
 7 | 
 8 | dependencies:
 9 |   - bokeh
10 |   - captum
11 |   - cudatoolkit=11.1
12 |   - datashader
13 |   - gh
14 |   - git
15 |   - holoviews
16 |   - hvplot
17 |   - ipywidgets
18 |   - jupyterlab
19 |   - jupyterlab-git
20 |   - jupyterlab-nvdashboard
21 |   - jupyterlab-lsp
22 |   - matplotlib
23 |   - numba
24 |   - numpy
25 |   - optuna
26 |   - pandas
27 |   - panel
28 |   - pip
29 |   - pip:
30 |     - -r requirements.txt
31 |   - pyarrow
32 |   - python=3.9
33 |   - python-language-server
34 |   - pytorch=1.9
35 |   - pytorch-lightning
36 |   - pyviz_comms
37 |   - scikit-learn
38 |   - scipy
39 |   - tensorboard
40 |   - torchaudio
41 |   - torchmetrics
42 |   - torchtext
43 |   - torchvision
44 |   - wandb
45 |   - xeus-python
46 | 


--------------------------------------------------------------------------------
/bin/launch-jupyter-server.srun:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | 
 3 | # setup the environment
 4 | module purge
 5 | conda activate ./env
 6 | 
 7 | # setup ssh tunneling 
 8 | export XDG_RUNTIME_DIR=/tmp IBEX_NODE=$(hostname -s) 
 9 | KAUST_USER=$(whoami)  
10 | JUPYTER_PORT=$SLURM_STEP_RESV_PORTS
11 | 
12 | echo "
13 | To connect to the compute node ${IBEX_NODE} on Ibex running your Jupyter server, 
14 | you need to create an ssh tunnel from your local machine to login node on Ibex 
15 | using the following command.
16 | 
17 | ssh -L ${JUPYTER_PORT}:${IBEX_NODE}:${JUPYTER_PORT} ${KAUST_USER}@glogin.ibex.kaust.edu.sa 
18 | 
19 | Next, you need to copy the second url provided below and paste it into the browser 
20 | on your local machine.
21 | " >&2
22 | 
23 | # launch jupyter server
24 | jupyter ${1:-lab} --no-browser --port=${JUPYTER_PORT} --ip=${IBEX_NODE}
25 | 


--------------------------------------------------------------------------------
/bin/launch-code-server.srun:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | 
 3 | set -e
 4 | 
 5 | # setup the environment
 6 | PROJECT_DIR="$PWD"
 7 | ENV_PREFIX="$PROJECT_DIR"/env
 8 | 
 9 | module purge
10 | conda activate "$ENV_PREFIX"
11 | 
12 | # setup ssh tunneling 
13 | COMPUTE_NODE=$(hostname -s) 
14 | CODE_SERVER_PORT=$SLURM_STEP_RESV_PORTS
15 | 
16 | echo "
17 | To connect to the compute node ${COMPUTE_NODE} on Ibex running your Code Server, 
18 | you need to create an ssh tunnel from your local machine to login node on Ibex 
19 | using the following command.
20 | 
21 | ssh -L ${CODE_SERVER_PORT}:${COMPUTE_NODE}:${CODE_SERVER_PORT} ${USER}@glogin.ibex.kaust.edu.sa 
22 | 
23 | Next, you need to copy the url provided below and paste it into the browser 
24 | on your local machine.
25 | 
26 | localhost:${CODE_SERVER_PORT}
27 | 
28 | " >&2
29 | 
30 | # launch code server
31 | code-server --auth none --bind-addr ${COMPUTE_NODE}:${CODE_SERVER_PORT} "$PROJECT_DIR"
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/bin/launch-train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # entire script fails if a single command fails
 4 | set -e
 5 | 
 6 | # script should be run from the project directory
 7 | PROJECT_DIR="$PWD"
 8 | 
 9 | # path to the Conda environment
10 | ENV_PREFIX="$PROJECT_DIR"/env
11 | 
12 | # project should have a data directory
13 | DATA_DIR="$PROJECT_DIR"/data
14 | 
15 | # creates a separate directory for each job
16 | JOB_NAME=example-training-job
17 | JOB_RESULTS_DIR="$PROJECT_DIR"/results/"$JOB_NAME"
18 | mkdir -p "$JOB_RESULTS_DIR"
19 | 
20 | # launch the training job
21 | CPUS_PER_GPU=6
22 | sbatch --job-name "$JOB_NAME" --cpus-per-gpu $CPUS_PER_GPU \
23 |     "$PROJECT_DIR"/bin/train.sbatch "$ENV_PREFIX" \
24 |         "$PROJECT_DIR"/src/train-argparse.py \
25 |             --dataloader-num-workers $CPUS_PER_GPU \
26 |             --data-dir "$DATA_DIR" \
27 |             --num-training-epochs 10 \
28 |             --output-dir "$JOB_RESULTS_DIR" \
29 |             --tqdm-disable
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) [year], [fullname]
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | LABEL maintainer="pughdr <david.pugh@kaust.edu.sa>"
 4 | 
 5 | SHELL [ "/bin/bash", "--login", "-c" ]
 6 | 
 7 | RUN apt-get update --fix-missing && \
 8 |     apt-get install -y wget bzip2 curl git && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | # Create a non-root user
13 | ARG username=al-khawarizmi
14 | ARG uid=1000
15 | ARG gid=100
16 | ENV USER $username
17 | ENV UID $uid
18 | ENV GID $gid
19 | ENV HOME /home/$USER
20 | 
21 | RUN adduser --disabled-password \
22 |     --gecos "Non-root user" \
23 |     --uid $UID \
24 |     --gid $GID \
25 |     --home $HOME \
26 |     $USER
27 | 
28 | COPY environment.yml requirements.txt /tmp/
29 | RUN chown $UID:$GID /tmp/environment.yml /tmp/requirements.txt
30 | 
31 | COPY postBuild docker/entrypoint.sh /usr/local/bin/
32 | RUN chown $UID:$GID /usr/local/bin/postBuild /usr/local/bin/entrypoint.sh && \
33 |     chmod u+x /usr/local/bin/postBuild /usr/local/bin/entrypoint.sh
34 | 
35 | USER $USER
36 | 
37 | # install miniconda
38 | ENV MINICONDA_VERSION 4.8.2
39 | ENV CONDA_DIR $HOME/miniconda3
40 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-$MINICONDA_VERSION-Linux-x86_64.sh -O ~/miniconda.sh && \
41 |     chmod +x ~/miniconda.sh && \
42 |     ~/miniconda.sh -b -p $CONDA_DIR && \
43 |     rm ~/miniconda.sh
44 | 
45 | # make non-activate conda commands available
46 | ENV PATH=$CONDA_DIR/bin:$PATH
47 | 
48 | # make conda activate command available from /bin/bash --login shells
49 | RUN echo ". $CONDA_DIR/etc/profile.d/conda.sh" >> ~/.profile
50 | 
51 | # make conda activate command available from /bin/bash --interative shells
52 | RUN conda init bash
53 | 
54 | # create a project directory inside user home
55 | ENV PROJECT_DIR $HOME/app
56 | RUN mkdir $PROJECT_DIR
57 | WORKDIR $PROJECT_DIR
58 | 
59 | # build the conda environment
60 | ENV ENV_PREFIX $PROJECT_DIR/env
61 | RUN conda update --name base --channel defaults conda && \
62 |     conda env create --prefix $ENV_PREFIX --file /tmp/environment.yml --force && \
63 |     conda activate $ENV_PREFIX && \
64 |     . /usr/local/bin/postBuild && \
65 |     conda clean --all --yes
66 | 
67 | # use an entrypoint script to insure conda environment is properly activated at runtime
68 | ENTRYPOINT [ "/usr/local/bin/entrypoint.sh" ]
69 | 
70 | # default command will be to launch JupyterLab server for development
71 | CMD [ "jupyter", "lab", "--no-browser", "--ip", "0.0.0.0" ]
72 | 


--------------------------------------------------------------------------------
/bin/launch-checkpoint-and-resubmit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # entire script fails if a single command fails
 4 | set -e
 5 | 
 6 | # script should be run from the project directory
 7 | PROJECT_DIR="$PWD"
 8 | 
 9 | # path to Conda environment
10 | ENV_PREFIX="$PROJECT_DIR"/env
11 | 
12 | # data should be read from a data directory
13 | DATA_DIR="$PROJECT_DIR"/data
14 | 
15 | # creates a separate directory for each job
16 | JOB_NAME=example-training-job
17 | JOB_RESULTS_DIR="$PROJECT_DIR"/results/"$JOB_NAME"
18 | mkdir -p "$JOB_RESULTS_DIR"
19 | 
20 | # create a directory to store the checkpoints
21 | CHECKPOINTS_DIR="$JOB_RESULTS_DIR"/checkpoints
22 | mkdir -p "$CHECKPOINTS_DIR"
23 | 
24 | # use a single file to track intermediate checkpoints
25 | CHECKPOINT_FILEPATH="$CHECKPOINTS_DIR"/checkpoint.pt
26 | 
27 | # define number of training periods and training epochs (per period)
28 | NUM_TRAINING_PERIODS=10
29 | NUM_EPOCHS_PER_PERIOD=1
30 | 
31 | # launch the training job for the initial period
32 | CPUS_PER_GPU=4
33 | TRAIN_JOBID=$(
34 |     sbatch --job-name "$JOB_NAME" --cpus-per-gpu $CPUS_PER_GPU --parsable \
35 |         "$PROJECT_DIR"/bin/train.sbatch "$ENV_PREFIX" \
36 |             "$PROJECT_DIR"/src/train-checkpoint-restart.py \
37 |                 --dataloader-num-workers $CPUS_PER_GPU \
38 |                 --data-dir "$DATA_DIR" \
39 |                 --num-training-epochs $NUM_EPOCHS_PER_PERIOD \
40 |                 --tqdm-disable \
41 |                 --write-checkpoint-to "$CHECKPOINT_FILEPATH" \
42 | )
43 | 
44 | # store the most recent checkpoint
45 | cp "$CHECKPOINT_FILEPATH" "$CHECKPOINTS_DIR"/most-recent-checkpoint.pt
46 | 
47 | # queue the training jobs for the remaining periods
48 | for ((PERIOD=1;PERIOD<$NUM_TRAINING_PERIODS;PERIOD++))
49 | do
50 | 
51 |     TRAIN_JOBID=$(
52 |         sbatch --job-name "$JOB_NAME" --cpus-per-gpu $CPUS_PER_GPU --parsable --dependency=afterok:$TRAIN_JOBID --kill-on-invalid-dep=yes \
53 |             "$PROJECT_DIR"/bin/train.sbatch "$ENV_PREFIX" \
54 |                 "$PROJECT_DIR"/src/train-checkpoint-restart.py \
55 |                     --checkpoint-filepath "$CHECKPOINTS_DIR"/most-recent-checkpoint.pt \
56 |                     --dataloader-num-workers $CPUS_PER_GPU \
57 |                     --data-dir "$DATA_DIR" \
58 |                     --num-training-epochs $NUM_EPOCHS_PER_PERIOD \
59 |                     --tqdm-disable \
60 |                     --write-checkpoint-to "$CHECKPOINT_FILEPATH" \
61 |     )
62 | 
63 |     # store the most recent checkpoint
64 |     cp "$CHECKPOINT_FILEPATH" "$CHECKPOINTS_DIR"/most-recent-checkpoint.pt
65 | 
66 | done
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | # ignore Slurm .out .err files for certain jobs
127 | bin/create-conda-env-*-slurm.out
128 | bin/create-conda-env-*-slurm.err
129 | bin/launch-code-server-*-slurm.out
130 | bin/launch-code-server-*-slurm.err
131 | bin/launch-jupyter-server-*-slurm.out
132 | bin/launch-jupyter-server-*-slurm.err
133 | 
134 | # ignore DCGM reports
135 | dcgm/
136 | 
137 | # ignore vscode settings
138 | .vscode/


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | 
  4 | from sklearn import metrics
  5 | import torch
  6 | from torch import nn, optim, utils
  7 | from torchvision import datasets, models, transforms
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | BATCH_SIZE = 256
 12 | DATA_DIR = pathlib.Path("data/")
 13 | DATALOADER_NUM_WORKERS = 6
 14 | DEVICE = torch.device("cuda")
 15 | NUM_CLASSES = 10
 16 | NUM_TRAIN_EPOCHS = 10
 17 | OPTIMIZER_LEARNING_RATE = 1e-3
 18 | OPTIMIZER_MOMENTUM = 0.9
 19 | OUTPUT_DIR = pathlib.Path("results/example-training-job/")
 20 | OUTPUT_FILENAME = OUTPUT_DIR / "model.pt"
 21 | PREFETCH_FACTOR = 2
 22 | RESIZE_SIZE = 224
 23 | SEED = 42
 24 | TQDM_DISABLE = True
 25 | 
 26 | 
 27 | # create the output directory
 28 | if not OUTPUT_DIR.exists():
 29 |     os.mkdir(OUTPUT_DIR)
 30 | 
 31 | # set seed for reproducibility
 32 | torch.manual_seed(SEED)
 33 | 
 34 | # create the train and test datasets
 35 | _transform = transforms.Compose([
 36 |     transforms.Resize(RESIZE_SIZE),
 37 |     transforms.ToTensor(),
 38 | ])
 39 | train_dataset = datasets.CIFAR10(root=DATA_DIR,
 40 |                                  train=True,
 41 |                                  download=True,
 42 |                                  transform=_transform)
 43 | 
 44 | test_dataset = datasets.CIFAR10(root=DATA_DIR,
 45 |                                 train=False,
 46 |                                 download=True,
 47 |                                 transform=_transform)
 48 | 
 49 | # create the train and test dataloaders
 50 | train_dataloader = (utils.data
 51 |                          .DataLoader(train_dataset,
 52 |                                      batch_size=BATCH_SIZE,
 53 |                                      shuffle=True,
 54 |                                      num_workers=DATALOADER_NUM_WORKERS,
 55 |                                      persistent_workers=True,
 56 |                                      pin_memory=True,
 57 |                                      prefetch_factor=PREFETCH_FACTOR))
 58 | test_dataloader = (utils.data
 59 |                         .DataLoader(test_dataset,
 60 |                                     batch_size=BATCH_SIZE,
 61 |                                     shuffle=False,
 62 |                                     num_workers=DATALOADER_NUM_WORKERS,
 63 |                                     persistent_workers=True,
 64 |                                     pin_memory=True,
 65 |                                     prefetch_factor=PREFETCH_FACTOR))
 66 | 
 67 | # define a model_fn, loss function, and an optimizer
 68 | model_fn = models.resnet50(pretrained=False,
 69 |                            num_classes=NUM_CLASSES)
 70 | model_fn.to(DEVICE)
 71 | loss_fn = nn.CrossEntropyLoss()
 72 | optimizer = optim.SGD(model_fn.parameters(),
 73 |                       lr=OPTIMIZER_LEARNING_RATE,
 74 |                       momentum=OPTIMIZER_MOMENTUM)
 75 | 
 76 | # train the model
 77 | print("Training started...")
 78 | for epoch in range(NUM_TRAIN_EPOCHS):
 79 | 
 80 |     with tqdm(train_dataloader, unit="batch", disable=TQDM_DISABLE) as tepoch:
 81 | 
 82 |         for (features, targets) in tepoch:
 83 |             tepoch.set_description(f"Epoch {epoch}")
 84 | 
 85 |             # zero the parameter gradients
 86 |             optimizer.zero_grad()
 87 | 
 88 |             # forward + backward + optimize
 89 |             predictions = model_fn(features.to(DEVICE))
 90 |             loss = loss_fn(predictions, targets.to(DEVICE))
 91 |             loss.backward()
 92 |             optimizer.step()
 93 | 
 94 | print("...training finished!")
 95 | 
 96 | # save the trained model
 97 | torch.save(model_fn.state_dict(), OUTPUT_FILENAME)
 98 | 
 99 | # compute the predications on the test data
100 | batch_targets = []
101 | batch_predicted_targets = []
102 | 
103 | with torch.no_grad():
104 |     for (features, targets) in test_dataloader:
105 |         predicted_probs = model_fn(features.to(DEVICE))
106 |         predicted_targets = predicted_probs.argmax(axis=1)
107 |         batch_targets.append(targets)
108 |         batch_predicted_targets.append(predicted_targets)
109 | 
110 | # generate a classification report
111 | test_target = (torch.cat(batch_targets)
112 |                     .cpu())
113 | test_predicted_targets = (torch.cat(batch_predicted_targets)
114 |                                .cpu())
115 | 
116 | classification_report = metrics.classification_report(
117 |     test_target,
118 |     test_predicted_targets,
119 | )
120 | print(classification_report)
121 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
  1 | ## Using the `kaustvl/pytorch-gpu-data-science-project` image
  2 | 
  3 | If you are not adding any additional dependencies to your project's `environment.yml` file, then you can run containers for your project based on the `kaustvl/pytorch-gpu--data-science-project` image hosted on DockerHub. Run the following command within your project's root directory to run a container for your project based on this existing Docker image.
  4 | 
  5 | ```bash
  6 | $ docker container run \
  7 |   --rm \
  8 |   --tty \
  9 |   --volume ${pwd}/bin:/home/$USER/app/bin \
 10 |   --volume ${pwd}/data:/home/$USER/app/data \
 11 |   --volume ${pwd}/doc:/home/$USER/app/doc \
 12 |   --volume ${pwd}/notebooks:/home/$USER/app/notebooks \
 13 |   --volume ${pwd}/results:/home/$USER/app/results \
 14 |   --volume ${pwd}/src:/home/$USER/app/src \
 15 |   --runtime nvidia \
 16 |   --publish 8888:8888 \
 17 |   kaustvl/pytorch-gpu-data-science-project:latest
 18 | ```
 19 | 
 20 | ## Building a new image for your project
 21 | 
 22 | If you wish to add (remove) dependencies in your project's `environment.yml` (or if you wish to have a custom user defined inside the image), then you will need to build a new Docker image for you project. The following command builds a new image for your project with a custom `$USER` (with associated `$UID` and `$GID`) as well as a particular `$IMAGE_NAME` and `$IMAGE_TAG`. This command should be run within the `docker` sub-directory of the project.
 23 | 
 24 | ```bash
 25 | $ docker image build \
 26 |   --build-arg username=$USER \
 27 |   --build-arg uid=$UID \
 28 |   --build-arg gid=$GID \
 29 |   --file Dockerfile \
 30 |   --tag $IMAGE_NAME:$IMAGE_TAG \
 31 |   ../
 32 | ```
 33 | 
 34 | ### Automating the build process with DockerHub
 35 | 
 36 | 1. Create a new (or login to your existing) [DockerHub](https://hub.docker.com) account.
 37 | 2. [Link your GitHub account with your DockerHub account](https://docs.docker.com/docker-hub/builds/link-source/) (if you have not already done so).
 38 | 3. Create a new DockerHub repository.
 39 |    1. Under "Build Settings" click the GitHub logo and then select your project's GitHub repository.
 40 |    2. Select "Click here to customize build settings" and specify the location of the Dockerfile for your build as `docker/Dockerfile`.
 41 |    3. Give the DockerHub repository the same name as your project's GitHub repository.
 42 |    4. Give the DockerHub repository a brief descrition (something like "Automated builds for $PROJECT" or similar).
 43 |    5. Click the "Create and Build" button.
 44 | 4. Edit the `hooks/build` script with your project's `$USER`, `$UID`, and `$GID` build args in place of the corresponding default values.
 45 | 
 46 | Below is a screenshot which should give you an idea of how the form out to be filled out prior to clicking "Create and Build".
 47 | 
 48 | ![Creating a new DockerHub repository for your project](./img/creating-dockerhub-repo-screenshot.png)
 49 | 
 50 | DockerHub is now configured to re-build your project's image whenever commits are pushed to your project's GitHub repository! Specifically, whenever you push new commits to your project's GitHub repository, GitHub will notify DockerHub and DockerHub will then run the `./hooks/build` script to re-build your project's image. For more details on the whole process see the [official documentation](https://docs.docker.com/docker-hub/builds/advanced/#build-hook-examples) on advanced DockerHub build options.
 51 | 
 52 | ### Running a container
 53 | 
 54 | Once you have built the image, the following command will run a container based on the image `$IMAGE_NAME:$IMAGE_TAG`. This command should be run from within the project's root directory.
 55 | 
 56 | ```bash
 57 | $ docker container run \
 58 |   --rm \
 59 |   --tty \
 60 |   --volume ${pwd}/bin:/home/$USER/app/bin \
 61 |   --volume ${pwd}/data:/home/$USER/app/data \ 
 62 |   --volume ${pwd}/doc:/home/$USER/app/doc \
 63 |   --volume ${pwd}/notebooks:/home/$USER/app/notebooks \
 64 |   --volume ${pwd}/results:/home/$USER/app/results \
 65 |   --volume ${pwd}/src:/home/$USER/app/src \
 66 |   --runtime nvidia \
 67 |   --publish 8888:8888 \
 68 |   $IMAGE_NAME:$IMAGE_TAG
 69 | ```
 70 | 
 71 | ### Using Docker Compose
 72 | 
 73 | It is quite easy to make a typo whilst writing the above docker commands by hand, a less error-prone approach is to use [Docker Compose](https://docs.docker.com/compose/). The above docker commands have been encapsulated into the `docker-compose.yml` configuration file. You will need to store your project specific values for `$USER`, `$UID`, and `$GID` in an a file called `.env` as follows. 
 74 | 
 75 | ```
 76 | USER=$USER
 77 | UID=$UID
 78 | GID=$GID
 79 | ```
 80 | 
 81 | For more details on how variable substitution works with Docker Compose, see the [official documentation](https://docs.docker.com/compose/environment-variables/#the-env-file).
 82 | 
 83 | Note that you can test your `docker-compose.yml` file by running the following command in the `docker` sub-directory of the project.
 84 | 
 85 | ```bash
 86 | $ docker-compose config
 87 | ```
 88 | 
 89 | This command takes the `docker-compose.yml` file and substitutes the values provided in the `.env` file and then returns the result.
 90 | 
 91 | Once you are confident that values in the `.env` file are being substituted properly into the `docker-compose.yml` file, the following command can be used to bring up a container based on your project's Docker image and launch the JupyterLab server. This command should also be run from within the `docker` sub-directory of the project.
 92 | 
 93 | ```bash
 94 | $ docker-compose up --build
 95 | ```
 96 | 
 97 | When you are done developing and have shutdown the JupyterLab server, the following command tears down the networking infrastructure for the running container.
 98 | 
 99 | ```bash
100 | $ docker-compose down
101 | ```
102 | 


--------------------------------------------------------------------------------
/src/train-argparse.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pathlib
  4 | 
  5 | from sklearn import metrics
  6 | import torch
  7 | from torch import nn, optim, utils
  8 | from torchvision import datasets, models, transforms
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument("--batch-size",
 14 |                     default=256,
 15 |                     type=int,
 16 |                     help="Number of training samples per batch.")
 17 | parser.add_argument("--data-dir",
 18 |                     required=True,
 19 |                     type=str,
 20 |                     help="Path to directory containing the train, val, test data.")
 21 | parser.add_argument("--dataloader-num-workers",
 22 |                     required=True,
 23 |                     type=int,
 24 |                     help="Number of workers to use for loading data.")
 25 | parser.add_argument("--dataloader-prefetch-factor",
 26 |                     default=2,
 27 |                     type=int,
 28 |                     help="Number of data batches to prefetch per worker.")
 29 | parser.add_argument("--disable-gpu",
 30 |                     action="store_true",
 31 |                     help="Disable GPU(s) for training and inference.")
 32 | parser.add_argument("--num-training-epochs",
 33 |                     default=1,
 34 |                     type=int,
 35 |                     help="Number of training epochs.")
 36 | parser.add_argument("--optimizer-learning-rate",
 37 |                     default=1e-3,
 38 |                     type=float,
 39 |                     help="Learning rate for optimizer.")
 40 | parser.add_argument("--optimizer-momentum",
 41 |                     default=0.9,
 42 |                     type=float,
 43 |                     help="Momentum for optimizer.")
 44 | parser.add_argument("--output-dir",
 45 |                     required=True,
 46 |                     type=str,
 47 |                     help="Path to directory where output should be written.")
 48 | parser.add_argument("--output-filename",
 49 |                     default="model.pt",
 50 |                     type=str,
 51 |                     help="Filename for model checkpoint.")
 52 | parser.add_argument("--seed",
 53 |                     type=int,
 54 |                     help="Seed used for pseudorandom number generation.")
 55 | parser.add_argument("--tqdm-disable",
 56 |                     action="store_true",
 57 |                     help="Disables the training progress bar.")
 58 | args = parser.parse_args()
 59 | 
 60 | 
 61 | # no need to expose these as command line args
 62 | DATA_DIR = pathlib.Path(args.data_dir)
 63 | DEVICE = torch.device("cpu") if args.disable_gpu else torch.device("cuda")
 64 | NUM_CLASSES = 10
 65 | OUTPUT_DIR = pathlib.Path(args.output_dir)
 66 | OUTPUT_FILEPATH = OUTPUT_DIR / args.output_filename
 67 | RESIZE_SIZE = 224
 68 | 
 69 | 
 70 | # create the output directory
 71 | if not OUTPUT_DIR.exists():
 72 |     os.mkdir(OUTPUT_DIR)
 73 | 
 74 | # set seed for reproducibility
 75 | if args.seed is not None:
 76 |     torch.manual_seed(args.seed)
 77 | 
 78 | # create the train and test datasets
 79 | _transform = transforms.Compose([
 80 |     transforms.Resize(RESIZE_SIZE),
 81 |     transforms.ToTensor(),
 82 | ])
 83 | train_dataset = datasets.CIFAR10(root=DATA_DIR,
 84 |                                  train=True,
 85 |                                  download=True,
 86 |                                  transform=_transform)
 87 | 
 88 | test_dataset = datasets.CIFAR10(root=DATA_DIR,
 89 |                                 train=False,
 90 |                                 download=True,
 91 |                                 transform=_transform)
 92 | 
 93 | # create the train and test dataloaders
 94 | train_dataloader = (utils.data
 95 |                          .DataLoader(train_dataset,
 96 |                                      batch_size=args.batch_size,
 97 |                                      shuffle=True,
 98 |                                      num_workers=args.dataloader_num_workers,
 99 |                                      persistent_workers=True,
100 |                                      pin_memory=True,
101 |                                      prefetch_factor=args.dataloader_prefetch_factor))
102 | test_dataloader = (utils.data
103 |                         .DataLoader(test_dataset,
104 |                                     batch_size=args.batch_size,
105 |                                     shuffle=False,
106 |                                     num_workers=args.dataloader_num_workers,
107 |                                     persistent_workers=True,
108 |                                     pin_memory=True,
109 |                                     prefetch_factor=args.dataloader_prefetch_factor))
110 | 
111 | # define a model_fn, loss function, and an optimizer
112 | model_fn = models.resnet50(pretrained=False,
113 |                            num_classes=NUM_CLASSES)
114 | model_fn.to(DEVICE)
115 | loss_fn = nn.CrossEntropyLoss()
116 | optimizer = optim.SGD(model_fn.parameters(),
117 |                       lr=args.optimizer_learning_rate,
118 |                       momentum=args.optimizer_momentum)
119 | 
120 | # train the model
121 | print("Training started...")
122 | for epoch in range(args.num_training_epochs):
123 | 
124 |     with tqdm(train_dataloader, unit="batch", disable=args.tqdm_disable) as tepoch:
125 | 
126 |         for (features, targets) in tepoch:
127 |             tepoch.set_description(f"Epoch {epoch}")
128 | 
129 |             # zero the parameter gradients
130 |             optimizer.zero_grad()
131 | 
132 |             # forward + backward + optimize
133 |             predictions = model_fn(features.to(DEVICE))
134 |             loss = loss_fn(predictions, targets.to(DEVICE))
135 |             loss.backward()
136 |             optimizer.step()
137 | 
138 | print("...training finished!")
139 | 
140 | # save the trained model
141 | torch.save(model_fn.state_dict(), OUTPUT_FILEPATH)
142 | 
143 | # compute the predications on the test data
144 | batch_targets = []
145 | batch_predicted_targets = []
146 | 
147 | with torch.no_grad():
148 |     for (features, targets) in test_dataloader:
149 |         predicted_probs = model_fn(features.to(DEVICE))
150 |         predicted_targets = predicted_probs.argmax(axis=1)
151 |         batch_targets.append(targets)
152 |         batch_predicted_targets.append(predicted_targets)
153 | 
154 | # generate a classification report
155 | test_target = (torch.cat(batch_targets)
156 |                     .cpu())
157 | test_predicted_targets = (torch.cat(batch_predicted_targets)
158 |                                .cpu())
159 | 
160 | classification_report = metrics.classification_report(
161 |     test_target,
162 |     test_predicted_targets,
163 | )
164 | print(classification_report)
165 | 


--------------------------------------------------------------------------------
/src/train-checkpoint-restart.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pathlib
  3 | 
  4 | from sklearn import metrics
  5 | import torch
  6 | from torch import nn, optim, utils
  7 | from torchvision import datasets, models, transforms
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument("--batch-size",
 13 |                     default=256,
 14 |                     type=int,
 15 |                     help="Number of training samples per batch.")
 16 | parser.add_argument("--checkpoint-filepath",
 17 |                     type=str,
 18 |                     help="Path to a file containing the current checkpoint")
 19 | parser.add_argument("--data-dir",
 20 |                     required=True,
 21 |                     type=str,
 22 |                     help="Path to directory containing the train, val, test data.")
 23 | parser.add_argument("--dataloader-num-workers",
 24 |                     required=True,
 25 |                     type=int,
 26 |                     help="Number of workers to use for loading data.")
 27 | parser.add_argument("--dataloader-prefetch-factor",
 28 |                     default=2,
 29 |                     type=int,
 30 |                     help="Number of data batches to prefetch per worker.")
 31 | parser.add_argument("--disable-gpu",
 32 |                     action="store_true",
 33 |                     help="Disable GPU(s) for training and inference.")
 34 | parser.add_argument("--num-training-epochs",
 35 |                     default=1,
 36 |                     type=int,
 37 |                     help="Number of training epochs.")
 38 | parser.add_argument("--optimizer-learning-rate",
 39 |                     default=1e-3,
 40 |                     type=float,
 41 |                     help="Learning rate for optimizer.")
 42 | parser.add_argument("--optimizer-momentum",
 43 |                     default=0.9,
 44 |                     type=float,
 45 |                     help="Momentum for optimizer.")
 46 | parser.add_argument("--seed",
 47 |                     type=int,
 48 |                     help="Seed used for pseudorandom number generation.")
 49 | parser.add_argument("--tqdm-disable",
 50 |                     action="store_true",
 51 |                     help="Disables the training progress bar.")
 52 | parser.add_argument("--write-checkpoint-to",
 53 |                     type=str,
 54 |                     help="Path to the file where checkpoint should be written")
 55 | args = parser.parse_args()
 56 | 
 57 | 
 58 | # no need to expose these as command line args
 59 | DATA_DIR = pathlib.Path(args.data_dir)
 60 | DEVICE = torch.device("cpu") if args.disable_gpu else torch.device("cuda")
 61 | NUM_CLASSES = 10
 62 | RESIZE_SIZE = 224
 63 | 
 64 | 
 65 | # set up checkpointing
 66 | if args.checkpoint_filepath is not None:
 67 |     CHECKPOINT_FILEPATH = pathlib.Path(args.checkpoint_filepath)
 68 | else:
 69 |     CHECKPOINT_FILEPATH = None
 70 | 
 71 | if args.write_checkpoint_to is not None:
 72 |     WRITE_CHECKPOINT_TO = pathlib.Path(args.write_checkpoint_to)
 73 | else:
 74 |     WRITE_CHECKPOINT_TO = None
 75 | 
 76 | # set seed for reproducibility
 77 | if args.seed is not None:
 78 |     torch.manual_seed(args.seed)
 79 | 
 80 | # create the train and test datasets
 81 | _transform = transforms.Compose([
 82 |     transforms.Resize(RESIZE_SIZE),
 83 |     transforms.ToTensor(),
 84 | ])
 85 | train_dataset = datasets.CIFAR10(root=DATA_DIR,
 86 |                                  train=True,
 87 |                                  download=True,
 88 |                                  transform=_transform)
 89 | 
 90 | test_dataset = datasets.CIFAR10(root=DATA_DIR,
 91 |                                 train=False,
 92 |                                 download=True,
 93 |                                 transform=_transform)
 94 | 
 95 | # create the train and test dataloaders
 96 | train_dataloader = (utils.data
 97 |                          .DataLoader(train_dataset,
 98 |                                      batch_size=args.batch_size,
 99 |                                      shuffle=True,
100 |                                      num_workers=args.dataloader_num_workers,
101 |                                      persistent_workers=True,
102 |                                      pin_memory=True,
103 |                                      prefetch_factor=args.dataloader_prefetch_factor))
104 | test_dataloader = (utils.data
105 |                         .DataLoader(test_dataset,
106 |                                     batch_size=args.batch_size,
107 |                                     shuffle=False,
108 |                                     num_workers=args.dataloader_num_workers,
109 |                                     persistent_workers=True,
110 |                                     pin_memory=True,
111 |                                     prefetch_factor=args.dataloader_prefetch_factor))
112 | 
113 | # define a model_fn, loss function, and an optimizer
114 | model_fn = models.resnet50(pretrained=False,
115 |                            num_classes=NUM_CLASSES)
116 | model_fn.to(DEVICE)
117 | loss_fn = nn.CrossEntropyLoss()
118 | optimizer = optim.SGD(model_fn.parameters(),
119 |                       lr=args.optimizer_learning_rate,
120 |                       momentum=args.optimizer_momentum)
121 | 
122 | # load model checkpoint (if available)
123 | if CHECKPOINT_FILEPATH is not None:
124 |     checkpoint_file = torch.load(CHECKPOINT_FILEPATH)
125 |     model_fn.load_state_dict(checkpoint_file["model_state_dict"])
126 |     optimizer.load_state_dict(checkpoint_file["optimizer_state_dict"])
127 | 
128 | # train the model
129 | print("Training started...")
130 | for epoch in range(args.num_training_epochs):
131 | 
132 |     with tqdm(train_dataloader, unit="batch", disable=args.tqdm_disable) as tepoch:
133 | 
134 |         for (features, targets) in tepoch:
135 |             tepoch.set_description(f"Epoch {epoch}")
136 | 
137 |             # zero the parameter gradients
138 |             optimizer.zero_grad()
139 | 
140 |             # forward + backward + optimize
141 |             predictions = model_fn(features.to(DEVICE))
142 |             loss = loss_fn(predictions, targets.to(DEVICE))
143 |             loss.backward()
144 |             optimizer.step()
145 |     
146 |     if WRITE_CHECKPOINT_TO is not None:
147 |         checkpoint = {
148 |             "model_state_dict": model_fn.state_dict(),
149 |             "optimizer_state_dict": optimizer.state_dict()
150 |         }
151 |         torch.save(checkpoint, WRITE_CHECKPOINT_TO)
152 | 
153 | print("...training finished!")
154 | 
155 | # compute the predications on the test data
156 | batch_targets = []
157 | batch_predicted_targets = []
158 | 
159 | with torch.no_grad():
160 |     for (features, targets) in test_dataloader:
161 |         predicted_probs = model_fn(features.to(DEVICE))
162 |         predicted_targets = predicted_probs.argmax(axis=1)
163 |         batch_targets.append(targets)
164 |         batch_predicted_targets.append(predicted_targets)
165 | 
166 | # generate a classification report
167 | test_target = (torch.cat(batch_targets)
168 |                     .cpu())
169 | test_predicted_targets = (torch.cat(batch_predicted_targets)
170 |                                .cpu())
171 | 
172 | classification_report = metrics.classification_report(
173 |     test_target,
174 |     test_predicted_targets,
175 | )
176 | print(classification_report)
177 | 


--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
  1 | ## Creating the Conda environment
  2 | 
  3 | For your convenience the commands to create the Conda environment have been combined in a shell script. The script should be run from the project root directory as follows. 
  4 | 
  5 | ```bash
  6 | ./bin/create-conda-env.sh
  7 | ```
  8 | 
  9 | ## Launching a job via Slurm to create the Conda environment
 10 | 
 11 | While running the shell script above on a login node will create the Conda environment, you may prefer to launch a job via Slurm
 12 | to create the Conda environment. If you lose your connection to the Ibex login node whilst your Conda environment script is running 
 13 | the environment will be left in an inconsistent state and you will need to start over. Depending on the load on the Ibex login nodes, 
 14 | lanuching a job via Slurm to create your Conda environment can also be faster.
 15 | 
 16 | For your convenience the commands to launch a job via Slurm to create the Conda environment have been combined into a job script. The script should be run from the project root directory as follows. 
 17 | 
 18 | ```bash
 19 | sbatch ./bin/create-conda-env.sbatch
 20 | ```
 21 | 
 22 | ## Launching a Jupyter server for interactive work
 23 | 
 24 | The job script `launch-jupyter-server.sbatch` launches a Jupyter server for interactive prototyping. To launch a JupyterLab server 
 25 | use `sbatch` to submit the job script by running the following command from the project root directory.
 26 | 
 27 | ```bash
 28 | sbatch ./bin/launch-jupyter-server.sbatch
 29 | ```
 30 | 
 31 | If you prefer the classic Jupyter Notebook interface, then you can launch the Jupyter notebook server with the following command in 
 32 | the project root directory.
 33 | 
 34 | ```bash
 35 | sbatch ./bin/launch-jupyter-server.sbatch notebook
 36 | ```
 37 | 
 38 | Once the job has started, you can inspect the `./bin/launch-jupyter-server-$SLURM_JOB_ID-slurm.err` file where you will find 
 39 | instructions on how to access the server running in your local browser.
 40 | 
 41 | ### SSH tunneling between your local machine and Ibex compute node(s)
 42 | To connect to the compute node on Ibex running your Jupyter server, you need to create an SSH tunnel from your local machine 
 43 | to a login node on Ibex using a command similar to the following.
 44 | 
 45 | ```
 46 | ssh -L ${JUPYTER_PORT}:${IBEX_NODE}:${JUPYTER_PORT} ${USER}@glogin.ibex.kaust.edu.sa
 47 | ```
 48 | 
 49 | The exact command for your job can be copied from the `./bin/launch-jupyter-server-$SLURM_JOB_ID-slurm.err` file.
 50 | 
 51 | ### Accessing the Jupyter server from your local machine
 52 | 
 53 | Once you have set up your SSH tunnel, in order to access the Jupyter server from your local machine you need to copy the 
 54 | second URL provided in the Jupyter server logs in the `launch-jupyter-server-$SLURM_JOB_ID-slurm.err` file and paste it into 
 55 | the browser on your local machine. The URL will look similar to the following.
 56 | 
 57 | ```
 58 | http://127.0.0.1:$JUPYTER_PORT/lab?token=$JUPYTER_TOKEN
 59 | ```
 60 | 
 61 | The exact command for your job containing both your assigned `$JUPYTER_PORT` as well as your specific `$JUPYTER_TOKEN` can 
 62 | be copied from the `launch-jupyter-server-$SLURM_JOB_ID-slurm.err`.
 63 | 
 64 | ## Launching a VS Code server for development work
 65 | 
 66 | The job script `launch-code-server.sbatch` launches a Microsoft Visual Studio (VS) Code server for development work. In order to 
 67 | use VS Code server, you will first need to install the server package in your Ibex home directory following the instructions 
 68 | provided on [GitHub](https://github.com/kaust-rccl/ibex-code-server-install). Once you have installed VS Code server, you can 
 69 | launch a server by running the following command from the project root directory.
 70 | 
 71 | ```bash
 72 | sbatch ./bin/launch-code-server.sbatch
 73 | ```
 74 | 
 75 | Once the job has started, you can inspect the `./bin/launch-code-server-$SLURM_JOB_ID-slurm.err` file where you will find 
 76 | instructions on how to access the server running in your local browser.
 77 | 
 78 | ### SSH tunneling between your local machine and Ibex compute node(s)
 79 | To connect to the compute node on Ibex running your VS Code server, you need to create an SSH tunnel from your local machine 
 80 | to a login node on Ibex using a command similar to the following.
 81 | 
 82 | ```
 83 | ssh -L ${JUPYTER_PORT}:${IBEX_NODE}:${JUPYTER_PORT} ${USER}@glogin.ibex.kaust.edu.sa
 84 | ```
 85 | 
 86 | The exact command for your job can be copied from the `./bin/launch-code-server-$SLURM_JOB_ID-slurm.err` file.
 87 | 
 88 | ### Accessing the VS Code server from your local machine
 89 | 
 90 | Once you have set up your SSH tunnel, in order to access the VS Code server from your local machine you need to copy the 
 91 | second URL provided in the `launch-code-server-$SLURM_JOB_ID-slurm.err` file and paste it into the browser on your local 
 92 | machine. The URL will look similar to the following.
 93 | 
 94 | ```
 95 | localhost:$CODE_SERVER_PORT
 96 | ```
 97 | 
 98 | The exact command for your job containing both your assigned `$CODE_SERVER_PORT` can be copied from the 
 99 | `launch-code-server-$SLURM_JOB_ID-slurm.err`.
100 | 
101 | ## Launching a training job via Slurm
102 | 
103 | The `src` directory contains an example training script, `train.py`, that trains a classification pipeline on the 
104 | [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset. You can launch this training script as a batch 
105 | job on Ibex via Slurm using the following command in the project root directory.
106 | 
107 | ```bash
108 | ./bin/launch-train.sh
109 | ```
110 | 
111 | ### The `./bin/launch-train.sh` script
112 | 
113 | Wrapping your job submission inside a `launch-train.sh` script is an Ibex "best-practice" that will help you automate more 
114 | complex machine learning workflows. In particular, this example script demonstrates how to break up a single training job 
115 | into training periods (where each training period consists of one or more training epochs) and launches a sequence of jobs 
116 | for each training period. Breaking up large training jobs into smaller jobs can dramatically improve your job throughput! 
117 | 
118 | ### The `./bin/train.sbatch` script
119 | 
120 | The script `./bin/train.sbatch` is the actual Slurm job script. This script can be broken down into several parts that 
121 | are common to all machine learning jobs on Ibex.
122 | 
123 | #### Request resources from Slurm
124 | 
125 | You will request resources for your job using Slurm headers. It is important to request a balanced allocation of CPUs, GPUs, 
126 | and CPU memory in order to insure good overall job performance. You should typically request resources that are roughly 
127 | proportional to the amount of GPUs you are requesting. Most of our nodes have 8 V100 GPUs, 48 CPU cores, and 748 GB of CPU 
128 | memory. The headers below request 6 Intel CPU cores per NVIDIA V100 GPU, and 64G of CPU memory for 2 hours.   
129 | 
130 | ```bash
131 | #!/bin/bash --login
132 | #SBATCH --time=2:00:00
133 | #SBATCH --gpus-per-node=v100:1
134 | #SBATCH --cpus-per-gpu=4  
135 | #SBATCH --mem=64G
136 | #SBATCH --partition=batch 
137 | #SBATCH --mail-type=ALL
138 | #SBATCH --output=results/%x/%j-slurm.out
139 | #SBATCH --error=results/%x/%j-slurm.err
140 | ```
141 | 
142 | #### Activate the Conda environment
143 | 
144 | Activating the Conda environment is done in the usual way however it is critical for the job script to run inside a 
145 | login shell in order for the `conda activate` command to work as expected (this is why the first line of the job script 
146 | is `#!/bin/bash --login`). It is also a good practice to purge any modules that you might have loaded prior to launching 
147 | the training job. Note that this script expects the first argument to be the path to the Conda environment.
148 |  
149 | ```bash
150 | module purge
151 | conda activate "$1"
152 | ```
153 | 
154 | #### Starting the NVDashboard monitoring server
155 | 
156 | After activating the Conda environment, but before launching your training script, you should start the 
157 | NVDashboard monitoring server to run in the background using `srun` to reserve a free port. The 
158 | `./bin/launch-nvdashboard-server.srun` script launches the monitoring server and logs out the assigned 
159 | port to the `slurm.err` file for the job.
160 | 
161 | ```bash
162 | # use srun to launch NVDashboard server in order to reserve a port
163 | srun --resv-ports=1 ./bin/launch-nvdashboard-server.srun &
164 | NVDASHBOARD_PID=$!
165 | ```
166 | 
167 | #### Launch a Python training script
168 | 
169 | Finally, you launch the training job! Note that we use the special Bash variable `"${@:2}"` to refer to all of the 
170 | command line arguments (other than the first!) passed to the Slurm job script. This allows you to reuse the same Slurm 
171 | job script for other training jobs!
172 | 
173 | ```bash
174 | python "${@:2}"
175 | ```
176 | 
177 | #### Stopping the NVDashboard monitoring server
178 | 
179 | Once the training script has finished running, you should stop the NVDashboard server so that your job exits. If 
180 | you do not stop the server the job will continue to run until it reached its time limit (which wastes resources).
181 | 
182 | ```
183 | # shutdown the NVDashboard server
184 | kill $NVDASHBOARD_PID
185 | ```
186 | 


--------------------------------------------------------------------------------
/notebooks/01d-mlp-for-regression-with-pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyNRe8u8vlaMDbQIEDjUMg2K",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01b-mlp-for-regression-with-pytorch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "# Multi-Layer Perceptrons (MLPs) for Regression with PyTorch"
 33 |       ],
 34 |       "metadata": {
 35 |         "id": "PE3OMdA_fdjF"
 36 |       }
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "execution_count": null,
 41 |       "metadata": {
 42 |         "id": "_kEVc3hofLkE"
 43 |       },
 44 |       "outputs": [],
 45 |       "source": [
 46 |         "import numpy as np\n",
 47 |         "import torch\n",
 48 |         "from torch import nn, optim\n",
 49 |         "\n",
 50 |         "from sklearn import compose, datasets, metrics, model_selection\n",
 51 |         "from sklearn import pipeline, preprocessing"
 52 |       ]
 53 |     },
 54 |     {
 55 |       "cell_type": "markdown",
 56 |       "source": [
 57 |         "## Loading the data"
 58 |       ],
 59 |       "metadata": {
 60 |         "id": "TEOj1HDnhGJ_"
 61 |       }
 62 |     },
 63 |     {
 64 |       "cell_type": "code",
 65 |       "source": [
 66 |         "housing_dataset = datasets.fetch_california_housing(\n",
 67 |         "    as_frame=True\n",
 68 |         ")"
 69 |       ],
 70 |       "metadata": {
 71 |         "id": "7AEVE9JgfoXm"
 72 |       },
 73 |       "execution_count": null,
 74 |       "outputs": []
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "source": [
 79 |         "print(housing_dataset[\"DESCR\"])"
 80 |       ],
 81 |       "metadata": {
 82 |         "id": "S01_yy5vf0fr"
 83 |       },
 84 |       "execution_count": null,
 85 |       "outputs": []
 86 |     },
 87 |     {
 88 |       "cell_type": "code",
 89 |       "source": [
 90 |         "housing_features_df = housing_dataset[\"data\"]\n",
 91 |         "median_house_value = housing_dataset[\"target\"]"
 92 |       ],
 93 |       "metadata": {
 94 |         "id": "I3eZZQu4gFRL"
 95 |       },
 96 |       "execution_count": null,
 97 |       "outputs": []
 98 |     },
 99 |     {
100 |       "cell_type": "code",
101 |       "source": [
102 |         "housing_features_df.info()"
103 |       ],
104 |       "metadata": {
105 |         "id": "TQVJir3zgcCi"
106 |       },
107 |       "execution_count": null,
108 |       "outputs": []
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "source": [
113 |         "_ = median_house_value.hist()"
114 |       ],
115 |       "metadata": {
116 |         "id": "BmIh3Xx_Abdc"
117 |       },
118 |       "execution_count": null,
119 |       "outputs": []
120 |     },
121 |     {
122 |       "cell_type": "markdown",
123 |       "source": [
124 |         "## Preparing the data"
125 |       ],
126 |       "metadata": {
127 |         "id": "U6XQukjIhrXE"
128 |       }
129 |     },
130 |     {
131 |       "cell_type": "markdown",
132 |       "source": [
133 |         "### Train/Val Split"
134 |       ],
135 |       "metadata": {
136 |         "id": "5Ur2SAGuhKI9"
137 |       }
138 |     },
139 |     {
140 |       "cell_type": "code",
141 |       "source": [
142 |         "RANDOM_STATE = np.random.RandomState(42)\n",
143 |         "\n",
144 |         "\n",
145 |         "train_features_df, val_features_df, train_target, val_target = (\n",
146 |         "    model_selection.train_test_split(\n",
147 |         "        housing_features_df,\n",
148 |         "        median_house_value,\n",
149 |         "        test_size=0.20,\n",
150 |         "        shuffle=True,\n",
151 |         "        random_state=RANDOM_STATE\n",
152 |         "    )\n",
153 |         ")\n"
154 |       ],
155 |       "metadata": {
156 |         "id": "jeoEgiaggdqY"
157 |       },
158 |       "execution_count": null,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "source": [
164 |         "train_features_df.info()"
165 |       ],
166 |       "metadata": {
167 |         "id": "606tY3PghXDD"
168 |       },
169 |       "execution_count": null,
170 |       "outputs": []
171 |     },
172 |     {
173 |       "cell_type": "code",
174 |       "source": [
175 |         "val_features_df.info()"
176 |       ],
177 |       "metadata": {
178 |         "id": "ZkmMaJF5hY_d"
179 |       },
180 |       "execution_count": null,
181 |       "outputs": []
182 |     },
183 |     {
184 |       "cell_type": "markdown",
185 |       "source": [
186 |         "### Features and target preparation"
187 |       ],
188 |       "metadata": {
189 |         "id": "U59gZ-kRhFOP"
190 |       }
191 |     },
192 |     {
193 |       "cell_type": "code",
194 |       "source": [
195 |         "def array_to_tensor(arr, dtype=torch.float32):\n",
196 |         "  return torch.tensor(arr, dtype=dtype)\n",
197 |         "\n",
198 |         "\n",
199 |         "def dataframe_to_tensor(df, dtype=torch.float32):\n",
200 |         "    arr = df.to_numpy()\n",
201 |         "    return array_to_tensor(arr, dtype)\n",
202 |         "\n",
203 |         "\n",
204 |         "def series_to_tensor(s, dtype=torch.float32):\n",
205 |         "    df = s.to_frame()\n",
206 |         "    return dataframe_to_tensor(df, dtype)\n",
207 |         "\n",
208 |         "\n",
209 |         "prepare_housing_features = pipeline.make_pipeline(\n",
210 |         "    preprocessing.QuantileTransformer(),\n",
211 |         "    preprocessing.FunctionTransformer(\n",
212 |         "        func=array_to_tensor\n",
213 |         "    )\n",
214 |         ")\n",
215 |         "\n",
216 |         "prepare_housing_target = pipeline.make_pipeline(\n",
217 |         "    preprocessing.FunctionTransformer(\n",
218 |         "        func=series_to_tensor\n",
219 |         "    )\n",
220 |         ")\n",
221 |         "\n"
222 |       ],
223 |       "metadata": {
224 |         "id": "eO5Bp1JBhcee"
225 |       },
226 |       "execution_count": null,
227 |       "outputs": []
228 |     },
229 |     {
230 |       "cell_type": "code",
231 |       "source": [
232 |         "X_train = prepare_housing_features.fit_transform(train_features_df)\n",
233 |         "X_val = prepare_housing_features.transform(val_features_df)\n"
234 |       ],
235 |       "metadata": {
236 |         "id": "hDAQixqqjLj-"
237 |       },
238 |       "execution_count": null,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "source": [
244 |         "print(X_train.shape)\n",
245 |         "print(X_val.shape)"
246 |       ],
247 |       "metadata": {
248 |         "id": "NLhcG9D2I3Z4"
249 |       },
250 |       "execution_count": null,
251 |       "outputs": []
252 |     },
253 |     {
254 |       "cell_type": "code",
255 |       "source": [
256 |         "y_train = prepare_housing_target.fit_transform(train_target)\n",
257 |         "y_val = prepare_housing_target.transform(val_target)\n"
258 |       ],
259 |       "metadata": {
260 |         "id": "c4hASNLXjaJk"
261 |       },
262 |       "execution_count": null,
263 |       "outputs": []
264 |     },
265 |     {
266 |       "cell_type": "code",
267 |       "source": [
268 |         "print(y_train.shape)\n",
269 |         "print(y_val.shape)"
270 |       ],
271 |       "metadata": {
272 |         "id": "QT4RhQDTI5jG"
273 |       },
274 |       "execution_count": null,
275 |       "outputs": []
276 |     },
277 |     {
278 |       "cell_type": "markdown",
279 |       "source": [
280 |         "## Implementing a MLP for Regression using nn.Sequential"
281 |       ],
282 |       "metadata": {
283 |         "id": "pAsTcEm-qREN"
284 |       }
285 |     },
286 |     {
287 |       "cell_type": "markdown",
288 |       "source": [
289 |         "[`nn.Sequential`](https://docs.pytorch.org/docs/stable/generated/torch.nn.Sequential.html) in PyTorch is a container module that allows for the sequential execution of a series of neural network layers or modules. It simplifies the process of building neural networks with a linear, feed-forward structure by eliminating the need to explicitly define the forward method for each layer.\n"
290 |       ],
291 |       "metadata": {
292 |         "id": "p8BrAySrrO48"
293 |       }
294 |     },
295 |     {
296 |       "cell_type": "markdown",
297 |       "source": [
298 |         "### Key Characteristics and Use-Cases\n",
299 |         "\n",
300 |         "* **Ordered Container:** `nn.Sequential` takes a list of `nn.Module` instances (layers) and arranges them in the order they are provided.\n",
301 |         "* **Automatic Forward Pass:** When an input tensor is passed to an `nn.Sequential` object, it automatically propagates through each contained module in the defined order, with the output of one module serving as the input to the next.\n",
302 |         "* **Simplified Model Definition:** It offers a concise way to define models, especially for straightforward architectures without complex branching or custom logic within the forward pass.\n",
303 |         "* **Treat as a Single Module:** The entire `nn.Sequential` container can be treated as a single `nn.Module`, allowing for easy integration into larger models or for applying operations like moving to a device (`.to(device)`) or setting training/evaluation mode (`.train()`, `.eval()`)."
304 |       ],
305 |       "metadata": {
306 |         "id": "J6s0od-NhdN_"
307 |       }
308 |     },
309 |     {
310 |       "cell_type": "code",
311 |       "source": [
312 |         "_ = torch.manual_seed(42)\n",
313 |         "\n",
314 |         "_, n_features = X_train.shape\n",
315 |         "housing_model = nn.Sequential(\n",
316 |         "    nn.Linear(\n",
317 |         "        in_features=n_features,\n",
318 |         "        out_features=50,\n",
319 |         "        bias=True,\n",
320 |         "    ),\n",
321 |         "    nn.ReLU(),\n",
322 |         "    nn.Linear(50, 40),\n",
323 |         "    nn.ReLU(),\n",
324 |         "    nn.Linear(40, 1)\n",
325 |         ")\n"
326 |       ],
327 |       "metadata": {
328 |         "id": "Co1xI1kWqxJU"
329 |       },
330 |       "execution_count": null,
331 |       "outputs": []
332 |     },
333 |     {
334 |       "cell_type": "markdown",
335 |       "source": [
336 |         "### Layer-by-Layer Explanation\n",
337 |         "\n",
338 |         "1. **Input layer:**\n",
339 |         "  * Input: Number of inputs (`n_features`)\n",
340 |         "  * Output: Number of neurons in the first hidden layer (50 neurons) → a tunable hyperparameter\n",
341 |         "2. **Activation:**\n",
342 |         "  * [`nn.ReLU`](https://docs.pytorch.org/docs/stable/generated/torch.nn.ReLU.html)\n",
343 |         "  * Applies ReLU elementwise (no parameters, same input/output shape)\n",
344 |         "3. **Second hidden layer:**\n",
345 |         "  * Input: 50 neurons (must match previous output!)\n",
346 |         "  * Output: Number of neurons in the second hidden layer (40 neurons) → a tunable hyperparameter\n",
347 |         "4. **Activation:**\n",
348 |         "  * `nn.ReLU`\n",
349 |         "5. **Output layer:**\n",
350 |         "  * Input: 40 neurons (must match previous output!)\n",
351 |         "  * Output: 1 neuron (must match the regression target dimensionality!)"
352 |       ],
353 |       "metadata": {
354 |         "id": "EGZzwqpJpwxC"
355 |       }
356 |     },
357 |     {
358 |       "cell_type": "markdown",
359 |       "source": [
360 |         "### Loss functions and optimizers"
361 |       ],
362 |       "metadata": {
363 |         "id": "ZepLa-OBoKvx"
364 |       }
365 |     },
366 |     {
367 |       "cell_type": "code",
368 |       "source": [
369 |         "mse_loss = nn.MSELoss()\n",
370 |         "\n",
371 |         "sgd = optim.SGD(\n",
372 |         "    housing_model.parameters(),\n",
373 |         "    lr=1e-2\n",
374 |         ")"
375 |       ],
376 |       "metadata": {
377 |         "id": "g02_w1bToMsD"
378 |       },
379 |       "execution_count": null,
380 |       "outputs": []
381 |     },
382 |     {
383 |       "cell_type": "markdown",
384 |       "source": [
385 |         "### Define a training loop\n"
386 |       ],
387 |       "metadata": {
388 |         "id": "k5BNFJksvm9l"
389 |       }
390 |     },
391 |     {
392 |       "cell_type": "code",
393 |       "source": [
394 |         "def train(\n",
395 |         "    model_fn,\n",
396 |         "    criterion,\n",
397 |         "    optimizer,\n",
398 |         "    X_train,\n",
399 |         "    y_train,\n",
400 |         "    X_val,\n",
401 |         "    y_val,\n",
402 |         "    n_epochs,\n",
403 |         "    log_epochs=100,\n",
404 |         "    ):\n",
405 |         "\n",
406 |         "    for epoch in range(n_epochs):\n",
407 |         "        # forward pass\n",
408 |         "        y_pred = model_fn(X_train)\n",
409 |         "        train_loss = criterion(y_pred, y_train)\n",
410 |         "\n",
411 |         "        # backward pass\n",
412 |         "        train_loss.backward()\n",
413 |         "\n",
414 |         "        # gradient descent step\n",
415 |         "        optimizer.step()\n",
416 |         "        optimizer.zero_grad()\n",
417 |         "\n",
418 |         "        # evaluate using the validation data\n",
419 |         "        with torch.no_grad():\n",
420 |         "            y_pred = model_fn(X_val)\n",
421 |         "            val_loss = criterion(y_pred, y_val)\n",
422 |         "\n",
423 |         "        if (epoch + 1) % log_epochs == 0:\n",
424 |         "            print(f\"Epoch {epoch + 1}/{n_epochs}, Training Loss: {train_loss.item(): .4f}, Val Loss: {val_loss.item(): .4f}\")\n",
425 |         "\n"
426 |       ],
427 |       "metadata": {
428 |         "id": "ftwFGTT9slw3"
429 |       },
430 |       "execution_count": null,
431 |       "outputs": []
432 |     },
433 |     {
434 |       "cell_type": "code",
435 |       "source": [
436 |         "train(\n",
437 |         "    housing_model,\n",
438 |         "    mse_loss,\n",
439 |         "    sgd,\n",
440 |         "    X_train,\n",
441 |         "    y_train,\n",
442 |         "    X_val,\n",
443 |         "    y_val,\n",
444 |         "    n_epochs=1000\n",
445 |         ")"
446 |       ],
447 |       "metadata": {
448 |         "id": "Mm7oF8Zstcf9"
449 |       },
450 |       "execution_count": null,
451 |       "outputs": []
452 |     },
453 |     {
454 |       "cell_type": "code",
455 |       "source": [],
456 |       "metadata": {
457 |         "id": "yXcj3qKcJtPw"
458 |       },
459 |       "execution_count": null,
460 |       "outputs": []
461 |     }
462 |   ]
463 | }


--------------------------------------------------------------------------------
/notebooks/02a-building-data-pipelines-in-pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "view-in-github",
  7 |         "colab_type": "text"
  8 |       },
  9 |       "source": [
 10 |         "<a href=\"https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02a-building-data-pipelines-in-pytorch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "markdown",
 15 |       "metadata": {
 16 |         "id": "loakLDv7VYxi"
 17 |       },
 18 |       "source": [
 19 |         "# Building Data Pipelines in PyTorch"
 20 |       ]
 21 |     },
 22 |     {
 23 |       "cell_type": "code",
 24 |       "execution_count": null,
 25 |       "metadata": {
 26 |         "id": "EeUl4L9uVSLB"
 27 |       },
 28 |       "outputs": [],
 29 |       "source": [
 30 |         "import pathlib\n",
 31 |         "\n",
 32 |         "import numpy as np\n",
 33 |         "import pandas as pd\n",
 34 |         "from sklearn import preprocessing, pipeline\n",
 35 |         "import torch\n",
 36 |         "from torch import utils"
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "execution_count": null,
 42 |       "metadata": {
 43 |         "id": "K4iPa9cMVX7y"
 44 |       },
 45 |       "outputs": [],
 46 |       "source": [
 47 |         "\n",
 48 |         "print(torch.__version__)"
 49 |       ]
 50 |     },
 51 |     {
 52 |       "cell_type": "markdown",
 53 |       "metadata": {
 54 |         "id": "O9s02l3t4fph"
 55 |       },
 56 |       "source": [
 57 |         "## Creating a DataLoader from existing tensors\n",
 58 |         "\n",
 59 |         "* PyTorch’s `DataLoader` helps efficiently load data in mini-batches.\n",
 60 |         "* Can shuffle data each epoch for better generalization: always set `shuffle=True` when training; can set `shuffle=False` when evaluating.\n",
 61 |         "* Expects a `Dataset` with the following methods:\n",
 62 |         "\n",
 63 |         "  1. `__len__(self)` → returns dataset size\n",
 64 |         "  2. `__getitem__(self, index)` → returns sample and target values for the index into the provided `Dataset`."
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "execution_count": null,
 70 |       "metadata": {
 71 |         "id": "qQlgDRzQ6Um_"
 72 |       },
 73 |       "outputs": [],
 74 |       "source": [
 75 |         "utils.data.DataLoader?"
 76 |       ]
 77 |     },
 78 |     {
 79 |       "cell_type": "code",
 80 |       "execution_count": null,
 81 |       "metadata": {
 82 |         "id": "xZ1lN3gA4jwN"
 83 |       },
 84 |       "outputs": [],
 85 |       "source": [
 86 |         "prng = torch.manual_seed(42)\n",
 87 |         "dataset = torch.rand(10, 8, generator=prng)"
 88 |       ]
 89 |     },
 90 |     {
 91 |       "cell_type": "code",
 92 |       "source": [
 93 |         "print(dataset)"
 94 |       ],
 95 |       "metadata": {
 96 |         "id": "wLRChKuPfxy5"
 97 |       },
 98 |       "execution_count": null,
 99 |       "outputs": []
100 |     },
101 |     {
102 |       "cell_type": "code",
103 |       "execution_count": null,
104 |       "metadata": {
105 |         "id": "m2lmCpf24vjM"
106 |       },
107 |       "outputs": [],
108 |       "source": [
109 |         "dataloader = utils.data.DataLoader(dataset)"
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "code",
114 |       "execution_count": null,
115 |       "metadata": {
116 |         "id": "icJi3HeJ44eO"
117 |       },
118 |       "outputs": [],
119 |       "source": [
120 |         "# default batch_size = 1\n",
121 |         "for i, batch in enumerate(dataloader):\n",
122 |         "    print(batch)"
123 |       ]
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "execution_count": null,
128 |       "metadata": {
129 |         "id": "guLaXtSq5RAO"
130 |       },
131 |       "outputs": [],
132 |       "source": [
133 |         "# always manually set the batch_size for your problem!\n",
134 |         "dataloader = utils.data.DataLoader(dataset, batch_size=3)\n",
135 |         "\n",
136 |         "for i, batch in enumerate(dataloader):\n",
137 |         "    print(batch)"
138 |       ]
139 |     },
140 |     {
141 |       "cell_type": "code",
142 |       "execution_count": null,
143 |       "metadata": {
144 |         "id": "qhPQJEuf6YDA"
145 |       },
146 |       "outputs": [],
147 |       "source": [
148 |         "# drop_last=True useful when dataset is not evenly divisible by the batch_size\n",
149 |         "dataloader = utils.data.DataLoader(dataset, batch_size=3, drop_last=True)\n",
150 |         "\n",
151 |         "for i, batch in enumerate(dataloader):\n",
152 |         "    print(batch)"
153 |       ]
154 |     },
155 |     {
156 |       "cell_type": "markdown",
157 |       "metadata": {
158 |         "id": "GjdyzznD6jdV"
159 |       },
160 |       "source": [
161 |         "## Combining two tensors into a joint dataset\n",
162 |         "\n",
163 |         "Often we will have datasets that combine two (or more!) tensors and we want to be able to shuffle and grab batches of all the different tensors and retrieve the results as tuples."
164 |       ]
165 |     },
166 |     {
167 |       "cell_type": "code",
168 |       "execution_count": null,
169 |       "metadata": {
170 |         "id": "n4cGDPRv7gmh"
171 |       },
172 |       "outputs": [],
173 |       "source": [
174 |         "class JointDataset(utils.data.Dataset):\n",
175 |         "    \"\"\"Example of a creating a custom dataset.\"\"\"\n",
176 |         "\n",
177 |         "    def __init__(self, d1, d2):\n",
178 |         "        self. _d1 = d1\n",
179 |         "        self._d2 = d2\n",
180 |         "\n",
181 |         "    def __len__(self):\n",
182 |         "        return len(self._d1)\n",
183 |         "\n",
184 |         "    def __getitem__(self, idx):\n",
185 |         "        return self._d1[idx], self._d2[idx]\n",
186 |         "\n"
187 |       ]
188 |     },
189 |     {
190 |       "cell_type": "code",
191 |       "execution_count": null,
192 |       "metadata": {
193 |         "id": "njVEBnX16ik1"
194 |       },
195 |       "outputs": [],
196 |       "source": [
197 |         "features = torch.rand(10, 8, generator=prng)\n",
198 |         "target = torch.rand(10, 1, generator=prng)"
199 |       ]
200 |     },
201 |     {
202 |       "cell_type": "code",
203 |       "execution_count": null,
204 |       "metadata": {
205 |         "id": "2H8xZKhz6e1_"
206 |       },
207 |       "outputs": [],
208 |       "source": [
209 |         "dataset = JointDataset(features, target)"
210 |       ]
211 |     },
212 |     {
213 |       "cell_type": "code",
214 |       "execution_count": null,
215 |       "metadata": {
216 |         "id": "-kg0JFcx8cgt"
217 |       },
218 |       "outputs": [],
219 |       "source": [
220 |         "dataloader = utils.data.DataLoader(dataset, batch_size=3)\n",
221 |         "\n",
222 |         "for i, (feature_batch, target_batch) in enumerate(dataloader):\n",
223 |         "    print(feature_batch, target_batch)"
224 |       ]
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "execution_count": null,
229 |       "metadata": {
230 |         "id": "TCtQIz2981si"
231 |       },
232 |       "outputs": [],
233 |       "source": [
234 |         "# this use-case is so common that their is a built-in class suporting it!\n",
235 |         "utils.data.TensorDataset?"
236 |       ]
237 |     },
238 |     {
239 |       "cell_type": "code",
240 |       "execution_count": null,
241 |       "metadata": {
242 |         "id": "PAIoJ5h78jv4"
243 |       },
244 |       "outputs": [],
245 |       "source": [
246 |         "dataset = utils.data.TensorDataset(features, target)\n",
247 |         "dataloader = utils.data.DataLoader(dataset, batch_size=3)\n",
248 |         "\n",
249 |         "for i, (feature_batch, target_batch) in enumerate(dataloader):\n",
250 |         "    print(feature_batch, target_batch)"
251 |       ]
252 |     },
253 |     {
254 |       "cell_type": "markdown",
255 |       "metadata": {
256 |         "id": "xlwzY15-9Kmo"
257 |       },
258 |       "source": [
259 |         "## Shuffle, batch, repeat\n",
260 |         "\n",
261 |         "A key aspect of training neural networks effectively using stochastic gradient descent is to repeated sample batches of data from the shuffled dataset."
262 |       ]
263 |     },
264 |     {
265 |       "cell_type": "code",
266 |       "execution_count": null,
267 |       "metadata": {
268 |         "id": "b7r4zmPp9KBJ"
269 |       },
270 |       "outputs": [],
271 |       "source": [
272 |         "dataset = utils.data.TensorDataset(features, target)\n",
273 |         "train_dataloader = utils.data.DataLoader(\n",
274 |         "    dataset,\n",
275 |         "    batch_size=3,\n",
276 |         "    shuffle=True,   # always shuffle during training!\n",
277 |         ")\n"
278 |       ]
279 |     },
280 |     {
281 |       "cell_type": "code",
282 |       "execution_count": null,
283 |       "metadata": {
284 |         "id": "S5SPYU_G8-mS"
285 |       },
286 |       "outputs": [],
287 |       "source": [
288 |         "for i, (feature_batch, target_batch) in enumerate(train_dataloader):\n",
289 |         "    print(feature_batch, target_batch)"
290 |       ]
291 |     },
292 |     {
293 |       "cell_type": "code",
294 |       "execution_count": null,
295 |       "metadata": {
296 |         "id": "0TC-eff49Zil"
297 |       },
298 |       "outputs": [],
299 |       "source": [
300 |         "epochs = 2\n",
301 |         "for epoch in range(epochs):\n",
302 |         "    for i, (X, y) in enumerate(train_dataloader):\n",
303 |         "        print(f\"Epoch {epoch}: features: {X}, target: {y}\")"
304 |       ]
305 |     },
306 |     {
307 |       "cell_type": "markdown",
308 |       "metadata": {
309 |         "id": "ghG-X2Ad-HjX"
310 |       },
311 |       "source": [
312 |         "## Creating a dataset from files on your local disk"
313 |       ]
314 |     },
315 |     {
316 |       "cell_type": "code",
317 |       "execution_count": null,
318 |       "metadata": {
319 |         "id": "ZbCY6vXb-HDW"
320 |       },
321 |       "outputs": [],
322 |       "source": [
323 |         "%%bash\n",
324 |         "\n",
325 |         "ls -l ./sample_data"
326 |       ]
327 |     },
328 |     {
329 |       "cell_type": "markdown",
330 |       "source": [
331 |         "### Loading files into DataFrames"
332 |       ],
333 |       "metadata": {
334 |         "id": "oCQdidNtGdLm"
335 |       }
336 |     },
337 |     {
338 |       "cell_type": "code",
339 |       "execution_count": null,
340 |       "metadata": {
341 |         "id": "RzKLM4ugPSQa"
342 |       },
343 |       "outputs": [],
344 |       "source": [
345 |         "DATA_DIR = pathlib.Path(\"./sample_data\")"
346 |       ]
347 |     },
348 |     {
349 |       "cell_type": "code",
350 |       "execution_count": null,
351 |       "metadata": {
352 |         "id": "prEF6bA092lY"
353 |       },
354 |       "outputs": [],
355 |       "source": [
356 |         "housing_train_df = pd.read_csv(DATA_DIR / \"california_housing_train.csv\")\n",
357 |         "housing_test_df = pd.read_csv(DATA_DIR / \"california_housing_test.csv\")\n"
358 |       ]
359 |     },
360 |     {
361 |       "cell_type": "code",
362 |       "execution_count": null,
363 |       "metadata": {
364 |         "id": "5a8WS4IFCnWL"
365 |       },
366 |       "outputs": [],
367 |       "source": [
368 |         "housing_train_df.info()"
369 |       ]
370 |     },
371 |     {
372 |       "cell_type": "code",
373 |       "execution_count": null,
374 |       "metadata": {
375 |         "id": "wfmG96qvR4zt"
376 |       },
377 |       "outputs": [],
378 |       "source": [
379 |         "housing_train_features_df = housing_train_df.drop(\"median_house_value\", axis=1)\n",
380 |         "housing_train_target = housing_train_df.loc[:, \"median_house_value\"]\n",
381 |         "\n",
382 |         "housing_test_features_df = housing_test_df.drop(\"median_house_value\", axis=1)\n",
383 |         "housing_test_target = housing_test_df.loc[:, \"median_house_value\"]"
384 |       ]
385 |     },
386 |     {
387 |       "cell_type": "markdown",
388 |       "source": [
389 |         "### Converting from DataFrames to Tensors"
390 |       ],
391 |       "metadata": {
392 |         "id": "SiH7ZwoAGQjt"
393 |       }
394 |     },
395 |     {
396 |       "cell_type": "code",
397 |       "execution_count": null,
398 |       "metadata": {
399 |         "id": "pF87niWqRYhZ"
400 |       },
401 |       "outputs": [],
402 |       "source": [
403 |         "def array_to_tensor(arr, dtype=torch.float32):\n",
404 |         "  return torch.tensor(arr, dtype=dtype)\n",
405 |         "\n",
406 |         "\n",
407 |         "def dataframe_to_tensor(df, dtype=torch.float32):\n",
408 |         "    arr = df.to_numpy()\n",
409 |         "    return array_to_tensor(arr, dtype)\n",
410 |         "\n",
411 |         "\n",
412 |         "def series_to_tensor(s, dtype=torch.float32):\n",
413 |         "    arr = s.to_numpy()\n",
414 |         "    return array_to_tensor(arr, dtype)\n"
415 |       ]
416 |     },
417 |     {
418 |       "cell_type": "code",
419 |       "source": [
420 |         "prepare_housing_features = pipeline.make_pipeline(\n",
421 |         "    preprocessing.StandardScaler(),\n",
422 |         "    preprocessing.FunctionTransformer(\n",
423 |         "        func=array_to_tensor\n",
424 |         "    )\n",
425 |         ")\n",
426 |         "\n",
427 |         "prepare_housing_target = pipeline.make_pipeline(\n",
428 |         "    preprocessing.FunctionTransformer(\n",
429 |         "        func=series_to_tensor\n",
430 |         "    ),\n",
431 |         "    preprocessing.FunctionTransformer(\n",
432 |         "        func=torch.unsqueeze,\n",
433 |         "        kw_args={\n",
434 |         "            \"dim\": 1\n",
435 |         "        }\n",
436 |         "    )\n",
437 |         ")\n",
438 |         "\n"
439 |       ],
440 |       "metadata": {
441 |         "id": "wkEgeHigwUSi"
442 |       },
443 |       "execution_count": null,
444 |       "outputs": []
445 |     },
446 |     {
447 |       "cell_type": "code",
448 |       "execution_count": null,
449 |       "metadata": {
450 |         "id": "mtkHuU4ID--U"
451 |       },
452 |       "outputs": [],
453 |       "source": [
454 |         "X_train = prepare_housing_features.fit_transform(housing_train_features_df)\n",
455 |         "y_train = prepare_housing_target.fit_transform(housing_train_target)\n",
456 |         "housing_train_dataset = utils.data.TensorDataset(\n",
457 |         "    X_train,\n",
458 |         "    y_train,\n",
459 |         ")\n",
460 |         "\n",
461 |         "X_test = prepare_housing_features.transform(housing_test_features_df)\n",
462 |         "y_test = prepare_housing_target.transform(housing_test_target)\n",
463 |         "housing_test_dataset = utils.data.TensorDataset(\n",
464 |         "    X_test,\n",
465 |         "    y_test,\n",
466 |         ")"
467 |       ]
468 |     },
469 |     {
470 |       "cell_type": "code",
471 |       "execution_count": null,
472 |       "metadata": {
473 |         "id": "nH9fwAJxBFan"
474 |       },
475 |       "outputs": [],
476 |       "source": [
477 |         "housing_train_dataloader = utils.data.DataLoader(\n",
478 |         "    housing_train_dataset,\n",
479 |         "    batch_size=4,\n",
480 |         "    shuffle=True\n",
481 |         ")\n",
482 |         "\n",
483 |         "for i, (X, y) in enumerate(housing_train_dataloader):\n",
484 |         "    print(X, y)\n",
485 |         "    if i > 4:\n",
486 |         "        break"
487 |       ]
488 |     },
489 |     {
490 |       "cell_type": "code",
491 |       "execution_count": null,
492 |       "metadata": {
493 |         "id": "Fq3p7QEXG-Xf"
494 |       },
495 |       "outputs": [],
496 |       "source": [
497 |         "# TODO: Add example of loading data directly from disk to mimic dataset that doesn't fit in memory!"
498 |       ]
499 |     }
500 |   ],
501 |   "metadata": {
502 |     "colab": {
503 |       "provenance": [],
504 |       "include_colab_link": true
505 |     },
506 |     "kernelspec": {
507 |       "display_name": "Python 3 (ipykernel)",
508 |       "language": "python",
509 |       "name": "python3"
510 |     },
511 |     "language_info": {
512 |       "codemirror_mode": {
513 |         "name": "ipython",
514 |         "version": 3
515 |       },
516 |       "file_extension": ".py",
517 |       "mimetype": "text/x-python",
518 |       "name": "python",
519 |       "nbconvert_exporter": "python",
520 |       "pygments_lexer": "ipython3",
521 |       "version": "3.10.6"
522 |     }
523 |   },
524 |   "nbformat": 4,
525 |   "nbformat_minor": 0
526 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KAUST-Academy/introduction-to-machine-learning/HEAD)
  2 | 
  3 | # Introduction to Deep Learning
  4 | 
  5 | There is strong demand for deep learning skills and expertise to solve challenging business problems both globally and locally in KSA. This course will help learners build capacity in core DL tools and methods and enable them to develop their own deep learning applications. This course covers the basic theory behind DL algorithms but the majority of the focus is on hands-on examples using [PyTorch](https://pytorch.org/).
  6 | 
  7 | ## Learning Objectives
  8 | 
  9 | The primary learning objective of this course is to provide students with practical, hands-on experience with state-of-the-art machine learning and deep learning tools that are widely used in industry.
 10 | 
 11 | This course covers portions of chapters 10-19 of [Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) and chapters 11-19 of [Machine Learning with PyTorch and Scikit-Learn](https://www.packtpub.com/product/machine-learning-with-pytorch-and-scikit-learn/9781801819312). The following topics will be discussed.
 12 | 
 13 | * Introduction to Artificial Neural Networks (ANNs) 
 14 | * Training Deep Neural Networks (DNNs) 
 15 | * Custom Models and Training with PyTorch and Lightning 
 16 | * Stratgeies for Loading and Preprocessing Data
 17 | * Training and Deploying PyTorch Models at Scale 
 18 | 
 19 | ## Lectures
 20 | 
 21 | ### Lecture 1: An Introduction to Artificial Neural Networks [(Slides)](https://kaust-my.sharepoint.com/:p:/g/personal/pughdr_kaust_edu_sa/EfRHWqkIFjpBk9j8aL4I3fABUngt5d3uccvxjDuuurtYfA?e=ffNqUj)
 22 | 
 23 | In this lecture we cover the fundamental concepts behind artificial neural networks (ANNs), beginning with the biological inspiration and simple perceptrons, then moving to multi-layer perceptrons (MLPs) and the back-propagation algorithm for training them. The hands on labs will cover the fundamental concepts of PyTorch and then demonstrate how to implement foundational models for supervised learning problems such as linear and logistic regression, and MLPs from scratch with PyTorch as well as how to train these models using full-batch, gradient descent.
 24 | 
 25 | **Suggested reading:**
 26 | 
 27 | * Chapter 9 of [Hands-on Machine Learning with Scikit-Learn and PyTorch](https://www.oreilly.com/library/view/hands-on-machine-learning/9798341607972/) 
 28 | * Chapter 11 of [ML with PyTorch and Sklearn](https://www.oreilly.com/library/view/machine-learning-with/9781801819312/)
 29 | 
 30 | | **Tutorial** | **Open in Google Colab** | **Open in Kaggle** |
 31 | |--------------|:------------------------:|:------------------:|
 32 | | First Steps with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01a-first-steps-with-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01a-first-steps-with-pytorch.ipynb) |
 33 | | Linear Regression from Scratch with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01b-linear-regression-from-scratch-with-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01b-linear-regression-from-scratch-with-pytorch.ipynb) |
 34 | | Logistic Regression from Scratch with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01c-logistic-regression-from-scratch-with-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01c-logistic-regression-from-scratch-with-pytorch.ipynb) |
 35 | | MLPs for Regression with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01d-mlp-for-regression-with-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01d-mlp-for-regression-with-pytorch.ipynb) | 
 36 | | MLPs for Classification with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01e-mlp-for-classification-with-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01e-mlp-for-classification-with-pytorch.ipynb) | 
 37 | 
 38 | ### Lecture 2: Building Neural Networks with PyTorch
 39 | 
 40 | In these hands on labs we will discusses how to build and train multi-layter perceptron (MLP) models using the PyTorch API—choosing number of layers and neurons, activation functions, loss functions, and optimizers—and illustrates how these networks form the basis of deep learning. Additional labs cover building data pipelines with PyTorch and model training using mini-batch stochastic gradient descent. This lecture emphasizes practical implementation in PyTorch and helps the student move from shallow models to basic deep architectures, preparing the ground for more advanced topics.
 41 | 
 42 | | **Tutorial** | **Open in Google Colab** | **Open in Kaggle** |
 43 | |--------------|:------------------------:|:------------------:|
 44 | | Building Data Pipelines with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02a-building-data-pipelines-in-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02a-building-data-pipelines-in-pytorch.ipynb) | 
 45 | | Mini-batch Gradient Descent with PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02b-implementing-minibatch-gradient-descent.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/notebooks/02b-implementing-minibatch-gradient-descent.ipynb) | 
 46 | | Model Evaluation with PyTorch and Torchmetrics | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02c-model-evaluation.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/notebooks/02c-model-evaluation.ipynb) | 
 47 | | Building an Image Classifier in PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02d-building-an-image-classifier-with-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/notebooks/02d-building-an-image-classifier-with-pytorch.ipynb) | 
 48 | | Saving and Loading PyTorch Models | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02e-saving-and-loading-pytorch-models.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/notebooks/02e-saving-and-loading-pytorch-models.ipynb) |
 49 | 
 50 | ### Lecture 3: Training Neural Networks, Part I: Stochastic Gradient Descent [(Slides)](https://kaust-my.sharepoint.com/:p:/g/personal/pughdr_kaust_edu_sa/ESk6HlHXmytAipnVVIeTCtsBnEVZLQeF4KNpE7A49GZCcA?e=VteHn9)
 51 | 
 52 | TBD
 53 | 
 54 | **Suggested reading:**
 55 | 
 56 | * Chapter 4 of [Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) 
 57 | * Chapter ? of [ML with PyTorch and Sklearn](https://www.oreilly.com/library/view/machine-learning-with/9781801819312/)
 58 | 
 59 | | **Tutorial** | **Open in Google Colab** | **Open in Kaggle** |
 60 | |--------------|:------------------------:|:------------------:|
 61 | | Introduction to PyTorch Lightning | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01e-introduction-to-pytorch-lightning.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01e-introduction-to-pytorch-lightning.ipynb) | 
 62 | | Introduction to PyTorch Lightning | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/01e-introduction-to-pytorch-lightning.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/01e-introduction-to-pytorch-lightning.ipynb)
 63 | 
 64 | 
 65 | ### Lecture 4: Training Neural Networks, Part II [(Slides)](https://kaust-my.sharepoint.com/:p:/g/personal/pughdr_kaust_edu_sa/EeA0M9ydAEVGmfAWZhzVDEwBWtnOVos3YroubDAkfswWoQ?e=ryFLJy)
 66 | 
 67 | * Consolidation of previous days content via Q/A and live coding demonstrations.
 68 | * The morning session will focus on the theory behind neural networks for solving both classification and regression problems by covering chapters 12-13 of [Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) and chapters 12-13 of [Machine Learning with PyTorch and Scikit-Learn](https://www.packtpub.com/product/machine-learning-with-pytorch-and-scikit-learn/9781801819312).  
 69 | * The afternoon session will focus on applying the techniques learned in the morning session using [PyTorch](https://pytorch.org/), followed by a short assessment on the Kaggle data science competition platform.
 70 | 
 71 | | **Tutorial** | **Open in Google Colab** | **Open in Kaggle** |
 72 | |--------------|:------------------------:|:------------------:|
 73 | | Deep Dive into PyTorch | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/01d-deep-dive-into-pytorch.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/01d-deep-dive-into-pytorch.ipynb) | 
 74 | | Vanishing and Exploding Gradients | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02c-vanishing-exploding-gradients.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02c-vanishing-exploding-gradients.ipynb) | 
 75 | | Transfer Learning and Unsupervised Pre-training | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02d-transfer-learning-and-unsupervised-pretraining.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02d-transfer-learning-and-unsupervised-pretraining.ipynb) | 
 76 | | Faster Optimizers | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02e-faster-optimizers.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02e-faster-optimizers.ipynb) | 
 77 | | Learning Rate Schedulers | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02f-learning-rate-schedulers.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02f-learning-rate-schedulers.ipynb) | 
 78 | | Regularization | [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02g-regularization.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/KAUST-Academy/introduction-to-deep-learning/blob/master/notebooks/02g-regularization.ipynb) | 
 79 | 
 80 | ### Module 3: [Deploying and Scaling PyTorch Models](https://kaust-my.sharepoint.com/:p:/g/personal/pughdr_kaust_edu_sa/EZItqTR1l3VOs0UzJJ7NAVABaDT2FN5kclARg2Rv8cnoxA?e=vyaLcu)
 81 | 
 82 | * Consolidation of previous days content via Q/A and live coding demonstrations.  
 83 | * The morning session will focus on various topics related to training and deploying PyTorch models as scale by covering chapter 19 of [Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/). 
 84 | * The afternoon session will allow time for a final assessment as well as additional time for learners to complete any of the previous assessments.
 85 | 
 86 | ## Assessment
 87 | 
 88 | Student performance on the course will be assessed through participation in a Kaggle classroom competition. 
 89 | 
 90 | # Repository Organization
 91 | 
 92 | Repository organization is based on ideas from [_Good Enough Practices for Scientific Computing_](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005510).
 93 | 
 94 | 1. Put each project in its own directory, which is named after the project.
 95 | 2. Put external scripts or compiled programs in the `bin` directory.
 96 | 3. Put raw data and metadata in a `data` directory.
 97 | 4. Put text documents associated with the project in the `doc` directory.
 98 | 5. Put all Docker related files in the `docker` directory.
 99 | 6. Install the Conda environment into an `env` directory. 
100 | 7. Put all notebooks in the `notebooks` directory.
101 | 8. Put files generated during cleanup and analysis in a `results` directory.
102 | 9. Put project source code in the `src` directory.
103 | 10. Name all files to reflect their content or function.
104 | 
105 | ## Building the Conda environment
106 | 
107 | After adding any necessary dependencies that should be downloaded via `conda` to the 
108 | `environment.yml` file and any dependencies that should be downloaded via `pip` to the 
109 | `requirements.txt` file you create the Conda environment in a sub-directory `./env`of your project 
110 | directory by running the following commands.
111 | 
112 | ```bash
113 | export ENV_PREFIX=$PWD/env
114 | mamba env create --prefix $ENV_PREFIX --file environment.yml --force
115 | ```
116 | 
117 | Once the new environment has been created you can activate the environment with the following 
118 | command.
119 | 
120 | ```bash
121 | conda activate $ENV_PREFIX
122 | ```
123 | 
124 | Note that the `ENV_PREFIX` directory is *not* under version control as it can always be re-created as 
125 | necessary.
126 | 
127 | For your convenience these commands have been combined in a shell script `./bin/create-conda-env.sh`. 
128 | Running the shell script will create the Conda environment, activate the Conda environment, and build 
129 | JupyterLab with any additional extensions. The script should be run from the project root directory 
130 | as follows. 
131 | 
132 | ```bash
133 | ./bin/create-conda-env.sh
134 | ```
135 | 
136 | ### Ibex
137 | 
138 | The most efficient way to build Conda environments on Ibex is to launch the environment creation script 
139 | as a job on the debug partition via Slurm. For your convenience a Slurm job script 
140 | `./bin/create-conda-env.sbatch` is included. The script should be run from the project root directory 
141 | as follows.
142 | 
143 | ```bash
144 | sbatch ./bin/create-conda-env.sbatch
145 | ```
146 | 
147 | ### Listing the full contents of the Conda environment
148 | 
149 | The list of explicit dependencies for the project are listed in the `environment.yml` file. To see 
150 | the full lost of packages installed into the environment run the following command.
151 | 
152 | ```bash
153 | conda list --prefix $ENV_PREFIX
154 | ```
155 | 
156 | ### Updating the Conda environment
157 | 
158 | If you add (remove) dependencies to (from) the `environment.yml` file or the `requirements.txt` file 
159 | after the environment has already been created, then you can re-create the environment with the 
160 | following command.
161 | 
162 | ```bash
163 | $ mamba env create --prefix $ENV_PREFIX --file environment.yml --force
164 | ```
165 | 
166 | ## Using Docker
167 | 
168 | In order to build Docker images for your project and run containers with GPU acceleration you will 
169 | need to install 
170 | [Docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/), 
171 | [Docker Compose](https://docs.docker.com/compose/install/) and the 
172 | [NVIDIA Docker runtime](https://github.com/NVIDIA/nvidia-docker).
173 | 
174 | Detailed instructions for using Docker to build and image and launch containers can be found in 
175 | the `docker/README.md`.
176 | 


--------------------------------------------------------------------------------
/notebooks/02b-implementing-minibatch-gradient-descent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4",
  8 |       "authorship_tag": "ABX9TyMMZRs3EgG/N55Ss/f8yKMw",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     },
 18 |     "accelerator": "GPU"
 19 |   },
 20 |   "cells": [
 21 |     {
 22 |       "cell_type": "markdown",
 23 |       "metadata": {
 24 |         "id": "view-in-github",
 25 |         "colab_type": "text"
 26 |       },
 27 |       "source": [
 28 |         "<a href=\"https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02b-implementing-minibatch-gradient-descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "source": [
 34 |         "# Implementing Mini-Batch Gradient Descent using DataLoaders"
 35 |       ],
 36 |       "metadata": {
 37 |         "id": "6408e1_DUgNI"
 38 |       }
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "source": [
 43 |         "import numpy as np\n",
 44 |         "from sklearn import compose, datasets, model_selection, pipeline, preprocessing\n",
 45 |         "\n",
 46 |         "import torch\n",
 47 |         "from torch import nn, optim, utils"
 48 |       ],
 49 |       "metadata": {
 50 |         "id": "hAF4bXNEUmwI"
 51 |       },
 52 |       "execution_count": null,
 53 |       "outputs": []
 54 |     },
 55 |     {
 56 |       "cell_type": "code",
 57 |       "source": [
 58 |         "print(np.__version__)"
 59 |       ],
 60 |       "metadata": {
 61 |         "id": "jI-q-B5VVC5t"
 62 |       },
 63 |       "execution_count": null,
 64 |       "outputs": []
 65 |     },
 66 |     {
 67 |       "cell_type": "markdown",
 68 |       "source": [
 69 |         "## Verifying availability of GPU(s)"
 70 |       ],
 71 |       "metadata": {
 72 |         "id": "Jw2YB037gupQ"
 73 |       }
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "source": [
 78 |         "# check that torch version has support for cuda\n",
 79 |         "print(torch.__version__)"
 80 |       ],
 81 |       "metadata": {
 82 |         "id": "WI2Hc3ssVFrp"
 83 |       },
 84 |       "execution_count": null,
 85 |       "outputs": []
 86 |     },
 87 |     {
 88 |       "cell_type": "code",
 89 |       "source": [
 90 |         "%%bash\n",
 91 |         "\n",
 92 |         "# check that GPUs are physically available\n",
 93 |         "nvidia-smi"
 94 |       ],
 95 |       "metadata": {
 96 |         "id": "QVTrHw0Hg3jU"
 97 |       },
 98 |       "execution_count": null,
 99 |       "outputs": []
100 |     },
101 |     {
102 |       "cell_type": "code",
103 |       "source": [
104 |         "# check that PyTorch can find the GPUs\n",
105 |         "print(torch.cuda.is_available())"
106 |       ],
107 |       "metadata": {
108 |         "id": "vWB6iFrZg0dG"
109 |       },
110 |       "execution_count": null,
111 |       "outputs": []
112 |     },
113 |     {
114 |       "cell_type": "code",
115 |       "source": [
116 |         "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
117 |       ],
118 |       "metadata": {
119 |         "id": "IvmFtiBRXwYM"
120 |       },
121 |       "execution_count": null,
122 |       "outputs": []
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "source": [
127 |         "print(DEVICE)"
128 |       ],
129 |       "metadata": {
130 |         "id": "arEX0TRPX1pM"
131 |       },
132 |       "execution_count": null,
133 |       "outputs": []
134 |     },
135 |     {
136 |       "cell_type": "markdown",
137 |       "source": [
138 |         "## Loading the data"
139 |       ],
140 |       "metadata": {
141 |         "id": "qU14efMhVJjE"
142 |       }
143 |     },
144 |     {
145 |       "cell_type": "code",
146 |       "execution_count": null,
147 |       "metadata": {
148 |         "id": "pi4TvYKpUWAC"
149 |       },
150 |       "outputs": [],
151 |       "source": [
152 |         "covtype_dataset = datasets.fetch_covtype(\n",
153 |         "    as_frame=True\n",
154 |         ")"
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "source": [
160 |         "print(covtype_dataset[\"DESCR\"])"
161 |       ],
162 |       "metadata": {
163 |         "id": "agASIJEGUjX6"
164 |       },
165 |       "execution_count": null,
166 |       "outputs": []
167 |     },
168 |     {
169 |       "cell_type": "code",
170 |       "source": [
171 |         "covtype_features_df = covtype_dataset[\"data\"]\n",
172 |         "covtype_target_df = (\n",
173 |         "    covtype_dataset.get(\"target\")\n",
174 |         "                   .to_frame()\n",
175 |         ")"
176 |       ],
177 |       "metadata": {
178 |         "id": "yQvLcEMhUp6w"
179 |       },
180 |       "execution_count": null,
181 |       "outputs": []
182 |     },
183 |     {
184 |       "cell_type": "markdown",
185 |       "source": [
186 |         "## Preparing the data"
187 |       ],
188 |       "metadata": {
189 |         "id": "PzsiqOoIVgBf"
190 |       }
191 |     },
192 |     {
193 |       "cell_type": "markdown",
194 |       "source": [
195 |         "### Train/Val split"
196 |       ],
197 |       "metadata": {
198 |         "id": "4a4kbLN3U1Wu"
199 |       }
200 |     },
201 |     {
202 |       "cell_type": "code",
203 |       "source": [
204 |         "RANDOM_STATE = np.random.RandomState(42)\n",
205 |         "\n",
206 |         "\n",
207 |         "train_features_df, val_features_df, train_target_df, val_target_df = (\n",
208 |         "    model_selection.train_test_split(\n",
209 |         "        covtype_features_df,\n",
210 |         "        covtype_target_df,\n",
211 |         "        test_size=0.20,\n",
212 |         "        shuffle=True,\n",
213 |         "        stratify=covtype_target_df,\n",
214 |         "        random_state=RANDOM_STATE\n",
215 |         "    )\n",
216 |         ")\n"
217 |       ],
218 |       "metadata": {
219 |         "id": "sjUrsUbmUzEw"
220 |       },
221 |       "execution_count": null,
222 |       "outputs": []
223 |     },
224 |     {
225 |       "cell_type": "markdown",
226 |       "source": [
227 |         "#### Features and target preparation"
228 |       ],
229 |       "metadata": {
230 |         "id": "npDOm1-1WTUU"
231 |       }
232 |     },
233 |     {
234 |       "cell_type": "code",
235 |       "source": [
236 |         "def array_to_tensor(arr, dtype=torch.float32):\n",
237 |         "    return torch.tensor(arr, dtype=dtype)\n",
238 |         "\n",
239 |         "\n",
240 |         "prepare_covtype_features = pipeline.make_pipeline(\n",
241 |         "    compose.make_column_transformer(\n",
242 |         "        (\n",
243 |         "            \"passthrough\",\n",
244 |         "            compose.make_column_selector(\n",
245 |         "                pattern=\"^Wilderness_Area_|^Soil_Type_\"\n",
246 |         "            )\n",
247 |         "        ),\n",
248 |         "        force_int_remainder_cols=False,\n",
249 |         "        n_jobs=-1,\n",
250 |         "        remainder=preprocessing.QuantileTransformer(\n",
251 |         "            output_distribution=\"normal\",\n",
252 |         "            random_state=RANDOM_STATE,\n",
253 |         "        )\n",
254 |         "    ),\n",
255 |         "    preprocessing.FunctionTransformer(\n",
256 |         "        func=array_to_tensor,\n",
257 |         "    )\n",
258 |         ")\n",
259 |         "\n",
260 |         "prepare_covtype_target = pipeline.make_pipeline(\n",
261 |         "    preprocessing.OrdinalEncoder(\n",
262 |         "        categories=[\n",
263 |         "            [1, 2, 3, 4, 5, 6, 7]\n",
264 |         "        ],\n",
265 |         "    ),\n",
266 |         "    preprocessing.FunctionTransformer(\n",
267 |         "        func=array_to_tensor,\n",
268 |         "        kw_args={\n",
269 |         "            \"dtype\": torch.int64\n",
270 |         "        }\n",
271 |         "    ),\n",
272 |         "    preprocessing.FunctionTransformer(\n",
273 |         "        func=torch.squeeze,\n",
274 |         "    )\n",
275 |         ")\n",
276 |         "\n"
277 |       ],
278 |       "metadata": {
279 |         "id": "rkuiF7znVunl"
280 |       },
281 |       "execution_count": null,
282 |       "outputs": []
283 |     },
284 |     {
285 |       "cell_type": "code",
286 |       "source": [
287 |         "X_train = prepare_covtype_features.fit_transform(train_features_df)\n",
288 |         "X_val = prepare_covtype_features.transform(val_features_df)\n"
289 |       ],
290 |       "metadata": {
291 |         "id": "WoEwybKkVPKz"
292 |       },
293 |       "execution_count": null,
294 |       "outputs": []
295 |     },
296 |     {
297 |       "cell_type": "code",
298 |       "source": [
299 |         "print(X_train.shape)\n",
300 |         "print(X_val.shape)"
301 |       ],
302 |       "metadata": {
303 |         "id": "ck0rwCW0WMk4"
304 |       },
305 |       "execution_count": null,
306 |       "outputs": []
307 |     },
308 |     {
309 |       "cell_type": "code",
310 |       "source": [
311 |         "y_train = prepare_covtype_target.fit_transform(train_target_df)\n",
312 |         "y_val = prepare_covtype_target.transform(val_target_df)\n"
313 |       ],
314 |       "metadata": {
315 |         "id": "0VRdZGsZWHhc"
316 |       },
317 |       "execution_count": null,
318 |       "outputs": []
319 |     },
320 |     {
321 |       "cell_type": "code",
322 |       "source": [
323 |         "print(y_train.shape)\n",
324 |         "print(y_val.shape)"
325 |       ],
326 |       "metadata": {
327 |         "id": "x7wl1vR-WMIc"
328 |       },
329 |       "execution_count": null,
330 |       "outputs": []
331 |     },
332 |     {
333 |       "cell_type": "markdown",
334 |       "source": [
335 |         "#### Datasets\n",
336 |         "\n",
337 |         "* Combine features tensor and targets tensor for each split into a single dataset.\n",
338 |         "* Train/val/test tensors are stored in separate datasets.\n"
339 |       ],
340 |       "metadata": {
341 |         "id": "bRRYnaSlWaW-"
342 |       }
343 |     },
344 |     {
345 |       "cell_type": "code",
346 |       "source": [
347 |         "train_dataset = utils.data.TensorDataset(X_train, y_train)\n",
348 |         "val_dataset = utils.data.TensorDataset(X_val, y_val)"
349 |       ],
350 |       "metadata": {
351 |         "id": "RcQc1S2GWPJD"
352 |       },
353 |       "execution_count": null,
354 |       "outputs": []
355 |     },
356 |     {
357 |       "cell_type": "markdown",
358 |       "source": [
359 |         "#### DataLoaders\n",
360 |         "\n",
361 |         "* Use pinned memory, `pin_memory=True`, for faster GPU transfers.\n",
362 |         "* Transfer data batches with `non_blocking=True` to avoid CPU blocking GPUs from making progress during data loading.\n",
363 |         "* Parallelize data loading:\n",
364 |         "  * `num_workers` → number of CPU workers to prefetch data. Typically set to the number of physical CPUs.\n",
365 |         "  * `prefetch_factor` → how many batches each worker preloads. Defaults to 2.\n",
366 |         "  * `persistent_workers=True` → reuse workers across epochs.\n",
367 |         "\n",
368 |         "Tune key parameters sucha as `batch_size` and `num_workers` based on hardware for optimal performance."
369 |       ],
370 |       "metadata": {
371 |         "id": "3z31QmQ1aH2i"
372 |       }
373 |     },
374 |     {
375 |       "cell_type": "code",
376 |       "source": [
377 |         "train_data_loader = (\n",
378 |         "    utils.data\n",
379 |         "         .DataLoader(\n",
380 |         "             train_dataset,\n",
381 |         "             num_workers=2,\n",
382 |         "             batch_size=128,\n",
383 |         "             shuffle=True,\n",
384 |         "             persistent_workers=True,\n",
385 |         "             pin_memory=True,\n",
386 |         "             prefetch_factor=2,\n",
387 |         "             drop_last=True,\n",
388 |         "         )\n",
389 |         ")\n",
390 |         "\n",
391 |         "val_data_loader = (\n",
392 |         "    utils.data\n",
393 |         "         .DataLoader(\n",
394 |         "             val_dataset,\n",
395 |         "             num_workers=2,\n",
396 |         "             batch_size=128,\n",
397 |         "             shuffle=True,\n",
398 |         "             persistent_workers=True,\n",
399 |         "             pin_memory=True,\n",
400 |         "             prefetch_factor=2,\n",
401 |         "             drop_last=True,\n",
402 |         "         )\n",
403 |         ")"
404 |       ],
405 |       "metadata": {
406 |         "id": "Sw7coNH4WgvT"
407 |       },
408 |       "execution_count": null,
409 |       "outputs": []
410 |     },
411 |     {
412 |       "cell_type": "markdown",
413 |       "source": [
414 |         "## Building a Mini-batch Gradient Descent training loop\n",
415 |         "\n",
416 |         "### The basics\n",
417 |         "\n",
418 |         "* PyTorch does **not** provide a built-in training loop → you write it manually.\n",
419 |         "* `model_fn.train()` sets model to training mode.\n",
420 |         "* Must move tensors to GPU **manually**!\n",
421 |         "* Use `model_fn.eval()` when evaluating or making predictions.\n",
422 |         "* `model_fn.training` → boolean flag for current mode.\n",
423 |         "\n",
424 |         "```python\n",
425 |         "model.train()\n",
426 |         "for epoch in range(n_epochs):\n",
427 |         "    for X_batch, y_batch in train_data_loader:\n",
428 |         "        X_batch = X_batch.to(DEVICE, non_blocking=True)\n",
429 |         "        y_batch = y_batch.to(DEVICE, non_blocking=True)\n",
430 |         "        ...\n",
431 |         "        loss.backward()\n",
432 |         "        optimizer.step()\n",
433 |         "        optimizer.zero_grad()\n",
434 |         "```\n",
435 |         "\n",
436 |         "### Beyond the basics\n",
437 |         "\n",
438 |         "* PyTorch provides flexibility and control making it ideal for research workflows.\n",
439 |         "* For higher-level automation of training features such as multi-GPU support, logging, and checkpointing take a look at libraries like [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/), [FastAI](https://www.fast.ai/), [Catalyst](https://catalyst-team.com/), or [Keras (v3+)](https://keras.io/)."
440 |       ],
441 |       "metadata": {
442 |         "id": "cq7uxeVnXKN9"
443 |       }
444 |     },
445 |     {
446 |       "cell_type": "code",
447 |       "source": [
448 |         "def train(\n",
449 |         "    model_fn,\n",
450 |         "    criterion,\n",
451 |         "    optimizer,\n",
452 |         "    train_data_loader,\n",
453 |         "    n_epochs,\n",
454 |         "    log_epochs=1,\n",
455 |         "    ):\n",
456 |         "\n",
457 |         "    model_fn.train()\n",
458 |         "    for epoch in range(n_epochs):\n",
459 |         "        total_loss = 0.0\n",
460 |         "        for i, (X_batch, y_batch) in enumerate(train_data_loader):\n",
461 |         "\n",
462 |         "            # move batches to device\n",
463 |         "            X_batch = X_batch.to(DEVICE, non_blocking=True)\n",
464 |         "            y_batch = y_batch.to(DEVICE, non_blocking=True)\n",
465 |         "\n",
466 |         "            # forward pass\n",
467 |         "            y_pred = model_fn(X_batch)\n",
468 |         "            train_loss = criterion(y_pred, y_batch)\n",
469 |         "            total_loss += train_loss.item()\n",
470 |         "\n",
471 |         "            # backward pass\n",
472 |         "            train_loss.backward()\n",
473 |         "\n",
474 |         "            # gradient descent step\n",
475 |         "            optimizer.step()\n",
476 |         "            optimizer.zero_grad()\n",
477 |         "\n",
478 |         "        average_loss = total_loss / len(train_data_loader)\n",
479 |         "\n",
480 |         "        if (epoch + 1) % log_epochs == 0:\n",
481 |         "            print(f\"Epoch {epoch + 1}/{n_epochs}, Training Loss: {average_loss: .4f}\")\n"
482 |       ],
483 |       "metadata": {
484 |         "id": "2L5czsBeXEol"
485 |       },
486 |       "execution_count": null,
487 |       "outputs": []
488 |     },
489 |     {
490 |       "cell_type": "markdown",
491 |       "source": [
492 |         "### Exercise:\n",
493 |         "\n",
494 |         "Implement a MLP using `nn.Sequential` that has three hidden layers with sizes 200, 100, and 50. Use `nn.ReLU` activation functions."
495 |       ],
496 |       "metadata": {
497 |         "id": "z7KUZVmGa90y"
498 |       }
499 |     },
500 |     {
501 |       "cell_type": "code",
502 |       "source": [
503 |         "# INSERT YOUR CODE HERE!"
504 |       ],
505 |       "metadata": {
506 |         "id": "8Ru93HdBcrUR"
507 |       },
508 |       "execution_count": null,
509 |       "outputs": []
510 |     },
511 |     {
512 |       "cell_type": "markdown",
513 |       "source": [
514 |         "### Solution:"
515 |       ],
516 |       "metadata": {
517 |         "id": "FPW8ZSVFctg_"
518 |       }
519 |     },
520 |     {
521 |       "cell_type": "code",
522 |       "source": [
523 |         "_ = torch.manual_seed(42)\n",
524 |         "\n",
525 |         "n_features = X_train.size(1)\n",
526 |         "n_classes = y_train.unique().size(0)\n",
527 |         "\n",
528 |         "covtype_model = nn.Sequential(\n",
529 |         "    nn.Linear(\n",
530 |         "        in_features=n_features,\n",
531 |         "        out_features=200,\n",
532 |         "        bias=True,\n",
533 |         "    ),\n",
534 |         "    nn.ReLU(),\n",
535 |         "    nn.Linear(\n",
536 |         "        in_features=200,\n",
537 |         "        out_features=100,\n",
538 |         "        bias=True,\n",
539 |         "    ),\n",
540 |         "    nn.ReLU(),\n",
541 |         "    nn.Linear(\n",
542 |         "        in_features=100,\n",
543 |         "        out_features=50,\n",
544 |         "        bias=True,\n",
545 |         "    ),\n",
546 |         "    nn.ReLU(),\n",
547 |         "    nn.Linear(\n",
548 |         "        in_features=50,\n",
549 |         "        out_features=n_classes,\n",
550 |         "        bias=True,\n",
551 |         "    ),\n",
552 |         ")\n"
553 |       ],
554 |       "metadata": {
555 |         "id": "JeWCfe_8YQXk"
556 |       },
557 |       "execution_count": null,
558 |       "outputs": []
559 |     },
560 |     {
561 |       "cell_type": "markdown",
562 |       "source": [
563 |         "### Using the GPU for Training\n",
564 |         "\n",
565 |         "* Move model and tensors to GPU for acceleration:\n",
566 |         "`model_fn = model_fn.to(DEVICE)`.\n",
567 |         "* Create optimizer after moving the model to make sure that optimizer state is allocated on same device.\n",
568 |         "* Typical learning rate for mini-batch training is *smaller* than learning rate used for full-batch gradient descent (more on the relationship between learning rates and batch sizes later).\n"
569 |       ],
570 |       "metadata": {
571 |         "id": "Zin17mMhcv4N"
572 |       }
573 |     },
574 |     {
575 |       "cell_type": "code",
576 |       "source": [
577 |         "# move model to the GPU before defining your optimizer!\n",
578 |         "covtype_model.to(DEVICE)"
579 |       ],
580 |       "metadata": {
581 |         "id": "MEboUrpuf-FY"
582 |       },
583 |       "execution_count": null,
584 |       "outputs": []
585 |     },
586 |     {
587 |       "cell_type": "markdown",
588 |       "source": [
589 |         "### Exercise:\n",
590 |         "\n",
591 |         "Define an appropriate loss function and optimizer and train your model for 100 epochs using mini-batch gradient descent.\n"
592 |       ],
593 |       "metadata": {
594 |         "id": "bSjpkOLzoctF"
595 |       }
596 |     },
597 |     {
598 |       "cell_type": "code",
599 |       "source": [
600 |         "# INSERT YOUR CODE HERE!"
601 |       ],
602 |       "metadata": {
603 |         "id": "Rcfin7ZjopRp"
604 |       },
605 |       "execution_count": null,
606 |       "outputs": []
607 |     },
608 |     {
609 |       "cell_type": "markdown",
610 |       "source": [
611 |         "### Solution:"
612 |       ],
613 |       "metadata": {
614 |         "id": "hROYf7i4oq1N"
615 |       }
616 |     },
617 |     {
618 |       "cell_type": "code",
619 |       "source": [
620 |         "cross_entropy_loss = nn.CrossEntropyLoss()\n",
621 |         "\n",
622 |         "# optimizer should be defined after moving model to GPU\n",
623 |         "sgd = optim.SGD(\n",
624 |         "    covtype_model.parameters(),\n",
625 |         "    lr=1e-3\n",
626 |         ")"
627 |       ],
628 |       "metadata": {
629 |         "id": "b9wXDokIotpO"
630 |       },
631 |       "execution_count": null,
632 |       "outputs": []
633 |     },
634 |     {
635 |       "cell_type": "code",
636 |       "source": [
637 |         "train(\n",
638 |         "    covtype_model,\n",
639 |         "    cross_entropy_loss,\n",
640 |         "    sgd,\n",
641 |         "    train_data_loader,\n",
642 |         "    n_epochs=100,\n",
643 |         "    log_epochs=1,\n",
644 |         ")"
645 |       ],
646 |       "metadata": {
647 |         "id": "BYg7gSsSYaRf"
648 |       },
649 |       "execution_count": null,
650 |       "outputs": []
651 |     },
652 |     {
653 |       "cell_type": "code",
654 |       "source": [],
655 |       "metadata": {
656 |         "id": "Ry5JOPjJYdms"
657 |       },
658 |       "execution_count": null,
659 |       "outputs": []
660 |     }
661 |   ]
662 | }


--------------------------------------------------------------------------------
/notebooks/introduction-to-pytorch-part-2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import glob\n",
 10 |     "import pathlib\n",
 11 |     "import pickle\n",
 12 |     "import requests\n",
 13 |     "import tarfile\n",
 14 |     "import time\n",
 15 |     "\n",
 16 |     "import joblib\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "from sklearn import model_selection\n",
 21 |     "import torch\n",
 22 |     "from torch import nn, optim\n",
 23 |     "import torch.nn.functional as F\n",
 24 |     "from torch.utils import data\n",
 25 |     "import torchinfo\n",
 26 |     "import torchmetrics\n",
 27 |     "from torchvision import models, transforms"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# Training Deep Neural Networks using GPUs"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# Data\n",
 42 |     "\n",
 43 |     "## CIFAR-10 Dataset\n",
 44 |     "\n",
 45 |     "The original [CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset is divided into five training batches and one test batch, each with 10000 images. The test batch contains exactly 1000 randomly-selected images from each class. The training batches contain the remaining images in random order, but some training batches may contain more images from one class than another. Between them, the training batches contain exactly 5000 images from each class."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "CLASS_LABELS = {\n",
 55 |     "    0: \"airplane\",\n",
 56 |     "    1: \"automobile\",\n",
 57 |     "    2: \"bird\",\n",
 58 |     "    3: \"cat\",\n",
 59 |     "    4: \"deer\",\n",
 60 |     "    5: \"dog\",\n",
 61 |     "    6: \"frog\",\n",
 62 |     "    7: \"horse\",\n",
 63 |     "    8: \"ship\",\n",
 64 |     "    9: \"truck\"\n",
 65 |     "}"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### Download and extract the data"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "DATA_DIR = pathlib.Path(\"../data/\")\n",
 82 |     "RAW_DATA_DIR = DATA_DIR / \"cifar-10\"\n",
 83 |     "URL = \"https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\"\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
 87 |     "\n",
 88 |     "with open(RAW_DATA_DIR / \"cifar-10-python.tar.gz\", \"wb\") as f:\n",
 89 |     "    response = requests.get(URL)\n",
 90 |     "    f.write(response.content)\n",
 91 |     "\n",
 92 |     "with tarfile.open(RAW_DATA_DIR / \"cifar-10-python.tar.gz\", \"r:gz\") as f:\n",
 93 |     "    f.extractall(RAW_DATA_DIR)\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "### Load the data\n",
101 |     "\n",
102 |     "We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "_data = []\n",
112 |     "_labels = []\n",
113 |     "filepaths = glob.glob(\"../data/cifar-10/cifar-10-batches-py/*_batch*\")\n",
114 |     "for filepath in sorted(filepaths):\n",
115 |     "    with open(filepath, \"rb\") as f:\n",
116 |     "        _batch = pickle.load(f, encoding=\"latin1\")\n",
117 |     "        _data.append(_batch[\"data\"])\n",
118 |     "        _labels.extend(_batch[\"labels\"])\n",
119 |     "\n",
120 |     "# each image has 3 channels with height and width of 32 pixels\n",
121 |     "features = pd.DataFrame(\n",
122 |     "    np.vstack(_data),\n",
123 |     "    columns=[f\"p{i}\" for i in range(3 * 32 * 32)],\n",
124 |     "    dtype=\"uint8\",\n",
125 |     ")\n",
126 |     "target = pd.Series(_labels, dtype=\"uint8\", name=\"labels\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "### Explore the data"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "features.info()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "features.head()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "target.head()"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "### Visualize the data"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "fig, axes = plt.subplots(10, 10, sharex=True, sharey=True, figsize=(15, 15))\n",
177 |     "for i in range(10):\n",
178 |     "    for j in range(10):\n",
179 |     "        m, _ = features.shape\n",
180 |     "        k = np.random.randint(m)\n",
181 |     "        img = (features.loc[k, :]\n",
182 |     "                       .to_numpy()\n",
183 |     "                       .reshape((3, 32, 32))\n",
184 |     "                       .transpose(1, 2, 0))\n",
185 |     "        _ = axes[i, j].imshow(img)\n",
186 |     "        _ = axes[i, j].set_title(CLASS_LABELS[target[k]])\n",
187 |     "\n",
188 |     "fig.suptitle(\"Random CIFAR-10 images\", x=0.5, y=1.0, fontsize=25)\n",
189 |     "fig.tight_layout()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "# Creating Train, Val, and Test Data\n",
197 |     "\n",
198 |     "Before we look at the data any further, we need to create a test set, put it aside, and never look at it (until we are ready to test our trainined machine learning model!). Why? We don't want our machine learning model to memorize our dataset (this is called overfitting). Instead we want a model that will generalize well (i.e., make good predictions) for inputs that it didn't see during training. To do this we hold split our dataset into training and testing datasets. The training dataset will be used to train our machine learning model(s) and the testing dataset will be used to make a final evaluation of our machine learning model(s). We also need to create a validation dataset for tuning hyperparameters and deciding when to stop training.\n",
199 |     "\n",
200 |     "## If you might refresh data in the future...\n",
201 |     "\n",
202 |     "...then you want to use some particular hashing function to compute the hash of a unique identifier for each observation of data and include the observation in the test set if resulting hash value is less than some fixed percentage of the maximum possible hash value for your algorithm. This way even if you fetch more data, your test set will never include data that was previously included in the training data."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "import zlib\n",
212 |     "\n",
213 |     "\n",
214 |     "def in_holdout_data(identifier, test_size):\n",
215 |     "    _hash = zlib.crc32(bytes(identifier))\n",
216 |     "    return _hash & 0xffffffff < test_size * 2**32\n",
217 |     "\n",
218 |     "\n",
219 |     "def split_data_by_id(data, test_size, id_column):\n",
220 |     "    ids = data[id_column]\n",
221 |     "    in_holdout_set = ids.apply(lambda identifier: in_holdout_data(identifier, test_size))\n",
222 |     "    return data.loc[~in_holdout_set], data.loc[in_holdout_set]\n"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "## If this is all the data you will ever have...\n",
230 |     "\n",
231 |     "...then you can just set a seed for the random number generator and then randomly split the data. Scikit-Learn has a [`model_selection`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) module that contains tools for splitting datasets. First, split the dataset into training and testing datasets. Next split the training dataset into training and validation datasets."
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "SEED = 42\n",
241 |     "SEED_GENERATOR = np.random.RandomState(SEED)\n",
242 |     "\n",
243 |     "\n",
244 |     "def generate_seed():\n",
245 |     "    return SEED_GENERATOR.randint(np.iinfo(\"uint16\").max)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# split the dataset into training and testing data\n",
255 |     "_seed = generate_seed()\n",
256 |     "_random_state = np.random.RandomState(_seed)\n",
257 |     "_train_features, test_features, _train_target, test_target = model_selection.train_test_split(\n",
258 |     "    features,\n",
259 |     "    target,\n",
260 |     "    test_size=1e-1,\n",
261 |     "    random_state=_random_state\n",
262 |     ")\n",
263 |     "\n",
264 |     "train_features, val_features, train_target, val_target = model_selection.train_test_split(\n",
265 |     "    _train_features,\n",
266 |     "    _train_target,\n",
267 |     "    test_size=1e-1,\n",
268 |     "    random_state=_random_state\n",
269 |     ")"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "train_features.info()"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "val_features.info()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "test_features.info()"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "# Training a Neural Network\n",
304 |     "\n",
305 |     "When working with GPUs we need to tell PyTorch which device to use when training."
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "Next we need to define the components of our training loop that we developed in this morning session."
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "def accuracy(output, target):\n",
331 |     "    return torchmetrics.functional.accuracy(output, target)\n",
332 |     "\n",
333 |     "\n",
334 |     "def partial_fit(model_fn, loss_fn, X_batch, y_batch, opt):\n",
335 |     "    # forward pass\n",
336 |     "    loss = loss_fn(model_fn(X_batch), y_batch)\n",
337 |     "\n",
338 |     "    # back propagation\n",
339 |     "    loss.backward()\n",
340 |     "    opt.step()\n",
341 |     "    opt.zero_grad() # don't forget to reset the gradient after each batch!\n",
342 |     "    \n",
343 |     "\n",
344 |     "def validate(model_fn, loss_fn, data_loader):\n",
345 |     "    with torch.no_grad():\n",
346 |     "\n",
347 |     "        batch_accs = []\n",
348 |     "        batch_losses = []\n",
349 |     "        \n",
350 |     "        for X, y in data_loader:\n",
351 |     "            batch_accs.append(accuracy(model_fn(X), y))\n",
352 |     "            batch_losses.append(loss_fn(model_fn(X), y))\n",
353 |     "        \n",
354 |     "        avg_accuracy = (torch.stack(batch_accs)\n",
355 |     "                             .mean())\n",
356 |     "        avg_loss = (torch.stack(batch_losses)\n",
357 |     "                         .mean())\n",
358 |     "\n",
359 |     "    return avg_accuracy, avg_loss\n",
360 |     "\n",
361 |     "\n",
362 |     "def fit(model_fn, loss_fn, train_data_loader, opt, lr_scheduler, val_data_loader=None, number_epochs=2):\n",
363 |     "    \n",
364 |     "    for epoch in range(number_epochs):\n",
365 |     "        # train the model\n",
366 |     "        model_fn.train()\n",
367 |     "        for X_batch, y_batch in train_data_loader:\n",
368 |     "            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)\n",
369 |     "        \n",
370 |     "        # compute validation loss after each training epoch\n",
371 |     "        model_fn.eval()\n",
372 |     "        if val_data_loader is not None:\n",
373 |     "            val_acc, val_loss = validate(model_fn, loss_fn, val_data_loader)\n",
374 |     "        print(f\"Training epoch: {epoch}, Validation accuracy: {val_acc}, Validation loss: {val_loss}\")\n",
375 |     "\n",
376 |     "        # update the learning rate\n",
377 |     "        lr_scheduler.step()"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "In this section we introduce a `CustomDataset` to better encapsulate data preprocessing transformations using PyTorch primitives instead of Scikit-Learn. We also reuse the `LambdaLayer` and the `WrappedDataLoader` classes from this morning session. However, instead of using the `WrappedDataLoader` to implement data preprocessing steps, we will instead use the class to send our training data batches from the CPU to the GPU during the training loop."
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "class CustomDataset(data.Dataset):\n",
394 |     "    \n",
395 |     "    def __init__(self, features, target, transforms = None):\n",
396 |     "        self._data = (features.to_numpy()\n",
397 |     "                              .reshape(-1, 3, 32, 32)\n",
398 |     "                              .transpose(0, 2, 3, 1))\n",
399 |     "        self._target = target.to_numpy()\n",
400 |     "        self._transforms = transforms\n",
401 |     "        \n",
402 |     "    def __getitem__(self, index):\n",
403 |     "        X, y = self._data[index], self._target[index]\n",
404 |     "        return (self._transforms(X), y) if self._transforms is not None else (X, y)\n",
405 |     "        \n",
406 |     "    def __len__(self):\n",
407 |     "        return len(self._data)\n",
408 |     "\n",
409 |     "\n",
410 |     "class LambdaLayer(nn.Module):\n",
411 |     "    \n",
412 |     "    def __init__(self, f):\n",
413 |     "        super().__init__()\n",
414 |     "        self._f = f\n",
415 |     "        \n",
416 |     "    def forward(self, X):\n",
417 |     "        return self._f(X)\n",
418 |     "\n",
419 |     "\n",
420 |     "class WrappedDataLoader:\n",
421 |     "    \n",
422 |     "    def __init__(self, data_loader, f):\n",
423 |     "        self._data_loader = data_loader\n",
424 |     "        self._f = f\n",
425 |     "        \n",
426 |     "    def __len__(self):\n",
427 |     "        return len(self._data_loader)\n",
428 |     "    \n",
429 |     "    def __iter__(self):\n",
430 |     "        for batch in iter(self._data_loader):\n",
431 |     "            yield self._f(*batch)\n"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {},
437 |    "source": [
438 |     "## Defining the LeNet-5 architecture"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "model_fn = nn.Sequential(\n",
448 |     "    nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=0),\n",
449 |     "    nn.Tanh(),\n",
450 |     "    nn.AvgPool2d(kernel_size=2, stride=2),\n",
451 |     "    nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),\n",
452 |     "    nn.Tanh(),\n",
453 |     "    nn.AvgPool2d(kernel_size=2, stride=2),\n",
454 |     "    LambdaLayer(lambda X: X.view(X.size(0), -1)),\n",
455 |     "    nn.Linear(400, 120),\n",
456 |     "    nn.Tanh(),\n",
457 |     "    nn.Linear(120, 84),\n",
458 |     "    nn.Tanh(),\n",
459 |     "    nn.Linear(84, 10)\n",
460 |     ")\n",
461 |     "_ = model_fn.to(device)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "torchinfo.summary(model_fn, input_size=(64, 3, 32, 32))"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "## Train the neural network"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {},
484 |    "outputs": [],
485 |    "source": [
486 |     "# use same loss function from last time\n",
487 |     "loss_fn = F.cross_entropy\n",
488 |     "\n",
489 |     "# define some preprocessing transforms (done on CPU!)\n",
490 |     "_transforms = transforms.Compose([\n",
491 |     "    transforms.ToTensor(),\n",
492 |     "])\n",
493 |     "\n",
494 |     "# move the tensor from the CPU to the GPU\n",
495 |     "_to_device = lambda X, y: (X.to(device), y.to(device))\n",
496 |     "\n",
497 |     "# define the datasets and dataloaders\n",
498 |     "_train_dataset = CustomDataset(train_features, train_target, _transforms)\n",
499 |     "_train_data_loader = data.DataLoader(_train_dataset, batch_size=64, shuffle=True)\n",
500 |     "train_data_loader = WrappedDataLoader(_train_data_loader, _to_device)\n",
501 |     "\n",
502 |     "_val_dataset = CustomDataset(val_features, val_target, _transforms)\n",
503 |     "_val_data_loader = data.DataLoader(_val_dataset, batch_size=128, shuffle=False)\n",
504 |     "val_data_loader = WrappedDataLoader(_val_data_loader, _to_device)\n",
505 |     "\n",
506 |     "_test_dataset = CustomDataset(test_features, test_target, _transforms)\n",
507 |     "_test_data_loader = data.DataLoader(_test_dataset, batch_size=128, shuffle=False)\n",
508 |     "test_data_loader = WrappedDataLoader(_test_data_loader, _to_device)\n",
509 |     "\n",
510 |     "# define the optimizer and the learning rate scheduler\n",
511 |     "opt = optim.SGD(model_fn.parameters(), lr=1e-2, momentum=0.9)\n",
512 |     "lr_scheduler = optim.lr_scheduler.ExponentialLR(opt, gamma=0.9, verbose=True)"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "fit(model_fn,\n",
522 |     "    loss_fn,\n",
523 |     "    train_data_loader,\n",
524 |     "    opt,\n",
525 |     "    lr_scheduler,\n",
526 |     "    val_data_loader,\n",
527 |     "    number_epochs=10)"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": null,
533 |    "metadata": {},
534 |    "outputs": [],
535 |    "source": [
536 |     "average_accuracy, average_loss = validate(model_fn, loss_fn, test_data_loader)"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "markdown",
541 |    "metadata": {},
542 |    "source": [
543 |     "### Exercise: Build your own neural network\n",
544 |     "\n",
545 |     "Modify the LeNet-5 archtiecture as you see fit in order to gain experience building your own neural network."
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": null,
551 |    "metadata": {},
552 |    "outputs": [],
553 |    "source": [
554 |     "# insert code here!"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "markdown",
559 |    "metadata": {},
560 |    "source": [
561 |     "### Exercise: Experiment with different batch sizes\n",
562 |     "\n",
563 |     "Train your model for 10 epochs with different batch sizes: 1, 4, 16, 64, 256. Do you notice any patterns?"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "metadata": {},
570 |    "outputs": [],
571 |    "source": [
572 |     "# insert code here!"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "### Exercise: Experiment with different learning rate schedulers\n",
580 |     "\n",
581 |     "Train your model for 10 epochs with different batch size of 64 but experiment with different learning rate schedulers. Does one learning rate scheduler outperform the others?"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": null,
587 |    "metadata": {},
588 |    "outputs": [],
589 |    "source": [
590 |     "# insert code here!"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "markdown",
595 |    "metadata": {},
596 |    "source": [
597 |     "## Experimenting with different architectures\n",
598 |     "\n",
599 |     "In practice, it is unlikely that you will be designing your own neural network architectures from scratch. Instead you will be starting from some pre-existing neural network architecture. The [torchvision](https://pytorch.org/vision/stable/) project contains a number of neural network architectures that have found widespread use in computer vision applications.\n",
600 |     "\n",
601 |     "For the remainder of this notebook we will be using the [ResNet-18](https://arxiv.org/pdf/1512.03385.pdf) model which was developed in 2015. The ResNet family of models were designed to be trained on larger images (224 x 224) and a larger number of classes (1000) so we need to make some small modifications in order to adapt this network for our dataset."
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "models."
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": [
619 |     "model_fn = models.resnet18(num_classes=10)\n",
620 |     "model_fn.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1,1), padding=(1,1), bias=False)\n",
621 |     "_ = model_fn.to(device)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {},
628 |    "outputs": [],
629 |    "source": [
630 |     "torchinfo.summary(model_fn, input_size=(64, 3, 32, 32))"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "metadata": {},
637 |    "outputs": [],
638 |    "source": [
639 |     "# use same loss function from last time\n",
640 |     "loss_fn = F.cross_entropy\n",
641 |     "\n",
642 |     "# define some preprocessing transforms (done on CPU!)\n",
643 |     "_transforms = transforms.Compose([\n",
644 |     "    transforms.ToTensor(),\n",
645 |     "])\n",
646 |     "\n",
647 |     "# move the tensor from the CPU to the GPU\n",
648 |     "_to_device = lambda X, y: (X.to(device), y.to(device))\n",
649 |     "\n",
650 |     "# define the datasets and dataloaders\n",
651 |     "_train_dataset = CustomDataset(train_features, train_target, _transforms)\n",
652 |     "_train_data_loader = data.DataLoader(_train_dataset, batch_size=128, shuffle=True)\n",
653 |     "train_data_loader = WrappedDataLoader(_train_data_loader, _to_device)\n",
654 |     "\n",
655 |     "_val_dataset = CustomDataset(val_features, val_target, _transforms)\n",
656 |     "_val_data_loader = data.DataLoader(_val_dataset, batch_size=256, shuffle=False)\n",
657 |     "val_data_loader = WrappedDataLoader(_val_data_loader, _to_device)\n",
658 |     "\n",
659 |     "_test_dataset = CustomDataset(test_features, test_target, _transforms)\n",
660 |     "_test_data_loader = data.DataLoader(_test_dataset, batch_size=256, shuffle=False)\n",
661 |     "test_data_loader = WrappedDataLoader(_test_data_loader, _to_device)\n",
662 |     "\n",
663 |     "# define the optimizer and the learning rate scheduler\n",
664 |     "opt = optim.SGD(model_fn.parameters(), lr=1e-1, momentum=0.9)\n",
665 |     "lr_scheduler = optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1, verbose=True)"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": null,
671 |    "metadata": {},
672 |    "outputs": [],
673 |    "source": [
674 |     "fit(model_fn,\n",
675 |     "    loss_fn,\n",
676 |     "    train_data_loader,\n",
677 |     "    opt,\n",
678 |     "    lr_scheduler,\n",
679 |     "    val_data_loader,\n",
680 |     "    number_epochs=20)"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": null,
686 |    "metadata": {},
687 |    "outputs": [],
688 |    "source": [
689 |     "average_accuracy, average_loss = validate(model_fn, loss_fn, test_data_loader)"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": null,
695 |    "metadata": {},
696 |    "outputs": [],
697 |    "source": [
698 |     "average_accuracy, average_loss"
699 |    ]
700 |   }
701 |  ],
702 |  "metadata": {
703 |   "kernelspec": {
704 |    "display_name": "Python 3 (ipykernel)",
705 |    "language": "python",
706 |    "name": "python3"
707 |   },
708 |   "language_info": {
709 |    "codemirror_mode": {
710 |     "name": "ipython",
711 |     "version": 3
712 |    },
713 |    "file_extension": ".py",
714 |    "mimetype": "text/x-python",
715 |    "name": "python",
716 |    "nbconvert_exporter": "python",
717 |    "pygments_lexer": "ipython3",
718 |    "version": "3.9.12"
719 |   }
720 |  },
721 |  "nbformat": 4,
722 |  "nbformat_minor": 4
723 | }
724 | 


--------------------------------------------------------------------------------
/notebooks/02d-building-an-image-classifier-with-pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4",
  8 |       "authorship_tag": "ABX9TyNUxMjNrCeLvqVJq1oqx97A",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     },
 18 |     "accelerator": "GPU"
 19 |   },
 20 |   "cells": [
 21 |     {
 22 |       "cell_type": "markdown",
 23 |       "metadata": {
 24 |         "id": "view-in-github",
 25 |         "colab_type": "text"
 26 |       },
 27 |       "source": [
 28 |         "<a href=\"https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02c-building-an-image-classifier-with-pytorch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "source": [
 34 |         "# Building an Image Classifier with PyTorch"
 35 |       ],
 36 |       "metadata": {
 37 |         "id": "KsEGsVUYr9RI"
 38 |       }
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "source": [
 43 |         "%%bash\n",
 44 |         "\n",
 45 |         "pip install --upgrade torchmetrics"
 46 |       ],
 47 |       "metadata": {
 48 |         "id": "hpSvDgm4VJsF"
 49 |       },
 50 |       "execution_count": null,
 51 |       "outputs": []
 52 |     },
 53 |     {
 54 |       "cell_type": "code",
 55 |       "source": [
 56 |         "import pathlib\n",
 57 |         "\n",
 58 |         "\n",
 59 |         "import pandas as pd\n",
 60 |         "import torch\n",
 61 |         "from torch import nn, optim, utils\n",
 62 |         "import torchmetrics\n",
 63 |         "import torchvision\n",
 64 |         "import torchvision.transforms.v2 as T\n",
 65 |         "\n",
 66 |         "\n",
 67 |         "# default linewidth is 80 characters\n",
 68 |         "torch.set_printoptions(linewidth=120)\n"
 69 |       ],
 70 |       "metadata": {
 71 |         "id": "An9wEigQthCk"
 72 |       },
 73 |       "execution_count": null,
 74 |       "outputs": []
 75 |     },
 76 |     {
 77 |       "cell_type": "markdown",
 78 |       "source": [
 79 |         "## Verifying availability of GPU(s)"
 80 |       ],
 81 |       "metadata": {
 82 |         "id": "e-PKAkew95Fd"
 83 |       }
 84 |     },
 85 |     {
 86 |       "cell_type": "code",
 87 |       "source": [
 88 |         "print(torch.__version__)"
 89 |       ],
 90 |       "metadata": {
 91 |         "id": "goFR55KH9-GA"
 92 |       },
 93 |       "execution_count": null,
 94 |       "outputs": []
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "%%bash\n",
100 |         "\n",
101 |         "nvidia-smi"
102 |       ],
103 |       "metadata": {
104 |         "id": "tRlezaJZv1Lw"
105 |       },
106 |       "execution_count": null,
107 |       "outputs": []
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "source": [
112 |         "print(torch.cuda.is_available())"
113 |       ],
114 |       "metadata": {
115 |         "id": "iwcTFs9BITaO"
116 |       },
117 |       "execution_count": null,
118 |       "outputs": []
119 |     },
120 |     {
121 |       "cell_type": "code",
122 |       "source": [
123 |         "DEVICE = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")"
124 |       ],
125 |       "metadata": {
126 |         "id": "PjWe6YMev5YG"
127 |       },
128 |       "execution_count": null,
129 |       "outputs": []
130 |     },
131 |     {
132 |       "cell_type": "code",
133 |       "source": [
134 |         "print(DEVICE)"
135 |       ],
136 |       "metadata": {
137 |         "id": "xeiUkDPzv9VA"
138 |       },
139 |       "execution_count": null,
140 |       "outputs": []
141 |     },
142 |     {
143 |       "cell_type": "markdown",
144 |       "source": [
145 |         "## Loading the Fashion MNIST dataset using TorchVision\n",
146 |         "\n",
147 |         "[TorchVision](https://docs.pytorch.org/vision/stable/index.html) is a core PyTorch library for computer vision. Torchvision provides:\n",
148 |         "\n",
149 |         "* Tools to download common datasets (e.g., MNIST, FashionMNIST).\n",
150 |         "* Pretrained models for vision tasks.\n",
151 |         "* Image transformations (crop, rotate, resize, etc.).\n",
152 |         "\n",
153 |         "TorchVision is preinstalled on Google Colab and Kaggle making it easy to use in teaching and research.\n",
154 |         "\n",
155 |         "### Fashion MNIST\n",
156 |         "\n",
157 |         "The Fashion MNIST dataset has the same structure of the familiar MNIST dataset.\n",
158 |         "\n",
159 |         "* 60,000 training images\n",
160 |         "* 10,000 test images\n",
161 |         "* Images are single channel (i.e., grayscale) images with 28 x 28 = 784 pixels.\n",
162 |         "\n",
163 |         "### Image Preprocessing with Transforms\n",
164 |         "\n",
165 |         "* TorchVision datasets accept a `transform` argument for preprocessing.\n",
166 |         "* Common transforms: scaling, normalization, cropping, etc.\n",
167 |         "* Use `Compose` to chain multiple transforms.\n",
168 |         "* `ToImage`: converts input to a Tensor image.\n",
169 |         "* `ToDtype`: converts to float32 and scales pixel values to [0.0, 1.0].\n",
170 |         "\n",
171 |         "**Be sure to use version 2 of the TorchVision transforms (i.e., `torchvision.transforms.v2`)! Version 2 is much faster, has more transforms and features, and is backward-compatible with version 1.**"
172 |       ],
173 |       "metadata": {
174 |         "id": "PF1b4nfNtjFX"
175 |       }
176 |     },
177 |     {
178 |       "cell_type": "code",
179 |       "source": [
180 |         "DATA_DIR = pathlib.Path(\"./sample_data\")\n",
181 |         "\n",
182 |         "\n",
183 |         "to_tensor = T.Compose([\n",
184 |         "    T.ToImage(),\n",
185 |         "    T.ToDtype(torch.float32, scale=True),\n",
186 |         "])\n",
187 |         "\n",
188 |         "\n",
189 |         "train_val_dataset = (\n",
190 |         "    torchvision.datasets\n",
191 |         "               .FashionMNIST(\n",
192 |         "                   DATA_DIR,\n",
193 |         "                   train=True,\n",
194 |         "                   download=True,\n",
195 |         "                   transform=to_tensor\n",
196 |         "               )\n",
197 |         ")\n",
198 |         "\n",
199 |         "test_dataset = (\n",
200 |         "    torchvision.datasets\n",
201 |         "               .FashionMNIST(\n",
202 |         "                   DATA_DIR,\n",
203 |         "                   train=False,\n",
204 |         "                   download=True,\n",
205 |         "                   transform=to_tensor\n",
206 |         "               )\n",
207 |         ")"
208 |       ],
209 |       "metadata": {
210 |         "id": "wtYip6U7r7-F"
211 |       },
212 |       "execution_count": null,
213 |       "outputs": []
214 |     },
215 |     {
216 |       "cell_type": "code",
217 |       "execution_count": null,
218 |       "metadata": {
219 |         "id": "U1eg9hGsr29N"
220 |       },
221 |       "outputs": [],
222 |       "source": [
223 |         "%%bash\n",
224 |         "\n",
225 |         "ls ./sample_data/FashionMNIST/raw"
226 |       ]
227 |     },
228 |     {
229 |       "cell_type": "code",
230 |       "source": [
231 |         "X0, y0 = train_val_dataset[0]\n",
232 |         "print(X0.shape)\n",
233 |         "print(X0.dtype)"
234 |       ],
235 |       "metadata": {
236 |         "id": "Zit8a1s4_7Q1"
237 |       },
238 |       "execution_count": null,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "source": [
244 |         "train_val_dataset.classes[y0]"
245 |       ],
246 |       "metadata": {
247 |         "id": "x68p4njAAjQI"
248 |       },
249 |       "execution_count": null,
250 |       "outputs": []
251 |     },
252 |     {
253 |       "cell_type": "markdown",
254 |       "source": [
255 |         "## Prepare the data"
256 |       ],
257 |       "metadata": {
258 |         "id": "_7Svkhhl-0so"
259 |       }
260 |     },
261 |     {
262 |       "cell_type": "markdown",
263 |       "source": [
264 |         "### Train/Val split"
265 |       ],
266 |       "metadata": {
267 |         "id": "2Q5_QPca-36S"
268 |       }
269 |     },
270 |     {
271 |       "cell_type": "code",
272 |       "source": [
273 |         "_ = torch.manual_seed(42)\n",
274 |         "\n",
275 |         "train_dataset, val_dataset = (\n",
276 |         "    utils.data\n",
277 |         "         .random_split(\n",
278 |         "             train_val_dataset,\n",
279 |         "             [55_000, 5_000]\n",
280 |         "         )\n",
281 |         ")"
282 |       ],
283 |       "metadata": {
284 |         "id": "voCEdPoQ-50c"
285 |       },
286 |       "execution_count": null,
287 |       "outputs": []
288 |     },
289 |     {
290 |       "cell_type": "markdown",
291 |       "source": [
292 |         "### Create the DataLoaders"
293 |       ],
294 |       "metadata": {
295 |         "id": "fjrKNGMPuK9M"
296 |       }
297 |     },
298 |     {
299 |       "cell_type": "code",
300 |       "source": [
301 |         "data_loader_kwargs = {\n",
302 |         "    \"batch_size\": 32,\n",
303 |         "    \"num_workers\": 2,            # load data in parallel using multiple workers\n",
304 |         "    \"persistent_workers\": True,  # keep workers around between epochs\n",
305 |         "    \"pin_memory\": True,          # avoid extra copy of data batches\n",
306 |         "    \"prefetch_factor\": 2,        # fetch multiple data batches in advance\n",
307 |         "}\n",
308 |         "\n",
309 |         "\n",
310 |         "train_data_loader = (\n",
311 |         "    utils.data\n",
312 |         "         .DataLoader(\n",
313 |         "             train_dataset,\n",
314 |         "             shuffle=True,\n",
315 |         "             **data_loader_kwargs\n",
316 |         "         )\n",
317 |         ")\n",
318 |         "\n",
319 |         "val_data_loader = (\n",
320 |         "    utils.data\n",
321 |         "         .DataLoader(\n",
322 |         "             val_dataset,\n",
323 |         "             shuffle=False,\n",
324 |         "             **data_loader_kwargs\n",
325 |         "         )\n",
326 |         ")\n",
327 |         "\n",
328 |         "test_data_loader = (\n",
329 |         "    utils.data\n",
330 |         "         .DataLoader(\n",
331 |         "             test_dataset,\n",
332 |         "             shuffle=False,\n",
333 |         "             **data_loader_kwargs\n",
334 |         "         )\n",
335 |         ")"
336 |       ],
337 |       "metadata": {
338 |         "id": "9TdT6yEjt4n3"
339 |       },
340 |       "execution_count": null,
341 |       "outputs": []
342 |     },
343 |     {
344 |       "cell_type": "markdown",
345 |       "source": [
346 |         "### Wrapping our model in a custom module"
347 |       ],
348 |       "metadata": {
349 |         "id": "-2hempAbDzwS"
350 |       }
351 |     },
352 |     {
353 |       "cell_type": "code",
354 |       "source": [
355 |         "class MLPClassifier(nn.Module):\n",
356 |         "\n",
357 |         "    def __init__(self, input_size, hidden_layer_sizes, n_classes):\n",
358 |         "        super().__init__()\n",
359 |         "\n",
360 |         "        # create the hidden layers\n",
361 |         "        modules = nn.ModuleList([nn.Flatten()])\n",
362 |         "        for hidden_layer_size in hidden_layer_sizes:\n",
363 |         "            modules.append(nn.Linear(input_size, hidden_layer_size))\n",
364 |         "            modules.append(nn.ReLU())\n",
365 |         "            input_size = hidden_layer_size\n",
366 |         "\n",
367 |         "        # define the output layer for the classifier\n",
368 |         "        modules.append(nn.Linear(input_size, n_classes))\n",
369 |         "\n",
370 |         "        # create the MLP from the modules\n",
371 |         "        self.mlp = nn.Sequential(*modules)\n",
372 |         "\n",
373 |         "    def forward(self, X):\n",
374 |         "        return self.mlp(X)\n",
375 |         "\n"
376 |       ],
377 |       "metadata": {
378 |         "id": "UoTd13UvA9g4"
379 |       },
380 |       "execution_count": null,
381 |       "outputs": []
382 |     },
383 |     {
384 |       "cell_type": "markdown",
385 |       "source": [
386 |         "## Defining the training and evaluation loop"
387 |       ],
388 |       "metadata": {
389 |         "id": "fOYlBO6jD3Vw"
390 |       }
391 |     },
392 |     {
393 |       "cell_type": "code",
394 |       "source": [
395 |         "def evaluate(model_fn, data_loader, metric):\n",
396 |         "    model_fn.eval()\n",
397 |         "    metric.reset()  # reset the metric at the beginning\n",
398 |         "    with torch.no_grad():\n",
399 |         "        for X_batch, y_batch in data_loader:\n",
400 |         "            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)\n",
401 |         "            y_pred = model_fn(X_batch)\n",
402 |         "            metric.update(y_pred, y_batch)  # update it at each iteration\n",
403 |         "    return metric.compute()  # compute the final result at the end\n",
404 |         "\n",
405 |         "\n",
406 |         "def train(\n",
407 |         "    model_fn,\n",
408 |         "    criterion,\n",
409 |         "    optimizer,\n",
410 |         "    metric,\n",
411 |         "    train_data_loader,\n",
412 |         "    val_data_loader,\n",
413 |         "    n_epochs,\n",
414 |         "    log_epochs=1,\n",
415 |         "    ):\n",
416 |         "\n",
417 |         "    history = {\n",
418 |         "        \"train_losses\": [],\n",
419 |         "        \"val_losses\": [],\n",
420 |         "        \"train_metrics\": [],\n",
421 |         "        \"val_metrics\": [],\n",
422 |         "    }\n",
423 |         "\n",
424 |         "    for epoch in range(n_epochs):\n",
425 |         "        total_train_loss = 0.0\n",
426 |         "        metric.reset()\n",
427 |         "        for i, (X_batch, y_batch) in enumerate(train_data_loader):\n",
428 |         "            model_fn.train()\n",
429 |         "\n",
430 |         "            # move batches to device\n",
431 |         "            X_batch = X_batch.to(DEVICE, non_blocking=True)\n",
432 |         "            y_batch = y_batch.to(DEVICE, non_blocking=True)\n",
433 |         "\n",
434 |         "            # forward pass\n",
435 |         "            y_pred = model_fn(X_batch)\n",
436 |         "            train_loss = criterion(y_pred, y_batch)\n",
437 |         "            total_train_loss += train_loss.item()\n",
438 |         "\n",
439 |         "            # backward pass\n",
440 |         "            train_loss.backward()\n",
441 |         "\n",
442 |         "            # gradient descent step\n",
443 |         "            optimizer.step()\n",
444 |         "            optimizer.zero_grad()\n",
445 |         "\n",
446 |         "            # update our metric\n",
447 |         "            metric.update(y_pred, y_batch)\n",
448 |         "\n",
449 |         "        # comute the average (across batches!) training loss\n",
450 |         "        average_train_loss = total_train_loss / len(train_data_loader)\n",
451 |         "        history[\"train_losses\"].append(average_train_loss)\n",
452 |         "\n",
453 |         "        # compute the average (across batched!) validation loss\n",
454 |         "        with torch.no_grad():\n",
455 |         "            model_fn.eval()\n",
456 |         "            total_val_loss = 0.0\n",
457 |         "            for X_batch, y_batch in val_data_loader:\n",
458 |         "                X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)\n",
459 |         "                y_pred = model_fn(X_batch)\n",
460 |         "                val_loss = criterion(y_pred, y_batch)\n",
461 |         "                total_val_loss += val_loss.item()\n",
462 |         "            average_val_loss = total_val_loss / len(val_data_loader)\n",
463 |         "            history[\"val_losses\"].append(average_val_loss)\n",
464 |         "\n",
465 |         "        # compute the training metric after each epoch\n",
466 |         "        average_train_metric = (\n",
467 |         "            metric.compute()\n",
468 |         "                  .item()\n",
469 |         "        )\n",
470 |         "        history[\"train_metrics\"].append(average_train_metric)\n",
471 |         "\n",
472 |         "        # compute the validation metric after each epoch\n",
473 |         "        average_val_metric = (\n",
474 |         "            evaluate(\n",
475 |         "              model_fn,\n",
476 |         "              val_data_loader,\n",
477 |         "              metric,\n",
478 |         "            ).item()\n",
479 |         "        )\n",
480 |         "        history[\"val_metrics\"].append(average_val_metric)\n",
481 |         "\n",
482 |         "        if (epoch + 1) % log_epochs == 0:\n",
483 |         "            print(f\"Epoch {epoch + 1}/{n_epochs}, \"\n",
484 |         "                  f\"train loss: {history['train_losses'][-1]:.4f}, \"\n",
485 |         "                  f\"val loss: {history['val_losses'][-1]:.4f}, \"\n",
486 |         "                  f\"train metric: {history['train_metrics'][-1]:.4f}, \"\n",
487 |         "                  f\"val metric: {history['val_metrics'][-1]:.4f}\"\n",
488 |         "            )\n",
489 |         "\n",
490 |         "    return history\n"
491 |       ],
492 |       "metadata": {
493 |         "id": "6n40t0eTwEhD"
494 |       },
495 |       "execution_count": null,
496 |       "outputs": []
497 |     },
498 |     {
499 |       "cell_type": "markdown",
500 |       "source": [
501 |         "## Putting everything together!"
502 |       ],
503 |       "metadata": {
504 |         "id": "JKdEjcs8X3TQ"
505 |       }
506 |     },
507 |     {
508 |       "cell_type": "code",
509 |       "source": [
510 |         "_ = torch.manual_seed(42)\n",
511 |         "\n",
512 |         "# define the model function\n",
513 |         "fashion_mnist_model_fn = MLPClassifier(\n",
514 |         "    input_size=28 * 28,\n",
515 |         "    hidden_layer_sizes=[256, 128],\n",
516 |         "    n_classes=10\n",
517 |         ")\n",
518 |         "fashion_mnist_model_fn = fashion_mnist_model_fn.to(DEVICE)\n",
519 |         "\n",
520 |         "# select loss function\n",
521 |         "cross_entropy_loss = nn.CrossEntropyLoss()\n",
522 |         "\n",
523 |         "# define the optimizer\n",
524 |         "sgd = optim.SGD(\n",
525 |         "    fashion_mnist_model_fn.parameters(),\n",
526 |         "    lr=1e-1\n",
527 |         ")\n",
528 |         "\n",
529 |         "# select a metric\n",
530 |         "accuracy = (\n",
531 |         "    torchmetrics.Accuracy(\n",
532 |         "        task=\"multiclass\",\n",
533 |         "        num_classes=10,\n",
534 |         "    ).to(DEVICE)\n",
535 |         ")\n"
536 |       ],
537 |       "metadata": {
538 |         "id": "AHECaCIfxJ6P"
539 |       },
540 |       "execution_count": null,
541 |       "outputs": []
542 |     },
543 |     {
544 |       "cell_type": "code",
545 |       "source": [
546 |         "%%timeit -n 1 -r 1\n",
547 |         "\n",
548 |         "history = train(\n",
549 |         "    model_fn=fashion_mnist_model_fn,\n",
550 |         "    criterion=cross_entropy_loss,\n",
551 |         "    optimizer=sgd,\n",
552 |         "    metric=accuracy,\n",
553 |         "    train_data_loader=train_data_loader,\n",
554 |         "    val_data_loader=val_data_loader,\n",
555 |         "    n_epochs=20,\n",
556 |         "    log_epochs=1\n",
557 |         ")"
558 |       ],
559 |       "metadata": {
560 |         "id": "A-2ByvXR1S7a"
561 |       },
562 |       "execution_count": null,
563 |       "outputs": []
564 |     },
565 |     {
566 |       "cell_type": "code",
567 |       "source": [
568 |         "history_df = pd.DataFrame.from_dict(\n",
569 |         "    history\n",
570 |         ")\n",
571 |         "\n",
572 |         "_ = history_df.plot(grid=True)"
573 |       ],
574 |       "metadata": {
575 |         "id": "d7i_bq-3HLcn"
576 |       },
577 |       "execution_count": null,
578 |       "outputs": []
579 |     },
580 |     {
581 |       "cell_type": "markdown",
582 |       "source": [
583 |         "## Predicting using the trained model"
584 |       ],
585 |       "metadata": {
586 |         "id": "F-3UXeQqiSh0"
587 |       }
588 |     },
589 |     {
590 |       "cell_type": "markdown",
591 |       "source": [
592 |         "### Predicting class labels"
593 |       ],
594 |       "metadata": {
595 |         "id": "0xteSl-qwJR7"
596 |       }
597 |     },
598 |     {
599 |       "cell_type": "code",
600 |       "source": [
601 |         "def predict(X, model_fn):\n",
602 |         "    model_fn.eval()\n",
603 |         "    with torch.no_grad():\n",
604 |         "        y_pred_logits = model_fn(X)\n",
605 |         "    class_indices = torch.argmax(y_pred_logits, dim=1)\n",
606 |         "    return class_indices"
607 |       ],
608 |       "metadata": {
609 |         "id": "zXESwdCgUF5S"
610 |       },
611 |       "execution_count": null,
612 |       "outputs": []
613 |     },
614 |     {
615 |       "cell_type": "code",
616 |       "source": [
617 |         "X_new, y_new = next(iter(val_data_loader))"
618 |       ],
619 |       "metadata": {
620 |         "id": "NPChGQmdhJep"
621 |       },
622 |       "execution_count": null,
623 |       "outputs": []
624 |     },
625 |     {
626 |       "cell_type": "code",
627 |       "source": [
628 |         "X_new.device"
629 |       ],
630 |       "metadata": {
631 |         "id": "34MxrQ-9hOX1"
632 |       },
633 |       "execution_count": null,
634 |       "outputs": []
635 |     },
636 |     {
637 |       "cell_type": "code",
638 |       "source": [
639 |         "X_new = X_new.to(DEVICE)"
640 |       ],
641 |       "metadata": {
642 |         "id": "vwBamGZDhPIH"
643 |       },
644 |       "execution_count": null,
645 |       "outputs": []
646 |     },
647 |     {
648 |       "cell_type": "code",
649 |       "source": [
650 |         "class_indices = predict(X_new, fashion_mnist_model_fn)\n",
651 |         "print(class_indices)"
652 |       ],
653 |       "metadata": {
654 |         "id": "ukspuABphd8D"
655 |       },
656 |       "execution_count": null,
657 |       "outputs": []
658 |     },
659 |     {
660 |       "cell_type": "code",
661 |       "source": [
662 |         "class_labels = [train_val_dataset.classes[i] for i in class_indices]\n",
663 |         "print(class_labels)"
664 |       ],
665 |       "metadata": {
666 |         "id": "YFozl04IhsfG"
667 |       },
668 |       "execution_count": null,
669 |       "outputs": []
670 |     },
671 |     {
672 |       "cell_type": "markdown",
673 |       "source": [
674 |         "### Predicting class probabilities"
675 |       ],
676 |       "metadata": {
677 |         "id": "P6kNLgmNwOCX"
678 |       }
679 |     },
680 |     {
681 |       "cell_type": "code",
682 |       "source": [
683 |         "def predict_proba(X, model_fn):\n",
684 |         "    model_fn.eval()\n",
685 |         "    with torch.no_grad():\n",
686 |         "        y_pred_logits = model_fn(X)\n",
687 |         "        y_pred_proba = torch.softmax(y_pred_logits, dim=1)\n",
688 |         "    return y_pred_proba\n"
689 |       ],
690 |       "metadata": {
691 |         "id": "hEdcRTDGwSmh"
692 |       },
693 |       "execution_count": null,
694 |       "outputs": []
695 |     },
696 |     {
697 |       "cell_type": "code",
698 |       "source": [
699 |         "class_probas = predict_proba(X_new, fashion_mnist_model_fn)\n",
700 |         "print(class_probas.round(decimals=3))"
701 |       ],
702 |       "metadata": {
703 |         "id": "qBy1xTC0iQxl"
704 |       },
705 |       "execution_count": null,
706 |       "outputs": []
707 |     },
708 |     {
709 |       "cell_type": "markdown",
710 |       "source": [
711 |         "### Top-k predictions"
712 |       ],
713 |       "metadata": {
714 |         "id": "f8grU50gwWO6"
715 |       }
716 |     },
717 |     {
718 |       "cell_type": "code",
719 |       "source": [
720 |         "def predict_topk(X, model_fn, k=3):\n",
721 |         "    model_fn.eval()\n",
722 |         "    with torch.no_grad():\n",
723 |         "        y_pred_logits = model_fn(X)\n",
724 |         "        _, topk_class_indices = torch.topk(y_pred_logits, k=k, dim=1)\n",
725 |         "    return topk_class_indices\n",
726 |         "\n",
727 |         "\n",
728 |         "def predict_topk_proba(X, model_fn, k=3):\n",
729 |         "    model_fn.eval()\n",
730 |         "    with torch.no_grad():\n",
731 |         "        y_pred_logits = model_fn(X)\n",
732 |         "        topk_logits, topk_class_indices = torch.topk(y_pred_logits, k=k, dim=1)\n",
733 |         "        topk_probas = torch.softmax(topk_logits, dim=1)\n",
734 |         "    return topk_probas\n",
735 |         "\n"
736 |       ],
737 |       "metadata": {
738 |         "id": "ZHAU3p-2igm0"
739 |       },
740 |       "execution_count": null,
741 |       "outputs": []
742 |     },
743 |     {
744 |       "cell_type": "code",
745 |       "source": [
746 |         "top3_class_indices = predict_topk(X_new, fashion_mnist_model_fn, k=3)\n",
747 |         "print(top3_class_indices)"
748 |       ],
749 |       "metadata": {
750 |         "id": "K-iQXTylk-MX"
751 |       },
752 |       "execution_count": null,
753 |       "outputs": []
754 |     },
755 |     {
756 |       "cell_type": "code",
757 |       "source": [
758 |         "top3_class_labels = []\n",
759 |         "for class_indices in top3_class_indices:\n",
760 |         "    top3_class_labels.append(\n",
761 |         "        [train_val_dataset.classes[i] for i in class_indices]\n",
762 |         "    )\n",
763 |         "print(top3_class_labels)\n"
764 |       ],
765 |       "metadata": {
766 |         "id": "siUh-f0KlJCC"
767 |       },
768 |       "execution_count": null,
769 |       "outputs": []
770 |     },
771 |     {
772 |       "cell_type": "code",
773 |       "source": [
774 |         "top3_probas = predict_topk_proba(X_new, fashion_mnist_model_fn, k=3)\n",
775 |         "print(top3_probas.round(decimals=3))"
776 |       ],
777 |       "metadata": {
778 |         "id": "rsMG_Qiik0QH"
779 |       },
780 |       "execution_count": null,
781 |       "outputs": []
782 |     },
783 |     {
784 |       "cell_type": "code",
785 |       "source": [],
786 |       "metadata": {
787 |         "id": "Qzjt2rCm3TpC"
788 |       },
789 |       "execution_count": null,
790 |       "outputs": []
791 |     }
792 |   ]
793 | }


--------------------------------------------------------------------------------
/notebooks/01e-mlp-for-classification-with-pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyM8qD7J1ewGzyrbrJrI27zw",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/01b-mlp-for-classification-with-pytorch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "# Multi-Layer Perceptrons (MLPs) for Classification with PyTorch"
 33 |       ],
 34 |       "metadata": {
 35 |         "id": "PE3OMdA_fdjF"
 36 |       }
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "execution_count": null,
 41 |       "metadata": {
 42 |         "id": "_kEVc3hofLkE"
 43 |       },
 44 |       "outputs": [],
 45 |       "source": [
 46 |         "import numpy as np\n",
 47 |         "from sklearn import compose, datasets, linear_model, metrics, model_selection\n",
 48 |         "from sklearn import pipeline, preprocessing\n",
 49 |         "\n",
 50 |         "import torch\n",
 51 |         "from torch import nn, optim\n"
 52 |       ]
 53 |     },
 54 |     {
 55 |       "cell_type": "markdown",
 56 |       "source": [
 57 |         "## Binary Classification"
 58 |       ],
 59 |       "metadata": {
 60 |         "id": "k3z3maMMw304"
 61 |       }
 62 |     },
 63 |     {
 64 |       "cell_type": "markdown",
 65 |       "source": [
 66 |         "### Loading the data"
 67 |       ],
 68 |       "metadata": {
 69 |         "id": "CPR-H2FFp8bV"
 70 |       }
 71 |     },
 72 |     {
 73 |       "cell_type": "code",
 74 |       "source": [
 75 |         "breast_cancer_dataset = datasets.load_breast_cancer(\n",
 76 |         "    as_frame=True,\n",
 77 |         ")"
 78 |       ],
 79 |       "metadata": {
 80 |         "id": "eSjzZVmgt2CV"
 81 |       },
 82 |       "execution_count": null,
 83 |       "outputs": []
 84 |     },
 85 |     {
 86 |       "cell_type": "code",
 87 |       "source": [
 88 |         "print(breast_cancer_dataset[\"DESCR\"])"
 89 |       ],
 90 |       "metadata": {
 91 |         "id": "FbzayFYc5gwr"
 92 |       },
 93 |       "execution_count": null,
 94 |       "outputs": []
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "breast_cancer_features_df = breast_cancer_dataset[\"data\"]\n",
100 |         "breast_cancer_target = breast_cancer_dataset[\"target\"]"
101 |       ],
102 |       "metadata": {
103 |         "id": "l9iBUC-X6MJl"
104 |       },
105 |       "execution_count": null,
106 |       "outputs": []
107 |     },
108 |     {
109 |       "cell_type": "code",
110 |       "source": [
111 |         "breast_cancer_features_df.info()"
112 |       ],
113 |       "metadata": {
114 |         "id": "tJZFubW06UWa"
115 |       },
116 |       "execution_count": null,
117 |       "outputs": []
118 |     },
119 |     {
120 |       "cell_type": "code",
121 |       "source": [
122 |         "breast_cancer_features_df.describe()"
123 |       ],
124 |       "metadata": {
125 |         "id": "5lEqBguNw2wT"
126 |       },
127 |       "execution_count": null,
128 |       "outputs": []
129 |     },
130 |     {
131 |       "cell_type": "code",
132 |       "source": [
133 |         "_ = breast_cancer_target.hist()"
134 |       ],
135 |       "metadata": {
136 |         "id": "GGg_Qd_v9fSd"
137 |       },
138 |       "execution_count": null,
139 |       "outputs": []
140 |     },
141 |     {
142 |       "cell_type": "markdown",
143 |       "source": [
144 |         "### Prepare the data"
145 |       ],
146 |       "metadata": {
147 |         "id": "NFFtOc0cqDwn"
148 |       }
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "source": [
153 |         "#### Train/Val Split"
154 |       ],
155 |       "metadata": {
156 |         "id": "xmd56sT0xPn3"
157 |       }
158 |     },
159 |     {
160 |       "cell_type": "code",
161 |       "source": [
162 |         "RANDOM_STATE = np.random.RandomState(42)\n",
163 |         "\n",
164 |         "\n",
165 |         "train_features_df, val_features_df, train_target, val_target = (\n",
166 |         "    model_selection.train_test_split(\n",
167 |         "        breast_cancer_features_df,\n",
168 |         "        breast_cancer_target,\n",
169 |         "        random_state=RANDOM_STATE,\n",
170 |         "        shuffle=True,\n",
171 |         "        stratify=breast_cancer_target,\n",
172 |         "        test_size=0.20,\n",
173 |         "    )\n",
174 |         ")\n"
175 |       ],
176 |       "metadata": {
177 |         "id": "6wOz_0q35s24"
178 |       },
179 |       "execution_count": null,
180 |       "outputs": []
181 |     },
182 |     {
183 |       "cell_type": "markdown",
184 |       "source": [
185 |         "#### Features and target preparation"
186 |       ],
187 |       "metadata": {
188 |         "id": "Tn9VUtwtkYLZ"
189 |       }
190 |     },
191 |     {
192 |       "cell_type": "code",
193 |       "source": [
194 |         "def array_to_tensor(arr, dtype=torch.float32):\n",
195 |         "    return torch.tensor(arr, dtype=dtype)\n",
196 |         "\n",
197 |         "\n",
198 |         "def series_to_tensor(s, dtype=torch.float32):\n",
199 |         "    arr = s.to_numpy()\n",
200 |         "    return array_to_tensor(arr, dtype)\n",
201 |         "\n",
202 |         "\n",
203 |         "n_samples, _ = train_features_df.shape\n",
204 |         "prepare_breast_cancer_features = pipeline.make_pipeline(\n",
205 |         "    preprocessing.QuantileTransformer(\n",
206 |         "      n_quantiles=n_samples,\n",
207 |         "      output_distribution=\"normal\",\n",
208 |         "      random_state=RANDOM_STATE\n",
209 |         "    ),\n",
210 |         "    preprocessing.FunctionTransformer(\n",
211 |         "        func=array_to_tensor\n",
212 |         "    )\n",
213 |         ")\n",
214 |         "\n",
215 |         "prepare_breast_cancer_target = pipeline.make_pipeline(\n",
216 |         "    preprocessing.FunctionTransformer(\n",
217 |         "        func=series_to_tensor,\n",
218 |         "    ),\n",
219 |         "    preprocessing.FunctionTransformer(\n",
220 |         "        func=torch.unsqueeze,\n",
221 |         "        kw_args={\n",
222 |         "            \"dim\": 1\n",
223 |         "        }\n",
224 |         "    )\n",
225 |         ")"
226 |       ],
227 |       "metadata": {
228 |         "id": "PyZrDS-16iJT"
229 |       },
230 |       "execution_count": null,
231 |       "outputs": []
232 |     },
233 |     {
234 |       "cell_type": "code",
235 |       "source": [
236 |         "X_train = prepare_breast_cancer_features.fit_transform(train_features_df)\n",
237 |         "X_val = prepare_breast_cancer_features.transform(val_features_df)\n"
238 |       ],
239 |       "metadata": {
240 |         "id": "GFxgQr8O7_wP"
241 |       },
242 |       "execution_count": null,
243 |       "outputs": []
244 |     },
245 |     {
246 |       "cell_type": "code",
247 |       "source": [
248 |         "print(X_train.shape)\n",
249 |         "print(X_val.shape)"
250 |       ],
251 |       "metadata": {
252 |         "id": "ZCb9ga-S8OGx"
253 |       },
254 |       "execution_count": null,
255 |       "outputs": []
256 |     },
257 |     {
258 |       "cell_type": "code",
259 |       "source": [
260 |         "y_train = prepare_breast_cancer_target.fit_transform(train_target)\n",
261 |         "y_val = prepare_breast_cancer_target.transform(val_target)\n"
262 |       ],
263 |       "metadata": {
264 |         "id": "C9SiVZKv7_wQ"
265 |       },
266 |       "execution_count": null,
267 |       "outputs": []
268 |     },
269 |     {
270 |       "cell_type": "code",
271 |       "source": [
272 |         "print(y_train.shape)\n",
273 |         "print(y_val.shape)\n",
274 |         "\n",
275 |         "# for binary classification targets are probs!\n",
276 |         "print(y_train.dtype)\n",
277 |         "print(y_val.dtype)"
278 |       ],
279 |       "metadata": {
280 |         "id": "t6TVvdtNCuqw"
281 |       },
282 |       "execution_count": null,
283 |       "outputs": []
284 |     },
285 |     {
286 |       "cell_type": "markdown",
287 |       "source": [
288 |         "### Implementing an MLP for Binary Classification using nn.Sequential"
289 |       ],
290 |       "metadata": {
291 |         "id": "LzgXCpjG7MhG"
292 |       }
293 |     },
294 |     {
295 |       "cell_type": "markdown",
296 |       "source": [
297 |         "[`nn.Sequential`](https://docs.pytorch.org/docs/stable/generated/torch.nn.Sequential.html) in PyTorch is a container module that allows for the sequential execution of a series of neural network layers or modules. It simplifies the process of building neural networks with a linear, feed-forward structure by eliminating the need to explicitly define the forward method for each layer.\n",
298 |         "\n",
299 |         "\n",
300 |         "### Key Characteristics and Use-Cases\n",
301 |         "\n",
302 |         "* **Ordered Container:** `nn.Sequential` takes a list of `nn.Module` instances (layers) and arranges them in the order they are provided.\n",
303 |         "* **Automatic Forward Pass:** When an input tensor is passed to an `nn.Sequential` object, it automatically propagates through each contained module in the defined order, with the output of one module serving as the input to the next.\n",
304 |         "* **Simplified Model Definition:** It offers a concise way to define models, especially for straightforward architectures without complex branching or custom logic within the forward pass.\n",
305 |         "* **Treat as a Single Module:** The entire `nn.Sequential` container can be treated as a single `nn.Module`, allowing for easy integration into larger models or for applying operations like moving to a device (`.to(device)`) or setting training/evaluation mode (`.train()`, `.eval()`)."
306 |       ],
307 |       "metadata": {
308 |         "id": "3cTr1l7hqJ1N"
309 |       }
310 |     },
311 |     {
312 |       "cell_type": "code",
313 |       "source": [
314 |         "_ = torch.manual_seed(42)\n",
315 |         "\n",
316 |         "n_features = X_train.size(1)\n",
317 |         "n_outputs = 1\n",
318 |         "\n",
319 |         "\n",
320 |         "# use the 2/3 heuristic for choosing the number of neurons\n",
321 |         "n_hidden = (2 * (n_features + n_outputs) ) // 3\n",
322 |         "\n",
323 |         "\n",
324 |         "breast_cancer_model = nn.Sequential(\n",
325 |         "    nn.Linear(\n",
326 |         "        in_features=n_features,\n",
327 |         "        out_features=n_hidden,\n",
328 |         "        bias=True,\n",
329 |         "    ),\n",
330 |         "    nn.ReLU(),\n",
331 |         "    nn.Linear(\n",
332 |         "        in_features=n_hidden,\n",
333 |         "        out_features=n_outputs,\n",
334 |         "        bias=True,\n",
335 |         "    ),\n",
336 |         ")"
337 |       ],
338 |       "metadata": {
339 |         "id": "BRX-6dg77xgU"
340 |       },
341 |       "execution_count": null,
342 |       "outputs": []
343 |     },
344 |     {
345 |       "cell_type": "markdown",
346 |       "source": [
347 |         "### Loss functions and optimizers"
348 |       ],
349 |       "metadata": {
350 |         "id": "noWDT9PD8bE5"
351 |       }
352 |     },
353 |     {
354 |       "cell_type": "code",
355 |       "source": [
356 |         "# bincount only works with 1D tensors of non-negative integers!\n",
357 |         "n_negative_samples, n_positive_samples = torch.bincount(y_train.squeeze().to(torch.int64))\n",
358 |         "positive_weight = torch.tensor([n_negative_samples / n_positive_samples], dtype=torch.float32)\n",
359 |         "print(positive_weight)"
360 |       ],
361 |       "metadata": {
362 |         "id": "ToTPG5J9mGmP"
363 |       },
364 |       "execution_count": null,
365 |       "outputs": []
366 |     },
367 |     {
368 |       "cell_type": "code",
369 |       "source": [
370 |         "binary_cross_entropy_loss = nn.BCEWithLogitsLoss(\n",
371 |         "    pos_weight=positive_weight\n",
372 |         ")\n",
373 |         "\n",
374 |         "sgd = optim.SGD(\n",
375 |         "    breast_cancer_model.parameters(),\n",
376 |         "    lr=1e-2,\n",
377 |         ")"
378 |       ],
379 |       "metadata": {
380 |         "id": "Pfn6VW1j76jK"
381 |       },
382 |       "execution_count": null,
383 |       "outputs": []
384 |     },
385 |     {
386 |       "cell_type": "code",
387 |       "source": [
388 |         "def train(\n",
389 |         "    model_fn,\n",
390 |         "    criterion,\n",
391 |         "    optimizer,\n",
392 |         "    X_train,\n",
393 |         "    y_train,\n",
394 |         "    X_val,\n",
395 |         "    y_val,\n",
396 |         "    n_epochs,\n",
397 |         "    log_epochs=1,\n",
398 |         "    ):\n",
399 |         "\n",
400 |         "    for epoch in range(n_epochs):\n",
401 |         "        # enable training mode\n",
402 |         "        model_fn.train()\n",
403 |         "\n",
404 |         "        # forward pass\n",
405 |         "        y_pred = model_fn(X_train)\n",
406 |         "        train_loss = criterion(y_pred, y_train)\n",
407 |         "\n",
408 |         "        # backward pass\n",
409 |         "        train_loss.backward()\n",
410 |         "\n",
411 |         "        # gradient descent step\n",
412 |         "        optimizer.step()\n",
413 |         "        optimizer.zero_grad()\n",
414 |         "\n",
415 |         "        # evaluate using the validation data\n",
416 |         "        with torch.no_grad():\n",
417 |         "            model_fn.eval()\n",
418 |         "            y_pred = model_fn(X_val)\n",
419 |         "            val_loss = criterion(y_pred, y_val)\n",
420 |         "\n",
421 |         "        if (epoch + 1) % log_epochs == 0:\n",
422 |         "            print(f\"Epoch {epoch + 1}/{n_epochs}, Training Loss: {train_loss.item(): .4f}, Val Loss: {val_loss.item(): .4f}\")\n"
423 |       ],
424 |       "metadata": {
425 |         "id": "g1khH5hZ8qR_"
426 |       },
427 |       "execution_count": null,
428 |       "outputs": []
429 |     },
430 |     {
431 |       "cell_type": "code",
432 |       "source": [
433 |         "train(\n",
434 |         "    breast_cancer_model,\n",
435 |         "    binary_cross_entropy_loss,\n",
436 |         "    sgd,\n",
437 |         "    X_train,\n",
438 |         "    y_train,\n",
439 |         "    X_val,\n",
440 |         "    y_val,\n",
441 |         "    n_epochs=1000\n",
442 |         ")"
443 |       ],
444 |       "metadata": {
445 |         "id": "aiN8LXAn8Mau"
446 |       },
447 |       "execution_count": null,
448 |       "outputs": []
449 |     },
450 |     {
451 |       "cell_type": "markdown",
452 |       "source": [
453 |         "## Multi-class Classification"
454 |       ],
455 |       "metadata": {
456 |         "id": "1XOg0i_wDL9_"
457 |       }
458 |     },
459 |     {
460 |       "cell_type": "markdown",
461 |       "source": [
462 |         "### Loading the data"
463 |       ],
464 |       "metadata": {
465 |         "id": "TEOj1HDnhGJ_"
466 |       }
467 |     },
468 |     {
469 |       "cell_type": "code",
470 |       "source": [
471 |         "covtype_dataset = datasets.fetch_covtype(\n",
472 |         "    as_frame=True\n",
473 |         ")"
474 |       ],
475 |       "metadata": {
476 |         "id": "7AEVE9JgfoXm"
477 |       },
478 |       "execution_count": null,
479 |       "outputs": []
480 |     },
481 |     {
482 |       "cell_type": "code",
483 |       "source": [
484 |         "print(covtype_dataset[\"DESCR\"])"
485 |       ],
486 |       "metadata": {
487 |         "id": "S01_yy5vf0fr"
488 |       },
489 |       "execution_count": null,
490 |       "outputs": []
491 |     },
492 |     {
493 |       "cell_type": "code",
494 |       "source": [
495 |         "covtype_features_df = covtype_dataset[\"data\"]\n",
496 |         "covtype_target_df = (\n",
497 |         "    covtype_dataset.get(\"target\")\n",
498 |         "                   .to_frame()\n",
499 |         ")"
500 |       ],
501 |       "metadata": {
502 |         "id": "I3eZZQu4gFRL"
503 |       },
504 |       "execution_count": null,
505 |       "outputs": []
506 |     },
507 |     {
508 |       "cell_type": "code",
509 |       "source": [
510 |         "covtype_features_df.info()"
511 |       ],
512 |       "metadata": {
513 |         "id": "TQVJir3zgcCi"
514 |       },
515 |       "execution_count": null,
516 |       "outputs": []
517 |     },
518 |     {
519 |       "cell_type": "code",
520 |       "source": [
521 |         "_ = (\n",
522 |         "    covtype_target_df.loc[:, \"Cover_Type\"]\n",
523 |         "                     .value_counts()\n",
524 |         "                     .sort_index()\n",
525 |         "                     .plot(kind=\"bar\")\n",
526 |         ")"
527 |       ],
528 |       "metadata": {
529 |         "id": "BmIh3Xx_Abdc"
530 |       },
531 |       "execution_count": null,
532 |       "outputs": []
533 |     },
534 |     {
535 |       "cell_type": "markdown",
536 |       "source": [
537 |         "### Preparing the data"
538 |       ],
539 |       "metadata": {
540 |         "id": "2xIyT6Tqk5W7"
541 |       }
542 |     },
543 |     {
544 |       "cell_type": "markdown",
545 |       "source": [
546 |         "#### Train/Val Split"
547 |       ],
548 |       "metadata": {
549 |         "id": "5Ur2SAGuhKI9"
550 |       }
551 |     },
552 |     {
553 |       "cell_type": "code",
554 |       "source": [
555 |         "train_features_df, val_features_df, train_target_df, val_target_df = (\n",
556 |         "    model_selection.train_test_split(\n",
557 |         "        covtype_features_df,\n",
558 |         "        covtype_target_df,\n",
559 |         "        test_size=0.20,\n",
560 |         "        shuffle=True,\n",
561 |         "        stratify=covtype_target_df,\n",
562 |         "        random_state=RANDOM_STATE\n",
563 |         "    )\n",
564 |         ")\n"
565 |       ],
566 |       "metadata": {
567 |         "id": "jeoEgiaggdqY"
568 |       },
569 |       "execution_count": null,
570 |       "outputs": []
571 |     },
572 |     {
573 |       "cell_type": "markdown",
574 |       "source": [
575 |         "#### Features and target preparation"
576 |       ],
577 |       "metadata": {
578 |         "id": "U6XQukjIhrXE"
579 |       }
580 |     },
581 |     {
582 |       "cell_type": "code",
583 |       "source": [
584 |         "prepare_covtype_features = pipeline.make_pipeline(\n",
585 |         "    compose.make_column_transformer(\n",
586 |         "        (\n",
587 |         "            \"passthrough\",\n",
588 |         "            compose.make_column_selector(\n",
589 |         "                pattern=\"^Wilderness_Area_|^Soil_Type_\"\n",
590 |         "            )\n",
591 |         "        ),\n",
592 |         "        force_int_remainder_cols=False,\n",
593 |         "        n_jobs=-1,\n",
594 |         "        remainder=preprocessing.QuantileTransformer(\n",
595 |         "            output_distribution=\"normal\",\n",
596 |         "            random_state=RANDOM_STATE,\n",
597 |         "        )\n",
598 |         "    ),\n",
599 |         "    preprocessing.FunctionTransformer(\n",
600 |         "        func=array_to_tensor,\n",
601 |         "    )\n",
602 |         ")\n",
603 |         "\n",
604 |         "prepare_covtype_target = pipeline.make_pipeline(\n",
605 |         "    preprocessing.OrdinalEncoder(\n",
606 |         "        categories=[\n",
607 |         "            [1, 2, 3, 4, 5, 6, 7]\n",
608 |         "        ],\n",
609 |         "    ),\n",
610 |         "    preprocessing.FunctionTransformer(\n",
611 |         "        func=array_to_tensor,\n",
612 |         "        kw_args={\n",
613 |         "            \"dtype\": torch.int64\n",
614 |         "        }\n",
615 |         "    ),\n",
616 |         "    preprocessing.FunctionTransformer(\n",
617 |         "        func=torch.squeeze,\n",
618 |         "    )\n",
619 |         ")\n",
620 |         "\n"
621 |       ],
622 |       "metadata": {
623 |         "id": "eO5Bp1JBhcee"
624 |       },
625 |       "execution_count": null,
626 |       "outputs": []
627 |     },
628 |     {
629 |       "cell_type": "code",
630 |       "source": [
631 |         "X_train = prepare_covtype_features.fit_transform(train_features_df)\n",
632 |         "X_val = prepare_covtype_features.transform(val_features_df)\n"
633 |       ],
634 |       "metadata": {
635 |         "id": "hDAQixqqjLj-"
636 |       },
637 |       "execution_count": null,
638 |       "outputs": []
639 |     },
640 |     {
641 |       "cell_type": "code",
642 |       "source": [
643 |         "print(X_train.shape)\n",
644 |         "print(X_val.shape)"
645 |       ],
646 |       "metadata": {
647 |         "id": "rOLNPI361Efl"
648 |       },
649 |       "execution_count": null,
650 |       "outputs": []
651 |     },
652 |     {
653 |       "cell_type": "code",
654 |       "source": [
655 |         "y_train = prepare_covtype_target.fit_transform(train_target_df)\n",
656 |         "y_val = prepare_covtype_target.transform(val_target_df)\n"
657 |       ],
658 |       "metadata": {
659 |         "id": "c4hASNLXjaJk"
660 |       },
661 |       "execution_count": null,
662 |       "outputs": []
663 |     },
664 |     {
665 |       "cell_type": "code",
666 |       "source": [
667 |         "# again note that the targets are 1-dimensional!\n",
668 |         "print(y_train.shape)\n",
669 |         "print(y_train.dtype)\n",
670 |         "\n",
671 |         "print(y_val.shape)\n",
672 |         "print(y_val.dtype)"
673 |       ],
674 |       "metadata": {
675 |         "id": "lFxjkHNl1Q5w"
676 |       },
677 |       "execution_count": null,
678 |       "outputs": []
679 |     },
680 |     {
681 |       "cell_type": "markdown",
682 |       "source": [
683 |         "### Exercise:\n",
684 |         "\n",
685 |         "Implement a MLP using `nn.Sequential` that has three hidden layers with sizes 200, 100, and 50. Use `nn.ReLU` activation functions."
686 |       ],
687 |       "metadata": {
688 |         "id": "pAsTcEm-qREN"
689 |       }
690 |     },
691 |     {
692 |       "cell_type": "code",
693 |       "source": [
694 |         "# INSERT YOUR CODE HERE!"
695 |       ],
696 |       "metadata": {
697 |         "id": "YvQaSn7nDcvh"
698 |       },
699 |       "execution_count": null,
700 |       "outputs": []
701 |     },
702 |     {
703 |       "cell_type": "markdown",
704 |       "source": [
705 |         "### Solution:"
706 |       ],
707 |       "metadata": {
708 |         "id": "fSCY5mFzDeka"
709 |       }
710 |     },
711 |     {
712 |       "cell_type": "code",
713 |       "source": [
714 |         "_ = torch.manual_seed(42)\n",
715 |         "\n",
716 |         "n_features = X_train.size(1)\n",
717 |         "n_classes = y_train.unique().size(0)\n",
718 |         "\n",
719 |         "covtype_model = nn.Sequential(\n",
720 |         "    nn.Linear(\n",
721 |         "        in_features=n_features,\n",
722 |         "        out_features=200,\n",
723 |         "        bias=True,\n",
724 |         "    ),\n",
725 |         "    nn.ReLU(),\n",
726 |         "    nn.Linear(\n",
727 |         "        in_features=200,\n",
728 |         "        out_features=100,\n",
729 |         "        bias=True,\n",
730 |         "    ),\n",
731 |         "    nn.ReLU(),\n",
732 |         "    nn.Linear(\n",
733 |         "        in_features=100,\n",
734 |         "        out_features=50,\n",
735 |         "        bias=True,\n",
736 |         "    ),\n",
737 |         "    nn.ReLU(),\n",
738 |         "    nn.Linear(\n",
739 |         "        in_features=50,\n",
740 |         "        out_features=n_classes,\n",
741 |         "        bias=True,\n",
742 |         "    ),\n",
743 |         ")\n"
744 |       ],
745 |       "metadata": {
746 |         "id": "Co1xI1kWqxJU"
747 |       },
748 |       "execution_count": null,
749 |       "outputs": []
750 |     },
751 |     {
752 |       "cell_type": "markdown",
753 |       "source": [
754 |         "### Exercise:\n",
755 |         "\n",
756 |         "Train your MLP for 100 epochs to minimize `nn.CrossEntropyLoss` using plain vanilla `optim.SGD` with a learning rate of 1."
757 |       ],
758 |       "metadata": {
759 |         "id": "pUH91w6gDlud"
760 |       }
761 |     },
762 |     {
763 |       "cell_type": "code",
764 |       "source": [
765 |         "# INSERT YOUR CODE HERE!"
766 |       ],
767 |       "metadata": {
768 |         "id": "fmVtpyn3DjzD"
769 |       },
770 |       "execution_count": null,
771 |       "outputs": []
772 |     },
773 |     {
774 |       "cell_type": "markdown",
775 |       "source": [
776 |         "### Solution:"
777 |       ],
778 |       "metadata": {
779 |         "id": "_CiT2WF3D2qd"
780 |       }
781 |     },
782 |     {
783 |       "cell_type": "code",
784 |       "source": [
785 |         "# rare classes get larger weight when calculating the loss\n",
786 |         "n_samples = y_train.size(0)\n",
787 |         "weights = n_samples / (n_classes * torch.bincount(y_train))\n",
788 |         "normalized_weights = weights / weights.sum()\n",
789 |         "\n",
790 |         "cross_entropy_loss = nn.CrossEntropyLoss(\n",
791 |         "    weight=normalized_weights\n",
792 |         ")\n",
793 |         "\n",
794 |         "sgd = optim.SGD(\n",
795 |         "    covtype_model.parameters(),\n",
796 |         "    lr=1e0\n",
797 |         ")"
798 |       ],
799 |       "metadata": {
800 |         "id": "eJiTci5rD4aL"
801 |       },
802 |       "execution_count": null,
803 |       "outputs": []
804 |     },
805 |     {
806 |       "cell_type": "code",
807 |       "source": [
808 |         "train(\n",
809 |         "    covtype_model,\n",
810 |         "    cross_entropy_loss,\n",
811 |         "    sgd,\n",
812 |         "    X_train,\n",
813 |         "    y_train,\n",
814 |         "    X_val,\n",
815 |         "    y_val,\n",
816 |         "    n_epochs=100,\n",
817 |         "    log_epochs=1,\n",
818 |         ")"
819 |       ],
820 |       "metadata": {
821 |         "id": "Mm7oF8Zstcf9"
822 |       },
823 |       "execution_count": null,
824 |       "outputs": []
825 |     },
826 |     {
827 |       "cell_type": "code",
828 |       "source": [],
829 |       "metadata": {
830 |         "id": "eiBHG_f8Ho19"
831 |       },
832 |       "execution_count": null,
833 |       "outputs": []
834 |     }
835 |   ]
836 | }


--------------------------------------------------------------------------------
/notebooks/02c-model-evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4",
  8 |       "authorship_tag": "ABX9TyMgxatMVJPw18cxDVXXNFaB",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     },
 18 |     "accelerator": "GPU"
 19 |   },
 20 |   "cells": [
 21 |     {
 22 |       "cell_type": "markdown",
 23 |       "metadata": {
 24 |         "id": "view-in-github",
 25 |         "colab_type": "text"
 26 |       },
 27 |       "source": [
 28 |         "<a href=\"https://colab.research.google.com/github/davidrpugh/introduction-to-deep-learning/blob/master/notebooks/02b-model-evaluation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "source": [
 34 |         "# Model Evaluation\n",
 35 |         "\n"
 36 |       ],
 37 |       "metadata": {
 38 |         "id": "6408e1_DUgNI"
 39 |       }
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "source": [
 44 |         "%%bash\n",
 45 |         "\n",
 46 |         "pip install --upgrade torchmetrics"
 47 |       ],
 48 |       "metadata": {
 49 |         "id": "ep8fETzUw6jI"
 50 |       },
 51 |       "execution_count": null,
 52 |       "outputs": []
 53 |     },
 54 |     {
 55 |       "cell_type": "code",
 56 |       "source": [
 57 |         "import numpy as np\n",
 58 |         "from sklearn import compose, datasets, model_selection, pipeline, preprocessing\n",
 59 |         "\n",
 60 |         "import torch\n",
 61 |         "from torch import nn, optim, utils\n",
 62 |         "import torchmetrics"
 63 |       ],
 64 |       "metadata": {
 65 |         "id": "hAF4bXNEUmwI"
 66 |       },
 67 |       "execution_count": null,
 68 |       "outputs": []
 69 |     },
 70 |     {
 71 |       "cell_type": "markdown",
 72 |       "source": [
 73 |         "## Verifying availability of GPU(s)"
 74 |       ],
 75 |       "metadata": {
 76 |         "id": "Jw2YB037gupQ"
 77 |       }
 78 |     },
 79 |     {
 80 |       "cell_type": "code",
 81 |       "source": [
 82 |         "# check that torch version has support for cuda\n",
 83 |         "print(torch.__version__)"
 84 |       ],
 85 |       "metadata": {
 86 |         "id": "WI2Hc3ssVFrp"
 87 |       },
 88 |       "execution_count": null,
 89 |       "outputs": []
 90 |     },
 91 |     {
 92 |       "cell_type": "code",
 93 |       "source": [
 94 |         "%%bash\n",
 95 |         "\n",
 96 |         "# check that GPUs are physically available\n",
 97 |         "nvidia-smi"
 98 |       ],
 99 |       "metadata": {
100 |         "id": "QVTrHw0Hg3jU"
101 |       },
102 |       "execution_count": null,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "source": [
108 |         "# check that PyTorch can find the GPUs\n",
109 |         "print(torch.cuda.is_available())"
110 |       ],
111 |       "metadata": {
112 |         "id": "vWB6iFrZg0dG"
113 |       },
114 |       "execution_count": null,
115 |       "outputs": []
116 |     },
117 |     {
118 |       "cell_type": "code",
119 |       "source": [
120 |         "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
121 |       ],
122 |       "metadata": {
123 |         "id": "IvmFtiBRXwYM"
124 |       },
125 |       "execution_count": null,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "source": [
131 |         "print(DEVICE)"
132 |       ],
133 |       "metadata": {
134 |         "id": "arEX0TRPX1pM"
135 |       },
136 |       "execution_count": null,
137 |       "outputs": []
138 |     },
139 |     {
140 |       "cell_type": "markdown",
141 |       "source": [
142 |         "## Loading the data"
143 |       ],
144 |       "metadata": {
145 |         "id": "qU14efMhVJjE"
146 |       }
147 |     },
148 |     {
149 |       "cell_type": "code",
150 |       "execution_count": null,
151 |       "metadata": {
152 |         "id": "pi4TvYKpUWAC"
153 |       },
154 |       "outputs": [],
155 |       "source": [
156 |         "covtype_dataset = datasets.fetch_covtype(\n",
157 |         "    as_frame=True\n",
158 |         ")"
159 |       ]
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "source": [
164 |         "print(covtype_dataset[\"DESCR\"])"
165 |       ],
166 |       "metadata": {
167 |         "id": "agASIJEGUjX6"
168 |       },
169 |       "execution_count": null,
170 |       "outputs": []
171 |     },
172 |     {
173 |       "cell_type": "code",
174 |       "source": [
175 |         "covtype_features_df = covtype_dataset[\"data\"]\n",
176 |         "covtype_target_df = (\n",
177 |         "    covtype_dataset.get(\"target\")\n",
178 |         "                   .to_frame()\n",
179 |         ")"
180 |       ],
181 |       "metadata": {
182 |         "id": "yQvLcEMhUp6w"
183 |       },
184 |       "execution_count": null,
185 |       "outputs": []
186 |     },
187 |     {
188 |       "cell_type": "markdown",
189 |       "source": [
190 |         "## Preparing the data"
191 |       ],
192 |       "metadata": {
193 |         "id": "PzsiqOoIVgBf"
194 |       }
195 |     },
196 |     {
197 |       "cell_type": "markdown",
198 |       "source": [
199 |         "### Train/Val split"
200 |       ],
201 |       "metadata": {
202 |         "id": "4a4kbLN3U1Wu"
203 |       }
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "source": [
208 |         "RANDOM_STATE = np.random.RandomState(42)\n",
209 |         "\n",
210 |         "\n",
211 |         "train_features_df, val_features_df, train_target_df, val_target_df = (\n",
212 |         "    model_selection.train_test_split(\n",
213 |         "        covtype_features_df,\n",
214 |         "        covtype_target_df,\n",
215 |         "        test_size=0.20,\n",
216 |         "        shuffle=True,\n",
217 |         "        stratify=covtype_target_df,\n",
218 |         "        random_state=RANDOM_STATE\n",
219 |         "    )\n",
220 |         ")\n"
221 |       ],
222 |       "metadata": {
223 |         "id": "sjUrsUbmUzEw"
224 |       },
225 |       "execution_count": null,
226 |       "outputs": []
227 |     },
228 |     {
229 |       "cell_type": "markdown",
230 |       "source": [
231 |         "### Features and target preparation"
232 |       ],
233 |       "metadata": {
234 |         "id": "npDOm1-1WTUU"
235 |       }
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "source": [
240 |         "def array_to_tensor(arr, dtype=torch.float32):\n",
241 |         "    return torch.tensor(arr, dtype=dtype)\n",
242 |         "\n",
243 |         "\n",
244 |         "prepare_covtype_features = pipeline.make_pipeline(\n",
245 |         "    compose.make_column_transformer(\n",
246 |         "        (\n",
247 |         "            \"passthrough\",\n",
248 |         "            compose.make_column_selector(\n",
249 |         "                pattern=\"^Wilderness_Area_|^Soil_Type_\"\n",
250 |         "            )\n",
251 |         "        ),\n",
252 |         "        force_int_remainder_cols=False,\n",
253 |         "        n_jobs=-1,\n",
254 |         "        remainder=preprocessing.QuantileTransformer(\n",
255 |         "            output_distribution=\"normal\",\n",
256 |         "            random_state=RANDOM_STATE,\n",
257 |         "        )\n",
258 |         "    ),\n",
259 |         "    preprocessing.FunctionTransformer(\n",
260 |         "        func=array_to_tensor,\n",
261 |         "    )\n",
262 |         ")\n",
263 |         "\n",
264 |         "prepare_covtype_target = pipeline.make_pipeline(\n",
265 |         "    preprocessing.OrdinalEncoder(\n",
266 |         "        categories=[\n",
267 |         "            [1, 2, 3, 4, 5, 6, 7]\n",
268 |         "        ],\n",
269 |         "    ),\n",
270 |         "    preprocessing.FunctionTransformer(\n",
271 |         "        func=array_to_tensor,\n",
272 |         "        kw_args={\n",
273 |         "            \"dtype\": torch.int64\n",
274 |         "        }\n",
275 |         "    ),\n",
276 |         "    preprocessing.FunctionTransformer(\n",
277 |         "        func=torch.squeeze,\n",
278 |         "    )\n",
279 |         ")\n",
280 |         "\n"
281 |       ],
282 |       "metadata": {
283 |         "id": "rkuiF7znVunl"
284 |       },
285 |       "execution_count": null,
286 |       "outputs": []
287 |     },
288 |     {
289 |       "cell_type": "code",
290 |       "source": [
291 |         "X_train = prepare_covtype_features.fit_transform(train_features_df)\n",
292 |         "X_val = prepare_covtype_features.transform(val_features_df)\n"
293 |       ],
294 |       "metadata": {
295 |         "id": "WoEwybKkVPKz"
296 |       },
297 |       "execution_count": null,
298 |       "outputs": []
299 |     },
300 |     {
301 |       "cell_type": "code",
302 |       "source": [
303 |         "print(X_train.shape)\n",
304 |         "print(X_val.shape)"
305 |       ],
306 |       "metadata": {
307 |         "id": "ck0rwCW0WMk4"
308 |       },
309 |       "execution_count": null,
310 |       "outputs": []
311 |     },
312 |     {
313 |       "cell_type": "code",
314 |       "source": [
315 |         "y_train = prepare_covtype_target.fit_transform(train_target_df)\n",
316 |         "y_val = prepare_covtype_target.transform(val_target_df)\n"
317 |       ],
318 |       "metadata": {
319 |         "id": "0VRdZGsZWHhc"
320 |       },
321 |       "execution_count": null,
322 |       "outputs": []
323 |     },
324 |     {
325 |       "cell_type": "code",
326 |       "source": [
327 |         "print(y_train.shape)\n",
328 |         "print(y_val.shape)"
329 |       ],
330 |       "metadata": {
331 |         "id": "x7wl1vR-WMIc"
332 |       },
333 |       "execution_count": null,
334 |       "outputs": []
335 |     },
336 |     {
337 |       "cell_type": "markdown",
338 |       "source": [
339 |         "### Datasets"
340 |       ],
341 |       "metadata": {
342 |         "id": "bRRYnaSlWaW-"
343 |       }
344 |     },
345 |     {
346 |       "cell_type": "code",
347 |       "source": [
348 |         "train_dataset = utils.data.TensorDataset(X_train, y_train)\n",
349 |         "val_dataset = utils.data.TensorDataset(X_val, y_val)"
350 |       ],
351 |       "metadata": {
352 |         "id": "RcQc1S2GWPJD"
353 |       },
354 |       "execution_count": null,
355 |       "outputs": []
356 |     },
357 |     {
358 |       "cell_type": "markdown",
359 |       "source": [
360 |         "### DataLoaders"
361 |       ],
362 |       "metadata": {
363 |         "id": "3z31QmQ1aH2i"
364 |       }
365 |     },
366 |     {
367 |       "cell_type": "code",
368 |       "source": [
369 |         "train_data_loader = (\n",
370 |         "    utils.data\n",
371 |         "         .DataLoader(\n",
372 |         "             train_dataset,\n",
373 |         "             num_workers=2,\n",
374 |         "             batch_size=128,\n",
375 |         "             shuffle=True,\n",
376 |         "             persistent_workers=True,\n",
377 |         "             pin_memory=True,\n",
378 |         "             prefetch_factor=2,\n",
379 |         "             drop_last=True,\n",
380 |         "         )\n",
381 |         ")\n",
382 |         "\n",
383 |         "val_data_loader = (\n",
384 |         "    utils.data\n",
385 |         "         .DataLoader(\n",
386 |         "             val_dataset,\n",
387 |         "             num_workers=2,\n",
388 |         "             batch_size=128,\n",
389 |         "             shuffle=True,\n",
390 |         "             persistent_workers=True,\n",
391 |         "             pin_memory=True,\n",
392 |         "             prefetch_factor=2,\n",
393 |         "             drop_last=True,\n",
394 |         "         )\n",
395 |         ")"
396 |       ],
397 |       "metadata": {
398 |         "id": "Sw7coNH4WgvT"
399 |       },
400 |       "execution_count": null,
401 |       "outputs": []
402 |     },
403 |     {
404 |       "cell_type": "markdown",
405 |       "source": [
406 |         "## Defining a training loop\n"
407 |       ],
408 |       "metadata": {
409 |         "id": "cq7uxeVnXKN9"
410 |       }
411 |     },
412 |     {
413 |       "cell_type": "code",
414 |       "source": [
415 |         "def train(\n",
416 |         "    model_fn,\n",
417 |         "    criterion,\n",
418 |         "    optimizer,\n",
419 |         "    train_data_loader,\n",
420 |         "    n_epochs,\n",
421 |         "    log_epochs=1,\n",
422 |         "    ):\n",
423 |         "\n",
424 |         "    model_fn.train()\n",
425 |         "    for epoch in range(n_epochs):\n",
426 |         "        total_loss = 0.0\n",
427 |         "        for i, (X_batch, y_batch) in enumerate(train_data_loader):\n",
428 |         "\n",
429 |         "            # move batches to device\n",
430 |         "            X_batch = X_batch.to(DEVICE, non_blocking=True)\n",
431 |         "            y_batch = y_batch.to(DEVICE, non_blocking=True)\n",
432 |         "\n",
433 |         "            # forward pass\n",
434 |         "            y_pred = model_fn(X_batch)\n",
435 |         "            train_loss = criterion(y_pred, y_batch)\n",
436 |         "            total_loss += train_loss.item()\n",
437 |         "\n",
438 |         "            # backward pass\n",
439 |         "            train_loss.backward()\n",
440 |         "\n",
441 |         "            # gradient descent step\n",
442 |         "            optimizer.step()\n",
443 |         "            optimizer.zero_grad()\n",
444 |         "\n",
445 |         "        average_loss = total_loss / len(train_data_loader)\n",
446 |         "\n",
447 |         "        if (epoch + 1) % log_epochs == 0:\n",
448 |         "            print(f\"Epoch {epoch + 1}/{n_epochs}, Training Loss: {average_loss: .4f}\")\n"
449 |       ],
450 |       "metadata": {
451 |         "id": "2L5czsBeXEol"
452 |       },
453 |       "execution_count": null,
454 |       "outputs": []
455 |     },
456 |     {
457 |       "cell_type": "markdown",
458 |       "source": [
459 |         "### Defining a model"
460 |       ],
461 |       "metadata": {
462 |         "id": "FPW8ZSVFctg_"
463 |       }
464 |     },
465 |     {
466 |       "cell_type": "code",
467 |       "source": [
468 |         "_ = torch.manual_seed(42)\n",
469 |         "\n",
470 |         "n_features = X_train.size(1)\n",
471 |         "n_classes = y_train.unique().size(0)\n",
472 |         "\n",
473 |         "covtype_model = nn.Sequential(\n",
474 |         "    nn.Linear(\n",
475 |         "        in_features=n_features,\n",
476 |         "        out_features=200,\n",
477 |         "        bias=True,\n",
478 |         "    ),\n",
479 |         "    nn.ReLU(),\n",
480 |         "    nn.Linear(\n",
481 |         "        in_features=200,\n",
482 |         "        out_features=100,\n",
483 |         "        bias=True,\n",
484 |         "    ),\n",
485 |         "    nn.ReLU(),\n",
486 |         "    nn.Linear(\n",
487 |         "        in_features=100,\n",
488 |         "        out_features=50,\n",
489 |         "        bias=True,\n",
490 |         "    ),\n",
491 |         "    nn.ReLU(),\n",
492 |         "    nn.Linear(\n",
493 |         "        in_features=50,\n",
494 |         "        out_features=n_classes,\n",
495 |         "        bias=True,\n",
496 |         "    ),\n",
497 |         ")\n",
498 |         "\n",
499 |         "# move model to the GPU before defining your optimizer!\n",
500 |         "covtype_model = covtype_model.to(DEVICE)"
501 |       ],
502 |       "metadata": {
503 |         "id": "JeWCfe_8YQXk"
504 |       },
505 |       "execution_count": null,
506 |       "outputs": []
507 |     },
508 |     {
509 |       "cell_type": "markdown",
510 |       "source": [
511 |         "### Defining a loss function and optimizer"
512 |       ],
513 |       "metadata": {
514 |         "id": "hROYf7i4oq1N"
515 |       }
516 |     },
517 |     {
518 |       "cell_type": "code",
519 |       "source": [
520 |         "cross_entropy_loss = nn.CrossEntropyLoss()\n",
521 |         "\n",
522 |         "# optimizer should be defined after moving model to GPU\n",
523 |         "sgd = optim.SGD(\n",
524 |         "    covtype_model.parameters(),\n",
525 |         "    lr=1e-3\n",
526 |         ")"
527 |       ],
528 |       "metadata": {
529 |         "id": "b9wXDokIotpO"
530 |       },
531 |       "execution_count": null,
532 |       "outputs": []
533 |     },
534 |     {
535 |       "cell_type": "markdown",
536 |       "source": [
537 |         "### Training the model"
538 |       ],
539 |       "metadata": {
540 |         "id": "ogxsMdXtvoX-"
541 |       }
542 |     },
543 |     {
544 |       "cell_type": "code",
545 |       "source": [
546 |         "train(\n",
547 |         "    covtype_model,\n",
548 |         "    cross_entropy_loss,\n",
549 |         "    sgd,\n",
550 |         "    train_data_loader,\n",
551 |         "    n_epochs=10,\n",
552 |         "    log_epochs=1,\n",
553 |         ")"
554 |       ],
555 |       "metadata": {
556 |         "id": "BYg7gSsSYaRf"
557 |       },
558 |       "execution_count": null,
559 |       "outputs": []
560 |     },
561 |     {
562 |       "cell_type": "markdown",
563 |       "source": [
564 |         "## Evaluating trained model performance"
565 |       ],
566 |       "metadata": {
567 |         "id": "7LdQ_etAup2u"
568 |       }
569 |     },
570 |     {
571 |       "cell_type": "code",
572 |       "source": [
573 |         "def evaluate(model_fn, data_loader, metric_fn, aggregate_fn=torch.mean):\n",
574 |         "    model_fn.eval()\n",
575 |         "    metrics = []\n",
576 |         "    with torch.no_grad():\n",
577 |         "        for X_batch, y_batch in data_loader:\n",
578 |         "            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)\n",
579 |         "            y_pred = model_fn(X_batch)\n",
580 |         "            metric = metric_fn(y_pred, y_batch)\n",
581 |         "            metrics.append(metric)\n",
582 |         "    return aggregate_fn(torch.stack(metrics))\n"
583 |       ],
584 |       "metadata": {
585 |         "id": "5fMSBgDOuv36"
586 |       },
587 |       "execution_count": null,
588 |       "outputs": []
589 |     },
590 |     {
591 |       "cell_type": "code",
592 |       "source": [
593 |         "average_loss = evaluate(\n",
594 |         "    covtype_model,\n",
595 |         "    val_data_loader,\n",
596 |         "    cross_entropy_loss,\n",
597 |         "    aggregate_fn=torch.mean,\n",
598 |         ")\n",
599 |         "print(f\"Validation Loss {average_loss: .4f}\")"
600 |       ],
601 |       "metadata": {
602 |         "id": "Ry5JOPjJYdms"
603 |       },
604 |       "execution_count": null,
605 |       "outputs": []
606 |     },
607 |     {
608 |       "cell_type": "markdown",
609 |       "source": [
610 |         "## Using Torchmetrics\n",
611 |         "\n",
612 |         "[TorchMetrics](https://lightning.ai/docs/torchmetrics/stable//index.html) is an open-source library designed to provide a comprehensive and standardized collection of machine learning metrics for PyTorch. It is developed by [Lightning AI](https://lightning.ai/docs/pytorch/stable/) and offers a wide range of functional and module-based metrics for evaluating model performance.\n",
613 |         "\n",
614 |         "### Key features and benefits of TorchMetrics:\n",
615 |         "\n",
616 |         "* **Extensive Metric Collection:** It offers over 100 pre-built metric implementations covering various domains like classification, regression, object detection, segmentation, and NLP. Examples include Accuracy, Precision, Recall, F1-Score, AUROC, RMSE, R², BLEU, and more.\n",
617 |         "* **Standardized Interface:** Provides a consistent API for metric computation, reducing boilerplate code and enhancing reproducibility across different projects and models.\n",
618 |         "* **Distributed Training Compatibility:** Metrics are designed to work seamlessly with distributed training setups, including PyTorch's DistributedDataParallel (DDP), ensuring correct and efficient metric aggregation across multiple devices.\n",
619 |         "* **Incremental Computation:** Metrics can be updated incrementally with new batches of data, which is crucial for handling large datasets that might not fit into memory and for efficient computation within training loops.\n",
620 |         "* **Custom Metric Creation:** Offers an easy-to-use API for creating custom metrics tailored to specific needs, allowing users to extend the library's functionality.\n",
621 |         "* **Integration with PyTorch Lightning:** While usable with native PyTorch, TorchMetrics has full integration with PyTorch Lightning, simplifying metric logging and management within Lightning's training and validation loops.\n",
622 |         "* **Performance Optimization:** Designed with performance in mind, minimizing synchronization points between CPU and GPU during metric collection to avoid performance bottlenecks.\n",
623 |         "* **Visualization Capabilities:** Includes features for quickly visualizing metric performance, aiding in model analysis and debugging."
624 |       ],
625 |       "metadata": {
626 |         "id": "zJMrlhbpw0Ic"
627 |       }
628 |     },
629 |     {
630 |       "cell_type": "code",
631 |       "source": [
632 |         "def evaluate_tm(model_fn, data_loader, metric):\n",
633 |         "    model_fn.eval()\n",
634 |         "    metric.reset()  # reset the metric at the beginning\n",
635 |         "    with torch.no_grad():\n",
636 |         "        for X_batch, y_batch in data_loader:\n",
637 |         "            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)\n",
638 |         "            y_pred = model_fn(X_batch)\n",
639 |         "            metric.update(y_pred, y_batch)  # update it at each iteration\n",
640 |         "    return metric.compute()  # compute the final result at the end"
641 |       ],
642 |       "metadata": {
643 |         "id": "YeNlqBaEwYyA"
644 |       },
645 |       "execution_count": null,
646 |       "outputs": []
647 |     },
648 |     {
649 |       "cell_type": "code",
650 |       "source": [
651 |         "torchmetrics.Accuracy?"
652 |       ],
653 |       "metadata": {
654 |         "id": "RmVUaJH7y6Ie"
655 |       },
656 |       "execution_count": null,
657 |       "outputs": []
658 |     },
659 |     {
660 |       "cell_type": "code",
661 |       "source": [
662 |         "accuracy = (\n",
663 |         "    torchmetrics.Accuracy(\n",
664 |         "        num_classes=n_classes,\n",
665 |         "        task=\"multiclass\",\n",
666 |         "    ).to(DEVICE)\n",
667 |         ")\n",
668 |         "\n",
669 |         "average_accuracy = evaluate_tm(\n",
670 |         "    covtype_model,\n",
671 |         "    val_data_loader,\n",
672 |         "    accuracy,\n",
673 |         ")\n",
674 |         "print(f\"Validation Accuracy {average_accuracy: .4f}\")"
675 |       ],
676 |       "metadata": {
677 |         "id": "PP53MfeZyP7i"
678 |       },
679 |       "execution_count": null,
680 |       "outputs": []
681 |     },
682 |     {
683 |       "cell_type": "markdown",
684 |       "source": [
685 |         "## Modifying our training loop"
686 |       ],
687 |       "metadata": {
688 |         "id": "r_O0-nvlzSrt"
689 |       }
690 |     },
691 |     {
692 |       "cell_type": "code",
693 |       "source": [
694 |         "def train(\n",
695 |         "    model_fn,\n",
696 |         "    criterion,\n",
697 |         "    optimizer,\n",
698 |         "    metric,\n",
699 |         "    train_data_loader,\n",
700 |         "    val_data_loader,\n",
701 |         "    n_epochs,\n",
702 |         "    log_epochs=1,\n",
703 |         "    ):\n",
704 |         "\n",
705 |         "    history = {\n",
706 |         "        \"train_losses\": [],\n",
707 |         "        \"val_losses\": [],\n",
708 |         "        \"train_metrics\": [],\n",
709 |         "        \"val_metrics\": [],\n",
710 |         "    }\n",
711 |         "\n",
712 |         "    for epoch in range(n_epochs):\n",
713 |         "        total_train_loss = 0.0\n",
714 |         "        metric.reset()\n",
715 |         "        for i, (X_batch, y_batch) in enumerate(train_data_loader):\n",
716 |         "            model_fn.train()\n",
717 |         "\n",
718 |         "            # move batches to device\n",
719 |         "            X_batch = X_batch.to(DEVICE, non_blocking=True)\n",
720 |         "            y_batch = y_batch.to(DEVICE, non_blocking=True)\n",
721 |         "\n",
722 |         "            # forward pass\n",
723 |         "            y_pred = model_fn(X_batch)\n",
724 |         "            train_loss = criterion(y_pred, y_batch)\n",
725 |         "            total_train_loss += train_loss.item()\n",
726 |         "\n",
727 |         "            # backward pass\n",
728 |         "            train_loss.backward()\n",
729 |         "\n",
730 |         "            # gradient descent step\n",
731 |         "            optimizer.step()\n",
732 |         "            optimizer.zero_grad()\n",
733 |         "\n",
734 |         "            # update our metric\n",
735 |         "            metric.update(y_pred, y_batch)\n",
736 |         "\n",
737 |         "        # comute the average (across batches!) training loss\n",
738 |         "        average_train_loss = total_train_loss / len(train_data_loader)\n",
739 |         "        history[\"train_losses\"].append(average_train_loss)\n",
740 |         "\n",
741 |         "        # compute the average (across batched!) validation loss\n",
742 |         "        with torch.no_grad():\n",
743 |         "            model_fn.eval()\n",
744 |         "            total_val_loss = 0.0\n",
745 |         "            for X_batch, y_batch in val_data_loader:\n",
746 |         "                X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)\n",
747 |         "                y_pred = model_fn(X_batch)\n",
748 |         "                val_loss = criterion(y_pred, y_batch)\n",
749 |         "                total_val_loss += val_loss.item()\n",
750 |         "            average_val_loss = total_val_loss / len(val_data_loader)\n",
751 |         "            history[\"val_losses\"].append(average_val_loss)\n",
752 |         "\n",
753 |         "        # compute the training metric after each epoch\n",
754 |         "        average_train_metric = (\n",
755 |         "            metric.compute()\n",
756 |         "                  .item()\n",
757 |         "        )\n",
758 |         "        history[\"train_metrics\"].append(average_train_metric)\n",
759 |         "\n",
760 |         "        # compute the validation metric after each epoch\n",
761 |         "        average_val_metric = evaluate_tm(\n",
762 |         "            model_fn,\n",
763 |         "            val_data_loader,\n",
764 |         "            metric,\n",
765 |         "        )\n",
766 |         "        history[\"val_metrics\"].append(average_val_metric)\n",
767 |         "\n",
768 |         "        if (epoch + 1) % log_epochs == 0:\n",
769 |         "            print(f\"Epoch {epoch + 1}/{n_epochs}, \"\n",
770 |         "                  f\"train loss: {history['train_losses'][-1]:.4f}, \"\n",
771 |         "                  f\"val loss: {history['val_losses'][-1]:.4f}, \"\n",
772 |         "                  f\"train metric: {history['train_metrics'][-1]:.4f}, \"\n",
773 |         "                  f\"val metric: {history['val_metrics'][-1]:.4f}\"\n",
774 |         "            )\n",
775 |         "\n",
776 |         "    return history\n",
777 |         "\n"
778 |       ],
779 |       "metadata": {
780 |         "id": "wyeDagaOywgs"
781 |       },
782 |       "execution_count": null,
783 |       "outputs": []
784 |     },
785 |     {
786 |       "cell_type": "markdown",
787 |       "source": [
788 |         "## Combining training and evaluation"
789 |       ],
790 |       "metadata": {
791 |         "id": "YLNcmbPv6MQJ"
792 |       }
793 |     },
794 |     {
795 |       "cell_type": "code",
796 |       "source": [
797 |         "_ = torch.manual_seed(42)\n",
798 |         "\n",
799 |         "n_features = X_train.size(1)\n",
800 |         "n_classes = y_train.unique().size(0)\n",
801 |         "\n",
802 |         "covtype_model = nn.Sequential(\n",
803 |         "    nn.Linear(\n",
804 |         "        in_features=n_features,\n",
805 |         "        out_features=200,\n",
806 |         "        bias=True,\n",
807 |         "    ),\n",
808 |         "    nn.ReLU(),\n",
809 |         "    nn.Linear(\n",
810 |         "        in_features=200,\n",
811 |         "        out_features=100,\n",
812 |         "        bias=True,\n",
813 |         "    ),\n",
814 |         "    nn.ReLU(),\n",
815 |         "    nn.Linear(\n",
816 |         "        in_features=100,\n",
817 |         "        out_features=50,\n",
818 |         "        bias=True,\n",
819 |         "    ),\n",
820 |         "    nn.ReLU(),\n",
821 |         "    nn.Linear(\n",
822 |         "        in_features=50,\n",
823 |         "        out_features=n_classes,\n",
824 |         "        bias=True,\n",
825 |         "    ),\n",
826 |         ")\n",
827 |         "\n",
828 |         "# move model to the GPU before defining your optimizer!\n",
829 |         "covtype_model = covtype_model.to(DEVICE)\n",
830 |         "\n",
831 |         "cross_entropy_loss = nn.CrossEntropyLoss()\n",
832 |         "\n",
833 |         "# optimizer should be defined after moving model to GPU\n",
834 |         "sgd = optim.SGD(\n",
835 |         "    covtype_model.parameters(),\n",
836 |         "    lr=1e-3\n",
837 |         ")\n",
838 |         "\n",
839 |         "# define metric\n",
840 |         "accuracy = (\n",
841 |         "    torchmetrics.Accuracy(\n",
842 |         "        num_classes=n_classes,\n",
843 |         "        task=\"multiclass\",\n",
844 |         "    ).to(DEVICE)\n",
845 |         ")\n",
846 |         "\n",
847 |         "history = train(\n",
848 |         "    covtype_model,\n",
849 |         "    cross_entropy_loss,\n",
850 |         "    sgd,\n",
851 |         "    accuracy,\n",
852 |         "    train_data_loader,\n",
853 |         "    val_data_loader,\n",
854 |         "    n_epochs=10,\n",
855 |         "    log_epochs=1,\n",
856 |         ")"
857 |       ],
858 |       "metadata": {
859 |         "id": "gD2xr1Px3W6Z"
860 |       },
861 |       "execution_count": null,
862 |       "outputs": []
863 |     },
864 |     {
865 |       "cell_type": "code",
866 |       "source": [],
867 |       "metadata": {
868 |         "id": "Ekv-tHhG37dq"
869 |       },
870 |       "execution_count": null,
871 |       "outputs": []
872 |     }
873 |   ]
874 | }


--------------------------------------------------------------------------------