├── LICENSE
├── README.md
├── after
    ├── conf
    │   ├── config.yaml
    │   └── files
    │   │   └── mnist.yaml
    ├── config.py
    ├── ds
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── load_data.py
    │   ├── metrics.py
    │   ├── models.py
    │   ├── runner.py
    │   ├── tensorboard.py
    │   ├── tracking.py
    │   └── utils.py
    ├── main.py
    └── parse_raw_data.py
├── before
    ├── ds
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── load_data.py
    │   ├── metrics.py
    │   ├── models.py
    │   ├── runner.py
    │   ├── tensorboard.py
    │   ├── tracking.py
    │   └── utils.py
    ├── main.py
    ├── parse_raw_data.py
    └── requirements.txt
├── data
    ├── README.md
    └── raw
    │   ├── t10k-images-idx3-ubyte.gz
    │   ├── t10k-labels-idx1-ubyte.gz
    │   ├── train-images-idx3-ubyte.gz
    │   └── train-labels-idx1-ubyte.gz
└── requirements.txt


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 ArjanCodes and Mark Todisco
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Configuration Management For Data Science Made Easy With Hydra
2 | 
3 | This repository contains the example code of the video on Hydra and configurations. Here's the link to the video: https://youtu.be/tEsPyYnzt8s.
4 | 


--------------------------------------------------------------------------------
/after/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - files: mnist
 3 |   - _self_
 4 | paths:
 5 |   log: ./runs
 6 |   data: ${hydra:runtime.cwd}/../data/raw
 7 | params:
 8 |   epoch_count: 20
 9 |   lr: 5e-5
10 |   batch_size: 128
11 | 


--------------------------------------------------------------------------------
/after/conf/files/mnist.yaml:
--------------------------------------------------------------------------------
1 | test_data: t10k-images-idx3-ubyte.gz
2 | test_labels: t10k-labels-idx1-ubyte.gz
3 | train_data: train-images-idx3-ubyte.gz
4 | train_labels: train-labels-idx1-ubyte.gz
5 | 


--------------------------------------------------------------------------------
/after/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class Paths:
 6 |     log: str
 7 |     data: str
 8 | 
 9 | 
10 | @dataclass
11 | class Files:
12 |     train_data: str
13 |     train_labels: str
14 |     test_data: str
15 |     test_labels: str
16 | 
17 | 
18 | @dataclass
19 | class Params:
20 |     epoch_count: int
21 |     lr: float
22 |     batch_size: int
23 | 
24 | 
25 | @dataclass
26 | class MNISTConfig:
27 |     paths: Paths
28 |     files: Files
29 |     params: Params
30 | 


--------------------------------------------------------------------------------
/after/ds/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/after/ds/__init__.py


--------------------------------------------------------------------------------
/after/ds/dataset.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch.utils.data import DataLoader, Dataset
 7 | 
 8 | from ds.load_data import load_image_data, load_label_data
 9 | 
10 | 
11 | class MNIST(Dataset[Any]):
12 |     idx: int  # requested data index
13 |     x: torch.Tensor
14 |     y: torch.Tensor
15 | 
16 |     TRAIN_MAX = 255.0
17 |     TRAIN_NORMALIZED_MEAN = 0.1306604762738429
18 |     TRAIN_NORMALIZED_STDEV = 0.3081078038564622
19 | 
20 |     def __init__(self, data: np.ndarray, targets: np.ndarray):
21 |         if len(data) != len(targets):
22 |             raise ValueError(
23 |                 "data and targets must be the same length. "
24 |                 f"{len(data)} != {len(targets)}"
25 |             )
26 | 
27 |         self.data = data
28 |         self.targets = targets
29 | 
30 |     def __len__(self):
31 |         return len(self.data)
32 | 
33 |     def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
34 |         x = self.get_x(idx)
35 |         y = self.get_y(idx)
36 |         return x, y
37 | 
38 |     def get_x(self, idx: int):
39 |         self.idx = idx
40 |         self.preprocess_x()
41 |         return self.x
42 | 
43 |     def preprocess_x(self):
44 |         self.x = self.data[self.idx].copy().astype(np.float64)
45 |         self.x /= self.TRAIN_MAX
46 |         self.x -= self.TRAIN_NORMALIZED_MEAN
47 |         self.x /= self.TRAIN_NORMALIZED_STDEV
48 |         self.x = self.x.astype(np.float32)
49 |         self.x = torch.from_numpy(self.x)
50 |         self.x = self.x.unsqueeze(0)
51 | 
52 |     def get_y(self, idx: int):
53 |         self.idx = idx
54 |         self.preprocess_y()
55 |         return self.y
56 | 
57 |     def preprocess_y(self):
58 |         self.y = self.targets[self.idx]
59 |         self.y = torch.tensor(self.y, dtype=torch.long)
60 | 
61 | 
62 | def create_dataloader(
63 |     batch_size: int,
64 |     root_path: str,
65 |     data_file: str,
66 |     label_file: str,
67 |     shuffle: bool = True,
68 | ) -> DataLoader[Any]:
69 |     data_path = Path(f"{root_path}/{data_file}")
70 |     label_path = Path(f"{root_path}/{label_file}")
71 |     data = load_image_data(data_path)
72 |     label_data = load_label_data(label_path)
73 |     return DataLoader(
74 |         dataset=MNIST(data, label_data),
75 |         batch_size=batch_size,
76 |         shuffle=shuffle,
77 |         num_workers=0,
78 |     )
79 | 


--------------------------------------------------------------------------------
/after/ds/load_data.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import struct
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | 
 7 | ALLOWED_TYPES = {
 8 |     "UNSIGNED_BYTE": b"\x08",
 9 |     "SIGNED_BYTE": b"\x09",
10 |     "SHORT": b"\x0B",
11 |     "INT": b"\x0C",
12 |     "SINGLE": b"\x0D",
13 |     "DOUBLE": b"\x0E",
14 | }
15 | 
16 | 
17 | def load_image_data(file_path: Path) -> np.ndarray:
18 |     with gzip.open(file_path, "rb") as fp:
19 |         _ = struct.unpack(">H", fp.read(2))  # dump padding bytes
20 | 
21 |         (data_type,) = struct.unpack(">c", fp.read(1))
22 |         assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"]
23 | 
24 |         number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0])
25 |         assert number_of_dimensions == 3
26 | 
27 |         (num_images,) = struct.unpack(">I", fp.read(4))
28 |         (num_rows,) = struct.unpack(">I", fp.read(4))
29 |         (num_cols,) = struct.unpack(">I", fp.read(4))
30 | 
31 |         raw = fp.read()
32 |         assert len(raw) == num_images * num_rows * num_cols
33 | 
34 |     data: np.ndarray = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">"))
35 |     data = data.reshape((num_images, num_rows, num_cols))
36 |     return data
37 | 
38 | 
39 | def load_label_data(file_path: Path) -> np.ndarray:
40 |     with gzip.open(file_path, "rb") as fp:
41 |         _ = struct.unpack(">H", fp.read(2))  # dump padding bytes
42 | 
43 |         (data_type,) = struct.unpack(">c", fp.read(1))
44 |         assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"]
45 | 
46 |         number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0])
47 |         assert number_of_dimensions == 1
48 | 
49 |         (num_images,) = struct.unpack(">I", fp.read(4))
50 | 
51 |         raw = fp.read()
52 |         assert len(raw) == num_images
53 | 
54 |     data = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">"))
55 |     return data
56 | 


--------------------------------------------------------------------------------
/after/ds/metrics.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | 
 4 | @dataclass
 5 | class Metric:
 6 |     values: list[float] = field(default_factory=list)
 7 |     running_total: float = 0.0
 8 |     num_updates: float = 0.0
 9 |     average: float = 0.0
10 | 
11 |     def update(self, value: float, batch_size: int):
12 |         self.values.append(value)
13 |         self.running_total += value * batch_size
14 |         self.num_updates += batch_size
15 |         self.average = self.running_total / self.num_updates
16 | 


--------------------------------------------------------------------------------
/after/ds/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class LinearNet(torch.nn.Module):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 | 
 8 |         self.network = torch.nn.Sequential(
 9 |             torch.nn.Flatten(),
10 |             torch.nn.Linear(in_features=28 * 28, out_features=32),
11 |             torch.nn.ReLU(),
12 |             torch.nn.Linear(in_features=32, out_features=10),
13 |             torch.nn.Softmax(dim=1),
14 |         )
15 | 
16 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
17 |         return self.network(x)
18 | 


--------------------------------------------------------------------------------
/after/ds/runner.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from sklearn.metrics import accuracy_score
 6 | from torch.utils.data.dataloader import DataLoader
 7 | from tqdm import tqdm
 8 | 
 9 | from ds.metrics import Metric
10 | from ds.tracking import ExperimentTracker, Stage
11 | 
12 | 
13 | class Runner:
14 |     def __init__(
15 |         self,
16 |         loader: DataLoader[Any],
17 |         model: torch.nn.Module,
18 |         optimizer: Optional[torch.optim.Optimizer] = None,
19 |     ) -> None:
20 |         self.run_count = 0
21 |         self.loader = loader
22 |         self.accuracy_metric = Metric()
23 |         self.model = model
24 |         self.optimizer = optimizer
25 |         # Objective (loss) function
26 |         self.compute_loss = torch.nn.CrossEntropyLoss(reduction="mean")
27 |         self.y_true_batches: list[list[Any]] = []
28 |         self.y_pred_batches: list[list[Any]] = []
29 |         # Assume Stage based on presence of optimizer
30 |         self.stage = Stage.VAL if optimizer is None else Stage.TRAIN
31 | 
32 |     @property
33 |     def avg_accuracy(self):
34 |         return self.accuracy_metric.average
35 | 
36 |     def run(self, desc: str, experiment: ExperimentTracker):
37 |         self.model.train(self.stage is Stage.TRAIN)
38 | 
39 |         for x, y in tqdm(self.loader, desc=desc, ncols=80):
40 |             loss, batch_accuracy = self._run_single(x, y)
41 | 
42 |             experiment.add_batch_metric("accuracy", batch_accuracy, self.run_count)
43 | 
44 |             if self.optimizer:
45 |                 # Reverse-mode AutoDiff (backpropagation)
46 |                 self.optimizer.zero_grad()
47 |                 loss.backward()
48 |                 self.optimizer.step()
49 | 
50 |     def _run_single(self, x: Any, y: Any):
51 |         self.run_count += 1
52 |         batch_size: int = x.shape[0]
53 |         prediction = self.model(x)
54 |         loss = self.compute_loss(prediction, y)
55 | 
56 |         # Compute Batch Validation Metrics
57 |         y_np = y.detach().numpy()
58 |         y_prediction_np = np.argmax(prediction.detach().numpy(), axis=1)
59 |         batch_accuracy: float = accuracy_score(y_np, y_prediction_np)
60 |         self.accuracy_metric.update(batch_accuracy, batch_size)
61 | 
62 |         self.y_true_batches += [y_np]
63 |         self.y_pred_batches += [y_prediction_np]
64 |         return loss, batch_accuracy
65 | 
66 |     def reset(self):
67 |         self.accuracy_metric = Metric()
68 |         self.y_true_batches = []
69 |         self.y_pred_batches = []
70 | 
71 | 
72 | def run_epoch(
73 |     test_runner: Runner,
74 |     train_runner: Runner,
75 |     experiment: ExperimentTracker,
76 |     epoch_id: int,
77 | ):
78 |     # Training Loop
79 |     experiment.set_stage(Stage.TRAIN)
80 |     train_runner.run("Train Batches", experiment)
81 | 
82 |     # Log Training Epoch Metrics
83 |     experiment.add_epoch_metric("accuracy", train_runner.avg_accuracy, epoch_id)
84 | 
85 |     # Testing Loop
86 |     experiment.set_stage(Stage.VAL)
87 |     test_runner.run("Validation Batches", experiment)
88 | 
89 |     # Log Validation Epoch Metrics
90 |     experiment.add_epoch_metric("accuracy", test_runner.avg_accuracy, epoch_id)
91 |     experiment.add_epoch_confusion_matrix(
92 |         test_runner.y_true_batches, test_runner.y_pred_batches, epoch_id
93 |     )
94 | 


--------------------------------------------------------------------------------
/after/ds/tensorboard.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import numpy as np
 4 | from matplotlib import pyplot as plt
 5 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
 6 | from torch.utils.tensorboard import SummaryWriter
 7 | 
 8 | from ds.tracking import Stage
 9 | from ds.utils import create_experiment_log_dir
10 | 
11 | 
12 | class TensorboardExperiment:
13 |     def __init__(self, log_path: str, create: bool = True):
14 | 
15 |         log_dir = create_experiment_log_dir(root=log_path)
16 |         self.stage = Stage.TRAIN
17 |         self._validate_log_dir(log_dir, create=create)
18 |         self._writer = SummaryWriter(log_dir=log_dir)
19 |         plt.ioff()
20 | 
21 |     def set_stage(self, stage: Stage):
22 |         self.stage = stage
23 | 
24 |     def flush(self):
25 |         self._writer.flush()
26 | 
27 |     @staticmethod
28 |     def _validate_log_dir(log_dir: str, create: bool = True):
29 |         log_path = Path(log_dir).resolve()
30 |         if log_path.exists():
31 |             return
32 |         elif not log_path.exists() and create:
33 |             log_path.mkdir(parents=True)
34 |         else:
35 |             raise NotADirectoryError(f"log_dir {log_dir} does not exist.")
36 | 
37 |     def add_batch_metric(self, name: str, value: float, step: int):
38 |         tag = f"{self.stage.name}/batch/{name}"
39 |         self._writer.add_scalar(tag, value, step)
40 | 
41 |     def add_epoch_metric(self, name: str, value: float, step: int):
42 |         tag = f"{self.stage.name}/epoch/{name}"
43 |         self._writer.add_scalar(tag, value, step)
44 | 
45 |     def add_epoch_confusion_matrix(
46 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
47 |     ):
48 |         y_true, y_pred = self.collapse_batches(y_true, y_pred)
49 |         fig = self.create_confusion_matrix(y_true, y_pred, step)
50 |         tag = f"{self.stage.name}/epoch/confusion_matrix"
51 |         self._writer.add_figure(tag, fig, step)
52 | 
53 |     @staticmethod
54 |     def collapse_batches(
55 |         y_true: list[np.array], y_pred: list[np.array]
56 |     ) -> tuple[np.ndarray, np.ndarray]:
57 |         return np.concatenate(y_true), np.concatenate(y_pred)
58 | 
59 |     def create_confusion_matrix(
60 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
61 |     ) -> plt.Figure:
62 |         cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues")
63 |         cm.ax_.set_title(f"{self.stage.name} Epoch: {step}")
64 |         return cm.figure_
65 | 


--------------------------------------------------------------------------------
/after/ds/tracking.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | from pathlib import Path
 3 | from typing import Protocol
 4 | 
 5 | import numpy as np
 6 | from matplotlib import pyplot as plt
 7 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
 8 | from torch.utils.tensorboard import SummaryWriter
 9 | 
10 | from ds.utils import create_experiment_log_dir
11 | 
12 | 
13 | class Stage(Enum):
14 |     TRAIN = auto()
15 |     TEST = auto()
16 |     VAL = auto()
17 | 
18 | 
19 | class ExperimentTracker(Protocol):
20 |     def set_stage(self, stage: Stage):
21 |         """Sets the current stage of the experiment."""
22 | 
23 |     def add_batch_metric(self, name: str, value: float, step: int):
24 |         """Implements logging a batch-level metric."""
25 | 
26 |     def add_epoch_metric(self, name: str, value: float, step: int):
27 |         """Implements logging a epoch-level metric."""
28 | 
29 |     def add_epoch_confusion_matrix(
30 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
31 |     ):
32 |         """Implements logging a confusion matrix at epoch-level."""
33 | 
34 | 
35 | class TensorboardExperiment:
36 |     def __init__(self, log_path: str, create: bool = True):
37 |         self.stage = Stage.TRAIN
38 |         self._writer = SummaryWriter(
39 |             log_dir=create_experiment_log_dir(log_path, parents=True)
40 |         )
41 |         plt.ioff()
42 | 
43 |     def set_stage(self, stage: Stage):
44 |         self.stage = stage
45 | 
46 |     def flush(self):
47 |         self._writer.flush()
48 | 
49 |     @staticmethod
50 |     def _validate_log_dir(log_dir: str, create: bool = True):
51 |         log_path = Path(log_dir).resolve()
52 |         if log_path.exists():
53 |             return
54 |         elif not log_path.exists() and create:
55 |             log_path.mkdir(parents=True)
56 |         else:
57 |             raise NotADirectoryError(f"log_dir {log_dir} does not exist.")
58 | 
59 |     def add_batch_metric(self, name: str, value: float, step: int):
60 |         tag = f"{self.stage.name}/batch/{name}"
61 |         self._writer.add_scalar(tag, value, step)
62 | 
63 |     def add_epoch_metric(self, name: str, value: float, step: int):
64 |         tag = f"{self.stage.name}/epoch/{name}"
65 |         self._writer.add_scalar(tag, value, step)
66 | 
67 |     def add_epoch_confusion_matrix(
68 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
69 |     ):
70 |         y_true, y_pred = self.collapse_batches(y_true, y_pred)
71 |         fig = self.create_confusion_matrix(y_true, y_pred, step)
72 |         tag = f"{self.stage.name}/epoch/confusion_matrix"
73 |         self._writer.add_figure(tag, fig, step)
74 | 
75 |     @staticmethod
76 |     def collapse_batches(
77 |         y_true: list[np.array], y_pred: list[np.array]
78 |     ) -> tuple[np.ndarray, np.ndarray]:
79 |         return np.concatenate(y_true), np.concatenate(y_pred)
80 | 
81 |     def create_confusion_matrix(
82 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
83 |     ) -> plt.Figure:
84 |         cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues")
85 |         cm.ax_.set_title(f"{self.stage.name} Epoch: {step}")
86 |         return cm.figure_
87 | 


--------------------------------------------------------------------------------
/after/ds/utils.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | 
 4 | def create_experiment_log_dir(root: str, parents: bool = True) -> str:
 5 |     root_path = pathlib.Path(root).resolve()
 6 |     child = (
 7 |         create_from_missing(root_path)
 8 |         if not root_path.exists()
 9 |         else create_from_existing(root_path)
10 |     )
11 |     child.mkdir(parents=parents)
12 |     return child.as_posix()
13 | 
14 | 
15 | def create_from_missing(root: pathlib.Path) -> pathlib.Path:
16 |     return root / "0"
17 | 
18 | 
19 | def create_from_existing(root: pathlib.Path) -> pathlib.Path:
20 |     children = [
21 |         int(c.name) for c in root.glob("*")
22 |         if (c.is_dir() and c.name.isnumeric())
23 |     ]
24 |     if is_first_experiment(children):
25 |         child = create_from_missing(root)
26 |     else:
27 |         child = root / increment_experiment_number(children)
28 |     return child
29 | 
30 | 
31 | def is_first_experiment(children: list[int]) -> bool:
32 |     return len(children) == 0
33 | 
34 | 
35 | def increment_experiment_number(children: list[int]) -> str:
36 |     return str(max(children) + 1)
37 | 


--------------------------------------------------------------------------------
/after/main.py:
--------------------------------------------------------------------------------
 1 | import hydra
 2 | import torch
 3 | from hydra.core.config_store import ConfigStore
 4 | from omegaconf import OmegaConf
 5 | 
 6 | from config import MNISTConfig
 7 | from ds.dataset import create_dataloader
 8 | from ds.models import LinearNet
 9 | from ds.runner import Runner, run_epoch
10 | from ds.tracking import TensorboardExperiment
11 | 
12 | cs = ConfigStore.instance()
13 | cs.store(name="mnist_config", node=MNISTConfig)
14 | 
15 | 
16 | @hydra.main(config_path="conf", config_name="config")
17 | def main(cfg: MNISTConfig) -> None:
18 |     print(OmegaConf.to_yaml(cfg))
19 | 
20 |     # Model and Optimizer
21 |     model = LinearNet()
22 |     optimizer = torch.optim.Adam(model.parameters(), lr=cfg.params.lr)
23 | 
24 |     # Create the data loaders
25 | 
26 |     test_loader = create_dataloader(
27 |         batch_size=cfg.params.batch_size,
28 |         root_path=cfg.paths.data,
29 |         data_file=cfg.files.test_data,
30 |         label_file=cfg.files.test_labels,
31 |     )
32 |     train_loader = create_dataloader(
33 |         batch_size=cfg.params.batch_size,
34 |         root_path=cfg.paths.data,
35 |         data_file=cfg.files.train_data,
36 |         label_file=cfg.files.train_labels,
37 |     )
38 | 
39 |     # Create the runners
40 |     test_runner = Runner(test_loader, model)
41 |     train_runner = Runner(train_loader, model, optimizer)
42 | 
43 |     # Setup the experiment tracker
44 |     tracker = TensorboardExperiment(log_path=cfg.paths.log)
45 | 
46 |     # Run the epochs
47 |     for epoch_id in range(cfg.params.epoch_count):
48 |         run_epoch(test_runner, train_runner, tracker, epoch_id)
49 | 
50 |         # Compute Average Epoch Metrics
51 |         summary = ", ".join(
52 |             [
53 |                 f"[Epoch: {epoch_id + 1}/{cfg.params.epoch_count}]",
54 |                 f"Test Accuracy: {test_runner.avg_accuracy: 0.4f}",
55 |                 f"Train Accuracy: {train_runner.avg_accuracy: 0.4f}",
56 |             ]
57 |         )
58 |         print("\n" + summary + "\n")
59 | 
60 |         # Reset the runners
61 |         train_runner.reset()
62 |         test_runner.reset()
63 | 
64 |         # Flush the tracker after every epoch for live updates
65 |         tracker.flush()
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/after/parse_raw_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import pathlib
 4 | from typing import Iterable, Tuple
 5 | 
 6 | import numpy as np
 7 | from PIL import Image
 8 | from tqdm import tqdm
 9 | 
10 | from ds.load_data import load_image_data, load_label_data
11 | 
12 | RAW_DATA = "./data/raw"
13 | TEST_DATA_RAW = pathlib.Path(f"{RAW_DATA}/t10k-images-idx3-ubyte.gz")
14 | TEST_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/t10k-labels-idx1-ubyte.gz")
15 | TRAIN_DATA_RAW = pathlib.Path(f"{RAW_DATA}/train-images-idx3-ubyte.gz")
16 | TRAIN_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/train-labels-idx1-ubyte.gz")
17 | 
18 | PROCESSED_DATA = './data/processed'
19 | TEST_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/test")
20 | TRAIN_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/tain")
21 | 
22 | 
23 | def main():
24 |     make_tree(TRAIN_DIR_PROCESSED, reset=True)
25 |     make_tree(TEST_DIR_PROCESSED, reset=True)
26 | 
27 |     save_dataset_to_png(
28 |         TRAIN_DIR_PROCESSED, 
29 |         zip(load_image_data(TRAIN_DATA_RAW), load_label_data(TRAIN_LABELS_RAW))
30 |     )
31 |     save_dataset_to_png(
32 |         TEST_DIR_PROCESSED, 
33 |         zip(load_image_data(TEST_DATA_RAW), load_label_data(TEST_LABELS_RAW))
34 |     )
35 | 
36 | 
37 | def make_tree(root: pathlib.Path, reset: bool = False) -> None:
38 |     if reset:
39 |         reset_tree(root)
40 |     for child in range(10):
41 |         child = pathlib.Path(str(child))
42 |         if not child.exists():
43 |             os.makedirs(root / child)
44 | 
45 | 
46 | def reset_tree(root: pathlib.Path) -> None:
47 |     print('Resetting tree.')
48 |     shutil.rmtree(root, ignore_errors=True)
49 | 
50 | 
51 | def save_dataset_to_png(
52 |     root: pathlib.Path, array: Iterable[Tuple[np.ndarray, np.ndarray]]
53 | ) -> None:
54 |     for i, xy in enumerate(tqdm(tuple(array), ncols=80)):
55 |         save_xy_to_png(root, xy, str(i))
56 | 
57 | 
58 | def save_xy_to_png(
59 |     root: pathlib.Path, xy: Tuple[np.ndarray, np.ndarray], name: str
60 | ) -> None:
61 |     x, y = xy
62 |     Image.fromarray(x).save(root / str(int(y)) / f'{name}.jpg')
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/before/ds/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/before/ds/__init__.py


--------------------------------------------------------------------------------
/before/ds/dataset.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch.utils.data import DataLoader, Dataset
 7 | 
 8 | from ds.load_data import load_image_data, load_label_data
 9 | 
10 | 
11 | class MNIST(Dataset[Any]):
12 |     idx: int  # requested data index
13 |     x: torch.Tensor
14 |     y: torch.Tensor
15 | 
16 |     TRAIN_MAX = 255.0
17 |     TRAIN_NORMALIZED_MEAN = 0.1306604762738429
18 |     TRAIN_NORMALIZED_STDEV = 0.3081078038564622
19 | 
20 |     def __init__(self, data: np.ndarray, targets: np.ndarray):
21 |         if len(data) != len(targets):
22 |             raise ValueError(
23 |                 "data and targets must be the same length. "
24 |                 f"{len(data)} != {len(targets)}"
25 |             )
26 | 
27 |         self.data = data
28 |         self.targets = targets
29 | 
30 |     def __len__(self):
31 |         return len(self.data)
32 | 
33 |     def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
34 |         x = self.get_x(idx)
35 |         y = self.get_y(idx)
36 |         return x, y
37 | 
38 |     def get_x(self, idx: int):
39 |         self.idx = idx
40 |         self.preprocess_x()
41 |         return self.x
42 | 
43 |     def preprocess_x(self):
44 |         self.x = self.data[self.idx].copy().astype(np.float64)
45 |         self.x /= self.TRAIN_MAX
46 |         self.x -= self.TRAIN_NORMALIZED_MEAN
47 |         self.x /= self.TRAIN_NORMALIZED_STDEV
48 |         self.x = self.x.astype(np.float32)
49 |         self.x = torch.from_numpy(self.x)
50 |         self.x = self.x.unsqueeze(0)
51 | 
52 |     def get_y(self, idx: int):
53 |         self.idx = idx
54 |         self.preprocess_y()
55 |         return self.y
56 | 
57 |     def preprocess_y(self):
58 |         self.y = self.targets[self.idx]
59 |         self.y = torch.tensor(self.y, dtype=torch.long)
60 | 
61 | 
62 | def create_dataloader(
63 |     batch_size: int, data_path: Path, label_path: Path, shuffle: bool = True
64 | ) -> DataLoader[Any]:
65 |     data = load_image_data(data_path)
66 |     label_data = load_label_data(label_path)
67 |     return DataLoader(
68 |         dataset=MNIST(data, label_data),
69 |         batch_size=batch_size,
70 |         shuffle=shuffle,
71 |         num_workers=0,
72 |     )
73 | 


--------------------------------------------------------------------------------
/before/ds/load_data.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import struct
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | 
 7 | ALLOWED_TYPES = {
 8 |     "UNSIGNED_BYTE": b"\x08",
 9 |     "SIGNED_BYTE": b"\x09",
10 |     "SHORT": b"\x0B",
11 |     "INT": b"\x0C",
12 |     "SINGLE": b"\x0D",
13 |     "DOUBLE": b"\x0E",
14 | }
15 | 
16 | 
17 | def load_image_data(file_path: Path) -> np.ndarray:
18 |     with gzip.open(file_path, "rb") as fp:
19 |         _ = struct.unpack(">H", fp.read(2))  # dump padding bytes
20 | 
21 |         (data_type,) = struct.unpack(">c", fp.read(1))
22 |         assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"]
23 | 
24 |         number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0])
25 |         assert number_of_dimensions == 3
26 | 
27 |         (num_images,) = struct.unpack(">I", fp.read(4))
28 |         (num_rows,) = struct.unpack(">I", fp.read(4))
29 |         (num_cols,) = struct.unpack(">I", fp.read(4))
30 | 
31 |         raw = fp.read()
32 |         assert len(raw) == num_images * num_rows * num_cols
33 | 
34 |     data: np.ndarray = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">"))
35 |     data = data.reshape((num_images, num_rows, num_cols))
36 |     return data
37 | 
38 | 
39 | def load_label_data(file_path: Path) -> np.ndarray:
40 |     with gzip.open(file_path, "rb") as fp:
41 |         _ = struct.unpack(">H", fp.read(2))  # dump padding bytes
42 | 
43 |         (data_type,) = struct.unpack(">c", fp.read(1))
44 |         assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"]
45 | 
46 |         number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0])
47 |         assert number_of_dimensions == 1
48 | 
49 |         (num_images,) = struct.unpack(">I", fp.read(4))
50 | 
51 |         raw = fp.read()
52 |         assert len(raw) == num_images
53 | 
54 |     data = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">"))
55 |     return data
56 | 


--------------------------------------------------------------------------------
/before/ds/metrics.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | 
 4 | @dataclass
 5 | class Metric:
 6 |     values: list[float] = field(default_factory=list)
 7 |     running_total: float = 0.0
 8 |     num_updates: float = 0.0
 9 |     average: float = 0.0
10 | 
11 |     def update(self, value: float, batch_size: int):
12 |         self.values.append(value)
13 |         self.running_total += value * batch_size
14 |         self.num_updates += batch_size
15 |         self.average = self.running_total / self.num_updates
16 | 


--------------------------------------------------------------------------------
/before/ds/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class LinearNet(torch.nn.Module):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 | 
 8 |         self.network = torch.nn.Sequential(
 9 |             torch.nn.Flatten(),
10 |             torch.nn.Linear(in_features=28 * 28, out_features=32),
11 |             torch.nn.ReLU(),
12 |             torch.nn.Linear(in_features=32, out_features=10),
13 |             torch.nn.Softmax(dim=1),
14 |         )
15 | 
16 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
17 |         return self.network(x)
18 | 


--------------------------------------------------------------------------------
/before/ds/runner.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from sklearn.metrics import accuracy_score
 6 | from torch.utils.data.dataloader import DataLoader
 7 | from tqdm import tqdm
 8 | 
 9 | from ds.metrics import Metric
10 | from ds.tracking import ExperimentTracker, Stage
11 | 
12 | 
13 | class Runner:
14 |     def __init__(
15 |         self,
16 |         loader: DataLoader[Any],
17 |         model: torch.nn.Module,
18 |         optimizer: Optional[torch.optim.Optimizer] = None,
19 |     ) -> None:
20 |         self.run_count = 0
21 |         self.loader = loader
22 |         self.accuracy_metric = Metric()
23 |         self.model = model
24 |         self.optimizer = optimizer
25 |         # Objective (loss) function
26 |         self.compute_loss = torch.nn.CrossEntropyLoss(reduction="mean")
27 |         self.y_true_batches: list[list[Any]] = []
28 |         self.y_pred_batches: list[list[Any]] = []
29 |         # Assume Stage based on presence of optimizer
30 |         self.stage = Stage.VAL if optimizer is None else Stage.TRAIN
31 | 
32 |     @property
33 |     def avg_accuracy(self):
34 |         return self.accuracy_metric.average
35 | 
36 |     def run(self, desc: str, experiment: ExperimentTracker):
37 |         self.model.train(self.stage is Stage.TRAIN)
38 | 
39 |         for x, y in tqdm(self.loader, desc=desc, ncols=80):
40 |             loss, batch_accuracy = self._run_single(x, y)
41 | 
42 |             experiment.add_batch_metric("accuracy", batch_accuracy, self.run_count)
43 | 
44 |             if self.optimizer:
45 |                 # Reverse-mode AutoDiff (backpropagation)
46 |                 self.optimizer.zero_grad()
47 |                 loss.backward()
48 |                 self.optimizer.step()
49 | 
50 |     def _run_single(self, x: Any, y: Any):
51 |         self.run_count += 1
52 |         batch_size: int = x.shape[0]
53 |         prediction = self.model(x)
54 |         loss = self.compute_loss(prediction, y)
55 | 
56 |         # Compute Batch Validation Metrics
57 |         y_np = y.detach().numpy()
58 |         y_prediction_np = np.argmax(prediction.detach().numpy(), axis=1)
59 |         batch_accuracy: float = accuracy_score(y_np, y_prediction_np)
60 |         self.accuracy_metric.update(batch_accuracy, batch_size)
61 | 
62 |         self.y_true_batches += [y_np]
63 |         self.y_pred_batches += [y_prediction_np]
64 |         return loss, batch_accuracy
65 | 
66 |     def reset(self):
67 |         self.accuracy_metric = Metric()
68 |         self.y_true_batches = []
69 |         self.y_pred_batches = []
70 | 
71 | 
72 | def run_epoch(
73 |     test_runner: Runner,
74 |     train_runner: Runner,
75 |     experiment: ExperimentTracker,
76 |     epoch_id: int,
77 | ):
78 |     # Training Loop
79 |     experiment.set_stage(Stage.TRAIN)
80 |     train_runner.run("Train Batches", experiment)
81 | 
82 |     # Log Training Epoch Metrics
83 |     experiment.add_epoch_metric("accuracy", train_runner.avg_accuracy, epoch_id)
84 | 
85 |     # Testing Loop
86 |     experiment.set_stage(Stage.VAL)
87 |     test_runner.run("Validation Batches", experiment)
88 | 
89 |     # Log Validation Epoch Metrics
90 |     experiment.add_epoch_metric("accuracy", test_runner.avg_accuracy, epoch_id)
91 |     experiment.add_epoch_confusion_matrix(
92 |         test_runner.y_true_batches, test_runner.y_pred_batches, epoch_id
93 |     )
94 | 


--------------------------------------------------------------------------------
/before/ds/tensorboard.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import numpy as np
 4 | from matplotlib import pyplot as plt
 5 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
 6 | from torch.utils.tensorboard import SummaryWriter
 7 | 
 8 | from ds.tracking import Stage
 9 | from ds.utils import create_experiment_log_dir
10 | 
11 | 
12 | class TensorboardExperiment:
13 |     def __init__(self, log_path: str, create: bool = True):
14 | 
15 |         log_dir = create_experiment_log_dir(root=log_path)
16 |         self.stage = Stage.TRAIN
17 |         self._validate_log_dir(log_dir, create=create)
18 |         self._writer = SummaryWriter(log_dir=log_dir)
19 |         plt.ioff()
20 | 
21 |     def set_stage(self, stage: Stage):
22 |         self.stage = stage
23 | 
24 |     def flush(self):
25 |         self._writer.flush()
26 | 
27 |     @staticmethod
28 |     def _validate_log_dir(log_dir: str, create: bool = True):
29 |         log_path = Path(log_dir).resolve()
30 |         if log_path.exists():
31 |             return
32 |         elif not log_path.exists() and create:
33 |             log_path.mkdir(parents=True)
34 |         else:
35 |             raise NotADirectoryError(f"log_dir {log_dir} does not exist.")
36 | 
37 |     def add_batch_metric(self, name: str, value: float, step: int):
38 |         tag = f"{self.stage.name}/batch/{name}"
39 |         self._writer.add_scalar(tag, value, step)
40 | 
41 |     def add_epoch_metric(self, name: str, value: float, step: int):
42 |         tag = f"{self.stage.name}/epoch/{name}"
43 |         self._writer.add_scalar(tag, value, step)
44 | 
45 |     def add_epoch_confusion_matrix(
46 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
47 |     ):
48 |         y_true, y_pred = self.collapse_batches(y_true, y_pred)
49 |         fig = self.create_confusion_matrix(y_true, y_pred, step)
50 |         tag = f"{self.stage.name}/epoch/confusion_matrix"
51 |         self._writer.add_figure(tag, fig, step)
52 | 
53 |     @staticmethod
54 |     def collapse_batches(
55 |         y_true: list[np.array], y_pred: list[np.array]
56 |     ) -> tuple[np.ndarray, np.ndarray]:
57 |         return np.concatenate(y_true), np.concatenate(y_pred)
58 | 
59 |     def create_confusion_matrix(
60 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
61 |     ) -> plt.Figure:
62 |         cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues")
63 |         cm.ax_.set_title(f"{self.stage.name} Epoch: {step}")
64 |         return cm.figure_
65 | 


--------------------------------------------------------------------------------
/before/ds/tracking.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | from pathlib import Path
 3 | from typing import Protocol
 4 | 
 5 | import numpy as np
 6 | from matplotlib import pyplot as plt
 7 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
 8 | from torch.utils.tensorboard import SummaryWriter
 9 | 
10 | from ds.utils import create_experiment_log_dir
11 | 
12 | 
13 | class Stage(Enum):
14 |     TRAIN = auto()
15 |     TEST = auto()
16 |     VAL = auto()
17 | 
18 | 
19 | class ExperimentTracker(Protocol):
20 |     def set_stage(self, stage: Stage):
21 |         """Sets the current stage of the experiment."""
22 | 
23 |     def add_batch_metric(self, name: str, value: float, step: int):
24 |         """Implements logging a batch-level metric."""
25 | 
26 |     def add_epoch_metric(self, name: str, value: float, step: int):
27 |         """Implements logging a epoch-level metric."""
28 | 
29 |     def add_epoch_confusion_matrix(
30 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
31 |     ):
32 |         """Implements logging a confusion matrix at epoch-level."""
33 | 
34 | 
35 | class TensorboardExperiment:
36 |     def __init__(self, log_path: str, create: bool = True):
37 |         self.stage = Stage.TRAIN
38 |         self._writer = SummaryWriter(
39 |             log_dir=create_experiment_log_dir(log_path, parents=True)
40 |         )
41 |         plt.ioff()
42 | 
43 |     def set_stage(self, stage: Stage):
44 |         self.stage = stage
45 | 
46 |     def flush(self):
47 |         self._writer.flush()
48 | 
49 |     @staticmethod
50 |     def _validate_log_dir(log_dir: str, create: bool = True):
51 |         log_path = Path(log_dir).resolve()
52 |         if log_path.exists():
53 |             return
54 |         elif not log_path.exists() and create:
55 |             log_path.mkdir(parents=True)
56 |         else:
57 |             raise NotADirectoryError(f"log_dir {log_dir} does not exist.")
58 | 
59 |     def add_batch_metric(self, name: str, value: float, step: int):
60 |         tag = f"{self.stage.name}/batch/{name}"
61 |         self._writer.add_scalar(tag, value, step)
62 | 
63 |     def add_epoch_metric(self, name: str, value: float, step: int):
64 |         tag = f"{self.stage.name}/epoch/{name}"
65 |         self._writer.add_scalar(tag, value, step)
66 | 
67 |     def add_epoch_confusion_matrix(
68 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
69 |     ):
70 |         y_true, y_pred = self.collapse_batches(y_true, y_pred)
71 |         fig = self.create_confusion_matrix(y_true, y_pred, step)
72 |         tag = f"{self.stage.name}/epoch/confusion_matrix"
73 |         self._writer.add_figure(tag, fig, step)
74 | 
75 |     @staticmethod
76 |     def collapse_batches(
77 |         y_true: list[np.array], y_pred: list[np.array]
78 |     ) -> tuple[np.ndarray, np.ndarray]:
79 |         return np.concatenate(y_true), np.concatenate(y_pred)
80 | 
81 |     def create_confusion_matrix(
82 |         self, y_true: list[np.array], y_pred: list[np.array], step: int
83 |     ) -> plt.Figure:
84 |         cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues")
85 |         cm.ax_.set_title(f"{self.stage.name} Epoch: {step}")
86 |         return cm.figure_
87 | 


--------------------------------------------------------------------------------
/before/ds/utils.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | 
 4 | def create_experiment_log_dir(root: str, parents: bool = True) -> str:
 5 |     root_path = pathlib.Path(root).resolve()
 6 |     child = (
 7 |         create_from_missing(root_path)
 8 |         if not root_path.exists()
 9 |         else create_from_existing(root_path)
10 |     )
11 |     child.mkdir(parents=parents)
12 |     return child.as_posix()
13 | 
14 | 
15 | def create_from_missing(root: pathlib.Path) -> pathlib.Path:
16 |     return root / "0"
17 | 
18 | 
19 | def create_from_existing(root: pathlib.Path) -> pathlib.Path:
20 |     children = [
21 |         int(c.name) for c in root.glob("*")
22 |         if (c.is_dir() and c.name.isnumeric())
23 |     ]
24 |     if is_first_experiment(children):
25 |         child = create_from_missing(root)
26 |     else:
27 |         child = root / increment_experiment_number(children)
28 |     return child
29 | 
30 | 
31 | def is_first_experiment(children: list[int]) -> bool:
32 |     return len(children) == 0
33 | 
34 | 
35 | def increment_experiment_number(children: list[int]) -> str:
36 |     return str(max(children) + 1)
37 | 


--------------------------------------------------------------------------------
/before/main.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import torch
 4 | 
 5 | from ds.dataset import create_dataloader
 6 | from ds.models import LinearNet
 7 | from ds.runner import Runner, run_epoch
 8 | from ds.tracking import TensorboardExperiment
 9 | 
10 | # Hyperparameters
11 | EPOCH_COUNT = 20
12 | LR = 5e-5
13 | BATCH_SIZE = 128
14 | LOG_PATH = "./runs"
15 | 
16 | # Data configuration
17 | DATA_DIR = "../data/raw"
18 | TEST_DATA = pathlib.Path(f"{DATA_DIR}/t10k-images-idx3-ubyte.gz")
19 | TEST_LABELS = pathlib.Path(f"{DATA_DIR}/t10k-labels-idx1-ubyte.gz")
20 | TRAIN_DATA = pathlib.Path(f"{DATA_DIR}/train-images-idx3-ubyte.gz")
21 | TRAIN_LABELS = pathlib.Path(f"{DATA_DIR}/train-labels-idx1-ubyte.gz")
22 | 
23 | 
24 | def main():
25 | 
26 |     # Model and Optimizer
27 |     model = LinearNet()
28 |     optimizer = torch.optim.Adam(model.parameters(), lr=LR)
29 | 
30 |     # Create the data loaders
31 |     test_loader = create_dataloader(BATCH_SIZE, TEST_DATA, TEST_LABELS)
32 |     train_loader = create_dataloader(BATCH_SIZE, TRAIN_DATA, TRAIN_LABELS)
33 | 
34 |     # Create the runners
35 |     test_runner = Runner(test_loader, model)
36 |     train_runner = Runner(train_loader, model, optimizer)
37 | 
38 |     # Setup the experiment tracker
39 |     tracker = TensorboardExperiment(log_path=LOG_PATH)
40 | 
41 |     # Run the epochs
42 |     for epoch_id in range(EPOCH_COUNT):
43 |         run_epoch(test_runner, train_runner, tracker, epoch_id)
44 | 
45 |         # Compute Average Epoch Metrics
46 |         summary = ", ".join(
47 |             [
48 |                 f"[Epoch: {epoch_id + 1}/{EPOCH_COUNT}]",
49 |                 f"Test Accuracy: {test_runner.avg_accuracy: 0.4f}",
50 |                 f"Train Accuracy: {train_runner.avg_accuracy: 0.4f}",
51 |             ]
52 |         )
53 |         print("\n" + summary + "\n")
54 | 
55 |         # Reset the runners
56 |         train_runner.reset()
57 |         test_runner.reset()
58 | 
59 |         # Flush the tracker after every epoch for live updates
60 |         tracker.flush()
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/before/parse_raw_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import pathlib
 4 | from typing import Iterable, Tuple
 5 | 
 6 | import numpy as np
 7 | from PIL import Image
 8 | from tqdm import tqdm
 9 | 
10 | from ds.load_data import load_image_data, load_label_data
11 | 
12 | RAW_DATA = "./data/raw"
13 | TEST_DATA_RAW = pathlib.Path(f"{RAW_DATA}/t10k-images-idx3-ubyte.gz")
14 | TEST_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/t10k-labels-idx1-ubyte.gz")
15 | TRAIN_DATA_RAW = pathlib.Path(f"{RAW_DATA}/train-images-idx3-ubyte.gz")
16 | TRAIN_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/train-labels-idx1-ubyte.gz")
17 | 
18 | PROCESSED_DATA = './data/processed'
19 | TEST_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/test")
20 | TRAIN_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/tain")
21 | 
22 | 
23 | def main():
24 |     make_tree(TRAIN_DIR_PROCESSED, reset=True)
25 |     make_tree(TEST_DIR_PROCESSED, reset=True)
26 | 
27 |     save_dataset_to_png(
28 |         TRAIN_DIR_PROCESSED, 
29 |         zip(load_image_data(TRAIN_DATA_RAW), load_label_data(TRAIN_LABELS_RAW))
30 |     )
31 |     save_dataset_to_png(
32 |         TEST_DIR_PROCESSED, 
33 |         zip(load_image_data(TEST_DATA_RAW), load_label_data(TEST_LABELS_RAW))
34 |     )
35 | 
36 | 
37 | def make_tree(root: pathlib.Path, reset: bool = False) -> None:
38 |     if reset:
39 |         reset_tree(root)
40 |     for child in range(10):
41 |         child = pathlib.Path(str(child))
42 |         if not child.exists():
43 |             os.makedirs(root / child)
44 | 
45 | 
46 | def reset_tree(root: pathlib.Path) -> None:
47 |     print('Resetting tree.')
48 |     shutil.rmtree(root, ignore_errors=True)
49 | 
50 | 
51 | def save_dataset_to_png(
52 |     root: pathlib.Path, array: Iterable[Tuple[np.ndarray, np.ndarray]]
53 | ) -> None:
54 |     for i, xy in enumerate(tqdm(tuple(array), ncols=80)):
55 |         save_xy_to_png(root, xy, str(i))
56 | 
57 | 
58 | def save_xy_to_png(
59 |     root: pathlib.Path, xy: Tuple[np.ndarray, np.ndarray], name: str
60 | ) -> None:
61 |     x, y = xy
62 |     Image.fromarray(x).save(root / str(int(y)) / f'{name}.jpg')
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/before/requirements.txt:
--------------------------------------------------------------------------------
 1 | ﻿# Python: 3.9.6
 2 | torch
 3 | numpy
 4 | pandas
 5 | scikit-learn
 6 | matplotlib
 7 | tensorboard
 8 | mlflow
 9 | tqdm
10 | pillow


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | The `processed` folder will contain JPG images of the MNIST dataset after running [parse_raw_data.py](../parse_raw_data.py). The images are not directly commited to GitHub. Rather, the script should be executed to create them.
 2 | 
 3 | ---
 4 | 
 5 | The `raw` folder contains the raw MNIST data files, which can be downloaded [here](http://yann.lecun.com/exdb/mnist/). Below is some information about the file format of the MNIST dataset needed to create the processed images.
 6 | 
 7 | ---
 8 | 
 9 | header\
10 | size in dimension 0\
11 | size in dimension 1\
12 | size in dimension 2\
13 | .....\
14 | size in dimension N\
15 | data
16 | 
17 | The magic number is an integer (MSB first). The first 2 bytes are always 0.
18 | 
19 | The third byte codes the type of the data:\
20 | 0x08: unsigned byte\
21 | 0x09: signed byte\
22 | 0x0B: short (2 bytes)\
23 | 0x0C: int (4 bytes)\
24 | 0x0D: float (4 bytes)\
25 | 0x0E: double (8 bytes)
26 | 
27 | The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices....
28 | 
29 | The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors).
30 | 
31 | The data is stored like in a C array, i.e. the index in the last dimension changes the fastest.
32 | 
33 | TEST SET LABEL FILE (t10k-labels-idx1-ubyte):\
34 | [offset] [type]          [value]          [description]\
35 | 0000     32 bit integer  0x00000801(2049) magic number (MSB first)\
36 | 0004     32 bit integer  10000            number of items\
37 | 0008     unsigned byte   ??               label\
38 | 0009     unsigned byte   ??               label\
39 | ........\
40 | xxxx     unsigned byte   ??               label\
41 | The labels values are 0 to 9.
42 | 


--------------------------------------------------------------------------------
/data/raw/t10k-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/t10k-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/data/raw/t10k-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/t10k-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/data/raw/train-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/train-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/data/raw/train-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/train-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ﻿# Python: 3.9.9
 2 | torch
 3 | numpy
 4 | pandas
 5 | scikit-learn
 6 | matplotlib
 7 | tensorboard
 8 | mlflow
 9 | tqdm
10 | pillow
11 | hydra-core


--------------------------------------------------------------------------------