├── LICENSE ├── README.md ├── after ├── conf │ ├── config.yaml │ └── files │ │ └── mnist.yaml ├── config.py ├── ds │ ├── __init__.py │ ├── dataset.py │ ├── load_data.py │ ├── metrics.py │ ├── models.py │ ├── runner.py │ ├── tensorboard.py │ ├── tracking.py │ └── utils.py ├── main.py └── parse_raw_data.py ├── before ├── ds │ ├── __init__.py │ ├── dataset.py │ ├── load_data.py │ ├── metrics.py │ ├── models.py │ ├── runner.py │ ├── tensorboard.py │ ├── tracking.py │ └── utils.py ├── main.py ├── parse_raw_data.py └── requirements.txt ├── data ├── README.md └── raw │ ├── t10k-images-idx3-ubyte.gz │ ├── t10k-labels-idx1-ubyte.gz │ ├── train-images-idx3-ubyte.gz │ └── train-labels-idx1-ubyte.gz └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ArjanCodes and Mark Todisco 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Configuration Management For Data Science Made Easy With Hydra 2 | 3 | This repository contains the example code of the video on Hydra and configurations. Here's the link to the video: https://youtu.be/tEsPyYnzt8s. 4 | -------------------------------------------------------------------------------- /after/conf/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - files: mnist 3 | - _self_ 4 | paths: 5 | log: ./runs 6 | data: ${hydra:runtime.cwd}/../data/raw 7 | params: 8 | epoch_count: 20 9 | lr: 5e-5 10 | batch_size: 128 11 | -------------------------------------------------------------------------------- /after/conf/files/mnist.yaml: -------------------------------------------------------------------------------- 1 | test_data: t10k-images-idx3-ubyte.gz 2 | test_labels: t10k-labels-idx1-ubyte.gz 3 | train_data: train-images-idx3-ubyte.gz 4 | train_labels: train-labels-idx1-ubyte.gz 5 | -------------------------------------------------------------------------------- /after/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class Paths: 6 | log: str 7 | data: str 8 | 9 | 10 | @dataclass 11 | class Files: 12 | train_data: str 13 | train_labels: str 14 | test_data: str 15 | test_labels: str 16 | 17 | 18 | @dataclass 19 | class Params: 20 | epoch_count: int 21 | lr: float 22 | batch_size: int 23 | 24 | 25 | @dataclass 26 | class MNISTConfig: 27 | paths: Paths 28 | files: Files 29 | params: Params 30 | -------------------------------------------------------------------------------- /after/ds/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/after/ds/__init__.py -------------------------------------------------------------------------------- /after/ds/dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | 4 | import numpy as np 5 | import torch 6 | from torch.utils.data import DataLoader, Dataset 7 | 8 | from ds.load_data import load_image_data, load_label_data 9 | 10 | 11 | class MNIST(Dataset[Any]): 12 | idx: int # requested data index 13 | x: torch.Tensor 14 | y: torch.Tensor 15 | 16 | TRAIN_MAX = 255.0 17 | TRAIN_NORMALIZED_MEAN = 0.1306604762738429 18 | TRAIN_NORMALIZED_STDEV = 0.3081078038564622 19 | 20 | def __init__(self, data: np.ndarray, targets: np.ndarray): 21 | if len(data) != len(targets): 22 | raise ValueError( 23 | "data and targets must be the same length. " 24 | f"{len(data)} != {len(targets)}" 25 | ) 26 | 27 | self.data = data 28 | self.targets = targets 29 | 30 | def __len__(self): 31 | return len(self.data) 32 | 33 | def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: 34 | x = self.get_x(idx) 35 | y = self.get_y(idx) 36 | return x, y 37 | 38 | def get_x(self, idx: int): 39 | self.idx = idx 40 | self.preprocess_x() 41 | return self.x 42 | 43 | def preprocess_x(self): 44 | self.x = self.data[self.idx].copy().astype(np.float64) 45 | self.x /= self.TRAIN_MAX 46 | self.x -= self.TRAIN_NORMALIZED_MEAN 47 | self.x /= self.TRAIN_NORMALIZED_STDEV 48 | self.x = self.x.astype(np.float32) 49 | self.x = torch.from_numpy(self.x) 50 | self.x = self.x.unsqueeze(0) 51 | 52 | def get_y(self, idx: int): 53 | self.idx = idx 54 | self.preprocess_y() 55 | return self.y 56 | 57 | def preprocess_y(self): 58 | self.y = self.targets[self.idx] 59 | self.y = torch.tensor(self.y, dtype=torch.long) 60 | 61 | 62 | def create_dataloader( 63 | batch_size: int, 64 | root_path: str, 65 | data_file: str, 66 | label_file: str, 67 | shuffle: bool = True, 68 | ) -> DataLoader[Any]: 69 | data_path = Path(f"{root_path}/{data_file}") 70 | label_path = Path(f"{root_path}/{label_file}") 71 | data = load_image_data(data_path) 72 | label_data = load_label_data(label_path) 73 | return DataLoader( 74 | dataset=MNIST(data, label_data), 75 | batch_size=batch_size, 76 | shuffle=shuffle, 77 | num_workers=0, 78 | ) 79 | -------------------------------------------------------------------------------- /after/ds/load_data.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import struct 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | 7 | ALLOWED_TYPES = { 8 | "UNSIGNED_BYTE": b"\x08", 9 | "SIGNED_BYTE": b"\x09", 10 | "SHORT": b"\x0B", 11 | "INT": b"\x0C", 12 | "SINGLE": b"\x0D", 13 | "DOUBLE": b"\x0E", 14 | } 15 | 16 | 17 | def load_image_data(file_path: Path) -> np.ndarray: 18 | with gzip.open(file_path, "rb") as fp: 19 | _ = struct.unpack(">H", fp.read(2)) # dump padding bytes 20 | 21 | (data_type,) = struct.unpack(">c", fp.read(1)) 22 | assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"] 23 | 24 | number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0]) 25 | assert number_of_dimensions == 3 26 | 27 | (num_images,) = struct.unpack(">I", fp.read(4)) 28 | (num_rows,) = struct.unpack(">I", fp.read(4)) 29 | (num_cols,) = struct.unpack(">I", fp.read(4)) 30 | 31 | raw = fp.read() 32 | assert len(raw) == num_images * num_rows * num_cols 33 | 34 | data: np.ndarray = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">")) 35 | data = data.reshape((num_images, num_rows, num_cols)) 36 | return data 37 | 38 | 39 | def load_label_data(file_path: Path) -> np.ndarray: 40 | with gzip.open(file_path, "rb") as fp: 41 | _ = struct.unpack(">H", fp.read(2)) # dump padding bytes 42 | 43 | (data_type,) = struct.unpack(">c", fp.read(1)) 44 | assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"] 45 | 46 | number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0]) 47 | assert number_of_dimensions == 1 48 | 49 | (num_images,) = struct.unpack(">I", fp.read(4)) 50 | 51 | raw = fp.read() 52 | assert len(raw) == num_images 53 | 54 | data = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">")) 55 | return data 56 | -------------------------------------------------------------------------------- /after/ds/metrics.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | 4 | @dataclass 5 | class Metric: 6 | values: list[float] = field(default_factory=list) 7 | running_total: float = 0.0 8 | num_updates: float = 0.0 9 | average: float = 0.0 10 | 11 | def update(self, value: float, batch_size: int): 12 | self.values.append(value) 13 | self.running_total += value * batch_size 14 | self.num_updates += batch_size 15 | self.average = self.running_total / self.num_updates 16 | -------------------------------------------------------------------------------- /after/ds/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class LinearNet(torch.nn.Module): 5 | def __init__(self): 6 | super().__init__() 7 | 8 | self.network = torch.nn.Sequential( 9 | torch.nn.Flatten(), 10 | torch.nn.Linear(in_features=28 * 28, out_features=32), 11 | torch.nn.ReLU(), 12 | torch.nn.Linear(in_features=32, out_features=10), 13 | torch.nn.Softmax(dim=1), 14 | ) 15 | 16 | def forward(self, x: torch.Tensor) -> torch.Tensor: 17 | return self.network(x) 18 | -------------------------------------------------------------------------------- /after/ds/runner.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import numpy as np 4 | import torch 5 | from sklearn.metrics import accuracy_score 6 | from torch.utils.data.dataloader import DataLoader 7 | from tqdm import tqdm 8 | 9 | from ds.metrics import Metric 10 | from ds.tracking import ExperimentTracker, Stage 11 | 12 | 13 | class Runner: 14 | def __init__( 15 | self, 16 | loader: DataLoader[Any], 17 | model: torch.nn.Module, 18 | optimizer: Optional[torch.optim.Optimizer] = None, 19 | ) -> None: 20 | self.run_count = 0 21 | self.loader = loader 22 | self.accuracy_metric = Metric() 23 | self.model = model 24 | self.optimizer = optimizer 25 | # Objective (loss) function 26 | self.compute_loss = torch.nn.CrossEntropyLoss(reduction="mean") 27 | self.y_true_batches: list[list[Any]] = [] 28 | self.y_pred_batches: list[list[Any]] = [] 29 | # Assume Stage based on presence of optimizer 30 | self.stage = Stage.VAL if optimizer is None else Stage.TRAIN 31 | 32 | @property 33 | def avg_accuracy(self): 34 | return self.accuracy_metric.average 35 | 36 | def run(self, desc: str, experiment: ExperimentTracker): 37 | self.model.train(self.stage is Stage.TRAIN) 38 | 39 | for x, y in tqdm(self.loader, desc=desc, ncols=80): 40 | loss, batch_accuracy = self._run_single(x, y) 41 | 42 | experiment.add_batch_metric("accuracy", batch_accuracy, self.run_count) 43 | 44 | if self.optimizer: 45 | # Reverse-mode AutoDiff (backpropagation) 46 | self.optimizer.zero_grad() 47 | loss.backward() 48 | self.optimizer.step() 49 | 50 | def _run_single(self, x: Any, y: Any): 51 | self.run_count += 1 52 | batch_size: int = x.shape[0] 53 | prediction = self.model(x) 54 | loss = self.compute_loss(prediction, y) 55 | 56 | # Compute Batch Validation Metrics 57 | y_np = y.detach().numpy() 58 | y_prediction_np = np.argmax(prediction.detach().numpy(), axis=1) 59 | batch_accuracy: float = accuracy_score(y_np, y_prediction_np) 60 | self.accuracy_metric.update(batch_accuracy, batch_size) 61 | 62 | self.y_true_batches += [y_np] 63 | self.y_pred_batches += [y_prediction_np] 64 | return loss, batch_accuracy 65 | 66 | def reset(self): 67 | self.accuracy_metric = Metric() 68 | self.y_true_batches = [] 69 | self.y_pred_batches = [] 70 | 71 | 72 | def run_epoch( 73 | test_runner: Runner, 74 | train_runner: Runner, 75 | experiment: ExperimentTracker, 76 | epoch_id: int, 77 | ): 78 | # Training Loop 79 | experiment.set_stage(Stage.TRAIN) 80 | train_runner.run("Train Batches", experiment) 81 | 82 | # Log Training Epoch Metrics 83 | experiment.add_epoch_metric("accuracy", train_runner.avg_accuracy, epoch_id) 84 | 85 | # Testing Loop 86 | experiment.set_stage(Stage.VAL) 87 | test_runner.run("Validation Batches", experiment) 88 | 89 | # Log Validation Epoch Metrics 90 | experiment.add_epoch_metric("accuracy", test_runner.avg_accuracy, epoch_id) 91 | experiment.add_epoch_confusion_matrix( 92 | test_runner.y_true_batches, test_runner.y_pred_batches, epoch_id 93 | ) 94 | -------------------------------------------------------------------------------- /after/ds/tensorboard.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | from ds.tracking import Stage 9 | from ds.utils import create_experiment_log_dir 10 | 11 | 12 | class TensorboardExperiment: 13 | def __init__(self, log_path: str, create: bool = True): 14 | 15 | log_dir = create_experiment_log_dir(root=log_path) 16 | self.stage = Stage.TRAIN 17 | self._validate_log_dir(log_dir, create=create) 18 | self._writer = SummaryWriter(log_dir=log_dir) 19 | plt.ioff() 20 | 21 | def set_stage(self, stage: Stage): 22 | self.stage = stage 23 | 24 | def flush(self): 25 | self._writer.flush() 26 | 27 | @staticmethod 28 | def _validate_log_dir(log_dir: str, create: bool = True): 29 | log_path = Path(log_dir).resolve() 30 | if log_path.exists(): 31 | return 32 | elif not log_path.exists() and create: 33 | log_path.mkdir(parents=True) 34 | else: 35 | raise NotADirectoryError(f"log_dir {log_dir} does not exist.") 36 | 37 | def add_batch_metric(self, name: str, value: float, step: int): 38 | tag = f"{self.stage.name}/batch/{name}" 39 | self._writer.add_scalar(tag, value, step) 40 | 41 | def add_epoch_metric(self, name: str, value: float, step: int): 42 | tag = f"{self.stage.name}/epoch/{name}" 43 | self._writer.add_scalar(tag, value, step) 44 | 45 | def add_epoch_confusion_matrix( 46 | self, y_true: list[np.array], y_pred: list[np.array], step: int 47 | ): 48 | y_true, y_pred = self.collapse_batches(y_true, y_pred) 49 | fig = self.create_confusion_matrix(y_true, y_pred, step) 50 | tag = f"{self.stage.name}/epoch/confusion_matrix" 51 | self._writer.add_figure(tag, fig, step) 52 | 53 | @staticmethod 54 | def collapse_batches( 55 | y_true: list[np.array], y_pred: list[np.array] 56 | ) -> tuple[np.ndarray, np.ndarray]: 57 | return np.concatenate(y_true), np.concatenate(y_pred) 58 | 59 | def create_confusion_matrix( 60 | self, y_true: list[np.array], y_pred: list[np.array], step: int 61 | ) -> plt.Figure: 62 | cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues") 63 | cm.ax_.set_title(f"{self.stage.name} Epoch: {step}") 64 | return cm.figure_ 65 | -------------------------------------------------------------------------------- /after/ds/tracking.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | from pathlib import Path 3 | from typing import Protocol 4 | 5 | import numpy as np 6 | from matplotlib import pyplot as plt 7 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | from ds.utils import create_experiment_log_dir 11 | 12 | 13 | class Stage(Enum): 14 | TRAIN = auto() 15 | TEST = auto() 16 | VAL = auto() 17 | 18 | 19 | class ExperimentTracker(Protocol): 20 | def set_stage(self, stage: Stage): 21 | """Sets the current stage of the experiment.""" 22 | 23 | def add_batch_metric(self, name: str, value: float, step: int): 24 | """Implements logging a batch-level metric.""" 25 | 26 | def add_epoch_metric(self, name: str, value: float, step: int): 27 | """Implements logging a epoch-level metric.""" 28 | 29 | def add_epoch_confusion_matrix( 30 | self, y_true: list[np.array], y_pred: list[np.array], step: int 31 | ): 32 | """Implements logging a confusion matrix at epoch-level.""" 33 | 34 | 35 | class TensorboardExperiment: 36 | def __init__(self, log_path: str, create: bool = True): 37 | self.stage = Stage.TRAIN 38 | self._writer = SummaryWriter( 39 | log_dir=create_experiment_log_dir(log_path, parents=True) 40 | ) 41 | plt.ioff() 42 | 43 | def set_stage(self, stage: Stage): 44 | self.stage = stage 45 | 46 | def flush(self): 47 | self._writer.flush() 48 | 49 | @staticmethod 50 | def _validate_log_dir(log_dir: str, create: bool = True): 51 | log_path = Path(log_dir).resolve() 52 | if log_path.exists(): 53 | return 54 | elif not log_path.exists() and create: 55 | log_path.mkdir(parents=True) 56 | else: 57 | raise NotADirectoryError(f"log_dir {log_dir} does not exist.") 58 | 59 | def add_batch_metric(self, name: str, value: float, step: int): 60 | tag = f"{self.stage.name}/batch/{name}" 61 | self._writer.add_scalar(tag, value, step) 62 | 63 | def add_epoch_metric(self, name: str, value: float, step: int): 64 | tag = f"{self.stage.name}/epoch/{name}" 65 | self._writer.add_scalar(tag, value, step) 66 | 67 | def add_epoch_confusion_matrix( 68 | self, y_true: list[np.array], y_pred: list[np.array], step: int 69 | ): 70 | y_true, y_pred = self.collapse_batches(y_true, y_pred) 71 | fig = self.create_confusion_matrix(y_true, y_pred, step) 72 | tag = f"{self.stage.name}/epoch/confusion_matrix" 73 | self._writer.add_figure(tag, fig, step) 74 | 75 | @staticmethod 76 | def collapse_batches( 77 | y_true: list[np.array], y_pred: list[np.array] 78 | ) -> tuple[np.ndarray, np.ndarray]: 79 | return np.concatenate(y_true), np.concatenate(y_pred) 80 | 81 | def create_confusion_matrix( 82 | self, y_true: list[np.array], y_pred: list[np.array], step: int 83 | ) -> plt.Figure: 84 | cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues") 85 | cm.ax_.set_title(f"{self.stage.name} Epoch: {step}") 86 | return cm.figure_ 87 | -------------------------------------------------------------------------------- /after/ds/utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | 4 | def create_experiment_log_dir(root: str, parents: bool = True) -> str: 5 | root_path = pathlib.Path(root).resolve() 6 | child = ( 7 | create_from_missing(root_path) 8 | if not root_path.exists() 9 | else create_from_existing(root_path) 10 | ) 11 | child.mkdir(parents=parents) 12 | return child.as_posix() 13 | 14 | 15 | def create_from_missing(root: pathlib.Path) -> pathlib.Path: 16 | return root / "0" 17 | 18 | 19 | def create_from_existing(root: pathlib.Path) -> pathlib.Path: 20 | children = [ 21 | int(c.name) for c in root.glob("*") 22 | if (c.is_dir() and c.name.isnumeric()) 23 | ] 24 | if is_first_experiment(children): 25 | child = create_from_missing(root) 26 | else: 27 | child = root / increment_experiment_number(children) 28 | return child 29 | 30 | 31 | def is_first_experiment(children: list[int]) -> bool: 32 | return len(children) == 0 33 | 34 | 35 | def increment_experiment_number(children: list[int]) -> str: 36 | return str(max(children) + 1) 37 | -------------------------------------------------------------------------------- /after/main.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import torch 3 | from hydra.core.config_store import ConfigStore 4 | from omegaconf import OmegaConf 5 | 6 | from config import MNISTConfig 7 | from ds.dataset import create_dataloader 8 | from ds.models import LinearNet 9 | from ds.runner import Runner, run_epoch 10 | from ds.tracking import TensorboardExperiment 11 | 12 | cs = ConfigStore.instance() 13 | cs.store(name="mnist_config", node=MNISTConfig) 14 | 15 | 16 | @hydra.main(config_path="conf", config_name="config") 17 | def main(cfg: MNISTConfig) -> None: 18 | print(OmegaConf.to_yaml(cfg)) 19 | 20 | # Model and Optimizer 21 | model = LinearNet() 22 | optimizer = torch.optim.Adam(model.parameters(), lr=cfg.params.lr) 23 | 24 | # Create the data loaders 25 | 26 | test_loader = create_dataloader( 27 | batch_size=cfg.params.batch_size, 28 | root_path=cfg.paths.data, 29 | data_file=cfg.files.test_data, 30 | label_file=cfg.files.test_labels, 31 | ) 32 | train_loader = create_dataloader( 33 | batch_size=cfg.params.batch_size, 34 | root_path=cfg.paths.data, 35 | data_file=cfg.files.train_data, 36 | label_file=cfg.files.train_labels, 37 | ) 38 | 39 | # Create the runners 40 | test_runner = Runner(test_loader, model) 41 | train_runner = Runner(train_loader, model, optimizer) 42 | 43 | # Setup the experiment tracker 44 | tracker = TensorboardExperiment(log_path=cfg.paths.log) 45 | 46 | # Run the epochs 47 | for epoch_id in range(cfg.params.epoch_count): 48 | run_epoch(test_runner, train_runner, tracker, epoch_id) 49 | 50 | # Compute Average Epoch Metrics 51 | summary = ", ".join( 52 | [ 53 | f"[Epoch: {epoch_id + 1}/{cfg.params.epoch_count}]", 54 | f"Test Accuracy: {test_runner.avg_accuracy: 0.4f}", 55 | f"Train Accuracy: {train_runner.avg_accuracy: 0.4f}", 56 | ] 57 | ) 58 | print("\n" + summary + "\n") 59 | 60 | # Reset the runners 61 | train_runner.reset() 62 | test_runner.reset() 63 | 64 | # Flush the tracker after every epoch for live updates 65 | tracker.flush() 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /after/parse_raw_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pathlib 4 | from typing import Iterable, Tuple 5 | 6 | import numpy as np 7 | from PIL import Image 8 | from tqdm import tqdm 9 | 10 | from ds.load_data import load_image_data, load_label_data 11 | 12 | RAW_DATA = "./data/raw" 13 | TEST_DATA_RAW = pathlib.Path(f"{RAW_DATA}/t10k-images-idx3-ubyte.gz") 14 | TEST_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/t10k-labels-idx1-ubyte.gz") 15 | TRAIN_DATA_RAW = pathlib.Path(f"{RAW_DATA}/train-images-idx3-ubyte.gz") 16 | TRAIN_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/train-labels-idx1-ubyte.gz") 17 | 18 | PROCESSED_DATA = './data/processed' 19 | TEST_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/test") 20 | TRAIN_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/tain") 21 | 22 | 23 | def main(): 24 | make_tree(TRAIN_DIR_PROCESSED, reset=True) 25 | make_tree(TEST_DIR_PROCESSED, reset=True) 26 | 27 | save_dataset_to_png( 28 | TRAIN_DIR_PROCESSED, 29 | zip(load_image_data(TRAIN_DATA_RAW), load_label_data(TRAIN_LABELS_RAW)) 30 | ) 31 | save_dataset_to_png( 32 | TEST_DIR_PROCESSED, 33 | zip(load_image_data(TEST_DATA_RAW), load_label_data(TEST_LABELS_RAW)) 34 | ) 35 | 36 | 37 | def make_tree(root: pathlib.Path, reset: bool = False) -> None: 38 | if reset: 39 | reset_tree(root) 40 | for child in range(10): 41 | child = pathlib.Path(str(child)) 42 | if not child.exists(): 43 | os.makedirs(root / child) 44 | 45 | 46 | def reset_tree(root: pathlib.Path) -> None: 47 | print('Resetting tree.') 48 | shutil.rmtree(root, ignore_errors=True) 49 | 50 | 51 | def save_dataset_to_png( 52 | root: pathlib.Path, array: Iterable[Tuple[np.ndarray, np.ndarray]] 53 | ) -> None: 54 | for i, xy in enumerate(tqdm(tuple(array), ncols=80)): 55 | save_xy_to_png(root, xy, str(i)) 56 | 57 | 58 | def save_xy_to_png( 59 | root: pathlib.Path, xy: Tuple[np.ndarray, np.ndarray], name: str 60 | ) -> None: 61 | x, y = xy 62 | Image.fromarray(x).save(root / str(int(y)) / f'{name}.jpg') 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /before/ds/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/before/ds/__init__.py -------------------------------------------------------------------------------- /before/ds/dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | 4 | import numpy as np 5 | import torch 6 | from torch.utils.data import DataLoader, Dataset 7 | 8 | from ds.load_data import load_image_data, load_label_data 9 | 10 | 11 | class MNIST(Dataset[Any]): 12 | idx: int # requested data index 13 | x: torch.Tensor 14 | y: torch.Tensor 15 | 16 | TRAIN_MAX = 255.0 17 | TRAIN_NORMALIZED_MEAN = 0.1306604762738429 18 | TRAIN_NORMALIZED_STDEV = 0.3081078038564622 19 | 20 | def __init__(self, data: np.ndarray, targets: np.ndarray): 21 | if len(data) != len(targets): 22 | raise ValueError( 23 | "data and targets must be the same length. " 24 | f"{len(data)} != {len(targets)}" 25 | ) 26 | 27 | self.data = data 28 | self.targets = targets 29 | 30 | def __len__(self): 31 | return len(self.data) 32 | 33 | def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: 34 | x = self.get_x(idx) 35 | y = self.get_y(idx) 36 | return x, y 37 | 38 | def get_x(self, idx: int): 39 | self.idx = idx 40 | self.preprocess_x() 41 | return self.x 42 | 43 | def preprocess_x(self): 44 | self.x = self.data[self.idx].copy().astype(np.float64) 45 | self.x /= self.TRAIN_MAX 46 | self.x -= self.TRAIN_NORMALIZED_MEAN 47 | self.x /= self.TRAIN_NORMALIZED_STDEV 48 | self.x = self.x.astype(np.float32) 49 | self.x = torch.from_numpy(self.x) 50 | self.x = self.x.unsqueeze(0) 51 | 52 | def get_y(self, idx: int): 53 | self.idx = idx 54 | self.preprocess_y() 55 | return self.y 56 | 57 | def preprocess_y(self): 58 | self.y = self.targets[self.idx] 59 | self.y = torch.tensor(self.y, dtype=torch.long) 60 | 61 | 62 | def create_dataloader( 63 | batch_size: int, data_path: Path, label_path: Path, shuffle: bool = True 64 | ) -> DataLoader[Any]: 65 | data = load_image_data(data_path) 66 | label_data = load_label_data(label_path) 67 | return DataLoader( 68 | dataset=MNIST(data, label_data), 69 | batch_size=batch_size, 70 | shuffle=shuffle, 71 | num_workers=0, 72 | ) 73 | -------------------------------------------------------------------------------- /before/ds/load_data.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import struct 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | 7 | ALLOWED_TYPES = { 8 | "UNSIGNED_BYTE": b"\x08", 9 | "SIGNED_BYTE": b"\x09", 10 | "SHORT": b"\x0B", 11 | "INT": b"\x0C", 12 | "SINGLE": b"\x0D", 13 | "DOUBLE": b"\x0E", 14 | } 15 | 16 | 17 | def load_image_data(file_path: Path) -> np.ndarray: 18 | with gzip.open(file_path, "rb") as fp: 19 | _ = struct.unpack(">H", fp.read(2)) # dump padding bytes 20 | 21 | (data_type,) = struct.unpack(">c", fp.read(1)) 22 | assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"] 23 | 24 | number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0]) 25 | assert number_of_dimensions == 3 26 | 27 | (num_images,) = struct.unpack(">I", fp.read(4)) 28 | (num_rows,) = struct.unpack(">I", fp.read(4)) 29 | (num_cols,) = struct.unpack(">I", fp.read(4)) 30 | 31 | raw = fp.read() 32 | assert len(raw) == num_images * num_rows * num_cols 33 | 34 | data: np.ndarray = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">")) 35 | data = data.reshape((num_images, num_rows, num_cols)) 36 | return data 37 | 38 | 39 | def load_label_data(file_path: Path) -> np.ndarray: 40 | with gzip.open(file_path, "rb") as fp: 41 | _ = struct.unpack(">H", fp.read(2)) # dump padding bytes 42 | 43 | (data_type,) = struct.unpack(">c", fp.read(1)) 44 | assert data_type == ALLOWED_TYPES["UNSIGNED_BYTE"] 45 | 46 | number_of_dimensions = ord(struct.unpack(">c", fp.read(1))[0]) 47 | assert number_of_dimensions == 1 48 | 49 | (num_images,) = struct.unpack(">I", fp.read(4)) 50 | 51 | raw = fp.read() 52 | assert len(raw) == num_images 53 | 54 | data = np.frombuffer(raw, dtype=np.dtype(np.uint8).newbyteorder(">")) 55 | return data 56 | -------------------------------------------------------------------------------- /before/ds/metrics.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | 4 | @dataclass 5 | class Metric: 6 | values: list[float] = field(default_factory=list) 7 | running_total: float = 0.0 8 | num_updates: float = 0.0 9 | average: float = 0.0 10 | 11 | def update(self, value: float, batch_size: int): 12 | self.values.append(value) 13 | self.running_total += value * batch_size 14 | self.num_updates += batch_size 15 | self.average = self.running_total / self.num_updates 16 | -------------------------------------------------------------------------------- /before/ds/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class LinearNet(torch.nn.Module): 5 | def __init__(self): 6 | super().__init__() 7 | 8 | self.network = torch.nn.Sequential( 9 | torch.nn.Flatten(), 10 | torch.nn.Linear(in_features=28 * 28, out_features=32), 11 | torch.nn.ReLU(), 12 | torch.nn.Linear(in_features=32, out_features=10), 13 | torch.nn.Softmax(dim=1), 14 | ) 15 | 16 | def forward(self, x: torch.Tensor) -> torch.Tensor: 17 | return self.network(x) 18 | -------------------------------------------------------------------------------- /before/ds/runner.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import numpy as np 4 | import torch 5 | from sklearn.metrics import accuracy_score 6 | from torch.utils.data.dataloader import DataLoader 7 | from tqdm import tqdm 8 | 9 | from ds.metrics import Metric 10 | from ds.tracking import ExperimentTracker, Stage 11 | 12 | 13 | class Runner: 14 | def __init__( 15 | self, 16 | loader: DataLoader[Any], 17 | model: torch.nn.Module, 18 | optimizer: Optional[torch.optim.Optimizer] = None, 19 | ) -> None: 20 | self.run_count = 0 21 | self.loader = loader 22 | self.accuracy_metric = Metric() 23 | self.model = model 24 | self.optimizer = optimizer 25 | # Objective (loss) function 26 | self.compute_loss = torch.nn.CrossEntropyLoss(reduction="mean") 27 | self.y_true_batches: list[list[Any]] = [] 28 | self.y_pred_batches: list[list[Any]] = [] 29 | # Assume Stage based on presence of optimizer 30 | self.stage = Stage.VAL if optimizer is None else Stage.TRAIN 31 | 32 | @property 33 | def avg_accuracy(self): 34 | return self.accuracy_metric.average 35 | 36 | def run(self, desc: str, experiment: ExperimentTracker): 37 | self.model.train(self.stage is Stage.TRAIN) 38 | 39 | for x, y in tqdm(self.loader, desc=desc, ncols=80): 40 | loss, batch_accuracy = self._run_single(x, y) 41 | 42 | experiment.add_batch_metric("accuracy", batch_accuracy, self.run_count) 43 | 44 | if self.optimizer: 45 | # Reverse-mode AutoDiff (backpropagation) 46 | self.optimizer.zero_grad() 47 | loss.backward() 48 | self.optimizer.step() 49 | 50 | def _run_single(self, x: Any, y: Any): 51 | self.run_count += 1 52 | batch_size: int = x.shape[0] 53 | prediction = self.model(x) 54 | loss = self.compute_loss(prediction, y) 55 | 56 | # Compute Batch Validation Metrics 57 | y_np = y.detach().numpy() 58 | y_prediction_np = np.argmax(prediction.detach().numpy(), axis=1) 59 | batch_accuracy: float = accuracy_score(y_np, y_prediction_np) 60 | self.accuracy_metric.update(batch_accuracy, batch_size) 61 | 62 | self.y_true_batches += [y_np] 63 | self.y_pred_batches += [y_prediction_np] 64 | return loss, batch_accuracy 65 | 66 | def reset(self): 67 | self.accuracy_metric = Metric() 68 | self.y_true_batches = [] 69 | self.y_pred_batches = [] 70 | 71 | 72 | def run_epoch( 73 | test_runner: Runner, 74 | train_runner: Runner, 75 | experiment: ExperimentTracker, 76 | epoch_id: int, 77 | ): 78 | # Training Loop 79 | experiment.set_stage(Stage.TRAIN) 80 | train_runner.run("Train Batches", experiment) 81 | 82 | # Log Training Epoch Metrics 83 | experiment.add_epoch_metric("accuracy", train_runner.avg_accuracy, epoch_id) 84 | 85 | # Testing Loop 86 | experiment.set_stage(Stage.VAL) 87 | test_runner.run("Validation Batches", experiment) 88 | 89 | # Log Validation Epoch Metrics 90 | experiment.add_epoch_metric("accuracy", test_runner.avg_accuracy, epoch_id) 91 | experiment.add_epoch_confusion_matrix( 92 | test_runner.y_true_batches, test_runner.y_pred_batches, epoch_id 93 | ) 94 | -------------------------------------------------------------------------------- /before/ds/tensorboard.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | from ds.tracking import Stage 9 | from ds.utils import create_experiment_log_dir 10 | 11 | 12 | class TensorboardExperiment: 13 | def __init__(self, log_path: str, create: bool = True): 14 | 15 | log_dir = create_experiment_log_dir(root=log_path) 16 | self.stage = Stage.TRAIN 17 | self._validate_log_dir(log_dir, create=create) 18 | self._writer = SummaryWriter(log_dir=log_dir) 19 | plt.ioff() 20 | 21 | def set_stage(self, stage: Stage): 22 | self.stage = stage 23 | 24 | def flush(self): 25 | self._writer.flush() 26 | 27 | @staticmethod 28 | def _validate_log_dir(log_dir: str, create: bool = True): 29 | log_path = Path(log_dir).resolve() 30 | if log_path.exists(): 31 | return 32 | elif not log_path.exists() and create: 33 | log_path.mkdir(parents=True) 34 | else: 35 | raise NotADirectoryError(f"log_dir {log_dir} does not exist.") 36 | 37 | def add_batch_metric(self, name: str, value: float, step: int): 38 | tag = f"{self.stage.name}/batch/{name}" 39 | self._writer.add_scalar(tag, value, step) 40 | 41 | def add_epoch_metric(self, name: str, value: float, step: int): 42 | tag = f"{self.stage.name}/epoch/{name}" 43 | self._writer.add_scalar(tag, value, step) 44 | 45 | def add_epoch_confusion_matrix( 46 | self, y_true: list[np.array], y_pred: list[np.array], step: int 47 | ): 48 | y_true, y_pred = self.collapse_batches(y_true, y_pred) 49 | fig = self.create_confusion_matrix(y_true, y_pred, step) 50 | tag = f"{self.stage.name}/epoch/confusion_matrix" 51 | self._writer.add_figure(tag, fig, step) 52 | 53 | @staticmethod 54 | def collapse_batches( 55 | y_true: list[np.array], y_pred: list[np.array] 56 | ) -> tuple[np.ndarray, np.ndarray]: 57 | return np.concatenate(y_true), np.concatenate(y_pred) 58 | 59 | def create_confusion_matrix( 60 | self, y_true: list[np.array], y_pred: list[np.array], step: int 61 | ) -> plt.Figure: 62 | cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues") 63 | cm.ax_.set_title(f"{self.stage.name} Epoch: {step}") 64 | return cm.figure_ 65 | -------------------------------------------------------------------------------- /before/ds/tracking.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | from pathlib import Path 3 | from typing import Protocol 4 | 5 | import numpy as np 6 | from matplotlib import pyplot as plt 7 | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | from ds.utils import create_experiment_log_dir 11 | 12 | 13 | class Stage(Enum): 14 | TRAIN = auto() 15 | TEST = auto() 16 | VAL = auto() 17 | 18 | 19 | class ExperimentTracker(Protocol): 20 | def set_stage(self, stage: Stage): 21 | """Sets the current stage of the experiment.""" 22 | 23 | def add_batch_metric(self, name: str, value: float, step: int): 24 | """Implements logging a batch-level metric.""" 25 | 26 | def add_epoch_metric(self, name: str, value: float, step: int): 27 | """Implements logging a epoch-level metric.""" 28 | 29 | def add_epoch_confusion_matrix( 30 | self, y_true: list[np.array], y_pred: list[np.array], step: int 31 | ): 32 | """Implements logging a confusion matrix at epoch-level.""" 33 | 34 | 35 | class TensorboardExperiment: 36 | def __init__(self, log_path: str, create: bool = True): 37 | self.stage = Stage.TRAIN 38 | self._writer = SummaryWriter( 39 | log_dir=create_experiment_log_dir(log_path, parents=True) 40 | ) 41 | plt.ioff() 42 | 43 | def set_stage(self, stage: Stage): 44 | self.stage = stage 45 | 46 | def flush(self): 47 | self._writer.flush() 48 | 49 | @staticmethod 50 | def _validate_log_dir(log_dir: str, create: bool = True): 51 | log_path = Path(log_dir).resolve() 52 | if log_path.exists(): 53 | return 54 | elif not log_path.exists() and create: 55 | log_path.mkdir(parents=True) 56 | else: 57 | raise NotADirectoryError(f"log_dir {log_dir} does not exist.") 58 | 59 | def add_batch_metric(self, name: str, value: float, step: int): 60 | tag = f"{self.stage.name}/batch/{name}" 61 | self._writer.add_scalar(tag, value, step) 62 | 63 | def add_epoch_metric(self, name: str, value: float, step: int): 64 | tag = f"{self.stage.name}/epoch/{name}" 65 | self._writer.add_scalar(tag, value, step) 66 | 67 | def add_epoch_confusion_matrix( 68 | self, y_true: list[np.array], y_pred: list[np.array], step: int 69 | ): 70 | y_true, y_pred = self.collapse_batches(y_true, y_pred) 71 | fig = self.create_confusion_matrix(y_true, y_pred, step) 72 | tag = f"{self.stage.name}/epoch/confusion_matrix" 73 | self._writer.add_figure(tag, fig, step) 74 | 75 | @staticmethod 76 | def collapse_batches( 77 | y_true: list[np.array], y_pred: list[np.array] 78 | ) -> tuple[np.ndarray, np.ndarray]: 79 | return np.concatenate(y_true), np.concatenate(y_pred) 80 | 81 | def create_confusion_matrix( 82 | self, y_true: list[np.array], y_pred: list[np.array], step: int 83 | ) -> plt.Figure: 84 | cm = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap="Blues") 85 | cm.ax_.set_title(f"{self.stage.name} Epoch: {step}") 86 | return cm.figure_ 87 | -------------------------------------------------------------------------------- /before/ds/utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | 4 | def create_experiment_log_dir(root: str, parents: bool = True) -> str: 5 | root_path = pathlib.Path(root).resolve() 6 | child = ( 7 | create_from_missing(root_path) 8 | if not root_path.exists() 9 | else create_from_existing(root_path) 10 | ) 11 | child.mkdir(parents=parents) 12 | return child.as_posix() 13 | 14 | 15 | def create_from_missing(root: pathlib.Path) -> pathlib.Path: 16 | return root / "0" 17 | 18 | 19 | def create_from_existing(root: pathlib.Path) -> pathlib.Path: 20 | children = [ 21 | int(c.name) for c in root.glob("*") 22 | if (c.is_dir() and c.name.isnumeric()) 23 | ] 24 | if is_first_experiment(children): 25 | child = create_from_missing(root) 26 | else: 27 | child = root / increment_experiment_number(children) 28 | return child 29 | 30 | 31 | def is_first_experiment(children: list[int]) -> bool: 32 | return len(children) == 0 33 | 34 | 35 | def increment_experiment_number(children: list[int]) -> str: 36 | return str(max(children) + 1) 37 | -------------------------------------------------------------------------------- /before/main.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import torch 4 | 5 | from ds.dataset import create_dataloader 6 | from ds.models import LinearNet 7 | from ds.runner import Runner, run_epoch 8 | from ds.tracking import TensorboardExperiment 9 | 10 | # Hyperparameters 11 | EPOCH_COUNT = 20 12 | LR = 5e-5 13 | BATCH_SIZE = 128 14 | LOG_PATH = "./runs" 15 | 16 | # Data configuration 17 | DATA_DIR = "../data/raw" 18 | TEST_DATA = pathlib.Path(f"{DATA_DIR}/t10k-images-idx3-ubyte.gz") 19 | TEST_LABELS = pathlib.Path(f"{DATA_DIR}/t10k-labels-idx1-ubyte.gz") 20 | TRAIN_DATA = pathlib.Path(f"{DATA_DIR}/train-images-idx3-ubyte.gz") 21 | TRAIN_LABELS = pathlib.Path(f"{DATA_DIR}/train-labels-idx1-ubyte.gz") 22 | 23 | 24 | def main(): 25 | 26 | # Model and Optimizer 27 | model = LinearNet() 28 | optimizer = torch.optim.Adam(model.parameters(), lr=LR) 29 | 30 | # Create the data loaders 31 | test_loader = create_dataloader(BATCH_SIZE, TEST_DATA, TEST_LABELS) 32 | train_loader = create_dataloader(BATCH_SIZE, TRAIN_DATA, TRAIN_LABELS) 33 | 34 | # Create the runners 35 | test_runner = Runner(test_loader, model) 36 | train_runner = Runner(train_loader, model, optimizer) 37 | 38 | # Setup the experiment tracker 39 | tracker = TensorboardExperiment(log_path=LOG_PATH) 40 | 41 | # Run the epochs 42 | for epoch_id in range(EPOCH_COUNT): 43 | run_epoch(test_runner, train_runner, tracker, epoch_id) 44 | 45 | # Compute Average Epoch Metrics 46 | summary = ", ".join( 47 | [ 48 | f"[Epoch: {epoch_id + 1}/{EPOCH_COUNT}]", 49 | f"Test Accuracy: {test_runner.avg_accuracy: 0.4f}", 50 | f"Train Accuracy: {train_runner.avg_accuracy: 0.4f}", 51 | ] 52 | ) 53 | print("\n" + summary + "\n") 54 | 55 | # Reset the runners 56 | train_runner.reset() 57 | test_runner.reset() 58 | 59 | # Flush the tracker after every epoch for live updates 60 | tracker.flush() 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /before/parse_raw_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pathlib 4 | from typing import Iterable, Tuple 5 | 6 | import numpy as np 7 | from PIL import Image 8 | from tqdm import tqdm 9 | 10 | from ds.load_data import load_image_data, load_label_data 11 | 12 | RAW_DATA = "./data/raw" 13 | TEST_DATA_RAW = pathlib.Path(f"{RAW_DATA}/t10k-images-idx3-ubyte.gz") 14 | TEST_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/t10k-labels-idx1-ubyte.gz") 15 | TRAIN_DATA_RAW = pathlib.Path(f"{RAW_DATA}/train-images-idx3-ubyte.gz") 16 | TRAIN_LABELS_RAW = pathlib.Path(f"{RAW_DATA}/train-labels-idx1-ubyte.gz") 17 | 18 | PROCESSED_DATA = './data/processed' 19 | TEST_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/test") 20 | TRAIN_DIR_PROCESSED = pathlib.Path(f"{PROCESSED_DATA}/tain") 21 | 22 | 23 | def main(): 24 | make_tree(TRAIN_DIR_PROCESSED, reset=True) 25 | make_tree(TEST_DIR_PROCESSED, reset=True) 26 | 27 | save_dataset_to_png( 28 | TRAIN_DIR_PROCESSED, 29 | zip(load_image_data(TRAIN_DATA_RAW), load_label_data(TRAIN_LABELS_RAW)) 30 | ) 31 | save_dataset_to_png( 32 | TEST_DIR_PROCESSED, 33 | zip(load_image_data(TEST_DATA_RAW), load_label_data(TEST_LABELS_RAW)) 34 | ) 35 | 36 | 37 | def make_tree(root: pathlib.Path, reset: bool = False) -> None: 38 | if reset: 39 | reset_tree(root) 40 | for child in range(10): 41 | child = pathlib.Path(str(child)) 42 | if not child.exists(): 43 | os.makedirs(root / child) 44 | 45 | 46 | def reset_tree(root: pathlib.Path) -> None: 47 | print('Resetting tree.') 48 | shutil.rmtree(root, ignore_errors=True) 49 | 50 | 51 | def save_dataset_to_png( 52 | root: pathlib.Path, array: Iterable[Tuple[np.ndarray, np.ndarray]] 53 | ) -> None: 54 | for i, xy in enumerate(tqdm(tuple(array), ncols=80)): 55 | save_xy_to_png(root, xy, str(i)) 56 | 57 | 58 | def save_xy_to_png( 59 | root: pathlib.Path, xy: Tuple[np.ndarray, np.ndarray], name: str 60 | ) -> None: 61 | x, y = xy 62 | Image.fromarray(x).save(root / str(int(y)) / f'{name}.jpg') 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /before/requirements.txt: -------------------------------------------------------------------------------- 1 | # Python: 3.9.6 2 | torch 3 | numpy 4 | pandas 5 | scikit-learn 6 | matplotlib 7 | tensorboard 8 | mlflow 9 | tqdm 10 | pillow -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | The `processed` folder will contain JPG images of the MNIST dataset after running [parse_raw_data.py](../parse_raw_data.py). The images are not directly commited to GitHub. Rather, the script should be executed to create them. 2 | 3 | --- 4 | 5 | The `raw` folder contains the raw MNIST data files, which can be downloaded [here](http://yann.lecun.com/exdb/mnist/). Below is some information about the file format of the MNIST dataset needed to create the processed images. 6 | 7 | --- 8 | 9 | header\ 10 | size in dimension 0\ 11 | size in dimension 1\ 12 | size in dimension 2\ 13 | .....\ 14 | size in dimension N\ 15 | data 16 | 17 | The magic number is an integer (MSB first). The first 2 bytes are always 0. 18 | 19 | The third byte codes the type of the data:\ 20 | 0x08: unsigned byte\ 21 | 0x09: signed byte\ 22 | 0x0B: short (2 bytes)\ 23 | 0x0C: int (4 bytes)\ 24 | 0x0D: float (4 bytes)\ 25 | 0x0E: double (8 bytes) 26 | 27 | The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices.... 28 | 29 | The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors). 30 | 31 | The data is stored like in a C array, i.e. the index in the last dimension changes the fastest. 32 | 33 | TEST SET LABEL FILE (t10k-labels-idx1-ubyte):\ 34 | [offset] [type] [value] [description]\ 35 | 0000 32 bit integer 0x00000801(2049) magic number (MSB first)\ 36 | 0004 32 bit integer 10000 number of items\ 37 | 0008 unsigned byte ?? label\ 38 | 0009 unsigned byte ?? label\ 39 | ........\ 40 | xxxx unsigned byte ?? label\ 41 | The labels values are 0 to 9. 42 | -------------------------------------------------------------------------------- /data/raw/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /data/raw/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /data/raw/train-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/train-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /data/raw/train-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-config/7c2c3babb0fb66d69eac81590356fae512c5e784/data/raw/train-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Python: 3.9.9 2 | torch 3 | numpy 4 | pandas 5 | scikit-learn 6 | matplotlib 7 | tensorboard 8 | mlflow 9 | tqdm 10 | pillow 11 | hydra-core --------------------------------------------------------------------------------