├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── README.md
├── assets
    ├── banner.jpg
    ├── cifar10-dataset.png
    ├── docker-build.png
    ├── k3d-clsuter-info.png
    ├── k3d-image-import.png
    ├── kubectl-get-nodes.png
    └── torch-version.png
├── data_ingestion.py
├── distributed-training.py
├── inference-request.py
├── inference-service.yaml
├── message-dumper.yaml
├── mnist-input.json
├── model-selection.py
├── model-selection.yaml
├── predict-service.py
├── predict-service.yaml
├── pvc.yaml
├── requirements-dev.txt
├── requirements.txt
├── tfjob.yaml
└── workflow.yaml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Jupyter Notebook
 7 | .ipynb_checkpoints
 8 | 
 9 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
10 | __pypackages__/
11 | 
12 | # Environments
13 | .env
14 | .venv
15 | env/
16 | venv/
17 | ENV/
18 | env.bak/
19 | venv.bak/
20 | 
21 | # datasets
22 | data/
23 | 
24 | # ruff
25 | .ruff_cache


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   # a set of useful Python-based pre-commit hooks
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v4.6.0
 5 |     hooks:
 6 |       # list of definitions and supported hooks: https://pre-commit.com/hooks.html
 7 |       - id: trailing-whitespace # removes any whitespace at the ends of lines
 8 |       - id: check-toml # check toml syntax by loading all toml files
 9 |       - id: check-yaml # check yaml syntax by loading all yaml files
10 |       - id: check-json # check-json syntax by loading all json files
11 |       - id: check-merge-conflict # check for files with merge conflict strings
12 |         args: ["--assume-in-merge"] #  and run this check even when not explicitly in a merge
13 |       - id: check-added-large-files # check that no "large" files have been added
14 |         args: ["--maxkb=10240"] #  where large means 10MB+, as in Hugging Face's git server
15 |       - id: debug-statements # check for python debug statements (import pdb, breakpoint, etc.)
16 |       - id: detect-private-key # checks for private keys (BEGIN X PRIVATE KEY, etc.)
17 |       - id: pretty-format-json # cleaner json files
18 |         args: ["--autofix"] # automatically
19 | 
20 |   # black for python autoformatting
21 |   - repo: https://github.com/psf/black
22 |     rev: 24.4.2
23 |     hooks:
24 |       - id: black
25 | 
26 |   # ruff for Python linting
27 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
28 |     # Ruff version.
29 |     rev: "v0.5.3"
30 |     hooks:
31 |       - id: ruff


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Install necessary dependencies
 4 | RUN apt-get update && apt-get install -y libhdf5-dev
 5 | 
 6 | RUN pip install tensorflow==2.12.0 tensorflow_datasets==4.9.2
 7 | 
 8 | COPY data_ingestion.py /
 9 | COPY predict-service.py /
10 | COPY model-selection.py /
11 | COPY distributed-training.py /


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # Distributed Machine Learning System
   2 | 
   3 | ![torch](<assets/banner.jpg>)
   4 | 
   5 | Building a distributed machine learning system in this modern era of deep learning is necessary. Every company that uses machine learning wants to serve their customers at scale. Models are becoming huge and the datasets required to train these models are also increasing. This trend will continue in future as well because of the [scaling laws](https://arxiv.org/abs/2203.15556). On top of that GPUs are very expensive. So keeping these GPUs idle can cost you a lot. Using multi-GPU training and optimizing inference can save costs and improve user experience.
   6 | 
   7 | I'm working on this project to get a deeper understanding of distributed machine learning systems. I'm using Kubernetes, Kubeflow, Tensorflow and Argo.
   8 | 
   9 | 
  10 | - [Distributed Machine Learning System](#distributed-machine-learning-system)
  11 |   - [Background](#background)
  12 |   - [Setup](#setup)
  13 |   - [System Architecture](#system-architecture)
  14 |   - [Data Ingestion](#data-ingestion)
  15 |     - [Create a simple pipeline](#create-a-simple-pipeline)
  16 |     - [Create a distributed data pipeline](#create-a-distributed-data-pipeline)
  17 |       - [What happens actually under the hood when this strategy is used?](#what-happens-actually-under-the-hood-when-this-strategy-is-used)
  18 |   - [Create a simple neural net](#create-a-simple-neural-net)
  19 |     - [Create a distributed model training workflow](#create-a-distributed-model-training-workflow)
  20 |   - [Model saving](#model-saving)
  21 |   - [Containerization](#containerization)
  22 |   - [Persistent volume](#persistent-volume)
  23 |   - [TFJob](#tfjob)
  24 |   - [Model selection](#model-selection)
  25 |   - [Inference](#inference)
  26 |     - [Create a single server model inference service](#create-a-single-server-model-inference-service)
  27 |     - [Distributed model inference](#distributed-model-inference)
  28 |   - [Replicated model servers inference](#replicated-model-servers-inference)
  29 |   - [End-to-end Workflow](#end-to-end-workflow)
  30 |   - [Logger](#logger)
  31 |   - [Monitoring](#monitoring)
  32 |   - [Summary](#summary)
  33 |   - [Next Steps](#next-steps)
  34 |   - [References](#references)
  35 | 
  36 | 
  37 | ## Background
  38 | 
  39 | > Distributed systems are a group of nodes that talk to each other to achieve a specific task, such as streaming movies across devices, search engines, etc. - Understanding distributed systems
  40 | 
  41 | These systems handle massive amounts of data across multiple clusters, use automation tools, and benefit from hardware accelerations.
  42 | 
  43 | This repository includes code and references to implement a scalable and reliable machine learning system. I'm constructing all the components including data ingestion, training, serving, and monitoring these workloads.
  44 | 
  45 | I'm building an image classification end-to-end system.
  46 | 
  47 | The steps involved are:
  48 | 1. Setup
  49 | 2. Data Ingestion
  50 | 3. Distributed Training
  51 | 4. Evaluation
  52 | 5. Serving
  53 | 6. End-to-End Workflow
  54 | 
  55 | 
  56 | ## Setup
  57 | 
  58 | I'm using a Macbook. These system are generally built on cloud. I'm using conda as the package manager. I also use homebrew for installations.
  59 | 
  60 | [1] Let's install pytorch for data processing, model building and evaluation workflows.
  61 | 
  62 | ```bash
  63 | conda install tensorflow tensorflow_datasets
  64 | ```
  65 | 
  66 | ![torch](<assets/torch-version.png>)
  67 | 
  68 | 
  69 | [2] `Docker` is required to create single- or multi-node clusters. I'm installing docker desktop.
  70 | 
  71 | [3] Install a popular CLI utility called `kubectl`.
  72 | 
  73 | ```bash
  74 | brew install kubectl
  75 | ```
  76 | 
  77 | [4] To use Kubernetes on the local machine, install k3d, a lightweight wrapper to run k8s. There's minikube, kind and other distributions as well but I find k3d lean, memory efficient and simpler.
  78 | 
  79 | ```bash
  80 | wget -q -O - https://raw.githubusercontent.com/rancher/k3d/main/install.sh | bash
  81 | ```
  82 | 
  83 | Create a single-node cluster. You can create a multi-server cluster as well by specifying `--servers 3`.
  84 | 
  85 | ```bash
  86 | k3d cluster create dist-ml --image rancher/k3s:v1.25.3-k3s1
  87 | ```
  88 | 
  89 | You can see the cluster info using command
  90 | 
  91 | ```bash
  92 | kubectl cluster-info
  93 | ```
  94 | 
  95 | ![k3d-cluster-info](assets/k3d-clsuter-info.png)
  96 | 
  97 | Let's see which pods are created using
  98 | 
  99 | ```bash
 100 | kubectl get nodes
 101 | ```
 102 | 
 103 | ![kubectl-get-nodes](assets/kubectl-get-nodes.png)
 104 | 
 105 | [5] Install `kubectx` to easily switch between clusters and `kubens` for namespaces. This is a very handy utility.
 106 | 
 107 | ```bash
 108 | brew install kubectx
 109 | ```
 110 | 
 111 | Using kubens you can swtich between namespaces easily
 112 | 
 113 | [6] Next, we install kubeflow training operator that allows us to train large models effectively.
 114 | 
 115 | But first create a new namespace.
 116 | 
 117 | ```bash
 118 | kubectl create namespace kubeflow
 119 | 
 120 | kubens kubeflow
 121 | ```
 122 | 
 123 | Now install the operator.
 124 | 
 125 | ```bash
 126 | kubectl apply -k "github.com/kubeflow/training-operator.git/manifests/overlays/standalone?ref=v1.7.0"
 127 | ```
 128 | 
 129 | [7] To create an end-to-end workflow we need argo workflows.
 130 | 
 131 | ```bash
 132 | kubectl create namespace argo
 133 | kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.5.8/install.yaml
 134 | ```
 135 | 
 136 | [8] For experiment tracking, install MLFlow.
 137 | 
 138 | ```bash
 139 | helm install dist-mlflow community-charts/mlflow
 140 | ```
 141 | 
 142 | ## System Architecture
 143 | 
 144 | There are multiple design patterns which can be used to create a ML system. In this project, I'm sticking to the easiest one. It has a data ingestion component. Once data is available you can schedule the pipeline to download the data and store it somewhere(e.g. s3). We then train multiple models on the same dataset parallely. Once the models are available, we can pick the best model and create a scalable inference service.
 145 | 
 146 | <img width="1075" alt="Screenshot 2024-06-17 at 3 28 42 PM" src="https://github.com/aniket-mish/distributed-ml-system/assets/71699313/635143bb-0952-4578-99cd-6d40d1172a33">
 147 | 
 148 | ## Data Ingestion
 149 | 
 150 | I'm using the cifar10 dataset that has 60,000 images(50,000 for training and 10,000 for testing). It has 10 different catergories and each is a low resolution color image of 32x32 px.
 151 | 
 152 | ![cifar10](assets/cifar10-dataset.png)
 153 | 
 154 | ### Create a simple pipeline
 155 | 
 156 | The `tf.data` API enables you to build complex input pipelines from simple, reusable pieces. It's very efficient and enables handling large amounts of data, reading from different data formats, and performing complex transformations.
 157 | 
 158 | I'm loading the dataset into a `tf.data.Dataset` object and cast the images to float32. Next, I'm normalizing the image pixel values from the [0, 255] to the [0, 1] range. These are some standard practices. I'm keeping an *in-memory cache* to improve performance. Let's also shuffle the training data to add some randomness.
 159 | 
 160 | ```python
 161 | import Tensorflow_datasets as tfds
 162 | import Tensorflow as tf
 163 | 
 164 | def get_dataset():
 165 |     BUFFER_SIZE = 10000
 166 |     def scale(image, label):
 167 |         image = tf.cast(image, tf.float32)
 168 |         image /= 255
 169 |         return image, label
 170 |     datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
 171 |     train = datasets['train']
 172 |     return train.map(scale).cache().shuffle(BUFFER_SIZE)
 173 | ```
 174 | 
 175 | I'm using Tensorflow datasets module to load the dataset. The above piece of code gives a shuffled dataset where each element consists of images and labels.
 176 | 
 177 | ### Create a distributed data pipeline
 178 | 
 179 | To consume a large dataset(>PBs), we need to use a distributed approach. We can do that with some tweaks to the same function that we created.
 180 | 
 181 | For distributed data ingestion, just increase the batch size to use the extra computing power effectively,
 182 | 
 183 | > [!TIP]
 184 | > Use the largest batch size that fits the GPU memory
 185 | 
 186 | There are several strategies in-built into Tensorflow library. There is a `MirroredStrategy()` that can be used to train on a single machine with multiple GPUs but if you want to distribute training on multiple machines in a cluster/s(recommended and my goal), then `MultiWorkerMirroredStrategy()` strategy is a way to go.
 187 | 
 188 | ```python
 189 | strategy = tf.distribute.MultiWorkerMirroredStrategy()
 190 | 
 191 | BATCH_SIZE_PER_REPLICA = 64
 192 | BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
 193 | ```
 194 | 
 195 | The `num_replicas_in_sync` equals the number of devices that are used in the **all-reduce** operation. Use the `tf.distribute.MultiWorkerMirroredStrategy` API and with the help of this strategy, a keras model that was designed to run on a single worker can seamlessly work on multiple workers with minimal code changes.
 196 | 
 197 | #### What happens actually under the hood when this strategy is used?
 198 | 
 199 | 1. Each GPU performs the forward pass on a different slice of the input data and computes the loss
 200 | 
 201 | 2. Next each GPU compute the gradients based on the loss
 202 | 
 203 | 3. These gradients are then aggregated across all of the devices(using an all-reduce algorithm)
 204 | 
 205 | 4. The optimizer updates the weights using the reduced gradients thereby keeping the devices in sync
 206 | 
 207 | > [!NOTE]
 208 | > PyTorch has [DDP](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [FSDP](https://pytorch.org/docs/stable/fsdp.html)(more popular and useful)
 209 | 
 210 | I'm enabling automatic data sharding across workers by setting `tf.data.experimental.AutoShardPolicy` to `AutoShardPolicy.DATA`. This setting is needed to ensure convergence and performance. The concept of [sharding](https://www.Tensorflow.org/api_docs/python/tf/data/experimental/DistributeOptions) means handing each worker a subset of the entire dataset.
 211 | 
 212 | Now the final training workflow can be written below
 213 | 
 214 | ```python
 215 | with strategy.scope():
 216 |     dataset = get_dataset().batch(BATCH_SIZE).repeat()
 217 |     options = tf.data.Options()
 218 |     options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
 219 |     dataset = dataset.with_options(options)
 220 |     model = build_and_compile_cnn_model()
 221 | 
 222 | model.fit(dataset, epochs=5, steps_per_epoch=70)
 223 | ```
 224 | 
 225 | ## Create a simple neural net
 226 | 
 227 | A simple neural net with `Adam` optimizer and `SparseCategoricalCrossentropy` loss as we have 10 categories to predict from.
 228 | 
 229 | ```python
 230 | def build_and_compile_cnn_model():
 231 |     """
 232 |     Build and compile a simple cnn model
 233 |     """
 234 |     print("Training a simple cnn model")
 235 |     model = tf.keras.models.Sequential()
 236 |     model.add(tf.keras.layers.Input(shape=(28, 28, 1), name="image_bytes"))
 237 |     model.add(tf.keras.layers.Conv2D(32, (3, 3), activation="relu"))
 238 |     model.add(tf.keras.layers.MaxPooling2D((2, 2)))
 239 |     model.add(tf.keras.layers.Flatten())
 240 |     model.add(tf.keras.layers.Dense(64, activation="relu"))
 241 |     model.add(tf.keras.layers.Dense(10, activation="softmax"))
 242 | 
 243 |     model.summary()
 244 | 
 245 |     model.compile(
 246 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
 247 |     )
 248 | 
 249 |     return model
 250 | ```
 251 | 
 252 | Let's define some necessary callbacks that will be executed during model training.
 253 | 
 254 | 1. Checkpointing saves model weights at some frequency(use `save_freq`). We use `tf.keras.callbacks.ModelCheckpoint` for checkpointing.
 255 | 
 256 | ```python
 257 | checkpoint_dir = './training_checkpoints'
 258 | checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
 259 | ```
 260 | 
 261 | I define the checkpoint directory to store the checkpoints and the names of the files. Checkpoints are important to restore the weights if the model training stops due to some issues.
 262 | 
 263 | 2. `tf.keras.callbacks.TensorBoard` writes a log for TensorBoard, which allows you to visualize the graphs.
 264 | 
 265 | 3. `tf.keras.callbacks.LearningRateScheduler` schedules the learning rate to change after, for example, every epoch/batch.
 266 | 
 267 | ```python
 268 | def decay(epoch):
 269 |     if epoch < 3:
 270 |         return 1e-3
 271 |     elif epoch >= 3 and epoch < 7:
 272 |         return 1e-4
 273 |     else:
 274 |         return 1e-5
 275 | ```
 276 | 
 277 | 4. PrintLR prints the learning rate at the end of each epoch.
 278 | 
 279 | ```python
 280 | class PrintLR(tf.keras.callbacks.Callback):
 281 |     def on_epoch_end(self, epoch, logs=None):
 282 |         print('\nLearning rate for epoch {} is {}'.format(        epoch + 1, model.optimizer.lr.numpy()))
 283 | ```
 284 | 
 285 | Now put all the components together.
 286 | 
 287 | ```python
 288 | callbacks = [
 289 |     tf.keras.callbacks.TensorBoard(log_dir='./logs'),
 290 |     tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True),
 291 |     tf.keras.callbacks.LearningRateScheduler(decay),
 292 |     PrintLR()
 293 | ]
 294 | ```
 295 | 
 296 | Now every piece is in it's correct place.
 297 | 
 298 | Next, train the model.
 299 | 
 300 | ```python
 301 | model = build_and_compile_cnn_model()
 302 | model.fit(dataset, epochs=3, steps_per_epoch=70, callbacks=callbacks)
 303 | ```
 304 | 
 305 | I'm getting an accuracy of 94% on the training data. I'm not spending much time on increasing the accuracy as it's not our end goal.
 306 | 
 307 | > [!NOTE]
 308 | > I'm doing these experiments in a colab notebook. Later I will copy this code to python scripts.
 309 | 
 310 | ### Create a distributed model training workflow
 311 | 
 312 | I've already discussed about the strategy to use here. Just a recap - for distributed training on multiple workers, use the `MultiWorkerMirroredStrategy` with Keras(tf as backend).
 313 | 
 314 | There are different ways to do distributed training and `data parallelism` is the most common one. There are two common ways to do [distributed training with data parallelism](https://www.Tensorflow.org/tutorials/distribute/multi_worker_with_keras):
 315 | 
 316 | 1. *Synchronous* training, where the steps of training are synced across the workers and replicas. all workers train over different slices of input data in sync, and aggregating gradients at each step.
 317 | 
 318 | 2. *Asynchronous* training, where the training steps are not strictly synced. all workers are independently training over the input data and updating variables asynchronously. see [parameter server training](https://www.Tensorflow.org/tutorials/distribute/parameter_server_training).
 319 | 
 320 | I'm using the `MultiWorkerMirroredStrategy` that does synchronous distributed training across multiple workers, each with potentially multiple GPUs. It replicates all variables and computations to each local device and uses distributed collective implementation (e.g. all-reduce) so that multiple workers can work together.
 321 | 
 322 | Let's initiate the distributed input data pipeline and the model inside the strategy scope but hold on we need to save these models somewhere so that they can then be fetched for inference.
 323 | 
 324 | ## Model saving
 325 | 
 326 | To save the model we can use `model.save` but since there are multiple models training parallely, the saving destinations need to be different for each worker.
 327 | 
 328 | One approach is:
 329 | - For worker nodes, save the model to a temporary directory
 330 | - For the master node, save the model to the provided directory
 331 | 
 332 | The temporary directories of the workers need to be unique to prevent errors. The model saved in the directories will be identical, and only the model saved by the master should be referenced for restoring or serving.
 333 | 
 334 | I'm not saving the model to temporary directories as doing this will waste my laptop's computing resources and memory. I'm determining which worker node is the master and saving its model only.
 335 | 
 336 | To determine if the worker node is the master or not, use the environment variable `TF_CONFIG`.
 337 | 
 338 | An example configuration looks like below:
 339 | ```python
 340 | tf_config = {
 341 |     'cluster': {
 342 |         'worker': ['localhost:12345', 'localhost:23456']
 343 |     },
 344 |     'task': {'type': 'worker', 'index': 0}
 345 | }
 346 | ```
 347 | 
 348 | The `_is_worker_master` function inspects the cluster specs and current task type and returns `True` if the worker is the master and `False` otherwise.
 349 | 
 350 | ```python
 351 | def _is_worker_master():
 352 |     return TASK_INDEX == 0
 353 | 
 354 | tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}')
 355 | TASK_INDEX = tf_config['task']['index']
 356 | 
 357 | if _is_worker_master():
 358 |     model_path = args.saved_model_dir
 359 | else:
 360 |     model_path = args.saved_model_dir + '/worker_tmp_' + str(TASK_INDEX)
 361 | 
 362 | multi_worker_model.save(model_path)
 363 | ```
 364 | 
 365 | 
 366 | ## Containerization
 367 | 
 368 | To containerize I have a Python script called `distributed-training.py` that has all the three models.
 369 | 
 370 | Let's create a `Dockerfile`
 371 | 
 372 | ```dockerfile
 373 | FROM python:3.9
 374 | RUN pip install Tensorflow Tensorflow_datasets
 375 | COPY distributed-training.py /
 376 | ```
 377 | 
 378 | Next, build the docker image
 379 | 
 380 | ```bash
 381 | docker build -f Dockerfile -t kubeflow/distributed-training-strategy:v0.1 .
 382 | ```
 383 | 
 384 | ![docker-build](assets/docker-build.png)
 385 | 
 386 | 
 387 | I need to import the image to the k3d cluster as it cannot access the image registry.
 388 | 
 389 | ```bash
 390 | k3d image import kubeflow/distributed-training-strategy:v0.1 --cluster dist-ml
 391 | ```
 392 | 
 393 | ![k3d-image-import](assets/k3d-image-import.png)
 394 | 
 395 | 
 396 | ## Persistent volume
 397 | 
 398 | When training a model in respective pods, if the operations/computations are completed/failed, the files in the pod are recycled/deleted by the garbage collector. This means that all the model checkpoints are lost, that means now we don't have a model for serving.
 399 | 
 400 | To avoid this we have to use `PersistentVolume` and `PersistentVolumeClaim`.
 401 | 
 402 | A *_PersistentVolume(PV)_* is a piece of storage in the cluster that has been provisioned by an administrator or dynamically provisioned. It is a resource in the cluster just like a node is a cluster resource. PVs are volume plugins like volumes but have a lifecycle independent of any individual Pod that uses the PV, that means the storage will persist and live even when the pods are deleted.
 403 | 
 404 | A *_PersistentVolumeClaim (PVC)_* is a request for storage by a user. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., they can be mounted ReadWriteOnce, ReadOnlyMany, or ReadWriteMany).
 405 | 
 406 | Next, create a PVC to submit a request for storage that will be used in worker pods to store the trained model. I'm requesting 1GB of storage with `ReadWriteOnce` mode.
 407 | 
 408 | ```yaml
 409 | apiVersion: v1
 410 | kind: PersistentVolumeClaim
 411 | metadata:
 412 |   name: strategy-volume
 413 | spec:
 414 |   accessModes: [ "ReadWriteOnce" ]
 415 |   volumeMode: Filesystem
 416 |   resources:
 417 |     requests:
 418 |       storage: 1Gi
 419 | ```
 420 | 
 421 | ```bash
 422 | kubectl apply -f pvc.yaml
 423 | ```
 424 | 
 425 | ## TFJob
 426 | 
 427 | Next, define a TFJob spec which helps distributed model training.
 428 | 
 429 | > [!NOTE]
 430 | > There's a concept of deployments and the main difference between deployments and jobs is how they handle a pod that is terminated. a deployment is intended to be a "service",
 431 | > e.g. it should be up and running, so it will try to restart the pods it's managing, to match the desired number of replicas. while a job is intended to execute and successfully
 432 | > terminate.
 433 | 
 434 | ```yaml
 435 | apiVersion: kubeflow.org/v1
 436 | kind: TFJob
 437 | metadata:
 438 |   name: training
 439 | spec:
 440 |   runPolicy:
 441 |     cleanPodPolicy: None
 442 |   tfReplicaSpecs:
 443 |     Worker:
 444 |       replicas: 2
 445 |       restartPolicy: Never
 446 |       template:
 447 |         spec:
 448 |           containers:
 449 |             - name: Tensorflow
 450 |               image: kubeflow/ditributed-training-strategy:v0.1
 451 |               imagePullPolicy: IfNotPresent
 452 |               command: ["python", "/distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
 453 |               volumeMounts:
 454 |                 - mountPath: /trained_model
 455 |                   name: training
 456 |               resources:
 457 |                 limits:
 458 |                   cpu: 500m
 459 |           volumes:
 460 |             - name: training
 461 |               persistentVolumeClaim:
 462 |                 claimName: strategy-volume
 463 | ```
 464 | 
 465 | You can pass `saved_model_dir` and `checkpoint_dir` to the container.
 466 | 
 467 | The `volumes` field specifies the persistent volume claim and `volumeMounts` field specifies what folder to mount the files. The `CleanPodPolicy` in the TFJob spec controls the deletion of pods when a job terminates. The `restartPolicy` determines whether pods will be restarted when they exit.
 468 | 
 469 | Submit the TFJob to our cluster and start our distributed model training.
 470 | 
 471 | ```bash
 472 | kubectl apply -f tfjob.yaml
 473 | ```
 474 | 
 475 | I can see two pods running our distributed training as we've specified `2` workers.
 476 | 
 477 | 1. training-worker-0
 478 | 2. training-worker-1
 479 | 
 480 | Let's see the logs from the pod `training-worker-0`
 481 | 
 482 | ```bash
 483 | kubectl logs training-worker-0
 484 | ```
 485 | 
 486 | #TODO
 487 | 
 488 | 
 489 | While training the model, I'm storing it in the `/saved_model_versions/1/` path.
 490 | 
 491 | > [!NOTE]
 492 | > We can edit/update the code and resubmit the job. Just delete the running job, rebuild the docker image, import it, and resubmit the job. These are the steps to remember every
 493 | > time we change the code.
 494 | 
 495 | ```bash
 496 | kubectl delete tfjob --all
 497 | docker build -f Dockerfile -t kubeflow/distributed-training-strategy:v0.1 .
 498 | k3d image import kubeflow/distributed-training-strategy:v0.1 --cluster dist-ml
 499 | kubectl apply -f tfjob.yaml
 500 | ```
 501 | 
 502 | Voila! model training's done.
 503 | 
 504 | Next, evaluate the model's performance.
 505 | 
 506 | ```bash
 507 | kubectl apply -f predict-service.yaml
 508 | ```
 509 | 
 510 | Finally, I have a trained model stored in the file path `trained_model/saved_model_versions/2/`.
 511 | 
 512 | ```bash
 513 | kubectl exec --stdin --tty predict-service -- bin/bash
 514 | ```
 515 | 
 516 | To see the evals, you can exec into a running container `predict-service`.
 517 | 
 518 | #TODO
 519 | 
 520 | Execute `predict-service.py` which takes the trained model and evaluates it on the `test` dataset.
 521 | 
 522 | ## Model selection
 523 | 
 524 | I'm training multiple models and let's pick the best one and use it for the inference.
 525 | 
 526 | I've a deep neural network with batch norm layers.
 527 | 
 528 | ```python
 529 | def build_and_compile_cnn_model_with_batch_norm():
 530 |     print("Training CNN model with batch normalization")
 531 |     model = models.Sequential()
 532 |     model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
 533 |     model.add(layers.Conv2D(32, (3, 3), activation='relu'))
 534 |     model.add(layers.BatchNormalization())
 535 |     model.add(layers.Activation('sigmoid'))
 536 |     model.add(layers.MaxPooling2D((2, 2)))
 537 |     model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 538 |     model.add(layers.BatchNormalization())
 539 |     model.add(layers.Activation('sigmoid'))
 540 |     model.add(layers.MaxPooling2D((2, 2)))
 541 |     model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 542 |     model.add(layers.Flatten())
 543 |     model.add(layers.Dense(64, activation='relu'))
 544 |     model.add(layers.Dense(10, activation='softmax'))
 545 | 
 546 |     model.summary()
 547 | 
 548 |     model.compile(optimizer='adam',
 549 |                   loss='sparse_categorical_crossentropy',
 550 |                   metrics=['accuracy'])
 551 | 
 552 |     return model
 553 | ```
 554 | 
 555 | Let's create one more nn with dropout.
 556 | 
 557 | ```python
 558 | def build_and_compile_cnn_model_with_dropout():
 559 |     print("Training CNN model with dropout")
 560 |     model = models.Sequential()
 561 |     model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
 562 |     model.add(layers.Conv2D(32, (3, 3), activation='relu'))
 563 |     model.add(layers.MaxPooling2D((2, 2)))
 564 |     model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 565 |     model.add(layers.MaxPooling2D((2, 2)))
 566 |     model.add(layers.Dropout(0.5))
 567 |     model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 568 |     model.add(layers.Flatten())
 569 |     model.add(layers.Dense(64, activation='relu'))
 570 |     model.add(layers.Dense(10, activation='softmax'))
 571 | 
 572 |     model.summary()
 573 | 
 574 |     model.compile(optimizer='adam',
 575 |                   loss='sparse_categorical_crossentropy',
 576 |                   metrics=['accuracy'])
 577 | 
 578 |     return model
 579 | ```
 580 | 
 581 | Let's train these models by submitting three different `TFJob`s with arguments `--model_type` and `--saved_model_dir`.
 582 | 
 583 | ```bash
 584 | kubectl apply -f tfjob.yaml
 585 | ```
 586 | 
 587 | Next, evaluate all the models performance. The model with the highest accuracy score can be moved to a different folder and then used for serving.
 588 | 
 589 | ```python
 590 | best_model_path = ""
 591 | best_accuracy = 0
 592 | 
 593 | for i in range(3):
 594 | 
 595 |     model_path = "trained_models/saved_model_versions/" + str(i)
 596 |     model = tf.keras.models.load_model(model_path)
 597 | 
 598 |     datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
 599 | 
 600 |     test = datasets['test']
 601 | 
 602 |     ds = mnist_test.map(scale).cache().shuffle(BUFFER_SIZE).batch(64)
 603 | 
 604 |     loss, accuracy = model.evaluate(ds)
 605 | 
 606 |     if accuracy > best_accuracy:
 607 |       best_accuracy = accuracy
 608 |       best_model_path = model_path
 609 | 
 610 | dest = "trained_model/saved_model_versions/3"
 611 | shutil.copytree(best_model_path, dest)
 612 | ```
 613 | 
 614 | You can run the model-selection.py.
 615 | 
 616 | ```yaml
 617 | apiVersion: v1
 618 | kind: Pod
 619 | metadata:
 620 |   name: model-selection
 621 | spec:
 622 |   containers:
 623 |   - name: predict
 624 |     image: kubeflow/distributed-training-strategy:v0.1
 625 |     command: ["python", "/model-selection.py"]
 626 |     volumeMounts:
 627 |     - name: model
 628 |       mountPath: /trained_model
 629 |   volumes:
 630 |   - name: model
 631 |     persistentVolumeClaim:
 632 |       claimName: strategy-volume
 633 | ```
 634 | 
 635 | #TODO
 636 | 
 637 | ## Inference
 638 | 
 639 | I've implemented distributed training and model selection components. Next I'm creating a model serving component. This component takes the trained model from `trained_model/saved_model_versions/3`.
 640 | 
 641 | The inference service should be very highly performant and robust. I'm not considering cost at this moment.
 642 | 
 643 | ### Create a single server model inference service
 644 | 
 645 | ```python
 646 | model_path = "trained_models/saved_model_versions/3"
 647 | model = tf.keras.models.load_model(model_path)
 648 | datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
 649 | mnist_test = datasets['test']
 650 | ds = mnist_test.map(scale).cache().shuffle(BUFFER_SIZE).batch(64)
 651 | loss, accuracy = model.predict(ds)
 652 | ```
 653 | 
 654 | I'm using `TFServing` to expose the model as an endpoint service.
 655 | 
 656 | ```bash
 657 | # Environment variable with the path to the model
 658 | os.environ["MODEL_PATH"] = f"{model_path}"
 659 | 
 660 | nohup Tensorflow_model_server \
 661 |   --port=8500 \
 662 |   --rest_api_port=8501 \
 663 |   --model_name=model \
 664 |   --model_base_path=$MODEL_PATH
 665 | ```
 666 | 
 667 | _Nohup, short for no hang-up is a command in Linux systems that keeps processes running even after exiting the shell or terminal._
 668 | 
 669 | 
 670 | ### Distributed model inference
 671 | 
 672 | The method mentioned above works great if we're only experimenting locally. There are more efficient ways for distributed model serving.
 673 | 
 674 | TensorFlow models contain a signature definition that defines the signature of a computation supported in a TensorFlow graph. SignatureDefs aims to provide generic support to identify the inputs and outputs of a function. We can modify this input layer with a preprocessing function so that clients can use base64 encoded images, which is a standard way of sending images through RESTFUL APIs. To do that, we’ll save a model with new serving signatures. The new signatures use Python functions to handle preprocessing the image from a JPEG to a Tensor. [Refer](https://cloud.google.com/blog/topics/developers-practitioners/add-preprocessing-functions-tensorflow-models-and-deploy-vertex-ai)
 675 | 
 676 | ```python
 677 | def _preprocess(bytes_inputs):
 678 |     decoded = tf.io.decode_jpeg(bytes_inputs, channels=1)
 679 |     resized = tf.image.resize(decoded, size=(28, 28))
 680 |     return tf.cast(resized, dtype=tf.uint8)
 681 | 
 682 | def _get_serve_image_fn(model):
 683 |     @tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.string, name='image_bytes')])
 684 |     def serve_image_fn(bytes_inputs):
 685 |         decoded_images = tf.map_fn(_preprocess, bytes_inputs, dtype=tf.uint8)
 686 |         return model(decoded_images)
 687 |     return serve_image_fn
 688 | 
 689 | signatures = {
 690 |     "serving_default": _get_serve_image_fn(model).get_concrete_function(
 691 |         tf.TensorSpec(shape=[None], dtype=tf.string, name='image_bytes')
 692 |     )
 693 | }
 694 | 
 695 | tf.saved_model.save(multi_worker_model, model_path, signatures=signatures)
 696 | ```
 697 | 
 698 | Now we have updated the training script, we should rebuild the image and re-train the model.
 699 | 
 700 | Next, we will use KServe for inference service. [KServe](https://www.kubeflow.org/docs/external-add-ons/kserve/kserve/) enables serverless inferencing on Kubernetes and provides performant, high-abstraction interfaces for common machine learning (ML) frameworks like TensorFlow, PyTorch, etc. [Refer](https://kserve.github.io/website/0.11/modelserving/v1beta1/tensorflow/).
 701 | 
 702 | We create an [InferenceService](https://kserve.github.io/website/0.11/get_started/first_isvc/#run-your-first-inferenceservice) yaml, which specifies the framework tensorflow and storageUri that is pointed to a saved Tensorflow model.
 703 | 
 704 | ```yaml
 705 | apiVersion: "serving.kserve.io/v1beta1"
 706 | kind: InferenceService
 707 | metadata:
 708 |   name: tf-mnist
 709 | spec:
 710 |   predictor:
 711 |     model:
 712 |       modelFormat:
 713 |         name: tensorflow
 714 |       storageUri: "pvc://strategy-volume/saved_model_versions"
 715 | ```
 716 | 
 717 | Install KServe.
 718 | 
 719 | ```bash
 720 | curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.11/hack/quick_install.sh" | bash
 721 | ```
 722 | 
 723 | Next, apply the inference-service.yaml to create the InferenceService. By default, it exposes an HTTP/REST endpoint.
 724 | 
 725 | ```bash
 726 | kubectl apply -f inference-service.yaml
 727 | ```
 728 | 
 729 | Wait for the InferenceService to be in a ready state.
 730 | 
 731 | ```bash
 732 | kubectl get isvc tf-mnist
 733 | ```
 734 | 
 735 | Next, we run the prediction. But first, we need to determine and set the INGRESS_HOST and INGRESS_PORT. An ingress gateway is like an API gateway that load-balances requests. To test it locally we have to do `Port Forward`.
 736 | 
 737 | ```bash
 738 | INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
 739 | kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80
 740 | ```
 741 | 
 742 | Then do the following in a different terminal window.
 743 | 
 744 | ```bash
 745 | export INGRESS_HOST=localhost
 746 | export INGRESS_PORT=8080
 747 | ```
 748 | 
 749 | We can send a sample request to our inference service. We can curl.
 750 | 
 751 | ```bash
 752 | MODEL_NAME=tf-mnist
 753 | INPUT_PATH=@./mnist-input.json
 754 | SERVICE_HOSTNAME=$(kubectl get inferenceservice $MODEL_NAME -n kubeflow -o jsonpath='{.status.url}' | cut -d "/" -f 3)
 755 | curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict -d $INPUT_PATH
 756 | ```
 757 | 
 758 | or we use the requests library.
 759 | 
 760 | ```python
 761 | input_path = "mnist-input.json"
 762 | 
 763 | with open(input_path) as json_file:
 764 |     data = json.load(json_file)
 765 | 
 766 | response = requests.post(
 767 |     url="http://localhost:8080/v1/models/tf-mnist:predict",
 768 |     data=json.dumps(data),
 769 |     headers={"Host": "tf-mnist.kubeflow.example.com"},
 770 | )
 771 | print(response.text)
 772 | ```
 773 | 
 774 | #TODO
 775 | 
 776 | Our inference service is working as expected.
 777 | 
 778 | 
 779 | ## Replicated model servers inference
 780 | 
 781 | Next, I want to have multiple model servers to handle large amounts of traffic. KServe can autoscale based on the requests. The autoscaler can scale down to zero if the application is receiving no traffic or we can specify a minimum number of replicas that need to be there. The `autoscaling.knative.dev/target` sets a soft limit. Other specs that can be configured like `minReplicas`, `containerConcurrency`, and `scaleMetric`, etc.
 782 | 
 783 | ```yaml
 784 | apiVersion: "serving.kserve.io/v1beta1"
 785 | kind: InferenceService
 786 | metadata:
 787 |   name: tf-mnist
 788 |   annotations:
 789 |     autoscaling.knative.dev/target: "1"
 790 | spec:
 791 |   predictor:
 792 |     model:
 793 |       modelFormat:
 794 |         name: tensorflow
 795 |       storageUri: "pvc://strategy-volume/saved_model_versions"
 796 | ```
 797 | 
 798 | Next, I install [Hey](https://github.com/rakyll/hey), a tiny program that sends some load to a web application. Hey runs provided a number of requests in the provided concurrency level and prints stats.
 799 | 
 800 | ```bash
 801 | # https://github.com/rakyll/hey
 802 | brew install hey
 803 | kubectl apply -f inference-service.yaml
 804 | 
 805 | hey -z 30s -c 5 -m POST -host ${SERVICE_HOSTNAME} -D mnist-input.json "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/tf-mnist:predict"
 806 | ```
 807 | 
 808 | #TODO
 809 | 
 810 | I'm sending traffic for 30 seconds with 5 concurrent requests. As the scaling target is set to 1 and we load the service with 5 concurrent requests, the autoscaler tries scaling up to 5 pods. There will be a cold start time initially to spawn pods. It may take longer (to pull the docker image) if is not cached on the node.
 811 | 
 812 | 
 813 | ## End-to-end Workflow
 814 | 
 815 | It's time to connect all the parts. I'm using argo workflow to orchestrate the jobs we executed before in an end-to-end fashion. We can build a CICD workflow using DAG (exactly similar to GitLab CICD) on Kubernetes. Argo is the defacto engine for orchestration on Kubernetes.
 816 | 
 817 | We will start by installing argo workflows in a different namespace.
 818 | 
 819 | ```bash
 820 | kubectl create namespace argo
 821 | kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.4.11/install.yaml
 822 | ```
 823 | 
 824 | I'm creating an end-to-end workflow with 4 steps:
 825 | 1. Data Ingestion
 826 | 2. Distributed Training
 827 | 3. Model Selection
 828 | 4. Model Serving
 829 | 
 830 | ```yaml
 831 | apiVersion: argoproj.io/v1alpha1
 832 | kind: Workflow                  # new type of k8s spec
 833 | metadata:
 834 |   generateName: tfjob-wf-    # name of the workflow spec
 835 | spec:
 836 |   entrypoint: tfjob-wf          # invoke the tfjob template
 837 |   templates:
 838 |   - name: tfjob-wf
 839 |     steps:
 840 |     - - name: data-ingestion-step
 841 |         template: data-ingestion-step
 842 |     - - name: distributed-tf-training-steps
 843 |         template: distributed-tf-training-steps
 844 |     - - name: model-selection-step
 845 |         template: model-selection-step
 846 |     - - name: create-model-serving-service
 847 |         template: create-model-serving-service
 848 | podGC:
 849 |   strategy: OnPodSuccess
 850 | volumes:
 851 | - name: model
 852 |   persistentVolumeClaim:
 853 |     claimName: strategy-volume
 854 | ```
 855 | 
 856 | This is a multi-step workflow where all the steps are executed sequentially(double dash). `PodGC` describes how to delete completed pods. Deleting completed pods can free the resources. I'm also using persistent storage to store the dataset and the trained models.
 857 | 
 858 | The first step is the data ingestion. We have added a `memoize` spec to cache the output of this step. Memoization reduces cost and execution time. Since we do not want to download the data every time, we can cache it using the configMap. We have to specify the `key` and name for the `config-map` cache. I have also specified `maxAge` to `1h`, which defines how long should the cache be considered valid.
 859 | 
 860 | ```yaml
 861 | - name: data-ingestion-step
 862 |   serviceAccountName: argo
 863 |   memoize:
 864 |   cache:
 865 |     configMap:
 866 |       name: data-ingestion-config
 867 |       key: "data-ingestion-cache"
 868 |     maxAge: "1h"
 869 |   container:
 870 |     image: kubeflow/distributed-training-strategy:v0.1
 871 |     imagePullPolicy: IfNotPresent
 872 |     command: ["python", "/data-ingestion.py"]
 873 | ```
 874 | 
 875 | Next, we execute the model training steps in parallel.
 876 | 
 877 | 
 878 | ```yaml
 879 | - name: distributed-training-step
 880 |   steps:
 881 |   - - name: cnn-model
 882 |       template: cnn-model
 883 |     - name: cnn-model-with-dropout
 884 |       template: cnn-model-with-dropout
 885 |     - name: cnn-model-with-batch-norm
 886 |       template: cnn-model-with-batch-norm
 887 | ```
 888 | 
 889 | Next, we create a step to run distributed training with the CNN model. To create the TFJob, we include the manifest we created before. We also add the `successCondition` and `failureCondition` to indicate if the job is created. Here we are storing the trained model in a different folder. We create similar steps for the other two models.
 890 | 
 891 | 
 892 | ```yaml
 893 | - name: cnn-model
 894 |   serviceAccountName: training-operator
 895 |   resource:
 896 |     action: create
 897 |     setOwnerReference: true
 898 |     successCondition: status.replicaStatuses.Worker.succeeded = 2
 899 |     failureCondition: status.replicaStatuses.Worker.failed > 0
 900 |   manifests: |
 901 |     apiVersion: kubeflow.org/v1
 902 |     kind: TFJob
 903 |     metadata:
 904 |       generateName: training-
 905 |     spec:
 906 |       runPolicy:
 907 |         cleanPodPolicy: None
 908 |       tfReplicaSpecs:
 909 |         Worker:
 910 |           replicas: 2
 911 |           restartPolicy: Never
 912 |           template:
 913 |             spec:
 914 |               containers:
 915 |                 - name: tensorflow
 916 |                   image: kubeflow/distributed-training-strategy:v0.1
 917 |                   imagePullPolicy: IfNotPresent
 918 |                   command: ["python", "/distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/1/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
 919 |                   volumeMounts:
 920 |                     - mountPath: /trained_model
 921 |                       name: training
 922 |                   resources:
 923 |                     limits:
 924 |                       cpu: 500m
 925 |               volumes:
 926 |                 - name: training
 927 |                   persistentVolumeClaim:
 928 |                     claimName: strategy-volume
 929 | ```
 930 | 
 931 | Next, we add the model selection step. It is similar to `model-selection.yaml` we created earlier.
 932 | 
 933 | ```yaml
 934 | - name: model-selection-step
 935 |   serviceAccountName: argo
 936 |   container:
 937 |     image: kubeflow/distributed-training-strategy:v0.1
 938 |     imagePullPolicy: IfNotPresent
 939 |     command: ["python", "/model-selection.py"]
 940 |     volumeMounts:
 941 |     - name: model
 942 |       mountPath: /trained_model
 943 | ```
 944 | 
 945 | The last step of the workflow is the model serving.
 946 | 
 947 | ```yaml
 948 | - name: create-model-serving-service
 949 |   serviceAccountName: training-operator
 950 |   successCondition: status.modelStatus.states.transitionStatus = UpToDate
 951 |   resource:
 952 |     action: create
 953 |     setOwnerReference: true
 954 |     manifest: |
 955 |       apiVersion: "serving.kserve.io/v1beta1"
 956 |       kind: InferenceService
 957 |       metadata:
 958 |         name: tf-mnist
 959 |       spec:
 960 |         predictor:
 961 |           model:
 962 |             modelFormat:
 963 |               name: tensorflow
 964 |             storageUri: "pvc://strategy-volume/saved_model_versions"
 965 | ```
 966 | 
 967 | Next, run the workflow.
 968 | 
 969 | ```bash
 970 | kubectl create -f workflow.yaml
 971 | ```
 972 | 
 973 | ## Logger
 974 | 
 975 | Logging is an essential component of the machine learning system. It helps debug issues, analyze performance, troubleshoot errors, gather insights, and implement a feedback loop. Fortunately, KServe makes it easy to create a service called message-dumper. It logs the request and the response. It has a unique identifier for the request and the response.
 976 | 
 977 | ```yaml
 978 | apiVersion: serving.knative.dev/v1
 979 | kind: Service
 980 | metadata:
 981 |   name: message-dumper
 982 | spec:
 983 |   template:
 984 |     spec:
 985 |       containers:
 986 |       - image: gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display
 987 | ```
 988 | 
 989 | ```bash
 990 | kubectl apply -f message-dumper.yaml
 991 | ```
 992 | 
 993 | Next, we include the logger which points to the message dumper url in the InferenceService predictor.
 994 | 
 995 | ```yaml
 996 | logger:
 997 |   mode: all
 998 |   url: http://message-dumper.default/
 999 | ```
1000 | 
1001 | You can read about the inference logger [here](https://kserve.github.io/website/0.8/modelserving/logger/logger/#create-an-inferenceservice-with-logger).
1002 | 
1003 | 
1004 | ## Summary
1005 | 
1006 | 1. A distributed machine learning system is designed to train machine learning models on large datasets that cannot be processed on a single machine. There is a need to distribute the computation or training process to train complex models with millions or rather billions of parameters.
1007 | 2. Kubernetes is a popular choice for building such complex distributed systems. We can build scalable and highly available systems using K8s.
1008 | 3. Tensorflow provides a number of strategies for distributed training. We have used `MultiWorkerMirroredStrategy` here.
1009 | 4. We have used KServe for building an Inference Service which can be autoscaled based on the traffic.
1010 | 5. Argo workflows are helpful in building CICD pipelines on Kubernetes.
1011 | 
1012 | 
1013 | ## Next Steps
1014 | 
1015 | - ~add a new dataset cifar10~
1016 | - update tf code to pytorch
1017 | - add mlflow to track experiments
1018 | - monitoring the metrics with prometheus and grafana
1019 | - deploy the app to aws eks
1020 | - add ci using github actions
1021 | - add gitops for cd
1022 | 
1023 | 
1024 | ## References
1025 | 
1026 | [1] [Distributed Machine Learning Patterns](https://www.manning.com/books/distributed-machine-learning-patterns?utm_source=terrytangyuan&utm_medium=affiliate&utm_campaign=book_tang_distributed_6_10_21&a_aid=terrytangyuan&a_bid=9b134929)
1027 | 
1028 | [2] [Machine Learning with PyTorch and Scikit-Learn](https://sebastianraschka.com/blog/2022/ml-pytorch-book.html)
1029 | 


--------------------------------------------------------------------------------
/assets/banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/banner.jpg


--------------------------------------------------------------------------------
/assets/cifar10-dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/cifar10-dataset.png


--------------------------------------------------------------------------------
/assets/docker-build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/docker-build.png


--------------------------------------------------------------------------------
/assets/k3d-clsuter-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/k3d-clsuter-info.png


--------------------------------------------------------------------------------
/assets/k3d-image-import.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/k3d-image-import.png


--------------------------------------------------------------------------------
/assets/kubectl-get-nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/kubectl-get-nodes.png


--------------------------------------------------------------------------------
/assets/torch-version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aniket-mish/distributed-ml-system/a0e791d344aa3a42c16dd84e591789a295457f59/assets/torch-version.png


--------------------------------------------------------------------------------
/data_ingestion.py:
--------------------------------------------------------------------------------
 1 | import tensorflow_datasets as tfds
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def get_dataset():
 6 |     """
 7 |     Download the dataset
 8 |     """
 9 |     BUFFER_SIZE = 10000
10 | 
11 |     # Scale the MNIST data from [0, 255] range to [0, 1] range
12 |     def scale(image, label):
13 |         image = tf.cast(image, tf.float32)
14 |         image /= 255
15 |         return image, label
16 | 
17 |     # Download the fashion mnist dataset
18 |     datasets, info = tfds.load(name="fashion_mnist", with_info=True, as_supervised=True)
19 |     train = datasets["train"]
20 | 
21 |     return train.map(scale).cache().shuffle(BUFFER_SIZE)
22 | 


--------------------------------------------------------------------------------
/distributed-training.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import tensorflow as tf
  5 | 
  6 | from data_ingestion import get_dataset
  7 | 
  8 | 
  9 | # Learning rate decay
 10 | def decay(epoch):
 11 |     if epoch < 3:
 12 |         return 1e-3
 13 |     elif epoch >= 3 and epoch < 7:
 14 |         return 1e-4
 15 |     else:
 16 |         return 1e-5
 17 | 
 18 | 
 19 | def build_and_compile_cnn_model():
 20 |     """
 21 |     Build and compile a simple cnn model
 22 |     """
 23 |     print("Training a simple cnn model")
 24 |     model = tf.keras.models.Sequential()
 25 |     model.add(tf.keras.layers.Input(shape=(28, 28, 1), name="image_bytes"))
 26 |     model.add(tf.keras.layers.Conv2D(32, (3, 3), activation="relu"))
 27 |     model.add(tf.keras.layers.MaxPooling2D((2, 2)))
 28 |     model.add(tf.keras.layers.Flatten())
 29 |     model.add(tf.keras.layers.Dense(64, activation="relu"))
 30 |     model.add(tf.keras.layers.Dense(10, activation="softmax"))
 31 | 
 32 |     model.summary()
 33 | 
 34 |     model.compile(
 35 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
 36 |     )
 37 | 
 38 |     return model
 39 | 
 40 | 
 41 | def build_and_compile_cnn_model_with_batch_norm():
 42 |     """
 43 |     Build and compile a cnn model with batch normalization
 44 |     """
 45 |     print("Training a cnn model with batch normalization")
 46 |     model = tf.keras.models.Sequential()
 47 |     model.add(tf.keras.layers.Input(shape=(28, 28, 1), name="image_bytes"))
 48 |     model.add(tf.keras.layers.Conv2D(32, (3, 3), activation="relu"))
 49 |     model.add(tf.keras.layers.BatchNormalization())
 50 |     model.add(tf.keras.layers.Activation("sigmoid"))
 51 |     model.add(tf.keras.layers.MaxPooling2D((2, 2)))
 52 |     model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu"))
 53 |     model.add(tf.keras.layers.BatchNormalization())
 54 |     model.add(tf.keras.layers.Activation("sigmoid"))
 55 |     model.add(tf.keras.layers.MaxPooling2D((2, 2)))
 56 |     model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu"))
 57 |     model.add(tf.keras.layers.Flatten())
 58 |     model.add(tf.keras.layers.Dense(64, activation="relu"))
 59 |     model.add(tf.keras.layers.Dense(10, activation="softmax"))
 60 | 
 61 |     model.summary()
 62 | 
 63 |     model.compile(
 64 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
 65 |     )
 66 | 
 67 |     return model
 68 | 
 69 | 
 70 | def build_and_compile_cnn_model_with_dropout():
 71 |     """
 72 |     Build and compile a cnn model with dropout
 73 |     """
 74 |     print("Training a cnn model with dropout")
 75 |     model = tf.keras.models.Sequential()
 76 |     model.add(tf.keras.layers.Input(shape=(28, 28, 1), name="image_bytes"))
 77 |     model.add(tf.keras.layers.Conv2D(32, (3, 3), activation="relu"))
 78 |     model.add(tf.keras.layers.MaxPooling2D((2, 2)))
 79 |     model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu"))
 80 |     model.add(tf.keras.layers.MaxPooling2D((2, 2)))
 81 |     model.add(tf.keras.layers.Dropout(0.5))
 82 |     model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu"))
 83 |     model.add(tf.keras.layers.Flatten())
 84 |     model.add(tf.keras.layers.Dense(64, activation="relu"))
 85 |     model.add(tf.keras.layers.Dense(10, activation="softmax"))
 86 | 
 87 |     model.summary()
 88 | 
 89 |     model.compile(
 90 |         optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
 91 |     )
 92 | 
 93 |     return model
 94 | 
 95 | 
 96 | def main(args):
 97 | 
 98 |     strategy = tf.distribute.MultiWorkerMirroredStrategy()
 99 |     BATCH_SIZE_PER_REPLICA = 64
100 |     BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
101 | 
102 |     with strategy.scope():
103 |         dataset = get_dataset().batch(BATCH_SIZE).repeat()
104 | 
105 |         options = tf.data.Options()
106 |         options.experimental_distribute.auto_shard_policy = (
107 |             tf.data.experimental.AutoShardPolicy.DATA
108 |         )
109 |         dataset = dataset.with_options(options)
110 | 
111 |         model_type = args.model_type
112 | 
113 |         if model_type == "cnn":
114 |             multi_worker_model = build_and_compile_cnn_model()
115 |         elif model_type == "cnn_batchnorm":
116 |             multi_worker_model = build_and_compile_cnn_model_with_batch_norm()
117 |         elif model_type == "cnn_dropout":
118 |             multi_worker_model = build_and_compile_cnn_model_with_dropout()
119 |         else:
120 |             Exception(f"Entered {model_type} is not supported")
121 | 
122 |     def _preprocess(bytes_inputs):
123 |         decoded = tf.io.decode_jpeg(bytes_inputs, channels=1)
124 |         resized = tf.image.resize(decoded, size=(28, 28))
125 |         return tf.cast(resized, dtype=tf.uint8)
126 | 
127 |     def _get_serve_image_fn(model):
128 |         @tf.function(
129 |             input_signature=[tf.TensorSpec([None], dtype=tf.string, name="image_bytes")]
130 |         )
131 |         def serve_image_fn(bytes_inputs):
132 |             decoded_images = tf.map_fn(_preprocess, bytes_inputs, dtype=tf.uint8)
133 |             return model(decoded_images)
134 | 
135 |         return serve_image_fn
136 | 
137 |     # Define the checkpoint directory to store the checkpoints
138 |     checkpoint_dir = args.checkpoint_dir
139 | 
140 |     # Name of the checkpoint files
141 |     checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
142 | 
143 |     class PrintLR(tf.keras.callbacks.Callback):
144 |         def on_epoch_end(self, epoch, logs=None):
145 |             print(
146 |                 "\nLearning rate for epoch {} is {}".format(
147 |                     epoch + 1, multi_worker_model.optimizer.lr.numpy()
148 |                 )
149 |             )
150 | 
151 |     callbacks = [
152 |         tf.keras.callbacks.TensorBoard(log_dir="./logs"),
153 |         tf.keras.callbacks.ModelCheckpoint(
154 |             filepath=checkpoint_prefix, save_weights_only=True
155 |         ),
156 |         tf.keras.callbacks.LearningRateScheduler(decay),
157 |         PrintLR(),
158 |     ]
159 | 
160 |     multi_worker_model.fit(dataset, epochs=1, steps_per_epoch=70, callbacks=callbacks)
161 | 
162 |     def _is_chief():
163 |         return TASK_INDEX == 0
164 | 
165 |     if _is_chief():
166 |         model_path = args.saved_model_dir
167 | 
168 |     else:
169 |         # Save to a path that is unique across workers.
170 |         model_path = args.saved_model_dir + "/worker_tmp_" + str(TASK_INDEX)
171 | 
172 |     multi_worker_model.save(model_path)
173 | 
174 |     signatures = {
175 |         "serving_default": _get_serve_image_fn(
176 |             multi_worker_model
177 |         ).get_concrete_function(
178 |             tf.TensorSpec(shape=[None], dtype=tf.string, name="image_bytes")
179 |         )
180 |     }
181 | 
182 |     tf.saved_model.save(multi_worker_model, model_path, signatures=signatures)
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     tf_config = json.loads(os.environ.get("TF_CONFIG") or "{}")
187 |     TASK_INDEX = tf_config["task"]["index"]
188 | 
189 |     parser = argparse.ArgumentParser()
190 |     parser.add_argument(
191 |         "--saved_model_dir", type=str, required=True, help="Tensorflow export directory"
192 |     )
193 | 
194 |     parser.add_argument(
195 |         "--checkpoint_dir",
196 |         type=str,
197 |         required=True,
198 |         help="Tensorflow checkpoint directory",
199 |     )
200 | 
201 |     parser.add_argument("--model_type", type=str, required=True, help="Model type")
202 | 
203 |     parsed_args = parser.parse_args()
204 |     main(parsed_args)
205 | 


--------------------------------------------------------------------------------
/inference-request.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | input_path = "fmnist-input.json"
 5 | 
 6 | with open(input_path) as json_file:
 7 |     print(f"Loading data from {input_path}")
 8 |     data = json.load(json_file)
 9 | 
10 | response = requests.post(
11 |     url="http://localhost:8080/v1/models/tf-fmnist:predict",
12 |     data=json.dumps(data),
13 |     headers={"Host": "tf-fmnist.kubeflow.example.com"},
14 | )
15 | print(response.text)
16 | 


--------------------------------------------------------------------------------
/inference-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "serving.kserve.io/v1beta1"
 2 | kind: InferenceService
 3 | metadata:
 4 |   name: tf-mnist
 5 |   annotations:
 6 |     autoscaling.knative.dev/target: "1"
 7 | spec:
 8 |   predictor:
 9 |     logger:
10 |       mode: all
11 |       url: http://message-dumper.default/
12 |     model:
13 |       modelFormat:
14 |         name: tensorflow
15 |       image: "emacski/tensorflow-serving:2.6.0"
16 |       storageUri: "pvc://strategy-volume/saved_model_versions"
17 | 


--------------------------------------------------------------------------------
/message-dumper.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.knative.dev/v1
 2 | kind: Service
 3 | metadata:
 4 |   name: message-dumper
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - image: gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display
10 | 


--------------------------------------------------------------------------------
/mnist-input.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "instances": [
 3 |     {
 4 |       "image_bytes": {
 5 |         "b64": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAErASsDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwC9A42ir9vA0nOOKxYJhkDqe1bNvO0ZAYdaIsC8LLjOwH60yWDAwY1/75qzDcDAz0qfhl55BqxGE1pCzZwVPt0qJ7MgZQbh7da1Z7bncBVQgoaVhlGFvKlBIwOhqxPFxkdKmdY5xiQYP94daaqtGPKkO5P4X/pU2AoKMMQatWv+tAPXpTJ4ipyBTVYqwYHBFTezA1ivHNRsuRU1tOlymOBIOo9aVoyGNaCIEHanEEEMKXbg07BAx2NICXO5KrvwcVPEcDFRyD5qTYDYhuPuKnA4waitxmQj1FWGX9Ka2ArODzUXU5qxIM81AODzUtjGzHMfvVRcl6mmOMio4V3PSAtwjBUd60l+6DVCMAzH2q6D8v0qo7CIJ3xmsqQ8kmtC5YAVmyctntSbGRkDOT0qWMFyABUWNzD0q5EuxM9zQgJQAqgCkJxS9vemMasA3c8CpFPHNRBgBkinBvSpuBMGxRnPWo1561IOlMBQMEU2R8DFKW2rk1XdsmgCN+TmqskuHIqeUhVNZMkoZyckZqQILTi5UntzWtHMOVbpWQh2zCr6jIBpRGzUjl2jBPHY1chuSODyKx4pOzdKnVyh68VYjbDBlyvSq88G4bhVeG4Kkc8HrV3eGUEVQjLkUr+FRmQgYzV+aMODxzWdIpU0mMerh1wahdCpPvTN21gQamB3jB+qn1rOQDI5GjcMvBFbdvMt1FkfeHWsJhzU1pcG3nDZ4PWlGVgNd4+MigL8uKscMgdeVNRsAORVsRGFwc1G45qfKg/MM/U0jLG3RQPxNS2BCh2OG9DVxwM57GqxRQc8j9asp80I5zjiiIyu64zVdhxVtwMVVak2BUlOTUlumATTXXmpPux0r6AS2vLv7GrLNtFVbM/K596knbgGqT0AqXLZeqbgsRU8x96hJzgCk2A+JPmA61PA4mUSL9wk7fcetULtmEMdvGSJrltgI6hQMsfwH6kVqRIsUaqgAVQAAOwFUgEJ7UwDOc1Ky55/OmtgcCi4EZ6UqqSc0Hk4p46igB44pQaaM5NI7hVx3qkA2V8nHaoAdzE9hTZHOMd6ZczfZoQq/fNDArahcgAxLyf4iKzs0OxJ5696ZUDQP97NaVsdyg+1IPszHlFzU8SRg4jGB6VSQh3linp02mpQm5enNJs9aoBoynfirMFwVOD0qADjDUn3W9qANIsGGQeKqXCK3PekjlIOCeKfJyN1AGXIMZFNik6xscc5U+hqxMgbPrVFwVas2BezvXOMOPvCo2GD7UyOXOG/iHX3p8hGzdn6Vm0M0rG8ZLYxtzz8pp0lyx/iNZUMpzzVkturURKZGP8AEfzpRMw6Nmq5HvTMspz1pAaUVzzhjiptxjPmRnjuKyBNzzxU8NwUbDcqaXoBreYJU3L+VVn5zTEcRvkHKNUjcE4qZdwITyabK3yGpG4GaqzN+7qG9ALNicwn/eNE75UgU2zb/RQfc0krY4rS+gFZgcc0iKM+9Kc81FcI727QxnDyjbu9AepoWrAZpv8Apt7NqB5jA8mAeig/MfxOPyFa4HFQ20KW8KQxrhEUKB7VYXFWAvlkhSDx1PvUchwSAKlD7Uyep6CoS3UnrU9QGHg0DJ5xSb8mjdjvVAOZ9oqs75JOaJX3Hg1GBmmAKRuLt0Xms24lMshbt2qxezgDyEPuxqkxpNjImo4pGOOarmbk0gJvMINWIbp42BB6VBLC0Z9qjVsGjYDqrWVJ4w6n6j0qcxbh71ztndtbyBlPHcetdLayx3Me+Ns+o7irTuJkDRDvwaYVIODWg0IYc9agkgBGDTEUyCv0pwc4wac8ZTg9KjYFRSAil6ZxVOYZFXGPBBqpKKljIFJB61KzFlqJhTkbPBrO+oxysR0qwrkjk1Wxg1IoPBJ4PpSc7BYkOfU0zzHXvke9WNuFBHQ1A/BrKVRoaQm/dweDShyOOtRZB+tAPHNSq6HYv203/LNuh6VeVvkweq1ioxB+lacUm5Nw7jBrVTUoktEsp+SqNw2F4q1I3yCqM5yQKwchpF62Yi0/GkcHgmkh4gAoOSK1UxEfVuKdbKZH8zseF+nrTGO0qo6scVciXgYrWG1xDxwcUm/GQKc3FVS3J5qmwHmUkknoKYXJGaZuBzTd1JMB4PBzxUUkhPApWcnpTFUk1SAdGrOcVW1S/TTbbIwZW4Rff1rQXbEmSefSqC6bHPdNd3Q86U/dDfdQegFUBkWYurnlI2YseWPetSPSZW5llC+yjJrUVABjt6CnHihJICkulWqcsGf/AHjUwt4QMCGMD/cFSM4HWm+YKq4HO/aAww3NRuqtypxTNpFIMisrjFDFDg1ctbySCQSRsQf5/Wqm4MOaT6GlewHY2OrQXWFciOT0J4NaDRq68/nXBLIRwa0bTVLi2wBIWX0bmqU+4rHSSQcFW5HY1TkiKZBGRUtrq8NyNr8N6VaYJIvysCPar0YjGdMDOeKrSL6ng1pXFuUJxyDVCSMgH0qGMqMvao84NTOp61XYkHmspDRYXBxnpSFjG2DyKbEdwK96eR5ilT95elZSKRYglBHlseD0NJKpU4NUlk2nB6VcjlEq7HPzDofWsJTvox2KzcZojbccGnSrgkVCpIkFcdaorblJFg8VctHzlapBs1Ztf9bTw+K01CUS07dvaqMh3TqKulSSTjPHWqCgGdnzkKOtYLGxbDkZoxnEYqR4j5e4HPqKrwncAO2M1X1jVRp+lXFxn7inHu3QCu6E3JKzIaC1lF3qk+05jtgIh/vnlj+AwPzrXUYArnvB0LLoUMshJknZpXJ7kmuhLAZxXcpIgimYjPNVd3HWpJnLHFQgcYp81xjht7U4AGkGFpDPt4QZNaIQ9YiRnoPemng4Tk+tIFdzmRjj0p5dEHFWmAqRfxOeakyAOwquZ93Sk5PJouBM0uKjMrHpUeRn5qQydhRcB3Pc0vHrUYJanbT6UwMX6ikKelafA/gX8qRljPWJfwzScAMsxjNJtx9K0Ht42HykqffkVA8DpzjI9RzUOLQ7lXbijDdRU2zIppX3qRipJ0z1rQttQkj4Ylh655FZjLzQMryDRewHUwXyTphiDnof8aZcQYO5eRXPw3BQ5U4Na1tf5Xa3I9KtSuTYgkjK5x0qpIpwM9K2HRJlLIefSqEsRGR+lRJAUVcxuCO1WpDlROn4iqknTp0p1tPtYo3KNXJOdtGWlcllUMBIvRuv1oGVOD1FOWIiRIz91zgH0qS6VoNwaMgKeJQMgfWvExGK960TeMRpPmqMnD4796JIQsf7s5lGAQT61YeSOS8SFwFkH3WHRuKq6fFJdajMZ3aMRtgqByT269q4nUnJXb2KskMjUiIS7sl32hfTtWhaArIQ42kdaaY/7PvzG6+dFMSWfHC91+nvS2e+Sab7QORNsUZwG7jn0xmpc3a6FYnDpC4lDEl/4G6DtVa5UqRDEADKTj2Hqas3Nkz6hGZGxCFJLL/FzwKluIykkQgQtC53N3Kn0+maxUrNMvRoiEZWPapAJ71keItPkvdPYiNnjg+cQjOZT/8AW61uOY1cGUcryFzUgkZ0JVsDtXfDGWmm9kZuGhX0qJLXT4IACuyMfKe1WZJQFOKiKlSCWycfnUUjjua9ali41PhZk42EbLHNISFFM83nA5pVXPLV3Ql2IaDBf2FOykQycUySUIMDrVZi0h5NbJiJnuGc4WkVWP3qYo29OtSKrt34q0wHhgo4GaCx/CnLCw561OkJP8NWIrBd/apVtj3qysI74FSZjTimBElsO1S+QKQzf3cCk3v60AUiqEcVG0R7DNPIZT92kLY5yRWgFd0I7UwMyGrLHPYGoXQHkVLAYYUlGUwj/oarMhRyrqQam3FDU6yRzoEl7dG7ioaTAzyuR0phQ+tXJreSLn7yHow6GoNhas2h3K5UA5zSrIUI5qQx5qIoBwRUMZehvMEZbB7GrS3Mdy/lMQk3YH+L6HvWJnHHpSs0c0ZinBKeoOCp9Qe1Tz2CxcvY2jY8VXgYMrhhhf73pTY728geO3ukW9tXztuPusvfDe9aECQmMqE3I/OD1FeVmFaKVlua049yNLooVjLD5MMcjORmrZN3LfmHzVjg2bn3LkMvp9TVeS6htbNXSNTk5RiPSrdhdx3ds0lztIcHJHHHt714cr/FY2JTbwGzDJ80kBym48/5xTZHzLFLLEygryw43+lFvJa/YZF52AZEh5IIpkN6k2yCcOcHdtIxzUaktl10gup02zMqlcFRyM0omRElhk25QYGBzu9qqLav9rl+ySII+q7mzz6UxHiXzYbkL9oHzGQHnd2x7VLj0Fc0Fml+y5kjbY2OT/CakaRSEMY/eE7do71krqBkgWNllCzMOdp/P0q35aQPEYHd9wKlSMnjvUuFtykyeW1ju51WQsjIm38e1V5LWRSiGcDy+Hx/ETzmpxK0kgl2ERYwzHru+lJe2pn8trc4kzyC3BHrSTa0ZRG7IkQIbcvTJ65qsUeZ2H3VXqT6065kkSfyIrZiQMFmHHHU06TKWyq7ZbGSfU16GB91uUnotTOYxIVU/eyaSRivFAxgHOaGw3yn8K9+hVhWjeBi01uQFWY05YSe9L5wXjbUiTqeq4+ldcSWSRwkdRmp1RVGW4qITf3TxSg7jljWqJJRJGvTrSGZj0OBTfK3Hini3brV2Ab5jNxTgCetSCE9+DR5ZxinYBAqjmjzVpPKOOtHlU7AVRLwOhoyHFZyuw6VMsz9aq4FghajYYHBpPNzzRvDD0pMCB8HtioiQOhqV1Peq0kZBz2qWBbgu2i44ZT1U9DVwWttdjdAxjfvH/hWGWZDx0qWO4dCGBII7ip5h2L82nlTgvtP+0KrPZSjkbH+jVft9VhnTyrtQQeN4qO7025C+bp06TL/AM8pDj8mH9RSaT2AyJo2X7yEH3FVJJFUdeadN4hlsJvK1Kxnt+cbiNy/nTzf6XerkBTnupK1yVdFcpDdME8935SofKP3s9vetcmCxTZcDcm4/Oh5APrVWK4kuA0dtHjbznODTzAbiaGV0EinO5Dkfn+NfN4ibqTvLRHTHRE8losVspkKT23VY+4z3z9Ka8FvayRyQiT7G/OGHCmp44omsnW5LbsHdGrcLjgfXtQdQVdOkhYeblNqoozx9K57vYTIbmWAT2zoGUOSGUDCk9jS3Fwl7fpsuFjZUAdgM5PSobm/hmsIbZnVmaQEJjgge/arN1FHdxWoh8qBgx3N/s/TuelVta5DZWN79gmliMgaVMkcEBqnhube4s5FmTMjjcGbruPpisvVIn0u5jE9wkz3PAcLjaKkb7Pp8kEkL7lkByHP3SMcj3q+RWTW7Iua0OpLJYzQyZ+VMFAMMD2xn3plnfva3Dw3aeXMg4UntjrVC9vXk8m+ijXbG20kA7m96fNqCSz29y6EmL5Hcp0zjGah0tNtylI14dTVpZHPIz90jk1Na5aLzvNBc/MExwB6fWq7zW63cExVBI8fGO/PFSXksUcgMZbMv+s2jhPeudxWyNUyO7vfOuo0BYKAGc+vtUVwr3upCFNywxrukfH8q02mha0kAjUqqZB6Z46VAl15kGyFQCVOBn+tVSqcmtgavoQtDDgorlSOnOahkzCu6QjaOd3bFPQmGVEeAmM43yZ6+uKknaPa8IAdG4OfSvewlanJ6JL0MZJmb9ttWOVcyeyKT/TFC3Fy5/0fTZCv964kWIfhjcf0qyMRjCKFHsMU1mYmvWi0ZsehuMfvPIiOekbGT9SF/lUhl2+hqDDkU3Yx5NbJiLH2kjocU8XLf3jVUITRsNXcRdE7H+Ol85/7xqmEYdDThuHencC557g/epftL+oqllqTLU7gQKvNPCkHBNOVkIG5amVI2Aw2KYEITqKTYQfSrnkHqMGlMB/u8UgKTKfc00oSORV4REdRQbfnikwMxoh0IqJodp+WtY24I55NNNvj+GpaAx2hYcg0+C5ntmzHIw9uoNXmtjk4FQSQbe1ZO62KRZTVbe7Qw30KAHuRlTUN5YlSj2RhEWeU8sFT9DVCZAB2qvDcSQMfLkIH90nINcmJqtQa6lRjqaUTNNI1rtWJ8E7umfUVIoffHapcA7Ry5GCB/WoLC4juhKZGjSUHaoc4/HPTFVPImsbo3VyywhMjy2OSwPpivm3FuTT3OnoXLuAxTC3FzuWYFtxGDwelN06eHTmfjYe4cckVMyiOSO4nlWSXd8qfwjIqS6uYZb+1V9nmxhjz9OKm+ljNmJetBJqdw8mYlAHl4UqPekt5tQnuVeOESW6ggEN6VY8Tajb6nHBYxnE5bJbHQVDpMdxbXCWECmYjJGDwfqa6Uv3d2tfP8yGJBqcWpTvHdw7Y0G1S33vfH5VB/ZN5GrtegPDJ8lsQ3IPbP1qxc6PEbGW4Fw0FxHlnjK8euKqy67P/AGfBEkb/ALyQYJGFBHaqjr/D2/IhmlYvcaVcpHeRqpC5A/hI+vrVy1vree/vgPnWQj91jqMAdPrVKG6XVZYoZ2ZPJ+YHg7varl8sVlqdvNZkuZ1KMMZIA57VzySbs9xok0/ZCstvdQsuCRtk6qvbBrQWxKwFldjC7fOCfnVewqnA1trLkytiZMojdCh9SO/NT28k8pksN+LhfldsHGPUf0rCd73+81iaBitksnOPlRSWUnhlxVKxaJnCR2hjTqrO2SfpT2tJlb7MzLJCQCW3YJGf8aa8aWDhZWdl6RcYI+prJbWuakrNiCZdwfa3y4+g/wDr1EkYI5/OkjCG1OxtoLDrT9siDBU49ua9fApaNGM77DvIHrxSiFD1wKZ5rZ9KUEsc170DFj/JGeBSi3p6c8ZqYLnoea3RJWNsMcUwwe1XQhPUU7y+MVYGeYcUhirQ8oDjrSGIdSKYGb5dJ5RrQMIPQU3yPaqAykjPpU6R1l29zOhAT5h/d61s28jSLl49n1NNO4CqpHQ08zGPgcmplVT3psiLTsIbHdIxw6496tCNSuQAR6g1mSpg8Ypsd3JC3ysfpSGanlD0pjBV6kVB9viuE2Sh4mP/AC0jP9Kgk02WXm3vkm/2WO1v8Kl+QE8kkXr+VVnngHVAfqaqTaZqEWS8Dkf7Jz/KqLiQZDRupHqprCU5LoNIvy3cfRUT8qpl43kG9FC9ziqbS7eM0eflSvXNefi5TlHQ1glcnu1DeXFbhDvPDHt70+5toZWG26jeeDBUyc+Zj2HP6UkMUJieGEM08gyAR04/SqcNtDYXsVzPdr52SPKAzgnjqK8OPrt+JsTBvPuRDdRPEoG5se/Sori2jsZZFeYusg3K5649KtXLPeX0QikChMh3xnI9qy9ctZY9RiWdzJbsu5SOBmrpq7tsZyNEHTbfRZF8tRJtIEnVmPaq1ldtpzjdJkS4IYDkN6fSqul6XaXBkMrtuJIQZ4X3qndLKLuWISGUQpuDIMFe3Pr2rRQjJuN7kGhrerx3LyNEMoRtlYfxN7Cqtiw1eeGzm/dQRfOXHXPQAVGLL9y7yFEeNcqAfvZ9ahluvs9ukcKmK5XBwBwRnk1pGKS5Ybg0dJZG0jtpIZo8BWPzg859aS2ubi2vY5bmMIJlzC56FQeh9D7VmWIE9rJO75kzkN0GRWhLNeajosyx2jTBfmIf5SPcVzyjrZiHyXTx628kUQ8iYgGQdA2Oa2pIUWKO5tpHNxwjlv4/rWNp0M2raaFtisYUhgX7MK0tPiluoj9omEc8ZZQo6bgec1hVVvloaRZZmWW1jFzNIhiYbWIJ+Q1E01rqN0oeSR1UcBBhc/jz+lWbWKa8jIZAsQOJFc8v7AVS02IxFHkCozE+WhOSR3rFWs31Rqi7A0LRSLblWBGACw+U++Kqi4uI5WRAzhTjIU4NTzzWUEx/erHu6j3Hf9azl1GVh8x4J4NdeCS573aImaS3Of8AWwMPcCp0MMnKPj2NZi3b/wB6p470j70aNX0tKatqc7RpCNlORyPapApzmq0N1bN/ejPvyKuo6EcSK1dSVyRytinja3UUmFz1FL8o6EVdgHgKKXYDzTMgdx+dAkIosA4xDsKPJpPN9qPNHrTA5tBs4Xge1So5HrTTG5HCmmiKQ1QF2OUHgnFSk8etU44znk4NW1QbeTzTEV5Bmqrrz3rSdFxxUDL3xUtDKBB7UbmXoSKtNGOuKYYx6VLAE1C5i+5Mw9qe2r3OPnWN/qtRGIdqY0Xqal3AWXUYHH72yjPuKoSzacTuEBQg9hT5oevFZ8sLHPFcde7VmXEn/eeYtzbdeQDnFPu7KVZo5UkhLxsCD94OfbHWqUbSxgx5zGex7VYa2l/s6NxM0pY/u1QZI+mK+dqRcJnQndGhePGbQO2yMR/MvOM//rrOtymqTML5CsbMNik42iq0cTx2ciXAledJVJjbkIvX86YJUmuw5Lquwgdt/wBPWkoct7feTIfYQ20cs+/zwsbsTsU4I7HNJLo9zY28+oROHSVsuO6qff1ratLiA6O0cpHkiPGdwB4//VVW0uWmsxazo/lzZBcnoMfzo9pK7ZNjKvbeC3KusjzRsnC9dvp+FVHhj+zRsvF2W2kdSR3/AAxW42nyaXL9ktla5gnXO6Q8r+fasW3MqXbwsoM7AgZ4xj6/Wt4Surp3GQ6fFK8skUrEKW+6p4BroLLUpo7l7JciUJkuT8uK5mO21CPUGO8tKnLFDnIqzHdOmpSOJCAVG5mHJp1aanfroTY6K0uDpz/ZCWIGXDp3z6+9WLF7i9vLohkiBfKt6ZHSsHQonvdRmubySRguAFJxx9PSukhsltdRk+yN8kxBCHsTXHWSi2upcS3LMdJkiSWT5X4STt+NUY1k/tF4CVBtiWjdTxIjc8ep57U3Uzf7lR7fdCGyCg3dK0LuaKysYVcDzCwWM7eVz3rFaLu2aIpzWUcTPHdReZFIN6S9Gz3z71Sm0+SNRJC5mg6hh1H1FS3LzvcrNNcJKpGFIOFGDyMdj0qJFubWZpbdv3bHJj6r9R6V6OC3tIzkuw1M8c1MrleCaso0F4uWType4Hr/AFpklnJGN33k/vCvbhDsYtiLLz1qZZcdDVbbkcCnAHtW8SS6s7DoxqZZz6ms0PjrUqyVqmwNETE9zThKezGqKy89alDe9UmIti5cd81ILs+gqiGpd4qrgSC4hI+9SfaYecGssI1OCMO2adwNDz4ienNPE47CqSIT2q1FCfwpgTGTd3xTTz3qVIgKlEQ7UxFXy/SnCLParax47VMsakdKVgKItQad9i9av4A6CgjNS4gZUlkDxjNUbizCg5wAOSa1r6+t7NP3jZbsg5JrltR1Ca+yp/dxdkHf61hU5UtSkmV5prWRinmYj5yw74qKUyCNYbW4KKRwu7H41UWFmc47dfap7aJ5bwY+Zdp3c8189iYWnzNnTF6WHWt3BZwtbvJvn5J2Atkn1Nal49tdaKN6JuhT5Gz0z1Hsax4IBaXztIuUYbd392kupLaK4SRJVZUO5152nH9a5nFSkmhstyWMFxpyQwnDMR5f/wBeoL++utPgt4ZIkw7ArKG+X3xVgSpNJBLYoZTgsQPlO08d6ytZvTq9qbeFNkdq4355PPAIPpnj64rSlFylaW35EMty31ydUTYyOHUDviMU24kuBDJbG0mdo23m5Vcqozyc9/oKct5ZyaD5EETmXAwEUhg44zmtXSdSh/stLedlLbPLkXqzk55x1NKT5FdR2AyIMLcuYZwUCgs55JaibRp49MM7zI4yGYAc8ntV2Hw1DFpzm2u9kj/vA7DgL6GmWp1P+yWHA2pmNxxkgcAj86PaXd4PqFjVleCLT2n2YaNRtK8MQKp6XqM020mPczyEluwH8+P6UzTFuDp9159s8t42XiMg4ZSMEfzqTSrhNTsrqyMUdtK6HDdAD/jWDgkmnrqWhZXv4tUMrMwRslGU5GPatO5gklaK4wJjt+cDnI6jHrWZps88McNnKMCJXRyeQw7Y/WpVvbqynIIV4WO5UIxtB7D6UKnzTUU7D5rIsX1vFqESmJlSc/dbOAf9k+lYkTy20xjdWRgcMp4wa12tbbUpfPspzbXf8SP91/qP6ipJrZpcRX0RhnAwsvUH8e4r2sNhfZxtuYylcpiXOD1PrV63vcEBzj3qhJDLaNslXGfusOhpEkB4rtjeJD1NkxRycj5Se46GoXt2U5x+VVIrhounK+ladtcLKOPxU9RXRFpk2KZiI5IyKaU9K1WgBGV6GoXg9sVaQijgqc1IrVIY8cVGVwaYDw2aN3tUfIOMUZx1FMColwelTLcetY8EzMil12MRyKtI/vVJjNVJ6sJcZ4BrIV8fxVMk2OlO4jYS4Gcc5qdJWx/jWRHcE9MCrUcjN1ancDSDnuRUitVISKg3OwAHcnFQTauqjbANx/vEcUm0hWNZ5khj3yMFHqayLzV5HBW2XaP75/pVCSaS5cF2Lv2H/wBanraTN94rEPVzj9Kzcm9h2KMgLOWclmPJJ5NMispbs/IpCeta6WllEMyFrh/TotLPMzx7eEjHRF4FZOn3Hcx5beCBcMd2OiL0/E96zri6k6Q/JngBa0bqMYJNUU3JcBYY98jDCrjrXFXg+iNIsqbbrbHGznjgA9s9/rVu6sopIo7bcFt4z8zL1Y+/tV2a0EUCNcYeQDkZ4qqLh50MBUfLli/oo7f0ryqlKpFq+5opJk8NjLqdvLcRwxRAjYrg4woHU+grHmt1tdFuJIiRbtIPNbPMuD8oX2J5/Crbx3Oo6WzjfHHI5IROFYD19aFtfK0R/MkEqRfejJ/iPQD/AD61nB8ujfXYe5HGFn8PGHTS0JmfLGQ8r0OM1c02EmTUXhnie7eMJFt5wcYYj86ppNqEcuLe2BtLlVBQj7uM9PSrSSHS7iK8MLbZplXy2Od2Rg49MD9cU5t2aXXX/hwSGH7Tp2gW0UluxYytG28cBc5x+PrWxFdb7dmtoZjE6kR5GSB0/Qg1nWLtNrV2ZhvhY/JC/IxngAU2Rv7S+aKV7ezAbyVX+IBiCD7ZH61lOKk7P1v6jSJbfUdUknaARIG4UkKScDv+NXLCG2mvJpEjLZdmDdmGc7x7+vvVH+0jZazCiZaBIkDKRyD3x+OKtIrmUgYJD70QDgxtnHPfP9KicdNrXGMuruKe/jmRCPLYDPTI71PehhJluR2rLinxNIjLlCxwO4rbhxc2a5O5lG0n+tehhsLrcznIz1GDuGQ3Wtez1QhPJuR5kXTJ61mvGUbHSmjj8K9SneBk9TpPscFzbkQuHjP8BPT8e1Yl5pz2bFgCY/UjkfWi3uXhbKMVP1rUi1NpF2yBXHQ5711WjJE6owgeeePrT1ZlYEZBHStZ7fT5udjQk/3Dx+VQPpqE/u7kH03Cj2bWwXLNleiQhJCFY/ka0Gi4rDNjMvRkb6HFalhcvgW9wCrjhWPetI32YmDwioHgz0rTePNQMuKuwjKeEjmm+Wa0mjB7VEYOaLAciBTwxA60pX0oCVBYqyN3qQSEdqjwT7U4KR0paiJ0uGHQVMLyY8LxVUDHWpFzjHXPQDvRdgSmRmO6Qlj6ntVy2s2lAd/kQ9B3NPtLEIFknAz2X0q8TnhapR6sVyEQiJSEG0d/ekEZ/CpxGep6UpPbFOwiuI6jljwOetWSCBmq0uM5zxUtDKM0TSuscalmY4AFaVvYRWEJPDSsPmf/AD2qzY2/lp50g+dhx7CmXj449qhxS1C5g6g5lc/3RU2n6YptHkuB8snUHuOwqWO0Nzcqh+71b6VpXK7k2Lwo4Fc6pJtzY79DnL28nEj+S+yMcAYqsqg2gnmJkVW3yKeuR0rQvbUD5QKqTxFNKmI/vL/hXmYjDato1jIhsZDc2s0wfypSSsag446ULp8MtzBh2PkoN3s3p/WmwxsbSJVGGLFVA75rXMH2BYY4xnAO73NcyoVHzOJfMjOa+hkQpLGY9pZS6KeG7HNPsw9ksdtKo+yWzM5P94MD/ImoJ4AZSSOSckGtJIftGnvGRmRUIX8ulNYZuNkg5tSOaWOe6ilt412ttQkjtmogkmnak7x/xZHPpUGmyAEQsTw4Zfz6V02pWAY7wPxrqw2D91qREp6nLSoRLkjk81oadP5MoVj8j8GmXMJB5FQbSGx2rtjHkehO5vXNvuGQOaznQqeRWjpt2J0+zSn94B8p/vCn3FqRniuvlT1RBkinq5HtTniKE8UzGDQlYLlpZg3B4NSiXFUgTUqscVohFoSZ6GpEmI4PI9KqA5pwY1SYG/DOJUAPWnOtZNvcGNhnpWj52UDKMjvVCGOtR/jUxkVqaVGaAOU8s96UR+lTDHfmjAzUjItgHWkK1IRTcjtSGN2/hWpY2ohAlkHznoD2qCxtwzea3IXpn1rRxk04oQ8ksetSLGKRIwe9TFcAAVYhp4HFAJHbmpQnA9ajkzyAaTArvhs+lMihEtwoxlRyaGHBPartpEIodzfebk1G4EsjhEz7cVkykuxY9e1Xbh9zEdhTLWIPJvP3V6fWolq7DC2hMMOMfvG+8fT2qb7P/e5qdU4J6AcCpjHhQcU7CMO9h3MTj2rLvYtunMv96RR/M/0robtADjHasfUF/dwJ/tFz/L/GuepHdlIj0Wx82VXYfLDkj6mtOa2Lv0qzodvt0/eRy7E/0q48YBAxzVwpJQSBvU5O8tSr9Kdakoy1sX1sOpFZJTyx+NT7NJhczr60MF84TgN86H612cDC+0+Gbj50BP17/rXP3sXnWaTj70R2t/unp+tX/DlySklqT935l+h61UI2k13B7Fe+ttkmMcVjSxlJOK7O8tw65xmudurYhjjtVSgCZQjchgQcMDkH3ro7O5W+t/mx5q/eHr71ze3axJFWLad7aVZU7dR6iiGjBmvPb5BwOaz5IipzitsMlxCJU6NVaaAOpwMGtrEmTjnpSqKlkRgQccios880rDJR0oBpEp7DPIpgOU81agnaM8niqPSnq5HemgNcbW+YdDUm32rNhmKnrxVoS8fepgYGQBTS5zxUXmUbsnrSAk6n1qSOMuwUVEpArRtY9ibm+81CVwJ0QKqqowBxVhIwelMQAnHep1wBgVQiRUxxT9nPJpiMfTJpxbPTigBXGBwaquCRkmpJGO04NVmY49qhgPiTzZVU9Op+lXZZAM1UtPlV3IxuOB9KJZAX46Ck9EBHIST3rQt4wkap36tVS2jMjbyOAePetSKPb1/GiMeoMcibm56CpXXI9qcigCkm4iNNrQDKusFyfwrJvQDPjsigVryYZxnp1rJ5mlyesj5/WsZrSw0dLYw+XYQr6IKc6j0qaMbVC+gpjjn8a3toIpTx74mB61h3URDV00iA8isq8h+UsB14qXEDNs2RmaCU/u5QUb2z3/CqVpI+n34Zlw0TlJB7dDVhl8t8Gm36b9lwOS/yv7kDr+VZtdSkdaCs0QKkFSMg1kXdttdsUnh673I1q55TlPpWpcxZG/HNarVXJOTnt9ueOtVNpXiuimgDBhisua1OCR2qXEdxdNvTbS7HP7p+vsfWtiYY5HQ965sqy9a1dNvRIn2aU8j7pP8AKqi+giSRBIMj7wqm6ZJBGDVyVdrHtULFXODwabGVFJXqKnVtw5pjrtYhuaFyOnSkBIycVHjHFSqcikZc9KYDQeafvPrUJ4NLvNFwME3IHemG9iTlpAPxrg/7QnbrNIf+BGpraR5ZQoyWY4FZ8wHoWnTx3cp2HKJyT2rdjOTWJpUC2tskI6jlj6mtpGAFaoCynB4HNSphScjJqBHwcipN/HuaYibzMdOtG4EYqEMCPencAHFIBJXAXHaoDlwOwpxy30pBjIHYVLGSb9sfHbpUagyuFHUmo5JMtgdBVqyjyTIe/A+lK12BegQAAAcAcVaUHNMRMKKnXGMitCSUDAqG5bjFTZGCap3DDGaljKFw2I5Gz2x+dUrRN17AuONwqzcnMeP7xqPThnUY/QAn/P51k1eSGjpM4HvUTHrTlPHNROeSexrckc/K8duaguIg8RHerIGUP0pjr8ucdKAOcuYjux3zUGzzIJIT1IyPqK1b2Hdll6is4fLKDjvUNDRSs7g2t3FMP4WGfp3rtjiSMHqCK4adNk7pjjJxXV6Ncefp6AnLINppQ7DYyaHDHaao+ScsOtbE6dGFUnXkkVpYkyprXcM4rPeF4XyMjB4I7V0YUEGopLVXBBHWpcRmfDdi4jCScSAfnUchwxp8thhuOMVE0Mu3ruxQMBJng80dDkdKrsxQ8jFOWQ9c0XAtA57Yp3eoVcHvUm71oAR1zUW2pjg0m2lYDxFQc1saGmdQiz25rPEfNa+iri9U+xrGO4Hd2bYArSjO41j2rHArSSTA4rdMC8JNowKcCTjmqqNuFTqcDmmBMCQODQWLcA8UzOeO1Ix7CgALYXimF9qZNVp5xvWFD8xPzH0FEkmSBUtgSpmSRVHVjW7bxhVAHQcVj6aheZpP7vArbT5acUJkwFTj2qBOmTUobkYqhDyflNUJznirjHGRVC4btSYylcHlR7Zo00/6cfZD/MUydssx7dKTTmH21vdD/MVmviGdErfJULHAIzxmlDfKKimPHHrWxJaTlCO9SDBGCO1QxN8n4VMBwPWgDPuo8ZI6VkSRgOCOhreuANprGuEKscdM8UhmZqCbLkH+8oNaHh6fbO8RPDDIqnqfIhbvgjNRafN5F9C3bdistpDOycZ47VQkADlfyrQPIyOtVLlAQHHUVsSVhgZp+3IBFRZwwJ6Gp044HQ0DIZo8jP51QkXYT7VrlQciqVxH19aGBlzqMhu3Q1WeLbyvFXJB1WoFPGKzkNFfJU09Zh3pzpmoHSlcZZ81fWk89fWqmD2pMUcwWPMQvNXrBjHcRkddwFUhVqDggjrWC3A7W2YgYrRjY4rMtTmNT32itJOgreIFtGx0qdSe/Sq0fT8amXk1YibJPPaqVzfAApFye7Ut8zARqCQrZyB3rO6jmonK2g0iS3bMpYntUzP3qCH+OpO4qUNnQadH5dqmep5NaAJ4qvB9xfpU461siCdTlQTUhOce1Rp92pOxoAbI2Kz5z8w/OtB+1Ztx95vpSYFKU5TNRWLldQUeqGpJfun6VBZf8hNf90/yrNbjOkV8pmoJXOCD1Bp0X+pqK4+6a2EXbdsoAatRnjPeqNt91aux0IRDMODWTcja49DWvN1rJuvvGgZl34zAmezGs8NtYEdjWhf/AOpH1/xrPHU1jLcpHbW0omtY3B4YCkccFT0NVNGJOmLn3q3J0rZbEmbJmOQoenUVLBJztqO+6p9aZGTvBpAaQ+YfSoJUzk1Knf6UjfcNUBiXSFTVHcUc+hrTvvu/hWY4+UVEhkoIYUx0psR+apm6VBRUZcUYFSuKZgUAf//Z"
 6 |       }
 7 |     }
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/model-selection.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | import tensorflow_datasets as tfds
 4 | import shutil
 5 | import os
 6 | 
 7 | 
 8 | # Scale the data from range [0, 255] to range [0, 1]
 9 | def scale(image, label):
10 |     """
11 |     Scale image
12 |     """
13 |     image = tf.cast(image, tf.float32)
14 |     image /= 255
15 |     return image, label
16 | 
17 | 
18 | # Variable to track the best model
19 | best_model_path = ""
20 | best_accuracy = 0
21 | 
22 | 
23 | for i in range(1, 4):
24 |     model_path = "trained_model/saved_model_versions/" + str(i)
25 |     model = keras.models.load_model(model_path)
26 |     datasets, info = tfds.load(name="fashion_mnist", with_info=True, as_supervised=True)
27 |     mnist_test = datasets["test"]
28 |     ds = mnist_test.map(scale).cache().shuffle(10000).batch(64)
29 |     loss, accuracy = model.evaluate(ds)
30 | 
31 |     if accuracy > best_accuracy:
32 |         best_accuracy = accuracy
33 |         best_model_path = model_path
34 | 
35 | destination = "trained_model/saved_model_versions/4"
36 | if os.path.exists(destination):
37 |     shutil.rmtree(destination)
38 | 
39 | shutil.copytree(best_model_path, destination)
40 | print(f"Best model with accuracy {best_accuracy} is copied to {destination}")
41 | 


--------------------------------------------------------------------------------
/model-selection.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: model-selection
 5 | spec:
 6 |   containers:
 7 |   - name: predict
 8 |     image: kubeflow/distributed-training-strategy:v0.1
 9 |     command: ["python", "/model-selection.py"]
10 |     volumeMounts:
11 |     - name: model
12 |       mountPath: /trained_model
13 |   volumes:
14 |   - name: model
15 |     persistentVolumeClaim:
16 |       claimName: strategy-volume
17 | 


--------------------------------------------------------------------------------
/predict-service.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | import tensorflow_datasets as tfds
 4 | 
 5 | model = keras.models.load_model("trained_model/saved_model_versions/1")
 6 | 
 7 | 
 8 | # Scaling mnist data from (0, 255] to (0., 1.]
 9 | def scale(image, label):
10 |     """
11 |     Scale image
12 |     """
13 |     image = tf.cast(image, tf.float32)
14 |     image /= 255
15 |     return image, label
16 | 
17 | 
18 | datasets, info = tfds.load(name="fashion_mnist", with_info=True, as_supervised=True)
19 | 
20 | ds = datasets["test"].map(scale).cache().shuffle(10000).batch(64)
21 | 
22 | # Evaluation
23 | test_loss, test_acc = model.evaluate(ds)
24 | 
25 | # Print
26 | print(f"Test loss: {test_loss} and Test accuracy: {test_acc}")
27 | 


--------------------------------------------------------------------------------
/predict-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: predict-service
 5 | spec:
 6 |   containers:
 7 |   - name: predict
 8 |     image: kubeflow/distributed-training-strategy:v0.1
 9 |     command: ['sleep', 'infinity']
10 |     volumeMounts:
11 |     - name: model
12 |       mountPath: /trained_model
13 |   volumes:
14 |   - name: model
15 |     persistentVolumeClaim:
16 |       claimName: strategy-volume
17 | 


--------------------------------------------------------------------------------
/pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: strategy-volume
 5 | spec:
 6 |   accessModes: [ "ReadWriteOnce" ]
 7 |   resources:
 8 |     requests:
 9 |       storage: 1Gi
10 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | pre-commit
3 | ruff


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.13
2 | tensorflow_datasets==4.9.2


--------------------------------------------------------------------------------
/tfjob.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1
 2 | kind: TFJob
 3 | metadata:
 4 |   name: training
 5 | spec:
 6 |   runPolicy:
 7 |     cleanPodPolicy: None
 8 |   tfReplicaSpecs:
 9 |     Worker:
10 |       replicas: 2
11 |       restartPolicy: Never
12 |       template:
13 |         spec:
14 |           containers:
15 |             - name: tensorflow
16 |               image: kubeflow/distributed-training-strategy:v0.1
17 |               imagePullPolicy: IfNotPresent
18 |               command: ["python", "/distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/1/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
19 |               volumeMounts:
20 |                 - mountPath: /trained_model
21 |                   name: training
22 |               resources:
23 |                 limits:
24 |                   cpu: 500m
25 |           volumes:
26 |             - name: training
27 |               persistentVolumeClaim:
28 |                 claimName: strategy-volume
29 | 


--------------------------------------------------------------------------------
/workflow.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: Workflow
  3 | metadata:
  4 |   generateName: tfjob-wf-
  5 |   namespace: kubeflow
  6 | spec:
  7 |   entrypoint: tfjob-wf
  8 |   podGC:
  9 |     strategy: OnPodSuccess
 10 |   volumes:
 11 |   - name: model
 12 |     persistentVolumeClaim:
 13 |       claimName: strategy-volume
 14 | 
 15 |   templates:
 16 |   - name: tfjob-wf
 17 |     steps:
 18 |     - - name: data-ingestion-step
 19 |         template: data-ingestion-step
 20 |     - - name: distributed-tf-training-steps
 21 |         template: distributed-tf-training-steps
 22 |     - - name: model-selection-step
 23 |         template: model-selection-step
 24 |     - - name: create-model-serving-service
 25 |         template: create-model-serving-service
 26 | 
 27 |   - name: data-ingestion-step
 28 |     serviceAccountName: argo
 29 |     memoize:
 30 |       key: "step-cache"
 31 |       maxAge: "1h"
 32 |       cache:
 33 |         configMap:
 34 |           name: my-config
 35 |           key: step-cache
 36 |     container:
 37 |       image: kubeflow/distributed-training-strategy:v0.1
 38 |       imagePullPolicy: IfNotPresent
 39 |       command: ["python", "/data-ingestion.py"]
 40 | 
 41 |   - name: distributed-tf-training-steps
 42 |     steps:
 43 |     - - name: cnn-model
 44 |         template: cnn-model
 45 |       - name: cnn-model-with-dropout
 46 |         template: cnn-model-with-dropout
 47 |       - name: cnn-model-with-batch-norm
 48 |         template: cnn-model-with-batch-norm
 49 | 
 50 |   - name: cnn-model
 51 |     serviceAccountName: training-operator
 52 |     resource:
 53 |       action: create
 54 |       setOwnerReference: true
 55 |       successCondition: status.replicaStatuses.Worker.succeeded = 2
 56 |       failureCondition: status.replicaStatuses.Worker.failed > 0
 57 |       manifest: |
 58 |         apiVersion: kubeflow.org/v1
 59 |         kind: TFJob
 60 |         metadata:
 61 |           generateName: multi-worker-training-
 62 |         spec:
 63 |           runPolicy:
 64 |             cleanPodPolicy: None
 65 |           tfReplicaSpecs:
 66 |             Worker:
 67 |               replicas: 2
 68 |               restartPolicy: Never
 69 |               template:
 70 |                 spec:
 71 |                   containers:
 72 |                     - name: tensorflow
 73 |                       image: kubeflow/distributed-training-strategy:v0.1
 74 |                       imagePullPolicy: IfNotPresent
 75 |                       command: ["python", "/distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/1/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
 76 |                       volumeMounts:
 77 |                         - mountPath: /trained_model
 78 |                           name: training
 79 |                       resources:
 80 |                         limits:
 81 |                           cpu: 500m
 82 |                   volumes:
 83 |                     - name: training
 84 |                       persistentVolumeClaim:
 85 |                         claimName: strategy-volume
 86 | 
 87 |   - name: cnn-model-with-dropout
 88 |     serviceAccountName: training-operator
 89 |     resource:
 90 |       action: create
 91 |       setOwnerReference: true
 92 |       successCondition: status.replicaStatuses.Worker.succeeded = 2
 93 |       failureCondition: status.replicaStatuses.Worker.failed > 0
 94 |       manifest: |
 95 |         apiVersion: kubeflow.org/v1
 96 |         kind: TFJob
 97 |         metadata:
 98 |           generateName: multi-worker-training-
 99 |         spec:
100 |           runPolicy:
101 |             cleanPodPolicy: None
102 |           tfReplicaSpecs:
103 |             Worker:
104 |               replicas: 2
105 |               restartPolicy: Never
106 |               template:
107 |                 spec:
108 |                   containers:
109 |                     - name: tensorflow
110 |                       image: kubeflow/distributed-training-strategy:v0.1
111 |                       imagePullPolicy: IfNotPresent
112 |                       command: ["python", "/distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn_batchnorm"]
113 |                       volumeMounts:
114 |                         - mountPath: /trained_model
115 |                           name: training
116 |                       resources:
117 |                         limits:
118 |                           cpu: 500m
119 |                   volumes:
120 |                     - name: training
121 |                       persistentVolumeClaim:
122 |                         claimName: strategy-volume
123 | 
124 |   - name: cnn-model-with-batch-norm
125 |     serviceAccountName: training-operator
126 |     resource:
127 |       action: create
128 |       setOwnerReference: true
129 |       successCondition: status.replicaStatuses.Worker.succeeded = 2
130 |       failureCondition: status.replicaStatuses.Worker.failed > 0
131 |       manifest: |
132 |         apiVersion: kubeflow.org/v1
133 |         kind: TFJob
134 |         metadata:
135 |           generateName: multi-worker-training-
136 |         spec:
137 |           runPolicy:
138 |             cleanPodPolicy: None
139 |           tfReplicaSpecs:
140 |             Worker:
141 |               replicas: 2
142 |               restartPolicy: Never
143 |               template:
144 |                 spec:
145 |                   containers:
146 |                     - name: tensorflow
147 |                       image: kubeflow/distributed-training-strategy:v0.1
148 |                       imagePullPolicy: IfNotPresent
149 |                       command: ["python", "/distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/3/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn_dropout"]
150 |                       volumeMounts:
151 |                         - mountPath: /trained_model
152 |                           name: training
153 |                       resources:
154 |                         limits:
155 |                           cpu: 500m
156 |                   volumes:
157 |                     - name: training
158 |                       persistentVolumeClaim:
159 |                         claimName: strategy-volume
160 | 
161 |   - name: model-selection-step
162 |     serviceAccountName: argo
163 |     container:
164 |       image: kubeflow/distributed-training-strategy:v0.1
165 |       imagePullPolicy: IfNotPresent
166 |       command: ["python", "/model-selection.py"]
167 |       volumeMounts:
168 |       - name: model
169 |         mountPath: /trained_model
170 | 
171 |   - name: create-model-serving-service
172 |     serviceAccountName: training-operator
173 |     successCondition: status.modelStatus.states.transitionStatus = UpToDate
174 |     resource:
175 |       action: create
176 |       setOwnerReference: true
177 |       manifest: |
178 |         apiVersion: serving.kserve.io/v1beta1
179 |         kind: InferenceService
180 |         metadata:
181 |           name: tf-mnist
182 |           annotations:
183 |             autoscaling.knative.dev/target: "1"
184 |         spec:
185 |           predictor:
186 |             model:
187 |               modelFormat:
188 |                 name: tensorflow
189 |               image: "emacski/tensorflow-serving:2.6.0"
190 |               storageUri: "pvc://strategy-volume/saved_model_versions"


--------------------------------------------------------------------------------