├── .gitignore
├── LICENSE
├── README.md
├── _config.yml
├── code
├── README.md
├── chapter-2
│ ├── listing-2-1.py
│ ├── listing-2-2.py
│ ├── listing-2-3.py
│ ├── listing-2-4.py
│ ├── listing-2-5.txt
│ ├── listing-2-6.txt
│ ├── listing-2-7.txt
│ └── listing-2-8.txt
└── project
│ ├── .gitignore
│ ├── README.md
│ ├── basics
│ ├── argo-coinflip.yaml
│ ├── argo-dag-diamond.yaml
│ ├── argo-hello-world.yaml
│ ├── argo-resource-template.yaml
│ ├── argo-script-template.yaml
│ ├── hello-world.yaml
│ └── tfjob.yaml
│ ├── code
│ ├── Dockerfile
│ ├── README.md
│ ├── access-model.yaml
│ ├── autoscaled-inference-service.yaml
│ ├── data-ingestion.py
│ ├── http-inference-request.py
│ ├── inference-client.py
│ ├── inference-input.json
│ ├── inference-service.yaml
│ ├── model-selection.py
│ ├── model-selection.yaml
│ ├── multi-worker-distributed-training.py
│ ├── multi-worker-pvc.yaml
│ ├── multi-worker-tfjob.yaml
│ ├── predict-service.py
│ ├── predict-service.yaml
│ └── workflow.yaml
│ └── manifests
│ ├── argo-workflows
│ ├── kustomization.yaml
│ └── rbac-patch.yaml
│ ├── kubeflow-training
│ ├── cluster-role-binding.yaml
│ ├── cluster-role.yaml
│ ├── crds
│ │ ├── kubeflow.org_mxjobs.yaml
│ │ ├── kubeflow.org_pytorchjobs.yaml
│ │ ├── kubeflow.org_tfjobs.yaml
│ │ ├── kubeflow.org_xgboostjobs.yaml
│ │ └── kustomization.yaml
│ ├── deployment.yaml
│ ├── kustomization.yaml
│ ├── service-account.yaml
│ └── service.yaml
│ └── kustomization.yaml
└── images
├── chinese-cover.pdf
├── english-front-cover.png
├── korean-cover-clean.png
├── korean-cover-white.jpg
└── korean-cover.jpg
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2021 Yuan Tang
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Distributed Machine Learning Patterns
2 |
3 | [](https://www.linkedin.com/in/terrytangyuan)
4 | [](https://bsky.app/profile/terrytangyuan.xyz)
5 | [](https://github.com/terrytangyuan)
6 | [](https://twitter.com/TerryTangYuan)
7 |
8 |
9 |
10 | This repository contains references and code for the book *Distributed Machine Learning Patterns* from [Manning Publications](https://bit.ly/2RKv8Zo) by [Yuan Tang](https://github.com/terrytangyuan).
11 |
12 | :fire: **[Korean](images/korean-cover.jpg) and [Chinese](images/chinese-cover.pdf) versions are available from Tsinghua University Press and Hanbit Media!**
13 |
14 | [Manning](https://bit.ly/2RKv8Zo), [Amazon](https://www.amazon.com/dp/1617299022/), [Barnes & Noble](https://www.barnesandnoble.com/w/distributed-machine-learning-patterns-yuan-tang/1140209010), [Powell’s]( https://www.powells.com/book/distributed-machine-learning-patterns-9781617299025), [Bookshop](https://bookshop.org/p/books/distributed-machine-learning-patterns-yuan-tang/17491200)
15 |
16 |
17 | In *Distributed Machine Learning Patterns* you will learn how to:
18 |
19 | * Apply patterns to build scalable and reliable machine learning systems.
20 | * Construct machine learning pipelines with data ingestion, distributed training, model serving, and more.
21 | * Automate machine learning tasks with [Kubernetes](https://kubernetes.io/), [TensorFlow](https://www.tensorflow.org/), [Kubeflow](https://www.kubeflow.org/), and [Argo Workflows](https://argoproj.github.io/argo-workflows/).
22 | * Make trade off decisions between different patterns and approaches.
23 | * Manage and monitor machine learning workloads at scale.
24 |
25 | This book teaches you how to take machine learning models from your personal laptop to large distributed clusters. You’ll explore key concepts and patterns behind successful distributed machine learning systems, and learn technologies like TensorFlow, Kubernetes, Kubeflow, and Argo Workflows directly from a key maintainer and contributor. Real-world scenarios, hands-on projects, and clear, practical advice DevOps techniques and let you easily launch, manage, and monitor cloud-native distributed machine learning pipelines.
26 |
27 | ## About the topic
28 |
29 | Scaling up models from personal devices to large distributed clusters is one of the biggest challenges faced by modern machine learning practitioners. Distributing machine learning systems allow developers to handle extremely large datasets across multiple clusters, take advantage of automation tools, and benefit from hardware accelerations. In this book, Yuan Tang shares patterns, techniques, and experience gained from years spent building and managing cutting-edge distributed machine learning infrastructure.
30 |
31 | ## About the book
32 |
33 | *Distributed Machine Learning Patterns* is filled with practical patterns for running machine learning systems on distributed Kubernetes clusters in the cloud. Each pattern is designed to help solve common challenges faced when building distributed machine learning systems, including supporting distributed model training, handling unexpected failures, and dynamic model serving traffic. Real-world scenarios provide clear examples of how to apply each pattern, alongside the potential trade-offs for each approach. Once you’ve mastered these cutting-edge techniques, you’ll put them all into practice and finish up by building a comprehensive distributed machine learning system.
34 |
35 | ## About the reader
36 |
37 | For data analysts, data scientists, and software engineers familiar with the basics of machine learning algorithms and running machine learning in production. Readers should be familiar with the basics of Bash, Python, and Docker.
38 |
39 | ## About the author
40 |
41 | Yuan is a principal software engineer at [Red Hat](https://www.redhat.com/), working on [OpenShift AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai). Previously, he has led AI infrastructure and platform teams at various companies. He holds leadership positions in open source projects, including [Argo](https://argoproj.github.io/), [Kubeflow](https://github.com/kubeflow), and [Kubernetes](https://github.com/kubernetes/community/tree/master/wg-serving). He's also a maintainer and author of many popular [open source projects](https://github.com/sponsors/terrytangyuan). In addition, Yuan [authored](https://terrytangyuan.github.io/cv#publications) three technical books and published numerous impactful papers. He's a regular [conference speaker](https://terrytangyuan.github.io/cv#talks), technical advisor, leader, and mentor at [various organizations](https://terrytangyuan.github.io/cv#services).
42 |
43 | ## Supporting Quotes
44 |
45 | *"This is a wonderful book for those wanting to understand how to be more effective with Machine Learning at scale, explained clearly and from first principles!"*
46 |
47 | **-- Laurence Moroney, AI Developer Relations Lead at Google**
48 |
49 | *"This book is an exceptionally timely and comprehensive guide to developing, running, and managing machine learning systems in a distributed environment. It covers essential topics such as data partitioning, ingestion, model training, serving, and workflow management. What truly sets this book apart is its discussion of these topics from a pattern perspective, accompanied by real-world examples and widely adopted systems like Kubernetes, Kubeflow, and Argo. I highly recommend it!"*
50 |
51 | **-- Yuan Chen, Principal Software Engineer at Apple**
52 |
53 |
54 | *"This book provides a high-level understanding of patterns with practical code examples needed for all MLOps engineering tasks. This is a must-read for anyone in the field."*
55 |
56 | **-- Brian Ray, Global Head of Data Science and Artificial Intelligence at Eviden**
57 |
58 |
59 | *"This book weaves together concepts from distributed systems, machine learning, and site reliability engineering in a way that’s approachable for beginners and that’ll excite and inspire experienced practitioners. As soon as I finished reading, I was ready to start building."*
60 |
61 | **-- James Lamb, Staff Data Engineer at SpotHero**
62 |
63 |
64 | *"Whatever your role is in the data ecosystem (scientist, analyst, or engineer), if you are looking to take your knowledge and skills to the next level, then this book is for you. This book is an amazing guide to the concepts and state-of-the-art when it comes to designing resilient and scalable, ML systems for both training and serving models. Regardless of what platform you may be working with, this book teaches you the patterns you should be familiar with when trying to scale out your systems."*
65 |
66 | **-- Ryan Russon, Senior Manager of Model Training at Capital One**
67 |
68 |
69 | *"AI is the new electricity, and distributed systems is the new power grid. Whether you are a research scientist, engineer, or product developer, you will find the best practices and recipes in this book to scale up your greatest endeavors."*
70 |
71 | **-- Linxi "Jim" Fan, Senior AI Research Scientist at NVIDIA, Stanford PhD**
72 |
73 | *"This book discusses various architectural approaches to tackle common data science problems such as scaling machine learning processes and building robust workflows and pipelines. It serves as an excellent introduction to the world of MLOps for data scientists and ML engineers who want to enhance their knowledge in this field."*
74 |
75 | **-- Rami Krispin, Senior Data Science and Engineering Manager**
76 |
77 | *"无论是新手还是专家,这本书都将引领你构建强大的机器学习系统,进而掌握分布式机器学习、自动化工具和大规模工作负载管理的要点。让你的机器学习之旅更上一层楼!"*
78 |
79 | **-- 高策,TensorChord CEO,Kubeflow 社区维护者**
80 |
81 | *"这是一本关于在分布式环境下开发、运行和管理机器学习系统的全面手册。作者详尽地阐述了从数据分区、采集、模型训练到服务和工作流程管理等一系列关键主题。通过使用现实世界中的案例,本书深入浅出地讲解了人工智能与机器学习领域用到的核心软件、系统和平台,涵盖了 PyTorch、TensorFlow、Kubeflow、Argo Workflows 和 Kubernetes 等。无论是算法工程师、系统工程师还是架构师,都能从中获得开发和维护分布式机器学习系统所需的全方位知识。我将此书极力推荐给所有对机器学习有着浓厚兴趣和实践需求的专业人士!"*
82 |
83 | **-- 陈源,NVIDIA 主任工程师**
84 |
85 | *"很高兴看到这本书能在国内出版。随着 ChatGPT 等工具和技术的爆火,AI技术迎来了又一波爆发期。与此同时,Kubernetes 等云原生技术作为基础设施的事实标准也再次在本轮技术热潮中成为首选项。这本书介绍了很多结合云原生和分布式技术进行机器学习的方法和案例,推荐对这方面感兴趣的读者进行阅读。"*
86 |
87 | **-- 张晋涛,Kong Inc., Microsoft MVP, CNCF Ambassador**
88 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect
2 | plugins:
3 | - jekyll-relative-links
4 | relative_links:
5 | enabled: true
6 | collections: true
7 | include:
8 | - README.md
9 |
--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 |
3 | * Install Python.
4 | * Run the following to install the necessary Python packages:
5 |
6 | ```bash
7 | pip install tensorflow tensorflow_io
8 | ```
9 |
10 | ## Instructions
11 |
12 | * All code snippets are organized by chapters and the the listing title. For example, `chapter-2/listing-2-1.py` is for Listing 2.1 in Chapter 2.
13 | * Files with `*.py` extension can be executed via `python *.py`.
14 | * Files with `*.txt` extension are pseudo-code and are not meant to be executed.
15 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-1.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf #A
2 |
3 | train, test = tf.keras.datasets.fashion_mnist.load_data() #B
4 |
5 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
6 | # 32768/29515 [=================================] - 0s 0us/step
7 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
8 | # 26427392/26421880 [==============================] - 0s 0us/step
9 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
10 | # 8192/5148 [===============================================] - 0s 0us/step
11 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
12 | # 4423680/4422102 [==============================] - 0s 0us/step
13 |
14 | #A Load TensorFlow library.
15 | #B Download the Fashion-MNIST dataset and then load it into memory.
16 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | images, labels = train #A
4 | images = images/255 #B
5 |
6 | dataset = tf.data.Dataset.from_tensor_slices((images, labels)) #C
7 | dataset #D
8 | #
9 |
10 | #A Split the training dataset object into images and labels.
11 | #B Normalize the images.
12 | #C Load in-memory array representation into a tf.data.Dataset object that will make it easier to use for training in TensorFlow.
13 | #D Take a look at the information of the dataset such as shapes and data types.
14 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-3.py:
--------------------------------------------------------------------------------
1 | import tensorflow_io as tfio #A
2 |
3 | d_train = tfio.IODataset.from_mnist( #B
4 | 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
5 | 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz')
6 |
7 | #A Load TensorFlow I/O library.
8 | #B Load the MNIST dataset from a URL to access dataset files directly without downloading via HTTP file system support.
9 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-4.py:
--------------------------------------------------------------------------------
1 | import os #A
2 | import tensorflow_io as tfio #B
3 |
4 | endpoint="postgresql://{}:{}@{}?port={}&dbname={}".format( #C
5 | os.environ['TFIO_DEMO_DATABASE_USER'],
6 | os.environ['TFIO_DEMO_DATABASE_PASS'],
7 | os.environ['TFIO_DEMO_DATABASE_HOST'],
8 | os.environ['TFIO_DEMO_DATABASE_PORT'],
9 | os.environ['TFIO_DEMO_DATABASE_NAME'],
10 | )
11 |
12 | dataset = tfio.experimental.IODataset.from_sql( #D
13 | query="SELECT co, pt08s1 FROM AirQualityUCI;",
14 | endpoint=endpoint)
15 | print(dataset.element_spec) #E
16 | # {'co': TensorSpec(shape=(), dtype=tf.float32, name=None), 'pt08s1': TensorSpec(shape=(), dtype=tf.int32, name=None)}
17 |
18 | #A Load Python’s built-in OS library for loading environment variables related to the PostgreSQL database.
19 | #B Load TensorFlow I/O library.
20 | #C Construct the endpoint for accessing the PostgreSQL database.
21 | #D Select two columns from the AirQualityUCI table in the database and instantiate a tf.data.Dataset object.
22 | #E Inspect the specification of the dataset such as the shape and data type for each column.
23 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-5.txt:
--------------------------------------------------------------------------------
1 | batch = read_next_batch(dataset) #A
2 | while batch is not None:
3 | model.train(batch) #B
4 | batch = read_next_batch(dataset) #C
5 |
6 | #A Read the next batch in the dataset.
7 | #B Train the model with this batch.
8 | #C Read the next batch once we are done training with the current batch.
9 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-6.txt:
--------------------------------------------------------------------------------
1 | if get_worker_rank() == 0: #A
2 | create_and_send_shards(dataset) #A
3 | shard = read_next_shard_locally() #B
4 | while shard is not None:
5 | model.train(shard) #C
6 | shard = read_next_shard_locally() #D
7 |
8 | #A Create and send shards to all other worker machines from the worker machine with rank 0.
9 | #B Read the next shard available locally in this worker machine.
10 | #C Train the model using the shard we just read from the worker machine locally.
11 | #D Read the next shard once we are done training with the current shard.
12 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-7.txt:
--------------------------------------------------------------------------------
1 | batch = read_next_batch(dataset) #A
2 | cache = initialize_cache(batch) #B
3 | while batch is not None: #C
4 | model.train(batch) #C
5 | cache.append(batch) #C
6 | batch = read_next_batch(dataset)
7 | while current_epoch() <= total_epochs: #D
8 | batch = cache.read_next_batch() #D
9 | model.train(batch) #D
10 |
11 | #A Read the next batch of the dataset.
12 | #B Initialize the cache for this batch.
13 | #C Train the model by iterating through the batches.
14 | #D Train the model for additional epochs using the batches that were cached previously.
15 |
--------------------------------------------------------------------------------
/code/chapter-2/listing-2-8.txt:
--------------------------------------------------------------------------------
1 | batch = read_next_batch(dataset)
2 | cache = initialize_cache(preprocess(batch)) #A
3 | while batch is not None:
4 | batch = preprocess(batch)
5 | model.train(batch)
6 | cache.append(batch)
7 | batch = read_next_batch(dataset)
8 | while current_epoch() <= total_epochs:
9 | processed_batch = cache.read_next_batch() #B
10 | model.train(processed_batch) #B
11 |
12 | #A Initialize the cache with the preprocessed batch.
13 | #B Retrieve the processed batch from the cache and use it for model training.
14 |
--------------------------------------------------------------------------------
/code/project/.gitignore:
--------------------------------------------------------------------------------
1 | trained_model/
2 | istio-*
3 |
--------------------------------------------------------------------------------
/code/project/README.md:
--------------------------------------------------------------------------------
1 | # Project Setup
2 |
3 | ## Cluster
4 |
5 | ```
6 | cd project/
7 | ```
8 |
9 | Via `kind`:
10 |
11 | ```
12 | go install sigs.k8s.io/kind@v0.17.0
13 | kind create cluster --name distml --image kindest/node:v1.25.3
14 | ```
15 |
16 | Or via `k3d`:
17 |
18 | ```
19 | k3d cluster create distml --image rancher/k3s:v1.25.3-k3s1
20 | ```
21 |
22 |
23 | ```
24 | kubectl create ns kubeflow
25 | kns kubeflow
26 | kubectl kustomize manifests | kubectl apply -f -
27 | ```
28 |
29 | # Run Workflow
30 |
31 | See instructions [here](https://github.com/terrytangyuan/distributed-ml-patterns/blob/main/code/project/code/README.md).
32 |
33 | # Clean-up
34 |
35 | ```
36 | k3d cluster rm distml
37 | kind delete cluster --name distml
38 | ```
39 |
--------------------------------------------------------------------------------
/code/project/basics/argo-coinflip.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: coinflip-
5 | spec:
6 | serviceAccountName: argo
7 | entrypoint: coinflip
8 | templates:
9 | - name: coinflip
10 | steps:
11 | - - name: flip-coin
12 | template: flip-coin
13 | - - name: heads
14 | template: heads
15 | when: "{{steps.flip-coin.outputs.result}} == heads"
16 | - name: tails
17 | template: tails
18 | when: "{{steps.flip-coin.outputs.result}} == tails"
19 |
20 | - name: flip-coin
21 | script:
22 | image: python:alpine3.6
23 | command: [python]
24 | source: |
25 | import random
26 | result = "heads" if random.randint(0,1) == 0 else "tails"
27 | print(result)
28 |
29 | - name: heads
30 | container:
31 | image: alpine:3.6
32 | command: [sh, -c]
33 | args: ["echo \"it was heads\""]
34 |
35 | - name: tails
36 | container:
37 | image: alpine:3.6
38 | command: [sh, -c]
39 | args: ["echo \"it was tails\""]
40 |
--------------------------------------------------------------------------------
/code/project/basics/argo-dag-diamond.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: dag-diamond-
5 | spec:
6 | serviceAccountName: argo
7 | entrypoint: diamond
8 | templates:
9 | - name: echo
10 | inputs:
11 | parameters:
12 | - name: message
13 | container:
14 | image: alpine:3.7
15 | command: [echo, "{{inputs.parameters.message}}"]
16 | - name: diamond
17 | dag:
18 | tasks:
19 | - name: A
20 | template: echo
21 | arguments:
22 | parameters: [{name: message, value: A}]
23 | - name: B
24 | dependencies: [A]
25 | template: echo
26 | arguments:
27 | parameters: [{name: message, value: B}]
28 | - name: C
29 | dependencies: [A]
30 | template: echo
31 | arguments:
32 | parameters: [{name: message, value: C}]
33 | - name: D
34 | dependencies: [B, C]
35 | template: echo
36 | arguments:
37 | parameters: [{name: message, value: D}]
38 |
--------------------------------------------------------------------------------
/code/project/basics/argo-hello-world.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: hello-world-
5 | spec:
6 | entrypoint: whalesay
7 | serviceAccountName: argo
8 | templates:
9 | - name: whalesay
10 | container:
11 | image: docker/whalesay
12 | command: [cowsay]
13 | args: ["hello world"]
14 |
--------------------------------------------------------------------------------
/code/project/basics/argo-resource-template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: k8s-resource-
5 | spec:
6 | entrypoint: k8s-resource
7 | serviceAccountName: argo
8 | templates:
9 | - name: k8s-resource
10 | resource:
11 | action: create
12 | manifest: |
13 | apiVersion: v1
14 | kind: ConfigMap
15 | metadata:
16 | name: cm-example
17 | data:
18 | some: value
19 |
--------------------------------------------------------------------------------
/code/project/basics/argo-script-template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: script-tmpl-
5 | spec:
6 | entrypoint: gen-random-int
7 | serviceAccountName: argo
8 | templates:
9 | - name: gen-random-int
10 | script:
11 | image: python:alpine3.6
12 | command: [python]
13 | source: |
14 | import random
15 | i = random.randint(1, 100)
16 | print(i)
17 |
--------------------------------------------------------------------------------
/code/project/basics/hello-world.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: whalesay
5 | spec:
6 | containers:
7 | - name: whalesay
8 | image: docker/whalesay:latest
9 | command: [cowsay]
10 | args: ["hello world"]
11 |
--------------------------------------------------------------------------------
/code/project/basics/tfjob.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeflow.org/v1
2 | kind: TFJob
3 | metadata:
4 | namespace: kubeflow
5 | generateName: distributed-tfjob-
6 | spec:
7 | tfReplicaSpecs:
8 | Worker:
9 | replicas: 2
10 | restartPolicy: OnFailure
11 | template:
12 | spec:
13 | containers:
14 | - name: tensorflow
15 | image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
16 | command:
17 | - "python"
18 | - "/var/tf_mnist/mnist_with_summaries.py"
19 | - "--log_dir=/train/metrics"
20 | - "--learning_rate=0.01"
21 | - "--batch_size=100"
22 |
--------------------------------------------------------------------------------
/code/project/code/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9
2 |
3 | RUN pip install tensorflow==2.11.0 tensorflow_datasets==4.7.0
4 |
5 | COPY data-ingestion.py /
6 | COPY predict-service.py /
7 | COPY model-selection.py /
8 | COPY multi-worker-distributed-training.py /
9 |
--------------------------------------------------------------------------------
/code/project/code/README.md:
--------------------------------------------------------------------------------
1 | # Multi-worker Distributed Training
2 |
3 | ## Setup
4 |
5 | ```
6 | cd project/code
7 | ```
8 |
9 | Build the image
10 | ```
11 | docker build -f Dockerfile -t kubeflow/multi-worker-strategy:v0.1 .
12 | # If using k3d
13 | k3d image import kubeflow/multi-worker-strategy:v0.1 --cluster distml
14 | # If using kind
15 | kind load docker-image kubeflow/multi-worker-strategy:v0.1 --name distml
16 | ```
17 |
18 | Switch to "kubeflow" namespace:
19 | ```
20 | kubectl config set-context --current --namespace=kubeflow
21 | ```
22 |
23 | Specify your storageClassName and create a persistent volume claim to save
24 | models and checkpoints
25 | ```
26 | kubectl create -f multi-worker-pvc.yaml
27 | ```
28 |
29 | ## Submitting Training Job
30 |
31 | Create a TFJob:
32 | ```
33 | kubectl create -f multi-worker-tfjob.yaml
34 | ```
35 |
36 | After making code changes, run the following to resubmit the job:
37 | ```
38 | kubectl delete tfjob --all; docker build -f Dockerfile -t kubeflow/multi-worker-strategy:v0.1 .; kind load docker-image kubeflow/multi-worker-strategy:v0.1 --name distml; kubectl create -f multi-worker-tfjob.yaml
39 | ```
40 |
41 | ## Model loading & prediction
42 |
43 | ```
44 | kubectl create -f predict-service.yaml
45 | kubectl exec --stdin --tty predict-service -- bin/bash
46 | python3 /predict-service.py
47 | ```
48 |
49 | ## Model selection
50 |
51 | ```
52 | python3 /model-selection.py
53 | ```
54 |
55 | ## Model serving
56 |
57 | ```
58 | # Install KServe
59 | curl -s "https://raw.githubusercontent.com/kserve/kserve/v0.10.0-rc1/hack/quick_install.sh" | bash
60 |
61 | # Create inference service
62 | kubectl create -f inference-service.yaml
63 |
64 | # https://kserve.github.io/website/master/get_started/first_isvc/#4-determine-the-ingress-ip-and-ports
65 | INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
66 | kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80
67 | # start another terminal
68 | export INGRESS_HOST=localhost
69 | export INGRESS_PORT=8080
70 |
71 | MODEL_NAME=flower-sample
72 | INPUT_PATH=@./inference-input.json
73 | SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -o jsonpath='{.status.url}' | cut -d "/" -f 3)
74 | curl -v -H "Host: ${SERVICE_HOSTNAME}" "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict" -d $INPUT_PATH
75 |
76 | ## TODO: gRPC serving. Not working yet
77 | # Client-side requirements
78 | python3 -m pip install tensorflow-metal
79 | python3 -m pip install tensorflow-macos==2.11.0
80 | python3 -m pip install tensorflow-serving-api==2.11.0
81 | ```
82 |
83 | Autoscaled inference service:
84 | ```
85 | # https://github.com/rakyll/hey
86 | brew install hey
87 | kubectl create -f autoscaled-inference-service.yaml
88 |
89 | hey -z 30s -c 5 -m POST -host ${SERVICE_HOSTNAME} -D inference-input.json "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict"
90 | ```
91 |
92 | ## Workflow
93 |
94 | ```
95 | kubectl create -f workflow.yaml
96 | ```
97 |
98 | ## Debugging
99 |
100 | Access the trained model
101 | ```
102 | kubectl create -f access-model.yaml
103 | kubectl exec --stdin --tty access-model -- ls /trained_model
104 | # Manually copy
105 | # kubectl cp trained_model access-model:/pv/trained_model -c model-storage
106 | ```
107 |
108 | Run TFServing commands in the KServe container:
109 | ```
110 | kubectl exec --stdin --tty flower-sample-predictor-default-00001-deployment-84759dfc5f6wfj -c kserve-container -- /usr/bin/tensorflow_model_server --model_name=flower-sample \
111 | --port=9000 \
112 | --rest_api_port=8080 \
113 | --model_base_path=/mnt \
114 | --rest_api_timeout_in_ms=60000
115 | ```
116 |
117 | ## Cleanup
118 |
119 | ```
120 | kubectl delete tfjob --all
121 | kubectl delete wf --all
122 | kubectl delete inferenceservice flower-sample
123 | kubectl delete pods --selector=app=flower-sample-predictor-default-00001 --force --grace-period=0
124 | kubectl delete pod access-model --force --grace-period=0
125 | kubectl delete pod predict-service --force --grace-period=0
126 | kubectl delete pvc strategy-volume
127 | ```
128 |
129 |
--------------------------------------------------------------------------------
/code/project/code/access-model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: access-model
5 | spec:
6 | containers:
7 | - name: model-storage
8 | image: alpine:latest
9 | command: ['sleep', 'infinity']
10 | volumeMounts:
11 | - name: model
12 | mountPath: /trained_model
13 | volumes:
14 | - name: model
15 | persistentVolumeClaim:
16 | claimName: strategy-volume
17 |
--------------------------------------------------------------------------------
/code/project/code/autoscaled-inference-service.yaml:
--------------------------------------------------------------------------------
1 | # https://kserve.github.io/website/master/modelserving/autoscaling/autoscaling/#create-inferenceservice
2 | apiVersion: serving.kserve.io/v1beta1
3 | kind: InferenceService
4 | metadata:
5 | name: flower-sample
6 | spec:
7 | predictor:
8 | # https://kserve.github.io/website/master/reference/api/#serving.kserve.io/v1beta1.ComponentExtensionSpec
9 | scaleTarget: 1
10 | scaleMetric: concurrency
11 | model:
12 | modelFormat:
13 | name: tensorflow
14 | # This is only needed on Mac M1
15 | image: "emacski/tensorflow-serving:2.6.0"
16 | storageUri: "pvc://strategy-volume/saved_model_versions"
17 |
--------------------------------------------------------------------------------
/code/project/code/data-ingestion.py:
--------------------------------------------------------------------------------
1 | import tensorflow_datasets as tfds
2 |
3 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
4 |
--------------------------------------------------------------------------------
/code/project/code/http-inference-request.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 |
4 | input_path = "inference-input.json"
5 |
6 | with open(input_path) as json_file:
7 | data = json.load(json_file)
8 |
9 | r = requests.post(url="http://localhost:8080/v1/models/flower-sample:predict", data=json.dumps(data), headers={'Host': 'flower-sample.kubeflow.example.com'})
10 | print(r.text)
11 |
--------------------------------------------------------------------------------
/code/project/code/inference-client.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import base64
4 | import grpc
5 |
6 | from tensorflow.contrib.util import make_tensor_proto
7 | from tensorflow_serving.apis import predict_pb2
8 | from tensorflow_serving.apis import prediction_service_pb2_grpc
9 |
10 |
11 | def predict(host, port, hostname, model, signature_name, input_path):
12 | # If hostname not set, we assume the host is a valid knative dns.
13 | if hostname:
14 | host_option = (('grpc.ssl_target_name_override', hostname,),)
15 | else:
16 | host_option = None
17 | channel = grpc.insecure_channel(target='{host}:{port}'.format(host=host, port=port), options=host_option)
18 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
19 | with open(input_path) as json_file:
20 | data = json.load(json_file)
21 | image = data['instances'][0]['image_bytes']['b64']
22 | key = data['instances'][0]['key']
23 |
24 | # Call classification model to make prediction
25 | request = predict_pb2.PredictRequest()
26 | request.model_spec.name = model
27 | request.model_spec.signature_name = signature_name
28 | image = base64.b64decode(image)
29 | request.inputs['image_bytes'].CopyFrom(
30 | make_tensor_proto(image, shape=[1]))
31 | request.inputs['key'].CopyFrom(make_tensor_proto(key, shape=[1]))
32 |
33 | result = stub.Predict(request, 10.0)
34 | print(result)
35 |
36 |
37 | if __name__ == '__main__':
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument('--host', help='Ingress Host Name', default='localhost', type=str)
40 | parser.add_argument('--port', help='Ingress Port', default=80, type=int)
41 | parser.add_argument('--model', help='TensorFlow Model Name', type=str)
42 | parser.add_argument('--signature_name', help='Signature name of saved TensorFlow model',
43 | default='serving_default', type=str)
44 | parser.add_argument('--hostname', help='Service Host Name', default='', type=str)
45 | parser.add_argument('--input_path', help='Prediction data input path', default='./input.json', type=str)
46 |
47 | args = parser.parse_args()
48 | predict(args.host, args.port, args.hostname, args.model, args.signature_name, args.input_path)
49 |
--------------------------------------------------------------------------------
/code/project/code/inference-input.json:
--------------------------------------------------------------------------------
1 | {
2 | "instances":[
3 | {
4 | "image_bytes":{
5 | "b64":"/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAErASsDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwC9A42ir9vA0nOOKxYJhkDqe1bNvO0ZAYdaIsC8LLjOwH60yWDAwY1/75qzDcDAz0qfhl55BqxGE1pCzZwVPt0qJ7MgZQbh7da1Z7bncBVQgoaVhlGFvKlBIwOhqxPFxkdKmdY5xiQYP94daaqtGPKkO5P4X/pU2AoKMMQatWv+tAPXpTJ4ipyBTVYqwYHBFTezA1ivHNRsuRU1tOlymOBIOo9aVoyGNaCIEHanEEEMKXbg07BAx2NICXO5KrvwcVPEcDFRyD5qTYDYhuPuKnA4waitxmQj1FWGX9Ka2ArODzUXU5qxIM81AODzUtjGzHMfvVRcl6mmOMio4V3PSAtwjBUd60l+6DVCMAzH2q6D8v0qo7CIJ3xmsqQ8kmtC5YAVmyctntSbGRkDOT0qWMFyABUWNzD0q5EuxM9zQgJQAqgCkJxS9vemMasA3c8CpFPHNRBgBkinBvSpuBMGxRnPWo1561IOlMBQMEU2R8DFKW2rk1XdsmgCN+TmqskuHIqeUhVNZMkoZyckZqQILTi5UntzWtHMOVbpWQh2zCr6jIBpRGzUjl2jBPHY1chuSODyKx4pOzdKnVyh68VYjbDBlyvSq88G4bhVeG4Kkc8HrV3eGUEVQjLkUr+FRmQgYzV+aMODxzWdIpU0mMerh1wahdCpPvTN21gQamB3jB+qn1rOQDI5GjcMvBFbdvMt1FkfeHWsJhzU1pcG3nDZ4PWlGVgNd4+MigL8uKscMgdeVNRsAORVsRGFwc1G45qfKg/MM/U0jLG3RQPxNS2BCh2OG9DVxwM57GqxRQc8j9asp80I5zjiiIyu64zVdhxVtwMVVak2BUlOTUlumATTXXmpPux0r6AS2vLv7GrLNtFVbM/K596knbgGqT0AqXLZeqbgsRU8x96hJzgCk2A+JPmA61PA4mUSL9wk7fcetULtmEMdvGSJrltgI6hQMsfwH6kVqRIsUaqgAVQAAOwFUgEJ7UwDOc1Ky55/OmtgcCi4EZ6UqqSc0Hk4p46igB44pQaaM5NI7hVx3qkA2V8nHaoAdzE9hTZHOMd6ZczfZoQq/fNDArahcgAxLyf4iKzs0OxJ5696ZUDQP97NaVsdyg+1IPszHlFzU8SRg4jGB6VSQh3linp02mpQm5enNJs9aoBoynfirMFwVOD0qADjDUn3W9qANIsGGQeKqXCK3PekjlIOCeKfJyN1AGXIMZFNik6xscc5U+hqxMgbPrVFwVas2BezvXOMOPvCo2GD7UyOXOG/iHX3p8hGzdn6Vm0M0rG8ZLYxtzz8pp0lyx/iNZUMpzzVkturURKZGP8AEfzpRMw6Nmq5HvTMspz1pAaUVzzhjiptxjPmRnjuKyBNzzxU8NwUbDcqaXoBreYJU3L+VVn5zTEcRvkHKNUjcE4qZdwITyabK3yGpG4GaqzN+7qG9ALNicwn/eNE75UgU2zb/RQfc0krY4rS+gFZgcc0iKM+9Kc81FcI727QxnDyjbu9AepoWrAZpv8Apt7NqB5jA8mAeig/MfxOPyFa4HFQ20KW8KQxrhEUKB7VYXFWAvlkhSDx1PvUchwSAKlD7Uyep6CoS3UnrU9QGHg0DJ5xSb8mjdjvVAOZ9oqs75JOaJX3Hg1GBmmAKRuLt0Xms24lMshbt2qxezgDyEPuxqkxpNjImo4pGOOarmbk0gJvMINWIbp42BB6VBLC0Z9qjVsGjYDqrWVJ4w6n6j0qcxbh71ztndtbyBlPHcetdLayx3Me+Ns+o7irTuJkDRDvwaYVIODWg0IYc9agkgBGDTEUyCv0pwc4wac8ZTg9KjYFRSAil6ZxVOYZFXGPBBqpKKljIFJB61KzFlqJhTkbPBrO+oxysR0qwrkjk1Wxg1IoPBJ4PpSc7BYkOfU0zzHXvke9WNuFBHQ1A/BrKVRoaQm/dweDShyOOtRZB+tAPHNSq6HYv203/LNuh6VeVvkweq1ioxB+lacUm5Nw7jBrVTUoktEsp+SqNw2F4q1I3yCqM5yQKwchpF62Yi0/GkcHgmkh4gAoOSK1UxEfVuKdbKZH8zseF+nrTGO0qo6scVciXgYrWG1xDxwcUm/GQKc3FVS3J5qmwHmUkknoKYXJGaZuBzTd1JMB4PBzxUUkhPApWcnpTFUk1SAdGrOcVW1S/TTbbIwZW4Rff1rQXbEmSefSqC6bHPdNd3Q86U/dDfdQegFUBkWYurnlI2YseWPetSPSZW5llC+yjJrUVABjt6CnHihJICkulWqcsGf/AHjUwt4QMCGMD/cFSM4HWm+YKq4HO/aAww3NRuqtypxTNpFIMisrjFDFDg1ctbySCQSRsQf5/Wqm4MOaT6GlewHY2OrQXWFciOT0J4NaDRq68/nXBLIRwa0bTVLi2wBIWX0bmqU+4rHSSQcFW5HY1TkiKZBGRUtrq8NyNr8N6VaYJIvysCPar0YjGdMDOeKrSL6ng1pXFuUJxyDVCSMgH0qGMqMvao84NTOp61XYkHmspDRYXBxnpSFjG2DyKbEdwK96eR5ilT95elZSKRYglBHlseD0NJKpU4NUlk2nB6VcjlEq7HPzDofWsJTvox2KzcZojbccGnSrgkVCpIkFcdaorblJFg8VctHzlapBs1Ztf9bTw+K01CUS07dvaqMh3TqKulSSTjPHWqCgGdnzkKOtYLGxbDkZoxnEYqR4j5e4HPqKrwncAO2M1X1jVRp+lXFxn7inHu3QCu6E3JKzIaC1lF3qk+05jtgIh/vnlj+AwPzrXUYArnvB0LLoUMshJknZpXJ7kmuhLAZxXcpIgimYjPNVd3HWpJnLHFQgcYp81xjht7U4AGkGFpDPt4QZNaIQ9YiRnoPemng4Tk+tIFdzmRjj0p5dEHFWmAqRfxOeakyAOwquZ93Sk5PJouBM0uKjMrHpUeRn5qQydhRcB3Pc0vHrUYJanbT6UwMX6ikKelafA/gX8qRljPWJfwzScAMsxjNJtx9K0Ht42HykqffkVA8DpzjI9RzUOLQ7lXbijDdRU2zIppX3qRipJ0z1rQttQkj4Ylh655FZjLzQMryDRewHUwXyTphiDnof8aZcQYO5eRXPw3BQ5U4Na1tf5Xa3I9KtSuTYgkjK5x0qpIpwM9K2HRJlLIefSqEsRGR+lRJAUVcxuCO1WpDlROn4iqknTp0p1tPtYo3KNXJOdtGWlcllUMBIvRuv1oGVOD1FOWIiRIz91zgH0qS6VoNwaMgKeJQMgfWvExGK960TeMRpPmqMnD4796JIQsf7s5lGAQT61YeSOS8SFwFkH3WHRuKq6fFJdajMZ3aMRtgqByT269q4nUnJXb2KskMjUiIS7sl32hfTtWhaArIQ42kdaaY/7PvzG6+dFMSWfHC91+nvS2e+Sab7QORNsUZwG7jn0xmpc3a6FYnDpC4lDEl/4G6DtVa5UqRDEADKTj2Hqas3Nkz6hGZGxCFJLL/FzwKluIykkQgQtC53N3Kn0+maxUrNMvRoiEZWPapAJ71keItPkvdPYiNnjg+cQjOZT/8AW61uOY1cGUcryFzUgkZ0JVsDtXfDGWmm9kZuGhX0qJLXT4IACuyMfKe1WZJQFOKiKlSCWycfnUUjjua9ali41PhZk42EbLHNISFFM83nA5pVXPLV3Ql2IaDBf2FOykQycUySUIMDrVZi0h5NbJiJnuGc4WkVWP3qYo29OtSKrt34q0wHhgo4GaCx/CnLCw561OkJP8NWIrBd/apVtj3qysI74FSZjTimBElsO1S+QKQzf3cCk3v60AUiqEcVG0R7DNPIZT92kLY5yRWgFd0I7UwMyGrLHPYGoXQHkVLAYYUlGUwj/oarMhRyrqQam3FDU6yRzoEl7dG7ioaTAzyuR0phQ+tXJreSLn7yHow6GoNhas2h3K5UA5zSrIUI5qQx5qIoBwRUMZehvMEZbB7GrS3Mdy/lMQk3YH+L6HvWJnHHpSs0c0ZinBKeoOCp9Qe1Tz2CxcvY2jY8VXgYMrhhhf73pTY728geO3ukW9tXztuPusvfDe9aECQmMqE3I/OD1FeVmFaKVlua049yNLooVjLD5MMcjORmrZN3LfmHzVjg2bn3LkMvp9TVeS6htbNXSNTk5RiPSrdhdx3ds0lztIcHJHHHt714cr/FY2JTbwGzDJ80kBym48/5xTZHzLFLLEygryw43+lFvJa/YZF52AZEh5IIpkN6k2yCcOcHdtIxzUaktl10gup02zMqlcFRyM0omRElhk25QYGBzu9qqLav9rl+ySII+q7mzz6UxHiXzYbkL9oHzGQHnd2x7VLj0Fc0Fml+y5kjbY2OT/CakaRSEMY/eE7do71krqBkgWNllCzMOdp/P0q35aQPEYHd9wKlSMnjvUuFtykyeW1ju51WQsjIm38e1V5LWRSiGcDy+Hx/ETzmpxK0kgl2ERYwzHru+lJe2pn8trc4kzyC3BHrSTa0ZRG7IkQIbcvTJ65qsUeZ2H3VXqT6065kkSfyIrZiQMFmHHHU06TKWyq7ZbGSfU16GB91uUnotTOYxIVU/eyaSRivFAxgHOaGw3yn8K9+hVhWjeBi01uQFWY05YSe9L5wXjbUiTqeq4+ldcSWSRwkdRmp1RVGW4qITf3TxSg7jljWqJJRJGvTrSGZj0OBTfK3Hini3brV2Ab5jNxTgCetSCE9+DR5ZxinYBAqjmjzVpPKOOtHlU7AVRLwOhoyHFZyuw6VMsz9aq4FghajYYHBpPNzzRvDD0pMCB8HtioiQOhqV1Peq0kZBz2qWBbgu2i44ZT1U9DVwWttdjdAxjfvH/hWGWZDx0qWO4dCGBII7ip5h2L82nlTgvtP+0KrPZSjkbH+jVft9VhnTyrtQQeN4qO7025C+bp06TL/AM8pDj8mH9RSaT2AyJo2X7yEH3FVJJFUdeadN4hlsJvK1Kxnt+cbiNy/nTzf6XerkBTnupK1yVdFcpDdME8935SofKP3s9vetcmCxTZcDcm4/Oh5APrVWK4kuA0dtHjbznODTzAbiaGV0EinO5Dkfn+NfN4ibqTvLRHTHRE8losVspkKT23VY+4z3z9Ka8FvayRyQiT7G/OGHCmp44omsnW5LbsHdGrcLjgfXtQdQVdOkhYeblNqoozx9K57vYTIbmWAT2zoGUOSGUDCk9jS3Fwl7fpsuFjZUAdgM5PSobm/hmsIbZnVmaQEJjgge/arN1FHdxWoh8qBgx3N/s/TuelVta5DZWN79gmliMgaVMkcEBqnhube4s5FmTMjjcGbruPpisvVIn0u5jE9wkz3PAcLjaKkb7Pp8kEkL7lkByHP3SMcj3q+RWTW7Iua0OpLJYzQyZ+VMFAMMD2xn3plnfva3Dw3aeXMg4UntjrVC9vXk8m+ijXbG20kA7m96fNqCSz29y6EmL5Hcp0zjGah0tNtylI14dTVpZHPIz90jk1Na5aLzvNBc/MExwB6fWq7zW63cExVBI8fGO/PFSXksUcgMZbMv+s2jhPeudxWyNUyO7vfOuo0BYKAGc+vtUVwr3upCFNywxrukfH8q02mha0kAjUqqZB6Z46VAl15kGyFQCVOBn+tVSqcmtgavoQtDDgorlSOnOahkzCu6QjaOd3bFPQmGVEeAmM43yZ6+uKknaPa8IAdG4OfSvewlanJ6JL0MZJmb9ttWOVcyeyKT/TFC3Fy5/0fTZCv964kWIfhjcf0qyMRjCKFHsMU1mYmvWi0ZsehuMfvPIiOekbGT9SF/lUhl2+hqDDkU3Yx5NbJiLH2kjocU8XLf3jVUITRsNXcRdE7H+Ol85/7xqmEYdDThuHencC557g/epftL+oqllqTLU7gQKvNPCkHBNOVkIG5amVI2Aw2KYEITqKTYQfSrnkHqMGlMB/u8UgKTKfc00oSORV4REdRQbfnikwMxoh0IqJodp+WtY24I55NNNvj+GpaAx2hYcg0+C5ntmzHIw9uoNXmtjk4FQSQbe1ZO62KRZTVbe7Qw30KAHuRlTUN5YlSj2RhEWeU8sFT9DVCZAB2qvDcSQMfLkIH90nINcmJqtQa6lRjqaUTNNI1rtWJ8E7umfUVIoffHapcA7Ry5GCB/WoLC4juhKZGjSUHaoc4/HPTFVPImsbo3VyywhMjy2OSwPpivm3FuTT3OnoXLuAxTC3FzuWYFtxGDwelN06eHTmfjYe4cckVMyiOSO4nlWSXd8qfwjIqS6uYZb+1V9nmxhjz9OKm+ljNmJetBJqdw8mYlAHl4UqPekt5tQnuVeOESW6ggEN6VY8Tajb6nHBYxnE5bJbHQVDpMdxbXCWECmYjJGDwfqa6Uv3d2tfP8yGJBqcWpTvHdw7Y0G1S33vfH5VB/ZN5GrtegPDJ8lsQ3IPbP1qxc6PEbGW4Fw0FxHlnjK8euKqy67P/AGfBEkb/ALyQYJGFBHaqjr/D2/IhmlYvcaVcpHeRqpC5A/hI+vrVy1vree/vgPnWQj91jqMAdPrVKG6XVZYoZ2ZPJ+YHg7varl8sVlqdvNZkuZ1KMMZIA57VzySbs9xok0/ZCstvdQsuCRtk6qvbBrQWxKwFldjC7fOCfnVewqnA1trLkytiZMojdCh9SO/NT28k8pksN+LhfldsHGPUf0rCd73+81iaBitksnOPlRSWUnhlxVKxaJnCR2hjTqrO2SfpT2tJlb7MzLJCQCW3YJGf8aa8aWDhZWdl6RcYI+prJbWuakrNiCZdwfa3y4+g/wDr1EkYI5/OkjCG1OxtoLDrT9siDBU49ua9fApaNGM77DvIHrxSiFD1wKZ5rZ9KUEsc170DFj/JGeBSi3p6c8ZqYLnoea3RJWNsMcUwwe1XQhPUU7y+MVYGeYcUhirQ8oDjrSGIdSKYGb5dJ5RrQMIPQU3yPaqAykjPpU6R1l29zOhAT5h/d61s28jSLl49n1NNO4CqpHQ08zGPgcmplVT3psiLTsIbHdIxw6496tCNSuQAR6g1mSpg8Ypsd3JC3ysfpSGanlD0pjBV6kVB9viuE2Sh4mP/AC0jP9Kgk02WXm3vkm/2WO1v8Kl+QE8kkXr+VVnngHVAfqaqTaZqEWS8Dkf7Jz/KqLiQZDRupHqprCU5LoNIvy3cfRUT8qpl43kG9FC9ziqbS7eM0eflSvXNefi5TlHQ1glcnu1DeXFbhDvPDHt70+5toZWG26jeeDBUyc+Zj2HP6UkMUJieGEM08gyAR04/SqcNtDYXsVzPdr52SPKAzgnjqK8OPrt+JsTBvPuRDdRPEoG5se/Sori2jsZZFeYusg3K5649KtXLPeX0QikChMh3xnI9qy9ctZY9RiWdzJbsu5SOBmrpq7tsZyNEHTbfRZF8tRJtIEnVmPaq1ldtpzjdJkS4IYDkN6fSqul6XaXBkMrtuJIQZ4X3qndLKLuWISGUQpuDIMFe3Pr2rRQjJuN7kGhrerx3LyNEMoRtlYfxN7Cqtiw1eeGzm/dQRfOXHXPQAVGLL9y7yFEeNcqAfvZ9ahluvs9ukcKmK5XBwBwRnk1pGKS5Ybg0dJZG0jtpIZo8BWPzg859aS2ubi2vY5bmMIJlzC56FQeh9D7VmWIE9rJO75kzkN0GRWhLNeajosyx2jTBfmIf5SPcVzyjrZiHyXTx628kUQ8iYgGQdA2Oa2pIUWKO5tpHNxwjlv4/rWNp0M2raaFtisYUhgX7MK0tPiluoj9omEc8ZZQo6bgec1hVVvloaRZZmWW1jFzNIhiYbWIJ+Q1E01rqN0oeSR1UcBBhc/jz+lWbWKa8jIZAsQOJFc8v7AVS02IxFHkCozE+WhOSR3rFWs31Rqi7A0LRSLblWBGACw+U++Kqi4uI5WRAzhTjIU4NTzzWUEx/erHu6j3Hf9azl1GVh8x4J4NdeCS573aImaS3Of8AWwMPcCp0MMnKPj2NZi3b/wB6p470j70aNX0tKatqc7RpCNlORyPapApzmq0N1bN/ejPvyKuo6EcSK1dSVyRytinja3UUmFz1FL8o6EVdgHgKKXYDzTMgdx+dAkIosA4xDsKPJpPN9qPNHrTA5tBs4Xge1So5HrTTG5HCmmiKQ1QF2OUHgnFSk8etU44znk4NW1QbeTzTEV5Bmqrrz3rSdFxxUDL3xUtDKBB7UbmXoSKtNGOuKYYx6VLAE1C5i+5Mw9qe2r3OPnWN/qtRGIdqY0Xqal3AWXUYHH72yjPuKoSzacTuEBQg9hT5oevFZ8sLHPFcde7VmXEn/eeYtzbdeQDnFPu7KVZo5UkhLxsCD94OfbHWqUbSxgx5zGex7VYa2l/s6NxM0pY/u1QZI+mK+dqRcJnQndGhePGbQO2yMR/MvOM//rrOtymqTML5CsbMNik42iq0cTx2ciXAledJVJjbkIvX86YJUmuw5Lquwgdt/wBPWkoct7feTIfYQ20cs+/zwsbsTsU4I7HNJLo9zY28+oROHSVsuO6qff1ratLiA6O0cpHkiPGdwB4//VVW0uWmsxazo/lzZBcnoMfzo9pK7ZNjKvbeC3KusjzRsnC9dvp+FVHhj+zRsvF2W2kdSR3/AAxW42nyaXL9ktla5gnXO6Q8r+fasW3MqXbwsoM7AgZ4xj6/Wt4Surp3GQ6fFK8skUrEKW+6p4BroLLUpo7l7JciUJkuT8uK5mO21CPUGO8tKnLFDnIqzHdOmpSOJCAVG5mHJp1aanfroTY6K0uDpz/ZCWIGXDp3z6+9WLF7i9vLohkiBfKt6ZHSsHQonvdRmubySRguAFJxx9PSukhsltdRk+yN8kxBCHsTXHWSi2upcS3LMdJkiSWT5X4STt+NUY1k/tF4CVBtiWjdTxIjc8ep57U3Uzf7lR7fdCGyCg3dK0LuaKysYVcDzCwWM7eVz3rFaLu2aIpzWUcTPHdReZFIN6S9Gz3z71Sm0+SNRJC5mg6hh1H1FS3LzvcrNNcJKpGFIOFGDyMdj0qJFubWZpbdv3bHJj6r9R6V6OC3tIzkuw1M8c1MrleCaso0F4uWType4Hr/AFpklnJGN33k/vCvbhDsYtiLLz1qZZcdDVbbkcCnAHtW8SS6s7DoxqZZz6ms0PjrUqyVqmwNETE9zThKezGqKy89alDe9UmIti5cd81ILs+gqiGpd4qrgSC4hI+9SfaYecGssI1OCMO2adwNDz4ienNPE47CqSIT2q1FCfwpgTGTd3xTTz3qVIgKlEQ7UxFXy/SnCLParax47VMsakdKVgKItQad9i9av4A6CgjNS4gZUlkDxjNUbizCg5wAOSa1r6+t7NP3jZbsg5JrltR1Ca+yp/dxdkHf61hU5UtSkmV5prWRinmYj5yw74qKUyCNYbW4KKRwu7H41UWFmc47dfap7aJ5bwY+Zdp3c8189iYWnzNnTF6WHWt3BZwtbvJvn5J2Atkn1Nal49tdaKN6JuhT5Gz0z1Hsax4IBaXztIuUYbd392kupLaK4SRJVZUO5152nH9a5nFSkmhstyWMFxpyQwnDMR5f/wBeoL++utPgt4ZIkw7ArKG+X3xVgSpNJBLYoZTgsQPlO08d6ytZvTq9qbeFNkdq4355PPAIPpnj64rSlFylaW35EMty31ydUTYyOHUDviMU24kuBDJbG0mdo23m5Vcqozyc9/oKct5ZyaD5EETmXAwEUhg44zmtXSdSh/stLedlLbPLkXqzk55x1NKT5FdR2AyIMLcuYZwUCgs55JaibRp49MM7zI4yGYAc8ntV2Hw1DFpzm2u9kj/vA7DgL6GmWp1P+yWHA2pmNxxkgcAj86PaXd4PqFjVleCLT2n2YaNRtK8MQKp6XqM020mPczyEluwH8+P6UzTFuDp9159s8t42XiMg4ZSMEfzqTSrhNTsrqyMUdtK6HDdAD/jWDgkmnrqWhZXv4tUMrMwRslGU5GPatO5gklaK4wJjt+cDnI6jHrWZps88McNnKMCJXRyeQw7Y/WpVvbqynIIV4WO5UIxtB7D6UKnzTUU7D5rIsX1vFqESmJlSc/dbOAf9k+lYkTy20xjdWRgcMp4wa12tbbUpfPspzbXf8SP91/qP6ipJrZpcRX0RhnAwsvUH8e4r2sNhfZxtuYylcpiXOD1PrV63vcEBzj3qhJDLaNslXGfusOhpEkB4rtjeJD1NkxRycj5Se46GoXt2U5x+VVIrhounK+ladtcLKOPxU9RXRFpk2KZiI5IyKaU9K1WgBGV6GoXg9sVaQijgqc1IrVIY8cVGVwaYDw2aN3tUfIOMUZx1FMColwelTLcetY8EzMil12MRyKtI/vVJjNVJ6sJcZ4BrIV8fxVMk2OlO4jYS4Gcc5qdJWx/jWRHcE9MCrUcjN1ancDSDnuRUitVISKg3OwAHcnFQTauqjbANx/vEcUm0hWNZ5khj3yMFHqayLzV5HBW2XaP75/pVCSaS5cF2Lv2H/wBanraTN94rEPVzj9Kzcm9h2KMgLOWclmPJJ5NMispbs/IpCeta6WllEMyFrh/TotLPMzx7eEjHRF4FZOn3Hcx5beCBcMd2OiL0/E96zri6k6Q/JngBa0bqMYJNUU3JcBYY98jDCrjrXFXg+iNIsqbbrbHGznjgA9s9/rVu6sopIo7bcFt4z8zL1Y+/tV2a0EUCNcYeQDkZ4qqLh50MBUfLli/oo7f0ryqlKpFq+5opJk8NjLqdvLcRwxRAjYrg4woHU+grHmt1tdFuJIiRbtIPNbPMuD8oX2J5/Crbx3Oo6WzjfHHI5IROFYD19aFtfK0R/MkEqRfejJ/iPQD/AD61nB8ujfXYe5HGFn8PGHTS0JmfLGQ8r0OM1c02EmTUXhnie7eMJFt5wcYYj86ppNqEcuLe2BtLlVBQj7uM9PSrSSHS7iK8MLbZplXy2Od2Rg49MD9cU5t2aXXX/hwSGH7Tp2gW0UluxYytG28cBc5x+PrWxFdb7dmtoZjE6kR5GSB0/Qg1nWLtNrV2ZhvhY/JC/IxngAU2Rv7S+aKV7ezAbyVX+IBiCD7ZH61lOKk7P1v6jSJbfUdUknaARIG4UkKScDv+NXLCG2mvJpEjLZdmDdmGc7x7+vvVH+0jZazCiZaBIkDKRyD3x+OKtIrmUgYJD70QDgxtnHPfP9KicdNrXGMuruKe/jmRCPLYDPTI71PehhJluR2rLinxNIjLlCxwO4rbhxc2a5O5lG0n+tehhsLrcznIz1GDuGQ3Wtez1QhPJuR5kXTJ61mvGUbHSmjj8K9SneBk9TpPscFzbkQuHjP8BPT8e1Yl5pz2bFgCY/UjkfWi3uXhbKMVP1rUi1NpF2yBXHQ5711WjJE6owgeeePrT1ZlYEZBHStZ7fT5udjQk/3Dx+VQPpqE/u7kH03Cj2bWwXLNleiQhJCFY/ka0Gi4rDNjMvRkb6HFalhcvgW9wCrjhWPetI32YmDwioHgz0rTePNQMuKuwjKeEjmm+Wa0mjB7VEYOaLAciBTwxA60pX0oCVBYqyN3qQSEdqjwT7U4KR0paiJ0uGHQVMLyY8LxVUDHWpFzjHXPQDvRdgSmRmO6Qlj6ntVy2s2lAd/kQ9B3NPtLEIFknAz2X0q8TnhapR6sVyEQiJSEG0d/ekEZ/CpxGep6UpPbFOwiuI6jljwOetWSCBmq0uM5zxUtDKM0TSuscalmY4AFaVvYRWEJPDSsPmf/AD2qzY2/lp50g+dhx7CmXj449qhxS1C5g6g5lc/3RU2n6YptHkuB8snUHuOwqWO0Nzcqh+71b6VpXK7k2Lwo4Fc6pJtzY79DnL28nEj+S+yMcAYqsqg2gnmJkVW3yKeuR0rQvbUD5QKqTxFNKmI/vL/hXmYjDato1jIhsZDc2s0wfypSSsag446ULp8MtzBh2PkoN3s3p/WmwxsbSJVGGLFVA75rXMH2BYY4xnAO73NcyoVHzOJfMjOa+hkQpLGY9pZS6KeG7HNPsw9ksdtKo+yWzM5P94MD/ImoJ4AZSSOSckGtJIftGnvGRmRUIX8ulNYZuNkg5tSOaWOe6ilt412ttQkjtmogkmnak7x/xZHPpUGmyAEQsTw4Zfz6V02pWAY7wPxrqw2D91qREp6nLSoRLkjk81oadP5MoVj8j8GmXMJB5FQbSGx2rtjHkehO5vXNvuGQOaznQqeRWjpt2J0+zSn94B8p/vCn3FqRniuvlT1RBkinq5HtTniKE8UzGDQlYLlpZg3B4NSiXFUgTUqscVohFoSZ6GpEmI4PI9KqA5pwY1SYG/DOJUAPWnOtZNvcGNhnpWj52UDKMjvVCGOtR/jUxkVqaVGaAOU8s96UR+lTDHfmjAzUjItgHWkK1IRTcjtSGN2/hWpY2ohAlkHznoD2qCxtwzea3IXpn1rRxk04oQ8ksetSLGKRIwe9TFcAAVYhp4HFAJHbmpQnA9ajkzyAaTArvhs+lMihEtwoxlRyaGHBPartpEIodzfebk1G4EsjhEz7cVkykuxY9e1Xbh9zEdhTLWIPJvP3V6fWolq7DC2hMMOMfvG+8fT2qb7P/e5qdU4J6AcCpjHhQcU7CMO9h3MTj2rLvYtunMv96RR/M/0robtADjHasfUF/dwJ/tFz/L/GuepHdlIj0Wx82VXYfLDkj6mtOa2Lv0qzodvt0/eRy7E/0q48YBAxzVwpJQSBvU5O8tSr9Kdakoy1sX1sOpFZJTyx+NT7NJhczr60MF84TgN86H612cDC+0+Gbj50BP17/rXP3sXnWaTj70R2t/unp+tX/DlySklqT935l+h61UI2k13B7Fe+ttkmMcVjSxlJOK7O8tw65xmudurYhjjtVSgCZQjchgQcMDkH3ro7O5W+t/mx5q/eHr71ze3axJFWLad7aVZU7dR6iiGjBmvPb5BwOaz5IipzitsMlxCJU6NVaaAOpwMGtrEmTjnpSqKlkRgQccios880rDJR0oBpEp7DPIpgOU81agnaM8niqPSnq5HemgNcbW+YdDUm32rNhmKnrxVoS8fepgYGQBTS5zxUXmUbsnrSAk6n1qSOMuwUVEpArRtY9ibm+81CVwJ0QKqqowBxVhIwelMQAnHep1wBgVQiRUxxT9nPJpiMfTJpxbPTigBXGBwaquCRkmpJGO04NVmY49qhgPiTzZVU9Op+lXZZAM1UtPlV3IxuOB9KJZAX46Ck9EBHIST3rQt4wkap36tVS2jMjbyOAePetSKPb1/GiMeoMcibm56CpXXI9qcigCkm4iNNrQDKusFyfwrJvQDPjsigVryYZxnp1rJ5mlyesj5/WsZrSw0dLYw+XYQr6IKc6j0qaMbVC+gpjjn8a3toIpTx74mB61h3URDV00iA8isq8h+UsB14qXEDNs2RmaCU/u5QUb2z3/CqVpI+n34Zlw0TlJB7dDVhl8t8Gm36b9lwOS/yv7kDr+VZtdSkdaCs0QKkFSMg1kXdttdsUnh673I1q55TlPpWpcxZG/HNarVXJOTnt9ueOtVNpXiuimgDBhisua1OCR2qXEdxdNvTbS7HP7p+vsfWtiYY5HQ965sqy9a1dNvRIn2aU8j7pP8AKqi+giSRBIMj7wqm6ZJBGDVyVdrHtULFXODwabGVFJXqKnVtw5pjrtYhuaFyOnSkBIycVHjHFSqcikZc9KYDQeafvPrUJ4NLvNFwME3IHemG9iTlpAPxrg/7QnbrNIf+BGpraR5ZQoyWY4FZ8wHoWnTx3cp2HKJyT2rdjOTWJpUC2tskI6jlj6mtpGAFaoCynB4HNSphScjJqBHwcipN/HuaYibzMdOtG4EYqEMCPencAHFIBJXAXHaoDlwOwpxy30pBjIHYVLGSb9sfHbpUagyuFHUmo5JMtgdBVqyjyTIe/A+lK12BegQAAAcAcVaUHNMRMKKnXGMitCSUDAqG5bjFTZGCap3DDGaljKFw2I5Gz2x+dUrRN17AuONwqzcnMeP7xqPThnUY/QAn/P51k1eSGjpM4HvUTHrTlPHNROeSexrckc/K8duaguIg8RHerIGUP0pjr8ucdKAOcuYjux3zUGzzIJIT1IyPqK1b2Hdll6is4fLKDjvUNDRSs7g2t3FMP4WGfp3rtjiSMHqCK4adNk7pjjJxXV6Ncefp6AnLINppQ7DYyaHDHaao+ScsOtbE6dGFUnXkkVpYkyprXcM4rPeF4XyMjB4I7V0YUEGopLVXBBHWpcRmfDdi4jCScSAfnUchwxp8thhuOMVE0Mu3ruxQMBJng80dDkdKrsxQ8jFOWQ9c0XAtA57Yp3eoVcHvUm71oAR1zUW2pjg0m2lYDxFQc1saGmdQiz25rPEfNa+iri9U+xrGO4Hd2bYArSjO41j2rHArSSTA4rdMC8JNowKcCTjmqqNuFTqcDmmBMCQODQWLcA8UzOeO1Ix7CgALYXimF9qZNVp5xvWFD8xPzH0FEkmSBUtgSpmSRVHVjW7bxhVAHQcVj6aheZpP7vArbT5acUJkwFTj2qBOmTUobkYqhDyflNUJznirjHGRVC4btSYylcHlR7Zo00/6cfZD/MUydssx7dKTTmH21vdD/MVmviGdErfJULHAIzxmlDfKKimPHHrWxJaTlCO9SDBGCO1QxN8n4VMBwPWgDPuo8ZI6VkSRgOCOhreuANprGuEKscdM8UhmZqCbLkH+8oNaHh6fbO8RPDDIqnqfIhbvgjNRafN5F9C3bdistpDOycZ47VQkADlfyrQPIyOtVLlAQHHUVsSVhgZp+3IBFRZwwJ6Gp044HQ0DIZo8jP51QkXYT7VrlQciqVxH19aGBlzqMhu3Q1WeLbyvFXJB1WoFPGKzkNFfJU09Zh3pzpmoHSlcZZ81fWk89fWqmD2pMUcwWPMQvNXrBjHcRkddwFUhVqDggjrWC3A7W2YgYrRjY4rMtTmNT32itJOgreIFtGx0qdSe/Sq0fT8amXk1YibJPPaqVzfAApFye7Ut8zARqCQrZyB3rO6jmonK2g0iS3bMpYntUzP3qCH+OpO4qUNnQadH5dqmep5NaAJ4qvB9xfpU461siCdTlQTUhOce1Rp92pOxoAbI2Kz5z8w/OtB+1Ztx95vpSYFKU5TNRWLldQUeqGpJfun6VBZf8hNf90/yrNbjOkV8pmoJXOCD1Bp0X+pqK4+6a2EXbdsoAatRnjPeqNt91aux0IRDMODWTcja49DWvN1rJuvvGgZl34zAmezGs8NtYEdjWhf/AOpH1/xrPHU1jLcpHbW0omtY3B4YCkccFT0NVNGJOmLn3q3J0rZbEmbJmOQoenUVLBJztqO+6p9aZGTvBpAaQ+YfSoJUzk1Knf6UjfcNUBiXSFTVHcUc+hrTvvu/hWY4+UVEhkoIYUx0psR+apm6VBRUZcUYFSuKZgUAf//Z"
6 | }
7 | }
8 | ]
9 | }
--------------------------------------------------------------------------------
/code/project/code/inference-service.yaml:
--------------------------------------------------------------------------------
1 | # https://kserve.github.io/website/modelserving/v1beta1/tensorflow/
2 | apiVersion: serving.kserve.io/v1beta1
3 | kind: InferenceService
4 | metadata:
5 | name: flower-sample
6 | spec:
7 | predictor:
8 | model:
9 | modelFormat:
10 | name: tensorflow
11 | # This is only needed on Mac M1
12 | image: "emacski/tensorflow-serving:2.6.0"
13 | # https://kserve.github.io/website/modelserving/storage/pvc/pvc/
14 | # Note that we are skipping `mountPath: /trained_model`
15 | storageUri: "pvc://strategy-volume/saved_model_versions"
16 |
--------------------------------------------------------------------------------
/code/project/code/model-selection.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from tensorflow import keras
4 | import tensorflow_datasets as tfds
5 | import shutil
6 | import os
7 |
8 |
9 | # Scaling MNIST data from (0, 255] to (0., 1.]
10 | def scale(image, label):
11 | image = tf.cast(image, tf.float32)
12 | image /= 255
13 | return image, label
14 |
15 | best_model_path = ""
16 | best_accuracy = 0
17 | for i in range(1, 4):
18 | model_path = "trained_model/saved_model_versions/" + str(i)
19 | model = keras.models.load_model(model_path)
20 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
21 | ds = datasets['test'].map(scale).cache().shuffle(10000).batch(64)
22 | _, accuracy = model.evaluate(ds)
23 | if accuracy > best_accuracy:
24 | best_accuracy = accuracy
25 | best_model_path = model_path
26 |
27 | destination = "trained_model/saved_model_versions/4"
28 | if os.path.exists(destination):
29 | shutil.rmtree(destination)
30 |
31 | shutil.copytree(best_model_path, destination)
32 | print("Best model with accuracy %f is copied to %s" % (best_accuracy, destination))
33 |
--------------------------------------------------------------------------------
/code/project/code/model-selection.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: model-selection
5 | spec:
6 | containers:
7 | - name: predict
8 | image: kubeflow/multi-worker-strategy:v0.1
9 | command: ["python", "/model-selection.py"]
10 | volumeMounts:
11 | - name: model
12 | mountPath: /trained_model
13 | volumes:
14 | - name: model
15 | persistentVolumeClaim:
16 | claimName: strategy-volume
17 |
--------------------------------------------------------------------------------
/code/project/code/multi-worker-distributed-training.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | import argparse
4 | import json
5 | import os
6 |
7 | import tensorflow_datasets as tfds
8 | import tensorflow as tf
9 | from tensorflow.keras import layers, models
10 |
11 |
12 | def make_datasets_unbatched():
13 | BUFFER_SIZE = 10000
14 |
15 | # Scaling MNIST data from (0, 255] to (0., 1.]
16 | def scale(image, label):
17 | image = tf.cast(image, tf.float32)
18 | image /= 255
19 | return image, label
20 | # Use Fashion-MNIST: https://www.tensorflow.org/datasets/catalog/fashion_mnist
21 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
22 |
23 | return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE)
24 |
25 |
26 | def build_and_compile_cnn_model():
27 | print("Training CNN model")
28 | model = models.Sequential()
29 | model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
30 | model.add(
31 | layers.Conv2D(32, (3, 3), activation='relu'))
32 | model.add(layers.MaxPooling2D((2, 2)))
33 | model.add(layers.Conv2D(64, (3, 3), activation='relu'))
34 | model.add(layers.MaxPooling2D((2, 2)))
35 | model.add(layers.Conv2D(64, (3, 3), activation='relu'))
36 | model.add(layers.Flatten())
37 | model.add(layers.Dense(64, activation='relu'))
38 | model.add(layers.Dense(10, activation='softmax'))
39 |
40 | model.summary()
41 |
42 | model.compile(optimizer='adam',
43 | loss='sparse_categorical_crossentropy',
44 | metrics=['accuracy'])
45 |
46 | return model
47 |
48 | # https://d2l.ai/chapter_convolutional-modern/batch-norm.html#concise-implementation
49 | def build_and_compile_cnn_model_with_batch_norm():
50 | print("Training CNN model with batch normalization")
51 | model = models.Sequential()
52 | model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
53 | model.add(
54 | layers.Conv2D(32, (3, 3), activation='relu'))
55 | model.add(layers.BatchNormalization())
56 | model.add(layers.Activation('sigmoid'))
57 | model.add(layers.MaxPooling2D((2, 2)))
58 | model.add(layers.Conv2D(64, (3, 3), activation='relu'))
59 | model.add(layers.BatchNormalization())
60 | model.add(layers.Activation('sigmoid'))
61 | model.add(layers.MaxPooling2D((2, 2)))
62 | model.add(layers.Conv2D(64, (3, 3), activation='relu'))
63 | model.add(layers.Flatten())
64 | model.add(layers.Dense(64, activation='relu'))
65 | model.add(layers.Dense(10, activation='softmax'))
66 |
67 | model.summary()
68 |
69 | model.compile(optimizer='adam',
70 | loss='sparse_categorical_crossentropy',
71 | metrics=['accuracy'])
72 |
73 | return model
74 |
75 | # https://d2l.ai/chapter_convolutional-modern/alexnet.html
76 | def build_and_compile_cnn_model_with_dropout():
77 | print("Training CNN model with dropout")
78 | model = models.Sequential()
79 | model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
80 | model.add(
81 | layers.Conv2D(32, (3, 3), activation='relu'))
82 | model.add(layers.MaxPooling2D((2, 2)))
83 | model.add(layers.Conv2D(64, (3, 3), activation='relu'))
84 | model.add(layers.MaxPooling2D((2, 2)))
85 | model.add(layers.Dropout(0.5))
86 | model.add(layers.Conv2D(64, (3, 3), activation='relu'))
87 | model.add(layers.Flatten())
88 | model.add(layers.Dense(64, activation='relu'))
89 | model.add(layers.Dense(10, activation='softmax'))
90 |
91 | model.summary()
92 |
93 | model.compile(optimizer='adam',
94 | loss='sparse_categorical_crossentropy',
95 | metrics=['accuracy'])
96 |
97 | return model
98 |
99 |
100 | def decay(epoch):
101 | if epoch < 3:
102 | return 1e-3
103 | if 3 <= epoch < 7:
104 | return 1e-4
105 | return 1e-5
106 |
107 | # https://cloud.google.com/blog/topics/developers-practitioners/add-preprocessing-functions-tensorflow-models-and-deploy-vertex-ai
108 | def _preprocess(bytes_inputs):
109 | decoded = tf.io.decode_jpeg(bytes_inputs, channels=1)
110 | resized = tf.image.resize(decoded, size=(28, 28))
111 | return tf.cast(resized, dtype=tf.uint8)
112 |
113 | def _get_serve_image_fn(model):
114 | @tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.string, name='image_bytes')])
115 | def serve_image_fn(bytes_inputs):
116 | decoded_images = tf.map_fn(_preprocess, bytes_inputs, dtype=tf.uint8)
117 | return model(decoded_images)
118 | return serve_image_fn
119 |
120 |
121 | def main(args):
122 |
123 | # MultiWorkerMirroredStrategy creates copies of all variables in the model's
124 | # layers on each device across all workers
125 | # if your GPUs don't support NCCL, replace "communication" with another
126 | # https://www.tensorflow.org/tutorials/distribute/keras
127 | strategy = tf.distribute.MultiWorkerMirroredStrategy(
128 | communication_options=tf.distribute.experimental.CommunicationOptions(implementation=tf.distribute.experimental.CollectiveCommunication.AUTO))
129 |
130 | BATCH_SIZE_PER_REPLICA = 64
131 | BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
132 |
133 | with strategy.scope():
134 | ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat()
135 | options = tf.data.Options()
136 | # https://www.tensorflow.org/tutorials/distribute/input
137 | options.experimental_distribute.auto_shard_policy = \
138 | tf.data.experimental.AutoShardPolicy.DATA
139 | ds_train = ds_train.with_options(options)
140 | # Model building/compiling need to be within `strategy.scope()`.
141 | if args.model_type == "cnn":
142 | multi_worker_model = build_and_compile_cnn_model()
143 | elif args.model_type == "dropout":
144 | multi_worker_model = build_and_compile_cnn_model_with_dropout()
145 | elif args.model_type == "batch_norm":
146 | multi_worker_model = build_and_compile_cnn_model_with_batch_norm()
147 | else:
148 | raise Exception("Unsupported model type: %s" % args.model_type)
149 |
150 | # Define the checkpoint directory to store the checkpoints
151 | checkpoint_dir = args.checkpoint_dir
152 |
153 | # Name of the checkpoint files
154 | checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
155 |
156 | # Function for decaying the learning rate.
157 | # You can define any decay function you need.
158 | # Callback for printing the LR at the end of each epoch.
159 | class PrintLR(tf.keras.callbacks.Callback):
160 |
161 | def on_epoch_end(self, epoch, logs=None): #pylint: disable=no-self-use
162 | print('\nLearning rate for epoch {} is {}'.format(
163 | epoch + 1, multi_worker_model.optimizer.lr.numpy()))
164 |
165 | callbacks = [
166 | tf.keras.callbacks.TensorBoard(log_dir='./logs'),
167 | tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
168 | save_weights_only=True),
169 | tf.keras.callbacks.LearningRateScheduler(decay),
170 | PrintLR()
171 | ]
172 |
173 | # Keras' `model.fit()` trains the model with specified number of epochs and
174 | # number of steps per epoch. Note that the numbers here are for demonstration
175 | # purposes only and may not sufficiently produce a model with good quality.
176 | multi_worker_model.fit(ds_train,
177 | epochs=1,
178 | steps_per_epoch=70,
179 | callbacks=callbacks)
180 |
181 | # Saving a model
182 | # Let `is_chief` be a utility function that inspects the cluster spec and
183 | # current task type and returns True if the worker is the chief and False
184 | # otherwise.
185 | def is_chief():
186 | return TASK_INDEX == 0
187 |
188 | if is_chief():
189 | model_path = args.saved_model_dir
190 |
191 | else:
192 | # Save to a path that is unique across workers.
193 | model_path = args.saved_model_dir + '/worker_tmp_' + str(TASK_INDEX)
194 |
195 | multi_worker_model.save(model_path)
196 |
197 |
198 | signatures = {
199 | "serving_default": _get_serve_image_fn(multi_worker_model).get_concrete_function(
200 | tf.TensorSpec(shape=[None], dtype=tf.string, name='image_bytes')
201 | )
202 | }
203 |
204 | # https://www.tensorflow.org/api_docs/python/tf/saved_model/save
205 | tf.saved_model.save(multi_worker_model, model_path, signatures=signatures)
206 |
207 |
208 | if __name__ == '__main__':
209 | os.environ['NCCL_DEBUG'] = 'INFO'
210 |
211 | tfds.disable_progress_bar()
212 |
213 | # to decide if a worker is chief, get TASK_INDEX in Cluster info
214 | tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}')
215 | TASK_INDEX = tf_config['task']['index']
216 |
217 | parser = argparse.ArgumentParser()
218 | parser.add_argument('--saved_model_dir',
219 | type=str,
220 | required=True,
221 | help='Tensorflow export directory.')
222 |
223 | parser.add_argument('--checkpoint_dir',
224 | type=str,
225 | required=True,
226 | help='Tensorflow checkpoint directory.')
227 |
228 | parser.add_argument('--model_type',
229 | type=str,
230 | required=True,
231 | help='Type of model to train.')
232 |
233 | parsed_args = parser.parse_args()
234 | main(parsed_args)
235 |
--------------------------------------------------------------------------------
/code/project/code/multi-worker-pvc.yaml:
--------------------------------------------------------------------------------
1 | kind: PersistentVolumeClaim
2 | apiVersion: v1
3 | metadata:
4 | name: strategy-volume
5 | spec:
6 | accessModes: [ "ReadWriteOnce" ]
7 | resources:
8 | requests:
9 | storage: 1Gi
10 |
--------------------------------------------------------------------------------
/code/project/code/multi-worker-tfjob.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeflow.org/v1
2 | kind: TFJob
3 | metadata:
4 | name: multi-worker-training
5 | spec:
6 | runPolicy:
7 | cleanPodPolicy: None
8 | tfReplicaSpecs:
9 | Worker:
10 | replicas: 2
11 | restartPolicy: Never
12 | template:
13 | spec:
14 | containers:
15 | - name: tensorflow
16 | image: kubeflow/multi-worker-strategy:v0.1
17 | imagePullPolicy: IfNotPresent
18 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
19 | volumeMounts:
20 | - mountPath: /trained_model
21 | name: training
22 | resources:
23 | limits:
24 | cpu: 500m
25 | volumes:
26 | - name: training
27 | persistentVolumeClaim:
28 | claimName: strategy-volume
29 |
--------------------------------------------------------------------------------
/code/project/code/predict-service.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from tensorflow import keras
4 | import tensorflow_datasets as tfds
5 |
6 |
7 | model = keras.models.load_model("trained_model/saved_model_versions")
8 |
9 | # Scaling MNIST data from (0, 255] to (0., 1.]
10 | def scale(image, label):
11 | image = tf.cast(image, tf.float32)
12 | image /= 255
13 | return image, label
14 |
15 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
16 |
17 | ds = datasets['test'].map(scale).cache().shuffle(10000).batch(64)
18 |
19 | # TODO: Visualize the images and compare with the classified result
20 | model.predict(ds)
21 |
--------------------------------------------------------------------------------
/code/project/code/predict-service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: predict-service
5 | spec:
6 | containers:
7 | - name: predict
8 | image: kubeflow/multi-worker-strategy:v0.1
9 | command: ['sleep', 'infinity']
10 | volumeMounts:
11 | - name: model
12 | mountPath: /trained_model
13 | volumes:
14 | - name: model
15 | persistentVolumeClaim:
16 | claimName: strategy-volume
17 |
--------------------------------------------------------------------------------
/code/project/code/workflow.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: argoproj.io/v1alpha1
2 | kind: Workflow
3 | metadata:
4 | generateName: tfjob-wf-
5 | namespace: kubeflow
6 | spec:
7 | entrypoint: tfjob-wf
8 | podGC:
9 | strategy: OnPodSuccess
10 | volumes:
11 | - name: model
12 | persistentVolumeClaim:
13 | claimName: strategy-volume
14 |
15 | templates:
16 | - name: tfjob-wf
17 | steps:
18 | - - name: data-ingestion-step
19 | template: data-ingestion-step
20 | - - name: distributed-tf-training-steps
21 | template: distributed-tf-training-steps
22 | - - name: model-selection-step
23 | template: model-selection-step
24 | - - name: create-model-serving-service
25 | template: create-model-serving-service
26 |
27 | - name: data-ingestion-step
28 | serviceAccountName: argo
29 | memoize:
30 | key: "step-cache"
31 | maxAge: "1h"
32 | cache:
33 | configMap:
34 | name: my-config
35 | key: step-cache
36 | container:
37 | image: kubeflow/multi-worker-strategy:v0.1
38 | imagePullPolicy: IfNotPresent
39 | command: ["python", "/data-ingestion.py"]
40 |
41 | - name: distributed-tf-training-steps
42 | steps:
43 | - - name: cnn-model
44 | template: cnn-model
45 | - name: cnn-model-with-dropout
46 | template: cnn-model-with-dropout
47 | - name: cnn-model-with-batch-norm
48 | template: cnn-model-with-batch-norm
49 |
50 | - name: cnn-model
51 | serviceAccountName: training-operator
52 | resource:
53 | action: create
54 | setOwnerReference: true
55 | successCondition: status.replicaStatuses.Worker.succeeded = 2
56 | failureCondition: status.replicaStatuses.Worker.failed > 0
57 | manifest: |
58 | apiVersion: kubeflow.org/v1
59 | kind: TFJob
60 | metadata:
61 | generateName: multi-worker-training-
62 | spec:
63 | runPolicy:
64 | cleanPodPolicy: None
65 | tfReplicaSpecs:
66 | Worker:
67 | replicas: 2
68 | restartPolicy: Never
69 | template:
70 | spec:
71 | containers:
72 | - name: tensorflow
73 | image: kubeflow/multi-worker-strategy:v0.1
74 | imagePullPolicy: IfNotPresent
75 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/1/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
76 | volumeMounts:
77 | - mountPath: /trained_model
78 | name: training
79 | resources:
80 | limits:
81 | cpu: 500m
82 | volumes:
83 | - name: training
84 | persistentVolumeClaim:
85 | claimName: strategy-volume
86 |
87 | - name: cnn-model-with-dropout
88 | serviceAccountName: training-operator
89 | resource:
90 | action: create
91 | setOwnerReference: true
92 | successCondition: status.replicaStatuses.Worker.succeeded = 2
93 | failureCondition: status.replicaStatuses.Worker.failed > 0
94 | manifest: |
95 | apiVersion: kubeflow.org/v1
96 | kind: TFJob
97 | metadata:
98 | generateName: multi-worker-training-
99 | spec:
100 | runPolicy:
101 | cleanPodPolicy: None
102 | tfReplicaSpecs:
103 | Worker:
104 | replicas: 2
105 | restartPolicy: Never
106 | template:
107 | spec:
108 | containers:
109 | - name: tensorflow
110 | image: kubeflow/multi-worker-strategy:v0.1
111 | imagePullPolicy: IfNotPresent
112 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "dropout"]
113 | volumeMounts:
114 | - mountPath: /trained_model
115 | name: training
116 | resources:
117 | limits:
118 | cpu: 500m
119 | volumes:
120 | - name: training
121 | persistentVolumeClaim:
122 | claimName: strategy-volume
123 |
124 | - name: cnn-model-with-batch-norm
125 | serviceAccountName: training-operator
126 | resource:
127 | action: create
128 | setOwnerReference: true
129 | successCondition: status.replicaStatuses.Worker.succeeded = 2
130 | failureCondition: status.replicaStatuses.Worker.failed > 0
131 | manifest: |
132 | apiVersion: kubeflow.org/v1
133 | kind: TFJob
134 | metadata:
135 | generateName: multi-worker-training-
136 | spec:
137 | runPolicy:
138 | cleanPodPolicy: None
139 | tfReplicaSpecs:
140 | Worker:
141 | replicas: 2
142 | restartPolicy: Never
143 | template:
144 | spec:
145 | containers:
146 | - name: tensorflow
147 | image: kubeflow/multi-worker-strategy:v0.1
148 | imagePullPolicy: IfNotPresent
149 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/3/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "dropout"]
150 | volumeMounts:
151 | - mountPath: /trained_model
152 | name: training
153 | resources:
154 | limits:
155 | cpu: 500m
156 | volumes:
157 | - name: training
158 | persistentVolumeClaim:
159 | claimName: strategy-volume
160 |
161 | - name: model-selection-step
162 | serviceAccountName: argo
163 | container:
164 | image: kubeflow/multi-worker-strategy:v0.1
165 | imagePullPolicy: IfNotPresent
166 | command: ["python", "/model-selection.py"]
167 | volumeMounts:
168 | - name: model
169 | mountPath: /trained_model
170 |
171 | - name: create-model-serving-service
172 | serviceAccountName: training-operator
173 | successCondition: status.modelStatus.states.transitionStatus = UpToDate
174 | resource:
175 | action: create
176 | setOwnerReference: true
177 | manifest: |
178 | apiVersion: serving.kserve.io/v1beta1
179 | kind: InferenceService
180 | metadata:
181 | name: flower-sample
182 | spec:
183 | predictor:
184 | model:
185 | modelFormat:
186 | name: tensorflow
187 | image: "emacski/tensorflow-serving:2.6.0"
188 | storageUri: "pvc://strategy-volume/saved_model_versions"
189 |
--------------------------------------------------------------------------------
/code/project/manifests/argo-workflows/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 | namespace: kubeflow
4 |
5 | resources:
6 | - https://github.com/argoproj/argo-workflows/releases/download/v3.4.0/install.yaml
7 |
8 | patchesStrategicMerge:
9 | - rbac-patch.yaml
10 |
--------------------------------------------------------------------------------
/code/project/manifests/argo-workflows/rbac-patch.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: argo-cluster-role
5 | rules:
6 | - apiGroups:
7 | - ""
8 | resources:
9 | - pods
10 | - pods/exec
11 | verbs:
12 | - create
13 | - get
14 | - list
15 | - watch
16 | - update
17 | - patch
18 | - delete
19 | - apiGroups:
20 | - ""
21 | resources:
22 | - configmaps
23 | verbs:
24 | # Note(terrytangyuan): "create" and "update" are additional RBAC needed to use memoization cache.
25 | - create
26 | - update
27 | - get
28 | - watch
29 | - list
30 | - apiGroups:
31 | - ""
32 | resources:
33 | - persistentvolumeclaims
34 | - persistentvolumeclaims/finalizers
35 | verbs:
36 | - create
37 | - update
38 | - delete
39 | - get
40 | - apiGroups:
41 | - argoproj.io
42 | resources:
43 | - workflows
44 | - workflows/finalizers
45 | - workflowtasksets
46 | - workflowtasksets/finalizers
47 | - workflowartifactgctasks
48 | verbs:
49 | - get
50 | - list
51 | - watch
52 | - update
53 | - patch
54 | - delete
55 | - create
56 | - apiGroups:
57 | - argoproj.io
58 | resources:
59 | - workflowtemplates
60 | - workflowtemplates/finalizers
61 | - clusterworkflowtemplates
62 | - clusterworkflowtemplates/finalizers
63 | verbs:
64 | - get
65 | - list
66 | - watch
67 | - apiGroups:
68 | - argoproj.io
69 | resources:
70 | - workflowtaskresults
71 | verbs:
72 | - list
73 | - watch
74 | - deletecollection
75 | - apiGroups:
76 | - ""
77 | resources:
78 | - serviceaccounts
79 | verbs:
80 | - get
81 | - list
82 | - apiGroups:
83 | - argoproj.io
84 | resources:
85 | - cronworkflows
86 | - cronworkflows/finalizers
87 | verbs:
88 | - get
89 | - list
90 | - watch
91 | - update
92 | - patch
93 | - delete
94 | - apiGroups:
95 | - ""
96 | resources:
97 | - events
98 | verbs:
99 | - create
100 | - patch
101 | - apiGroups:
102 | - policy
103 | resources:
104 | - poddisruptionbudgets
105 | verbs:
106 | - create
107 | - get
108 | - delete
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/cluster-role-binding.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | kind: ClusterRoleBinding
4 | metadata:
5 | labels:
6 | app: training-operator
7 | name: training-operator
8 | roleRef:
9 | apiGroup: rbac.authorization.k8s.io
10 | kind: ClusterRole
11 | name: training-operator
12 | subjects:
13 | - kind: ServiceAccount
14 | name: training-operator
15 |
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/cluster-role.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | kind: ClusterRole
4 | metadata:
5 | labels:
6 | app: training-operator
7 | name: training-operator
8 | rules:
9 | - apiGroups:
10 | - serving.kserve.io
11 | resources:
12 | - inferenceservices
13 | verbs:
14 | - "*"
15 | - apiGroups:
16 | - kubeflow.org
17 | resources:
18 | - tfjobs
19 | - mxjobs
20 | - pytorchjobs
21 | - xgboostjobs
22 | - tfjobs/status
23 | - pytorchjobs/status
24 | - mxjobs/status
25 | - xgboostjobs/status
26 | verbs:
27 | - create
28 | - delete
29 | - get
30 | - list
31 | - patch
32 | - update
33 | - watch
34 | - apiGroups:
35 | - ""
36 | resources:
37 | - pods
38 | - services
39 | - endpoints
40 | - events
41 | verbs:
42 | - "*"
43 | - apiGroups:
44 | - apps
45 | - extensions
46 | resources:
47 | - deployments
48 | verbs:
49 | - "*"
50 | - apiGroups:
51 | - scheduling.volcano.sh
52 | resources:
53 | - podgroups
54 | verbs:
55 | - "*"
56 |
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/crds/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 | resources:
4 | - kubeflow.org_tfjobs.yaml
5 | - kubeflow.org_mxjobs.yaml
6 | - kubeflow.org_pytorchjobs.yaml
7 | - kubeflow.org_xgboostjobs.yaml
8 |
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: training-operator
5 | labels:
6 | control-plane: kubeflow-training-operator
7 | spec:
8 | selector:
9 | matchLabels:
10 | control-plane: kubeflow-training-operator
11 | replicas: 1
12 | template:
13 | metadata:
14 | labels:
15 | control-plane: kubeflow-training-operator
16 | annotations:
17 | sidecar.istio.io/inject: "false"
18 | spec:
19 | containers:
20 | - command:
21 | - /manager
22 | image: kubeflow/training-operator
23 | name: training-operator
24 | env:
25 | - name: MY_POD_NAMESPACE
26 | valueFrom:
27 | fieldRef:
28 | fieldPath: metadata.namespace
29 | - name: MY_POD_NAME
30 | valueFrom:
31 | fieldRef:
32 | fieldPath: metadata.name
33 | securityContext:
34 | allowPrivilegeEscalation: false
35 | livenessProbe:
36 | httpGet:
37 | path: /healthz
38 | port: 8081
39 | initialDelaySeconds: 15
40 | periodSeconds: 20
41 | readinessProbe:
42 | httpGet:
43 | path: /readyz
44 | port: 8081
45 | initialDelaySeconds: 5
46 | periodSeconds: 10
47 | resources:
48 | limits:
49 | cpu: 100m
50 | memory: 30Mi
51 | requests:
52 | cpu: 100m
53 | memory: 20Mi
54 | serviceAccountName: training-operator
55 | terminationGracePeriodSeconds: 10
56 |
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 | namespace: kubeflow
4 | resources:
5 | - crds/
6 | - cluster-role-binding.yaml
7 | - cluster-role.yaml
8 | - service-account.yaml
9 | - service.yaml
10 | - deployment.yaml
11 | images:
12 | - name: kubeflow/training-operator
13 | newName: public.ecr.aws/j1r0q0g6/training/training-operator
14 | newTag: "5ef6c405df2bb1bf1d3ede988cd43433eff2e956"
15 |
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/service-account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | labels:
5 | app: training-operator
6 | name: training-operator
7 |
--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/service.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Service
4 | metadata:
5 | annotations:
6 | prometheus.io/path: /metrics
7 | prometheus.io/scrape: "true"
8 | prometheus.io/port: "8443"
9 | labels:
10 | app: training-operator
11 | name: training-operator
12 | spec:
13 | ports:
14 | - name: monitoring-port
15 | port: 8443
16 | targetPort: 8443
17 | selector:
18 | name: training-operator
19 | type: ClusterIP
20 |
--------------------------------------------------------------------------------
/code/project/manifests/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 | namespace: kubeflow
4 |
5 | resources:
6 | - argo-workflows/
7 | - kubeflow-training/
8 |
--------------------------------------------------------------------------------
/images/chinese-cover.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/chinese-cover.pdf
--------------------------------------------------------------------------------
/images/english-front-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/english-front-cover.png
--------------------------------------------------------------------------------
/images/korean-cover-clean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover-clean.png
--------------------------------------------------------------------------------
/images/korean-cover-white.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover-white.jpg
--------------------------------------------------------------------------------
/images/korean-cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover.jpg
--------------------------------------------------------------------------------