├── .gitignore
├── LICENSE
├── README.md
├── _config.yml
├── code
    ├── README.md
    ├── chapter-2
    │   ├── listing-2-1.py
    │   ├── listing-2-2.py
    │   ├── listing-2-3.py
    │   ├── listing-2-4.py
    │   ├── listing-2-5.txt
    │   ├── listing-2-6.txt
    │   ├── listing-2-7.txt
    │   └── listing-2-8.txt
    └── project
    │   ├── .gitignore
    │   ├── README.md
    │   ├── basics
    │       ├── argo-coinflip.yaml
    │       ├── argo-dag-diamond.yaml
    │       ├── argo-hello-world.yaml
    │       ├── argo-resource-template.yaml
    │       ├── argo-script-template.yaml
    │       ├── hello-world.yaml
    │       └── tfjob.yaml
    │   ├── code
    │       ├── Dockerfile
    │       ├── README.md
    │       ├── access-model.yaml
    │       ├── autoscaled-inference-service.yaml
    │       ├── data-ingestion.py
    │       ├── http-inference-request.py
    │       ├── inference-client.py
    │       ├── inference-input.json
    │       ├── inference-service.yaml
    │       ├── model-selection.py
    │       ├── model-selection.yaml
    │       ├── multi-worker-distributed-training.py
    │       ├── multi-worker-pvc.yaml
    │       ├── multi-worker-tfjob.yaml
    │       ├── predict-service.py
    │       ├── predict-service.yaml
    │       └── workflow.yaml
    │   └── manifests
    │       ├── argo-workflows
    │           ├── kustomization.yaml
    │           └── rbac-patch.yaml
    │       ├── kubeflow-training
    │           ├── cluster-role-binding.yaml
    │           ├── cluster-role.yaml
    │           ├── crds
    │           │   ├── kubeflow.org_mxjobs.yaml
    │           │   ├── kubeflow.org_pytorchjobs.yaml
    │           │   ├── kubeflow.org_tfjobs.yaml
    │           │   ├── kubeflow.org_xgboostjobs.yaml
    │           │   └── kustomization.yaml
    │           ├── deployment.yaml
    │           ├── kustomization.yaml
    │           ├── service-account.yaml
    │           └── service.yaml
    │       └── kustomization.yaml
└── images
    ├── chinese-cover.pdf
    ├── english-front-cover.png
    ├── korean-cover-clean.png
    ├── korean-cover-white.jpg
    └── korean-cover.jpg


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021 Yuan Tang
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed Machine Learning Patterns
 2 | 
 3 | [![LinkedIn](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/linkedin.svg)](https://www.linkedin.com/in/terrytangyuan)
 4 | [![Bluesky](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/bluesky.svg)](https://bsky.app/profile/terrytangyuan.xyz)
 5 | [![GitHub](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/github.svg)](https://github.com/terrytangyuan)
 6 | [![Twitter](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/twitter.svg)](https://twitter.com/TerryTangYuan)
 7 | 
 8 | <img align="right" src="images/english-front-cover.png" alt="book-front-cover" width="50%" height="50%">
 9 | 
10 | This repository contains references and code for the book *Distributed Machine Learning Patterns* from [Manning Publications](https://bit.ly/2RKv8Zo) by [Yuan Tang](https://github.com/terrytangyuan).
11 | 
12 | :fire: **[Korean](images/korean-cover.jpg) and [Chinese](images/chinese-cover.pdf) versions are available from Tsinghua University Press and Hanbit Media!**
13 | 
14 | [Manning](https://bit.ly/2RKv8Zo), [Amazon](https://www.amazon.com/dp/1617299022/), [Barnes & Noble](https://www.barnesandnoble.com/w/distributed-machine-learning-patterns-yuan-tang/1140209010), [Powell’s]( https://www.powells.com/book/distributed-machine-learning-patterns-9781617299025), [Bookshop](https://bookshop.org/p/books/distributed-machine-learning-patterns-yuan-tang/17491200)
15 | 
16 | 
17 | In *Distributed Machine Learning Patterns* you will learn how to:
18 | 
19 | * Apply patterns to build scalable and reliable machine learning systems.
20 | * Construct machine learning pipelines with data ingestion, distributed training, model serving, and more.
21 | * Automate machine learning tasks with [Kubernetes](https://kubernetes.io/), [TensorFlow](https://www.tensorflow.org/), [Kubeflow](https://www.kubeflow.org/), and [Argo Workflows](https://argoproj.github.io/argo-workflows/).
22 | * Make trade off decisions between different patterns and approaches.
23 | * Manage and monitor machine learning workloads at scale.
24 | 
25 | This book teaches you how to take machine learning models from your personal laptop to large distributed clusters. You’ll explore key concepts and patterns behind successful distributed machine learning systems, and learn technologies like TensorFlow, Kubernetes, Kubeflow, and Argo Workflows directly from a key maintainer and contributor. Real-world scenarios, hands-on projects, and clear, practical advice DevOps techniques and let you easily launch, manage, and monitor cloud-native distributed machine learning pipelines.
26 | 
27 | ## About the topic
28 | 
29 | Scaling up models from personal devices to large distributed clusters is one of the biggest challenges faced by modern machine learning practitioners. Distributing machine learning systems allow developers to handle extremely large datasets across multiple clusters, take advantage of automation tools, and benefit from hardware accelerations. In this book, Yuan Tang shares patterns, techniques, and experience gained from years spent building and managing cutting-edge distributed machine learning infrastructure.
30 | 
31 | ## About the book
32 | 
33 | *Distributed Machine Learning Patterns* is filled with practical patterns for running machine learning systems on distributed Kubernetes clusters in the cloud. Each pattern is designed to help solve common challenges faced when building distributed machine learning systems, including supporting distributed model training, handling unexpected failures, and dynamic model serving traffic. Real-world scenarios provide clear examples of how to apply each pattern, alongside the potential trade-offs for each approach. Once you’ve mastered these cutting-edge techniques, you’ll put them all into practice and finish up by building a comprehensive distributed machine learning system.
34 | 
35 | ## About the reader
36 | 
37 | For data analysts, data scientists, and software engineers familiar with the basics of machine learning algorithms and running machine learning in production. Readers should be familiar with the basics of Bash, Python, and Docker.
38 | 
39 | ## About the author
40 | 
41 | Yuan is a principal software engineer at [Red Hat](https://www.redhat.com/), working on [OpenShift AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai). Previously, he has led AI infrastructure and platform teams at various companies. He holds leadership positions in open source projects, including [Argo](https://argoproj.github.io/), [Kubeflow](https://github.com/kubeflow), and [Kubernetes](https://github.com/kubernetes/community/tree/master/wg-serving). He's also a maintainer and author of many popular [open source projects](https://github.com/sponsors/terrytangyuan). In addition, Yuan [authored](https://terrytangyuan.github.io/cv#publications) three technical books and published numerous impactful papers. He's a regular [conference speaker](https://terrytangyuan.github.io/cv#talks), technical advisor, leader, and mentor at [various organizations](https://terrytangyuan.github.io/cv#services). 
42 | 
43 | ## Supporting Quotes
44 | 
45 | *"This is a wonderful book for those wanting to understand how to be more effective with Machine Learning at scale, explained clearly and from first principles!"*
46 | 
47 | **-- Laurence Moroney, AI Developer Relations Lead at Google**
48 | 
49 | *"This book is an exceptionally timely and comprehensive guide to developing, running, and managing machine learning systems in a distributed environment. It covers essential topics such as data partitioning, ingestion, model training, serving, and workflow management. What truly sets this book apart is its discussion of these topics from a pattern perspective, accompanied by real-world examples and widely adopted systems like Kubernetes, Kubeflow, and Argo. I highly recommend it!"*
50 | 
51 | **-- Yuan Chen, Principal Software Engineer at Apple**
52 | 
53 | 
54 | *"This book provides a high-level understanding of patterns with practical code examples needed for all MLOps engineering tasks. This is a must-read for anyone in the field."*
55 | 
56 | **-- Brian Ray, Global Head of Data Science and Artificial Intelligence at Eviden**
57 | 
58 | 
59 | *"This book weaves together concepts from distributed systems, machine learning, and site reliability engineering in a way that’s approachable for beginners and that’ll excite and inspire experienced practitioners. As soon as I finished reading, I was ready to start building."*
60 | 
61 | **-- James Lamb, Staff Data Engineer at SpotHero**
62 | 
63 | 
64 | *"Whatever your role is in the data ecosystem (scientist, analyst, or engineer), if you are looking to take your knowledge and skills to the next level, then this book is for you. This book is an amazing guide to the concepts and state-of-the-art when it comes to designing resilient and scalable, ML systems for both training and serving models. Regardless of what platform you may be working with, this book teaches you the patterns you should be familiar with when trying to scale out your systems."*
65 | 
66 | **-- Ryan Russon, Senior Manager of Model Training at Capital One**
67 | 
68 | 
69 | *"AI is the new electricity, and distributed systems is the new power grid. Whether you are a research scientist, engineer, or product developer, you will find the best practices and recipes in this book to scale up your greatest endeavors."*
70 | 
71 | **-- Linxi "Jim" Fan, Senior AI Research Scientist at NVIDIA, Stanford PhD**
72 | 
73 | *"This book discusses various architectural approaches to tackle common data science problems such as scaling machine learning processes and building robust workflows and pipelines. It serves as an excellent introduction to the world of MLOps for data scientists and ML engineers who want to enhance their knowledge in this field."*
74 | 
75 | **-- Rami Krispin, Senior Data Science and Engineering Manager**
76 | 
77 | *"无论是新手还是专家，这本书都将引领你构建强大的机器学习系统，进而掌握分布式机器学习、自动化工具和大规模工作负载管理的要点。让你的机器学习之旅更上一层楼！"*
78 | 
79 | **-- 高策，TensorChord CEO，Kubeflow 社区维护者**
80 | 
81 | *"这是一本关于在分布式环境下开发、运行和管理机器学习系统的全面手册。作者详尽地阐述了从数据分区、采集、模型训练到服务和工作流程管理等一系列关键主题。通过使用现实世界中的案例，本书深入浅出地讲解了人工智能与机器学习领域用到的核心软件、系统和平台，涵盖了 PyTorch、TensorFlow、Kubeflow、Argo Workflows 和 Kubernetes 等。无论是算法工程师、系统工程师还是架构师，都能从中获得开发和维护分布式机器学习系统所需的全方位知识。我将此书极力推荐给所有对机器学习有着浓厚兴趣和实践需求的专业人士！"*
82 | 
83 | **-- 陈源，NVIDIA 主任工程师**
84 | 
85 | *"很高兴看到这本书能在国内出版。随着 ChatGPT 等工具和技术的爆火，AI技术迎来了又一波爆发期。与此同时，Kubernetes 等云原生技术作为基础设施的事实标准也再次在本轮技术热潮中成为首选项。这本书介绍了很多结合云原生和分布式技术进行机器学习的方法和案例，推荐对这方面感兴趣的读者进行阅读。"*
86 | 
87 | **-- 张晋涛，Kong Inc., Microsoft MVP, CNCF Ambassador**
88 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect
2 | plugins:
3 |   - jekyll-relative-links
4 | relative_links:
5 |   enabled: true
6 |   collections: true
7 | include:
8 |   - README.md
9 | 


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | * Install Python.
 4 | * Run the following to install the necessary Python packages:
 5 | 
 6 | ```bash
 7 | pip install tensorflow tensorflow_io
 8 | ```
 9 | 
10 | ## Instructions
11 | 
12 | * All code snippets are organized by chapters and the the listing title. For example, `chapter-2/listing-2-1.py` is for Listing 2.1 in Chapter 2.
13 | * Files with `*.py` extension can be executed via `python *.py`.
14 | * Files with `*.txt` extension are pseudo-code and are not meant to be executed.
15 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-1.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf  #A
 2 | 
 3 | train, test = tf.keras.datasets.fashion_mnist.load_data()  #B
 4 | 
 5 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
 6 | # 32768/29515 [=================================] - 0s 0us/step
 7 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
 8 | # 26427392/26421880 [==============================] - 0s 0us/step
 9 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
10 | # 8192/5148 [===============================================] - 0s 0us/step
11 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
12 | # 4423680/4422102 [==============================] - 0s 0us/step
13 | 
14 | #A Load TensorFlow library.
15 | #B Download the Fashion-MNIST dataset and then load it into memory.
16 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-2.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | images, labels = train  #A
 4 | images = images/255  #B
 5 | 
 6 | dataset = tf.data.Dataset.from_tensor_slices((images, labels))  #C
 7 | dataset  #D
 8 | # <TensorSliceDataset shapes: ((28, 28), ()), types: (tf.float64, tf.uint8)>
 9 | 
10 | #A Split the training dataset object into images and labels.
11 | #B Normalize the images.
12 | #C Load in-memory array representation into a tf.data.Dataset object that will make it easier to use for training in TensorFlow.
13 | #D Take a look at the information of the dataset such as shapes and data types.
14 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-3.py:
--------------------------------------------------------------------------------
1 | import tensorflow_io as tfio  #A
2 | 
3 | d_train = tfio.IODataset.from_mnist(  #B
4 |     'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
5 |     'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz')
6 | 
7 | #A Load TensorFlow I/O library.
8 | #B Load the MNIST dataset from a URL to access dataset files directly without downloading via HTTP file system support.
9 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-4.py:
--------------------------------------------------------------------------------
 1 | import os  #A
 2 | import tensorflow_io as tfio  #B
 3 | 
 4 | endpoint="postgresql://{}:{}@{}?port={}&dbname={}".format(  #C
 5 |     os.environ['TFIO_DEMO_DATABASE_USER'],
 6 |     os.environ['TFIO_DEMO_DATABASE_PASS'],
 7 |     os.environ['TFIO_DEMO_DATABASE_HOST'],
 8 |     os.environ['TFIO_DEMO_DATABASE_PORT'],
 9 |     os.environ['TFIO_DEMO_DATABASE_NAME'],
10 | )
11 | 
12 | dataset = tfio.experimental.IODataset.from_sql(  #D
13 |     query="SELECT co, pt08s1 FROM AirQualityUCI;",
14 |     endpoint=endpoint)
15 | print(dataset.element_spec)  #E
16 | # {'co': TensorSpec(shape=(), dtype=tf.float32, name=None), 'pt08s1': TensorSpec(shape=(), dtype=tf.int32, name=None)}
17 | 
18 | #A Load Python’s built-in OS  library for loading environment variables related to the PostgreSQL database.
19 | #B Load TensorFlow I/O library.
20 | #C Construct the endpoint for accessing the PostgreSQL database.
21 | #D Select two columns from the AirQualityUCI table in the database and instantiate a tf.data.Dataset object.
22 | #E Inspect the specification of the dataset such as the shape and data type for each column.
23 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-5.txt:
--------------------------------------------------------------------------------
1 | batch = read_next_batch(dataset)  #A
2 | while batch is not None:
3 |     model.train(batch) #B
4 |     batch = read_next_batch(dataset)  #C
5 | 
6 | #A Read the next batch in the dataset.
7 | #B Train the model with this batch.
8 | #C Read the next batch once we are done training with the current batch.
9 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-6.txt:
--------------------------------------------------------------------------------
 1 | if get_worker_rank() == 0:  #A 
 2 |     create_and_send_shards(dataset)  #A
 3 | shard = read_next_shard_locally()  #B
 4 | while shard is not None:
 5 |     model.train(shard)  #C
 6 |     shard = read_next_shard_locally()  #D
 7 | 
 8 | #A Create and send shards to all other worker machines from the worker machine with rank 0.
 9 | #B Read the next shard available locally in this worker machine.
10 | #C Train the model using the shard we just read from the worker machine locally.
11 | #D Read the next shard once we are done training with the current shard.
12 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-7.txt:
--------------------------------------------------------------------------------
 1 | batch = read_next_batch(dataset)  #A
 2 | cache = initialize_cache(batch)  #B
 3 | while batch is not None:  #C
 4 |     model.train(batch)  #C
 5 |     cache.append(batch)  #C
 6 |     batch = read_next_batch(dataset)
 7 | while current_epoch() <= total_epochs:  #D
 8 |     batch = cache.read_next_batch()  #D
 9 |     model.train(batch)  #D
10 |     
11 | #A Read the next batch of the dataset.
12 | #B Initialize the cache for this batch.
13 | #C Train the model by iterating through the batches.
14 | #D Train the model for additional epochs using the batches that were cached previously.
15 | 


--------------------------------------------------------------------------------
/code/chapter-2/listing-2-8.txt:
--------------------------------------------------------------------------------
 1 | batch = read_next_batch(dataset)
 2 | cache = initialize_cache(preprocess(batch))  #A
 3 | while batch is not None:
 4 |     batch = preprocess(batch)
 5 |     model.train(batch)
 6 |     cache.append(batch)
 7 |     batch = read_next_batch(dataset)
 8 | while current_epoch() <= total_epochs:
 9 |     processed_batch = cache.read_next_batch()  #B
10 |     model.train(processed_batch)  #B
11 | 
12 | #A Initialize the cache with the preprocessed batch.
13 | #B Retrieve the processed batch from the cache and use it for model training.
14 | 


--------------------------------------------------------------------------------
/code/project/.gitignore:
--------------------------------------------------------------------------------
1 | trained_model/
2 | istio-*
3 | 


--------------------------------------------------------------------------------
/code/project/README.md:
--------------------------------------------------------------------------------
 1 | # Project Setup
 2 | 
 3 | ## Cluster
 4 | 
 5 | ```
 6 | cd project/
 7 | ```
 8 | 
 9 | Via `kind`:
10 | 
11 | ```
12 | go install sigs.k8s.io/kind@v0.17.0
13 | kind create cluster --name distml --image kindest/node:v1.25.3
14 | ```
15 | 
16 | Or via `k3d`:
17 | 
18 | ```
19 | k3d cluster create distml --image rancher/k3s:v1.25.3-k3s1
20 | ```
21 | 
22 | 
23 | ```
24 | kubectl create ns kubeflow
25 | kns kubeflow
26 | kubectl kustomize manifests | kubectl apply -f -
27 | ```
28 | 
29 | # Run Workflow
30 | 
31 | See instructions [here](https://github.com/terrytangyuan/distributed-ml-patterns/blob/main/code/project/code/README.md).
32 | 
33 | # Clean-up
34 | 
35 | ```
36 | k3d cluster rm distml
37 | kind delete cluster --name distml
38 | ```
39 | 


--------------------------------------------------------------------------------
/code/project/basics/argo-coinflip.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: coinflip-
 5 | spec:
 6 |   serviceAccountName: argo
 7 |   entrypoint: coinflip
 8 |   templates:
 9 |   - name: coinflip
10 |     steps:
11 |     - - name: flip-coin
12 |         template: flip-coin
13 |     - - name: heads
14 |         template: heads
15 |         when: "{{steps.flip-coin.outputs.result}} == heads"
16 |       - name: tails
17 |         template: tails
18 |         when: "{{steps.flip-coin.outputs.result}} == tails"
19 | 
20 |   - name: flip-coin
21 |     script:
22 |       image: python:alpine3.6
23 |       command: [python]
24 |       source: |
25 |         import random
26 |         result = "heads" if random.randint(0,1) == 0 else "tails"
27 |         print(result)
28 | 
29 |   - name: heads
30 |     container:
31 |       image: alpine:3.6
32 |       command: [sh, -c]
33 |       args: ["echo \"it was heads\""]
34 | 
35 |   - name: tails
36 |     container:
37 |       image: alpine:3.6
38 |       command: [sh, -c]
39 |       args: ["echo \"it was tails\""]
40 | 


--------------------------------------------------------------------------------
/code/project/basics/argo-dag-diamond.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: dag-diamond-
 5 | spec:
 6 |   serviceAccountName: argo
 7 |   entrypoint: diamond
 8 |   templates:
 9 |   - name: echo
10 |     inputs:
11 |       parameters:
12 |         - name: message
13 |     container:
14 |       image: alpine:3.7
15 |       command: [echo, "{{inputs.parameters.message}}"]
16 |   - name: diamond
17 |     dag:
18 |       tasks:
19 |       - name: A
20 |         template: echo
21 |         arguments:
22 |           parameters: [{name: message, value: A}]
23 |       - name: B
24 |         dependencies: [A]
25 |         template: echo
26 |         arguments:
27 |           parameters: [{name: message, value: B}]
28 |       - name: C
29 |         dependencies: [A]
30 |         template: echo
31 |         arguments:
32 |           parameters: [{name: message, value: C}]
33 |       - name: D
34 |         dependencies: [B, C]
35 |         template: echo
36 |         arguments:
37 |           parameters: [{name: message, value: D}]
38 | 


--------------------------------------------------------------------------------
/code/project/basics/argo-hello-world.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: hello-world-
 5 | spec:
 6 |   entrypoint: whalesay
 7 |   serviceAccountName: argo
 8 |   templates:
 9 |   - name: whalesay
10 |     container:
11 |       image: docker/whalesay
12 |       command: [cowsay]
13 |       args: ["hello world"]
14 | 


--------------------------------------------------------------------------------
/code/project/basics/argo-resource-template.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: k8s-resource-
 5 | spec:
 6 |   entrypoint: k8s-resource
 7 |   serviceAccountName: argo
 8 |   templates:
 9 |   - name: k8s-resource
10 |     resource:
11 |        action: create
12 |        manifest: |
13 |          apiVersion: v1
14 |          kind: ConfigMap
15 |          metadata:
16 |           name: cm-example
17 |          data:
18 |           some: value
19 | 


--------------------------------------------------------------------------------
/code/project/basics/argo-script-template.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: script-tmpl-
 5 | spec:
 6 |   entrypoint: gen-random-int
 7 |   serviceAccountName: argo
 8 |   templates:
 9 |   - name: gen-random-int
10 |     script:
11 |        image: python:alpine3.6
12 |        command: [python]
13 |        source: |
14 |           import random
15 |           i = random.randint(1, 100)
16 |           print(i)
17 | 


--------------------------------------------------------------------------------
/code/project/basics/hello-world.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: whalesay
 5 | spec:
 6 |   containers:
 7 |   - name: whalesay
 8 |     image: docker/whalesay:latest
 9 |     command: [cowsay]
10 |     args: ["hello world"]
11 | 


--------------------------------------------------------------------------------
/code/project/basics/tfjob.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1
 2 | kind: TFJob
 3 | metadata:
 4 |   namespace: kubeflow
 5 |   generateName: distributed-tfjob-
 6 | spec:
 7 |   tfReplicaSpecs:
 8 |     Worker:
 9 |       replicas: 2
10 |       restartPolicy: OnFailure
11 |       template:
12 |         spec:
13 |           containers:
14 |             - name: tensorflow
15 |               image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
16 |               command:
17 |                 - "python"
18 |                 - "/var/tf_mnist/mnist_with_summaries.py"
19 |                 - "--log_dir=/train/metrics"
20 |                 - "--learning_rate=0.01"
21 |                 - "--batch_size=100"
22 | 


--------------------------------------------------------------------------------
/code/project/code/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9
2 | 
3 | RUN pip install tensorflow==2.11.0 tensorflow_datasets==4.7.0
4 | 
5 | COPY data-ingestion.py /
6 | COPY predict-service.py /
7 | COPY model-selection.py /
8 | COPY multi-worker-distributed-training.py /
9 | 


--------------------------------------------------------------------------------
/code/project/code/README.md:
--------------------------------------------------------------------------------
  1 | # Multi-worker Distributed Training
  2 | 
  3 | ## Setup
  4 | 
  5 | ```
  6 | cd project/code
  7 | ```
  8 | 
  9 | Build the image
 10 | ```
 11 | docker build -f Dockerfile -t kubeflow/multi-worker-strategy:v0.1 .
 12 | # If using k3d
 13 | k3d image import kubeflow/multi-worker-strategy:v0.1 --cluster distml
 14 | # If using kind
 15 | kind load docker-image kubeflow/multi-worker-strategy:v0.1 --name distml
 16 | ```
 17 | 
 18 | Switch to "kubeflow" namespace:
 19 | ```
 20 | kubectl config set-context --current --namespace=kubeflow
 21 | ```
 22 | 
 23 | Specify your storageClassName and create a persistent volume claim to save 
 24 | models and checkpoints
 25 | ```
 26 | kubectl create -f multi-worker-pvc.yaml
 27 | ```
 28 | 
 29 | ## Submitting Training Job
 30 | 
 31 | Create a TFJob:
 32 | ```
 33 | kubectl create -f multi-worker-tfjob.yaml
 34 | ```
 35 | 
 36 | After making code changes, run the following to resubmit the job:
 37 | ```
 38 | kubectl delete tfjob --all; docker build -f Dockerfile -t kubeflow/multi-worker-strategy:v0.1 .; kind load docker-image kubeflow/multi-worker-strategy:v0.1 --name distml; kubectl create -f multi-worker-tfjob.yaml
 39 | ```
 40 | 
 41 | ## Model loading & prediction
 42 | 
 43 | ```
 44 | kubectl create -f predict-service.yaml
 45 | kubectl exec --stdin --tty predict-service -- bin/bash
 46 | python3 /predict-service.py
 47 | ```
 48 | 
 49 | ## Model selection
 50 | 
 51 | ```
 52 | python3 /model-selection.py
 53 | ```
 54 | 
 55 | ## Model serving
 56 | 
 57 | ```
 58 | # Install KServe
 59 | curl -s "https://raw.githubusercontent.com/kserve/kserve/v0.10.0-rc1/hack/quick_install.sh" | bash
 60 | 
 61 | # Create inference service
 62 | kubectl create -f inference-service.yaml
 63 | 
 64 | # https://kserve.github.io/website/master/get_started/first_isvc/#4-determine-the-ingress-ip-and-ports
 65 | INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
 66 | kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80
 67 | # start another terminal
 68 | export INGRESS_HOST=localhost
 69 | export INGRESS_PORT=8080
 70 | 
 71 | MODEL_NAME=flower-sample                                                                                                      
 72 | INPUT_PATH=@./inference-input.json
 73 | SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -o jsonpath='{.status.url}' | cut -d "/" -f 3)
 74 | curl -v -H "Host: ${SERVICE_HOSTNAME}" "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict" -d $INPUT_PATH
 75 | 
 76 | ## TODO: gRPC serving. Not working yet
 77 | # Client-side requirements
 78 | python3 -m pip install tensorflow-metal
 79 | python3 -m pip install tensorflow-macos==2.11.0
 80 | python3 -m pip install tensorflow-serving-api==2.11.0
 81 | ```
 82 | 
 83 | Autoscaled inference service:
 84 | ```
 85 | # https://github.com/rakyll/hey
 86 | brew install hey
 87 | kubectl create -f autoscaled-inference-service.yaml
 88 | 
 89 | hey -z 30s -c 5 -m POST -host ${SERVICE_HOSTNAME} -D inference-input.json "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict"
 90 | ```
 91 | 
 92 | ## Workflow
 93 | 
 94 | ```
 95 | kubectl create -f workflow.yaml
 96 | ```
 97 | 
 98 | ## Debugging
 99 | 
100 | Access the trained model
101 | ```
102 | kubectl create -f access-model.yaml 
103 | kubectl exec --stdin --tty access-model -- ls /trained_model
104 | # Manually copy
105 | # kubectl cp trained_model access-model:/pv/trained_model -c model-storage
106 | ```
107 | 
108 | Run TFServing commands in the KServe container:
109 | ```
110 | kubectl exec --stdin --tty flower-sample-predictor-default-00001-deployment-84759dfc5f6wfj -c kserve-container -- /usr/bin/tensorflow_model_server --model_name=flower-sample \
111 |       --port=9000 \
112 |       --rest_api_port=8080 \
113 |       --model_base_path=/mnt \
114 |       --rest_api_timeout_in_ms=60000
115 | ```
116 | 
117 | ## Cleanup
118 | 
119 | ```
120 | kubectl delete tfjob --all
121 | kubectl delete wf --all
122 | kubectl delete inferenceservice flower-sample
123 | kubectl delete pods --selector=app=flower-sample-predictor-default-00001 --force --grace-period=0
124 | kubectl delete pod access-model --force --grace-period=0
125 | kubectl delete pod predict-service --force --grace-period=0
126 | kubectl delete pvc strategy-volume
127 | ```
128 | 
129 | 


--------------------------------------------------------------------------------
/code/project/code/access-model.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: access-model
 5 | spec:
 6 |   containers:
 7 |   - name: model-storage
 8 |     image: alpine:latest
 9 |     command: ['sleep', 'infinity']
10 |     volumeMounts:
11 |     - name: model
12 |       mountPath: /trained_model
13 |   volumes:
14 |   - name: model
15 |     persistentVolumeClaim:
16 |       claimName: strategy-volume
17 | 


--------------------------------------------------------------------------------
/code/project/code/autoscaled-inference-service.yaml:
--------------------------------------------------------------------------------
 1 | # https://kserve.github.io/website/master/modelserving/autoscaling/autoscaling/#create-inferenceservice
 2 | apiVersion: serving.kserve.io/v1beta1
 3 | kind: InferenceService
 4 | metadata:
 5 |   name: flower-sample
 6 | spec:
 7 |   predictor:
 8 |     # https://kserve.github.io/website/master/reference/api/#serving.kserve.io/v1beta1.ComponentExtensionSpec
 9 |     scaleTarget: 1
10 |     scaleMetric: concurrency
11 |     model:
12 |       modelFormat:
13 |         name: tensorflow
14 |       # This is only needed on Mac M1
15 |       image: "emacski/tensorflow-serving:2.6.0"
16 |       storageUri: "pvc://strategy-volume/saved_model_versions"
17 | 


--------------------------------------------------------------------------------
/code/project/code/data-ingestion.py:
--------------------------------------------------------------------------------
1 | import tensorflow_datasets as tfds
2 | 
3 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
4 | 


--------------------------------------------------------------------------------
/code/project/code/http-inference-request.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | input_path = "inference-input.json"
 5 | 
 6 | with open(input_path) as json_file:
 7 | 	data = json.load(json_file)
 8 | 
 9 | r = requests.post(url="http://localhost:8080/v1/models/flower-sample:predict", data=json.dumps(data), headers={'Host': 'flower-sample.kubeflow.example.com'})
10 | print(r.text)
11 | 


--------------------------------------------------------------------------------
/code/project/code/inference-client.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import base64
 4 | import grpc
 5 | 
 6 | from tensorflow.contrib.util import make_tensor_proto
 7 | from tensorflow_serving.apis import predict_pb2
 8 | from tensorflow_serving.apis import prediction_service_pb2_grpc
 9 | 
10 | 
11 | def predict(host, port, hostname, model, signature_name, input_path):
12 |     # If hostname not set, we assume the host is a valid knative dns.
13 |     if hostname:
14 |         host_option = (('grpc.ssl_target_name_override', hostname,),)
15 |     else:
16 |         host_option = None
17 |     channel = grpc.insecure_channel(target='{host}:{port}'.format(host=host, port=port), options=host_option)
18 |     stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
19 |     with open(input_path) as json_file:
20 |         data = json.load(json_file)
21 |     image = data['instances'][0]['image_bytes']['b64']
22 |     key = data['instances'][0]['key']
23 | 
24 |     # Call classification model to make prediction
25 |     request = predict_pb2.PredictRequest()
26 |     request.model_spec.name = model
27 |     request.model_spec.signature_name = signature_name
28 |     image = base64.b64decode(image)
29 |     request.inputs['image_bytes'].CopyFrom(
30 |         make_tensor_proto(image, shape=[1]))
31 |     request.inputs['key'].CopyFrom(make_tensor_proto(key, shape=[1]))
32 | 
33 |     result = stub.Predict(request, 10.0)
34 |     print(result)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument('--host', help='Ingress Host Name', default='localhost', type=str)
40 |     parser.add_argument('--port', help='Ingress Port', default=80, type=int)
41 |     parser.add_argument('--model', help='TensorFlow Model Name', type=str)
42 |     parser.add_argument('--signature_name', help='Signature name of saved TensorFlow model',
43 |                         default='serving_default', type=str)
44 |     parser.add_argument('--hostname', help='Service Host Name', default='', type=str)
45 |     parser.add_argument('--input_path', help='Prediction data input path', default='./input.json', type=str)
46 | 
47 |     args = parser.parse_args()
48 |     predict(args.host, args.port, args.hostname, args.model, args.signature_name, args.input_path)
49 | 


--------------------------------------------------------------------------------
/code/project/code/inference-input.json:
--------------------------------------------------------------------------------
1 | {  
2 |     "instances":[  
3 |        {  
4 |           "image_bytes":{  
5 |              "b64":"/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAErASsDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwC9A42ir9vA0nOOKxYJhkDqe1bNvO0ZAYdaIsC8LLjOwH60yWDAwY1/75qzDcDAz0qfhl55BqxGE1pCzZwVPt0qJ7MgZQbh7da1Z7bncBVQgoaVhlGFvKlBIwOhqxPFxkdKmdY5xiQYP94daaqtGPKkO5P4X/pU2AoKMMQatWv+tAPXpTJ4ipyBTVYqwYHBFTezA1ivHNRsuRU1tOlymOBIOo9aVoyGNaCIEHanEEEMKXbg07BAx2NICXO5KrvwcVPEcDFRyD5qTYDYhuPuKnA4waitxmQj1FWGX9Ka2ArODzUXU5qxIM81AODzUtjGzHMfvVRcl6mmOMio4V3PSAtwjBUd60l+6DVCMAzH2q6D8v0qo7CIJ3xmsqQ8kmtC5YAVmyctntSbGRkDOT0qWMFyABUWNzD0q5EuxM9zQgJQAqgCkJxS9vemMasA3c8CpFPHNRBgBkinBvSpuBMGxRnPWo1561IOlMBQMEU2R8DFKW2rk1XdsmgCN+TmqskuHIqeUhVNZMkoZyckZqQILTi5UntzWtHMOVbpWQh2zCr6jIBpRGzUjl2jBPHY1chuSODyKx4pOzdKnVyh68VYjbDBlyvSq88G4bhVeG4Kkc8HrV3eGUEVQjLkUr+FRmQgYzV+aMODxzWdIpU0mMerh1wahdCpPvTN21gQamB3jB+qn1rOQDI5GjcMvBFbdvMt1FkfeHWsJhzU1pcG3nDZ4PWlGVgNd4+MigL8uKscMgdeVNRsAORVsRGFwc1G45qfKg/MM/U0jLG3RQPxNS2BCh2OG9DVxwM57GqxRQc8j9asp80I5zjiiIyu64zVdhxVtwMVVak2BUlOTUlumATTXXmpPux0r6AS2vLv7GrLNtFVbM/K596knbgGqT0AqXLZeqbgsRU8x96hJzgCk2A+JPmA61PA4mUSL9wk7fcetULtmEMdvGSJrltgI6hQMsfwH6kVqRIsUaqgAVQAAOwFUgEJ7UwDOc1Ky55/OmtgcCi4EZ6UqqSc0Hk4p46igB44pQaaM5NI7hVx3qkA2V8nHaoAdzE9hTZHOMd6ZczfZoQq/fNDArahcgAxLyf4iKzs0OxJ5696ZUDQP97NaVsdyg+1IPszHlFzU8SRg4jGB6VSQh3linp02mpQm5enNJs9aoBoynfirMFwVOD0qADjDUn3W9qANIsGGQeKqXCK3PekjlIOCeKfJyN1AGXIMZFNik6xscc5U+hqxMgbPrVFwVas2BezvXOMOPvCo2GD7UyOXOG/iHX3p8hGzdn6Vm0M0rG8ZLYxtzz8pp0lyx/iNZUMpzzVkturURKZGP8AEfzpRMw6Nmq5HvTMspz1pAaUVzzhjiptxjPmRnjuKyBNzzxU8NwUbDcqaXoBreYJU3L+VVn5zTEcRvkHKNUjcE4qZdwITyabK3yGpG4GaqzN+7qG9ALNicwn/eNE75UgU2zb/RQfc0krY4rS+gFZgcc0iKM+9Kc81FcI727QxnDyjbu9AepoWrAZpv8Apt7NqB5jA8mAeig/MfxOPyFa4HFQ20KW8KQxrhEUKB7VYXFWAvlkhSDx1PvUchwSAKlD7Uyep6CoS3UnrU9QGHg0DJ5xSb8mjdjvVAOZ9oqs75JOaJX3Hg1GBmmAKRuLt0Xms24lMshbt2qxezgDyEPuxqkxpNjImo4pGOOarmbk0gJvMINWIbp42BB6VBLC0Z9qjVsGjYDqrWVJ4w6n6j0qcxbh71ztndtbyBlPHcetdLayx3Me+Ns+o7irTuJkDRDvwaYVIODWg0IYc9agkgBGDTEUyCv0pwc4wac8ZTg9KjYFRSAil6ZxVOYZFXGPBBqpKKljIFJB61KzFlqJhTkbPBrO+oxysR0qwrkjk1Wxg1IoPBJ4PpSc7BYkOfU0zzHXvke9WNuFBHQ1A/BrKVRoaQm/dweDShyOOtRZB+tAPHNSq6HYv203/LNuh6VeVvkweq1ioxB+lacUm5Nw7jBrVTUoktEsp+SqNw2F4q1I3yCqM5yQKwchpF62Yi0/GkcHgmkh4gAoOSK1UxEfVuKdbKZH8zseF+nrTGO0qo6scVciXgYrWG1xDxwcUm/GQKc3FVS3J5qmwHmUkknoKYXJGaZuBzTd1JMB4PBzxUUkhPApWcnpTFUk1SAdGrOcVW1S/TTbbIwZW4Rff1rQXbEmSefSqC6bHPdNd3Q86U/dDfdQegFUBkWYurnlI2YseWPetSPSZW5llC+yjJrUVABjt6CnHihJICkulWqcsGf/AHjUwt4QMCGMD/cFSM4HWm+YKq4HO/aAww3NRuqtypxTNpFIMisrjFDFDg1ctbySCQSRsQf5/Wqm4MOaT6GlewHY2OrQXWFciOT0J4NaDRq68/nXBLIRwa0bTVLi2wBIWX0bmqU+4rHSSQcFW5HY1TkiKZBGRUtrq8NyNr8N6VaYJIvysCPar0YjGdMDOeKrSL6ng1pXFuUJxyDVCSMgH0qGMqMvao84NTOp61XYkHmspDRYXBxnpSFjG2DyKbEdwK96eR5ilT95elZSKRYglBHlseD0NJKpU4NUlk2nB6VcjlEq7HPzDofWsJTvox2KzcZojbccGnSrgkVCpIkFcdaorblJFg8VctHzlapBs1Ztf9bTw+K01CUS07dvaqMh3TqKulSSTjPHWqCgGdnzkKOtYLGxbDkZoxnEYqR4j5e4HPqKrwncAO2M1X1jVRp+lXFxn7inHu3QCu6E3JKzIaC1lF3qk+05jtgIh/vnlj+AwPzrXUYArnvB0LLoUMshJknZpXJ7kmuhLAZxXcpIgimYjPNVd3HWpJnLHFQgcYp81xjht7U4AGkGFpDPt4QZNaIQ9YiRnoPemng4Tk+tIFdzmRjj0p5dEHFWmAqRfxOeakyAOwquZ93Sk5PJouBM0uKjMrHpUeRn5qQydhRcB3Pc0vHrUYJanbT6UwMX6ikKelafA/gX8qRljPWJfwzScAMsxjNJtx9K0Ht42HykqffkVA8DpzjI9RzUOLQ7lXbijDdRU2zIppX3qRipJ0z1rQttQkj4Ylh655FZjLzQMryDRewHUwXyTphiDnof8aZcQYO5eRXPw3BQ5U4Na1tf5Xa3I9KtSuTYgkjK5x0qpIpwM9K2HRJlLIefSqEsRGR+lRJAUVcxuCO1WpDlROn4iqknTp0p1tPtYo3KNXJOdtGWlcllUMBIvRuv1oGVOD1FOWIiRIz91zgH0qS6VoNwaMgKeJQMgfWvExGK960TeMRpPmqMnD4796JIQsf7s5lGAQT61YeSOS8SFwFkH3WHRuKq6fFJdajMZ3aMRtgqByT269q4nUnJXb2KskMjUiIS7sl32hfTtWhaArIQ42kdaaY/7PvzG6+dFMSWfHC91+nvS2e+Sab7QORNsUZwG7jn0xmpc3a6FYnDpC4lDEl/4G6DtVa5UqRDEADKTj2Hqas3Nkz6hGZGxCFJLL/FzwKluIykkQgQtC53N3Kn0+maxUrNMvRoiEZWPapAJ71keItPkvdPYiNnjg+cQjOZT/8AW61uOY1cGUcryFzUgkZ0JVsDtXfDGWmm9kZuGhX0qJLXT4IACuyMfKe1WZJQFOKiKlSCWycfnUUjjua9ali41PhZk42EbLHNISFFM83nA5pVXPLV3Ql2IaDBf2FOykQycUySUIMDrVZi0h5NbJiJnuGc4WkVWP3qYo29OtSKrt34q0wHhgo4GaCx/CnLCw561OkJP8NWIrBd/apVtj3qysI74FSZjTimBElsO1S+QKQzf3cCk3v60AUiqEcVG0R7DNPIZT92kLY5yRWgFd0I7UwMyGrLHPYGoXQHkVLAYYUlGUwj/oarMhRyrqQam3FDU6yRzoEl7dG7ioaTAzyuR0phQ+tXJreSLn7yHow6GoNhas2h3K5UA5zSrIUI5qQx5qIoBwRUMZehvMEZbB7GrS3Mdy/lMQk3YH+L6HvWJnHHpSs0c0ZinBKeoOCp9Qe1Tz2CxcvY2jY8VXgYMrhhhf73pTY728geO3ukW9tXztuPusvfDe9aECQmMqE3I/OD1FeVmFaKVlua049yNLooVjLD5MMcjORmrZN3LfmHzVjg2bn3LkMvp9TVeS6htbNXSNTk5RiPSrdhdx3ds0lztIcHJHHHt714cr/FY2JTbwGzDJ80kBym48/5xTZHzLFLLEygryw43+lFvJa/YZF52AZEh5IIpkN6k2yCcOcHdtIxzUaktl10gup02zMqlcFRyM0omRElhk25QYGBzu9qqLav9rl+ySII+q7mzz6UxHiXzYbkL9oHzGQHnd2x7VLj0Fc0Fml+y5kjbY2OT/CakaRSEMY/eE7do71krqBkgWNllCzMOdp/P0q35aQPEYHd9wKlSMnjvUuFtykyeW1ju51WQsjIm38e1V5LWRSiGcDy+Hx/ETzmpxK0kgl2ERYwzHru+lJe2pn8trc4kzyC3BHrSTa0ZRG7IkQIbcvTJ65qsUeZ2H3VXqT6065kkSfyIrZiQMFmHHHU06TKWyq7ZbGSfU16GB91uUnotTOYxIVU/eyaSRivFAxgHOaGw3yn8K9+hVhWjeBi01uQFWY05YSe9L5wXjbUiTqeq4+ldcSWSRwkdRmp1RVGW4qITf3TxSg7jljWqJJRJGvTrSGZj0OBTfK3Hini3brV2Ab5jNxTgCetSCE9+DR5ZxinYBAqjmjzVpPKOOtHlU7AVRLwOhoyHFZyuw6VMsz9aq4FghajYYHBpPNzzRvDD0pMCB8HtioiQOhqV1Peq0kZBz2qWBbgu2i44ZT1U9DVwWttdjdAxjfvH/hWGWZDx0qWO4dCGBII7ip5h2L82nlTgvtP+0KrPZSjkbH+jVft9VhnTyrtQQeN4qO7025C+bp06TL/AM8pDj8mH9RSaT2AyJo2X7yEH3FVJJFUdeadN4hlsJvK1Kxnt+cbiNy/nTzf6XerkBTnupK1yVdFcpDdME8935SofKP3s9vetcmCxTZcDcm4/Oh5APrVWK4kuA0dtHjbznODTzAbiaGV0EinO5Dkfn+NfN4ibqTvLRHTHRE8losVspkKT23VY+4z3z9Ka8FvayRyQiT7G/OGHCmp44omsnW5LbsHdGrcLjgfXtQdQVdOkhYeblNqoozx9K57vYTIbmWAT2zoGUOSGUDCk9jS3Fwl7fpsuFjZUAdgM5PSobm/hmsIbZnVmaQEJjgge/arN1FHdxWoh8qBgx3N/s/TuelVta5DZWN79gmliMgaVMkcEBqnhube4s5FmTMjjcGbruPpisvVIn0u5jE9wkz3PAcLjaKkb7Pp8kEkL7lkByHP3SMcj3q+RWTW7Iua0OpLJYzQyZ+VMFAMMD2xn3plnfva3Dw3aeXMg4UntjrVC9vXk8m+ijXbG20kA7m96fNqCSz29y6EmL5Hcp0zjGah0tNtylI14dTVpZHPIz90jk1Na5aLzvNBc/MExwB6fWq7zW63cExVBI8fGO/PFSXksUcgMZbMv+s2jhPeudxWyNUyO7vfOuo0BYKAGc+vtUVwr3upCFNywxrukfH8q02mha0kAjUqqZB6Z46VAl15kGyFQCVOBn+tVSqcmtgavoQtDDgorlSOnOahkzCu6QjaOd3bFPQmGVEeAmM43yZ6+uKknaPa8IAdG4OfSvewlanJ6JL0MZJmb9ttWOVcyeyKT/TFC3Fy5/0fTZCv964kWIfhjcf0qyMRjCKFHsMU1mYmvWi0ZsehuMfvPIiOekbGT9SF/lUhl2+hqDDkU3Yx5NbJiLH2kjocU8XLf3jVUITRsNXcRdE7H+Ol85/7xqmEYdDThuHencC557g/epftL+oqllqTLU7gQKvNPCkHBNOVkIG5amVI2Aw2KYEITqKTYQfSrnkHqMGlMB/u8UgKTKfc00oSORV4REdRQbfnikwMxoh0IqJodp+WtY24I55NNNvj+GpaAx2hYcg0+C5ntmzHIw9uoNXmtjk4FQSQbe1ZO62KRZTVbe7Qw30KAHuRlTUN5YlSj2RhEWeU8sFT9DVCZAB2qvDcSQMfLkIH90nINcmJqtQa6lRjqaUTNNI1rtWJ8E7umfUVIoffHapcA7Ry5GCB/WoLC4juhKZGjSUHaoc4/HPTFVPImsbo3VyywhMjy2OSwPpivm3FuTT3OnoXLuAxTC3FzuWYFtxGDwelN06eHTmfjYe4cckVMyiOSO4nlWSXd8qfwjIqS6uYZb+1V9nmxhjz9OKm+ljNmJetBJqdw8mYlAHl4UqPekt5tQnuVeOESW6ggEN6VY8Tajb6nHBYxnE5bJbHQVDpMdxbXCWECmYjJGDwfqa6Uv3d2tfP8yGJBqcWpTvHdw7Y0G1S33vfH5VB/ZN5GrtegPDJ8lsQ3IPbP1qxc6PEbGW4Fw0FxHlnjK8euKqy67P/AGfBEkb/ALyQYJGFBHaqjr/D2/IhmlYvcaVcpHeRqpC5A/hI+vrVy1vree/vgPnWQj91jqMAdPrVKG6XVZYoZ2ZPJ+YHg7varl8sVlqdvNZkuZ1KMMZIA57VzySbs9xok0/ZCstvdQsuCRtk6qvbBrQWxKwFldjC7fOCfnVewqnA1trLkytiZMojdCh9SO/NT28k8pksN+LhfldsHGPUf0rCd73+81iaBitksnOPlRSWUnhlxVKxaJnCR2hjTqrO2SfpT2tJlb7MzLJCQCW3YJGf8aa8aWDhZWdl6RcYI+prJbWuakrNiCZdwfa3y4+g/wDr1EkYI5/OkjCG1OxtoLDrT9siDBU49ua9fApaNGM77DvIHrxSiFD1wKZ5rZ9KUEsc170DFj/JGeBSi3p6c8ZqYLnoea3RJWNsMcUwwe1XQhPUU7y+MVYGeYcUhirQ8oDjrSGIdSKYGb5dJ5RrQMIPQU3yPaqAykjPpU6R1l29zOhAT5h/d61s28jSLl49n1NNO4CqpHQ08zGPgcmplVT3psiLTsIbHdIxw6496tCNSuQAR6g1mSpg8Ypsd3JC3ysfpSGanlD0pjBV6kVB9viuE2Sh4mP/AC0jP9Kgk02WXm3vkm/2WO1v8Kl+QE8kkXr+VVnngHVAfqaqTaZqEWS8Dkf7Jz/KqLiQZDRupHqprCU5LoNIvy3cfRUT8qpl43kG9FC9ziqbS7eM0eflSvXNefi5TlHQ1glcnu1DeXFbhDvPDHt70+5toZWG26jeeDBUyc+Zj2HP6UkMUJieGEM08gyAR04/SqcNtDYXsVzPdr52SPKAzgnjqK8OPrt+JsTBvPuRDdRPEoG5se/Sori2jsZZFeYusg3K5649KtXLPeX0QikChMh3xnI9qy9ctZY9RiWdzJbsu5SOBmrpq7tsZyNEHTbfRZF8tRJtIEnVmPaq1ldtpzjdJkS4IYDkN6fSqul6XaXBkMrtuJIQZ4X3qndLKLuWISGUQpuDIMFe3Pr2rRQjJuN7kGhrerx3LyNEMoRtlYfxN7Cqtiw1eeGzm/dQRfOXHXPQAVGLL9y7yFEeNcqAfvZ9ahluvs9ukcKmK5XBwBwRnk1pGKS5Ybg0dJZG0jtpIZo8BWPzg859aS2ubi2vY5bmMIJlzC56FQeh9D7VmWIE9rJO75kzkN0GRWhLNeajosyx2jTBfmIf5SPcVzyjrZiHyXTx628kUQ8iYgGQdA2Oa2pIUWKO5tpHNxwjlv4/rWNp0M2raaFtisYUhgX7MK0tPiluoj9omEc8ZZQo6bgec1hVVvloaRZZmWW1jFzNIhiYbWIJ+Q1E01rqN0oeSR1UcBBhc/jz+lWbWKa8jIZAsQOJFc8v7AVS02IxFHkCozE+WhOSR3rFWs31Rqi7A0LRSLblWBGACw+U++Kqi4uI5WRAzhTjIU4NTzzWUEx/erHu6j3Hf9azl1GVh8x4J4NdeCS573aImaS3Of8AWwMPcCp0MMnKPj2NZi3b/wB6p470j70aNX0tKatqc7RpCNlORyPapApzmq0N1bN/ejPvyKuo6EcSK1dSVyRytinja3UUmFz1FL8o6EVdgHgKKXYDzTMgdx+dAkIosA4xDsKPJpPN9qPNHrTA5tBs4Xge1So5HrTTG5HCmmiKQ1QF2OUHgnFSk8etU44znk4NW1QbeTzTEV5Bmqrrz3rSdFxxUDL3xUtDKBB7UbmXoSKtNGOuKYYx6VLAE1C5i+5Mw9qe2r3OPnWN/qtRGIdqY0Xqal3AWXUYHH72yjPuKoSzacTuEBQg9hT5oevFZ8sLHPFcde7VmXEn/eeYtzbdeQDnFPu7KVZo5UkhLxsCD94OfbHWqUbSxgx5zGex7VYa2l/s6NxM0pY/u1QZI+mK+dqRcJnQndGhePGbQO2yMR/MvOM//rrOtymqTML5CsbMNik42iq0cTx2ciXAledJVJjbkIvX86YJUmuw5Lquwgdt/wBPWkoct7feTIfYQ20cs+/zwsbsTsU4I7HNJLo9zY28+oROHSVsuO6qff1ratLiA6O0cpHkiPGdwB4//VVW0uWmsxazo/lzZBcnoMfzo9pK7ZNjKvbeC3KusjzRsnC9dvp+FVHhj+zRsvF2W2kdSR3/AAxW42nyaXL9ktla5gnXO6Q8r+fasW3MqXbwsoM7AgZ4xj6/Wt4Surp3GQ6fFK8skUrEKW+6p4BroLLUpo7l7JciUJkuT8uK5mO21CPUGO8tKnLFDnIqzHdOmpSOJCAVG5mHJp1aanfroTY6K0uDpz/ZCWIGXDp3z6+9WLF7i9vLohkiBfKt6ZHSsHQonvdRmubySRguAFJxx9PSukhsltdRk+yN8kxBCHsTXHWSi2upcS3LMdJkiSWT5X4STt+NUY1k/tF4CVBtiWjdTxIjc8ep57U3Uzf7lR7fdCGyCg3dK0LuaKysYVcDzCwWM7eVz3rFaLu2aIpzWUcTPHdReZFIN6S9Gz3z71Sm0+SNRJC5mg6hh1H1FS3LzvcrNNcJKpGFIOFGDyMdj0qJFubWZpbdv3bHJj6r9R6V6OC3tIzkuw1M8c1MrleCaso0F4uWType4Hr/AFpklnJGN33k/vCvbhDsYtiLLz1qZZcdDVbbkcCnAHtW8SS6s7DoxqZZz6ms0PjrUqyVqmwNETE9zThKezGqKy89alDe9UmIti5cd81ILs+gqiGpd4qrgSC4hI+9SfaYecGssI1OCMO2adwNDz4ienNPE47CqSIT2q1FCfwpgTGTd3xTTz3qVIgKlEQ7UxFXy/SnCLParax47VMsakdKVgKItQad9i9av4A6CgjNS4gZUlkDxjNUbizCg5wAOSa1r6+t7NP3jZbsg5JrltR1Ca+yp/dxdkHf61hU5UtSkmV5prWRinmYj5yw74qKUyCNYbW4KKRwu7H41UWFmc47dfap7aJ5bwY+Zdp3c8189iYWnzNnTF6WHWt3BZwtbvJvn5J2Atkn1Nal49tdaKN6JuhT5Gz0z1Hsax4IBaXztIuUYbd392kupLaK4SRJVZUO5152nH9a5nFSkmhstyWMFxpyQwnDMR5f/wBeoL++utPgt4ZIkw7ArKG+X3xVgSpNJBLYoZTgsQPlO08d6ytZvTq9qbeFNkdq4355PPAIPpnj64rSlFylaW35EMty31ydUTYyOHUDviMU24kuBDJbG0mdo23m5Vcqozyc9/oKct5ZyaD5EETmXAwEUhg44zmtXSdSh/stLedlLbPLkXqzk55x1NKT5FdR2AyIMLcuYZwUCgs55JaibRp49MM7zI4yGYAc8ntV2Hw1DFpzm2u9kj/vA7DgL6GmWp1P+yWHA2pmNxxkgcAj86PaXd4PqFjVleCLT2n2YaNRtK8MQKp6XqM020mPczyEluwH8+P6UzTFuDp9159s8t42XiMg4ZSMEfzqTSrhNTsrqyMUdtK6HDdAD/jWDgkmnrqWhZXv4tUMrMwRslGU5GPatO5gklaK4wJjt+cDnI6jHrWZps88McNnKMCJXRyeQw7Y/WpVvbqynIIV4WO5UIxtB7D6UKnzTUU7D5rIsX1vFqESmJlSc/dbOAf9k+lYkTy20xjdWRgcMp4wa12tbbUpfPspzbXf8SP91/qP6ipJrZpcRX0RhnAwsvUH8e4r2sNhfZxtuYylcpiXOD1PrV63vcEBzj3qhJDLaNslXGfusOhpEkB4rtjeJD1NkxRycj5Se46GoXt2U5x+VVIrhounK+ladtcLKOPxU9RXRFpk2KZiI5IyKaU9K1WgBGV6GoXg9sVaQijgqc1IrVIY8cVGVwaYDw2aN3tUfIOMUZx1FMColwelTLcetY8EzMil12MRyKtI/vVJjNVJ6sJcZ4BrIV8fxVMk2OlO4jYS4Gcc5qdJWx/jWRHcE9MCrUcjN1ancDSDnuRUitVISKg3OwAHcnFQTauqjbANx/vEcUm0hWNZ5khj3yMFHqayLzV5HBW2XaP75/pVCSaS5cF2Lv2H/wBanraTN94rEPVzj9Kzcm9h2KMgLOWclmPJJ5NMispbs/IpCeta6WllEMyFrh/TotLPMzx7eEjHRF4FZOn3Hcx5beCBcMd2OiL0/E96zri6k6Q/JngBa0bqMYJNUU3JcBYY98jDCrjrXFXg+iNIsqbbrbHGznjgA9s9/rVu6sopIo7bcFt4z8zL1Y+/tV2a0EUCNcYeQDkZ4qqLh50MBUfLli/oo7f0ryqlKpFq+5opJk8NjLqdvLcRwxRAjYrg4woHU+grHmt1tdFuJIiRbtIPNbPMuD8oX2J5/Crbx3Oo6WzjfHHI5IROFYD19aFtfK0R/MkEqRfejJ/iPQD/AD61nB8ujfXYe5HGFn8PGHTS0JmfLGQ8r0OM1c02EmTUXhnie7eMJFt5wcYYj86ppNqEcuLe2BtLlVBQj7uM9PSrSSHS7iK8MLbZplXy2Od2Rg49MD9cU5t2aXXX/hwSGH7Tp2gW0UluxYytG28cBc5x+PrWxFdb7dmtoZjE6kR5GSB0/Qg1nWLtNrV2ZhvhY/JC/IxngAU2Rv7S+aKV7ezAbyVX+IBiCD7ZH61lOKk7P1v6jSJbfUdUknaARIG4UkKScDv+NXLCG2mvJpEjLZdmDdmGc7x7+vvVH+0jZazCiZaBIkDKRyD3x+OKtIrmUgYJD70QDgxtnHPfP9KicdNrXGMuruKe/jmRCPLYDPTI71PehhJluR2rLinxNIjLlCxwO4rbhxc2a5O5lG0n+tehhsLrcznIz1GDuGQ3Wtez1QhPJuR5kXTJ61mvGUbHSmjj8K9SneBk9TpPscFzbkQuHjP8BPT8e1Yl5pz2bFgCY/UjkfWi3uXhbKMVP1rUi1NpF2yBXHQ5711WjJE6owgeeePrT1ZlYEZBHStZ7fT5udjQk/3Dx+VQPpqE/u7kH03Cj2bWwXLNleiQhJCFY/ka0Gi4rDNjMvRkb6HFalhcvgW9wCrjhWPetI32YmDwioHgz0rTePNQMuKuwjKeEjmm+Wa0mjB7VEYOaLAciBTwxA60pX0oCVBYqyN3qQSEdqjwT7U4KR0paiJ0uGHQVMLyY8LxVUDHWpFzjHXPQDvRdgSmRmO6Qlj6ntVy2s2lAd/kQ9B3NPtLEIFknAz2X0q8TnhapR6sVyEQiJSEG0d/ekEZ/CpxGep6UpPbFOwiuI6jljwOetWSCBmq0uM5zxUtDKM0TSuscalmY4AFaVvYRWEJPDSsPmf/AD2qzY2/lp50g+dhx7CmXj449qhxS1C5g6g5lc/3RU2n6YptHkuB8snUHuOwqWO0Nzcqh+71b6VpXK7k2Lwo4Fc6pJtzY79DnL28nEj+S+yMcAYqsqg2gnmJkVW3yKeuR0rQvbUD5QKqTxFNKmI/vL/hXmYjDato1jIhsZDc2s0wfypSSsag446ULp8MtzBh2PkoN3s3p/WmwxsbSJVGGLFVA75rXMH2BYY4xnAO73NcyoVHzOJfMjOa+hkQpLGY9pZS6KeG7HNPsw9ksdtKo+yWzM5P94MD/ImoJ4AZSSOSckGtJIftGnvGRmRUIX8ulNYZuNkg5tSOaWOe6ilt412ttQkjtmogkmnak7x/xZHPpUGmyAEQsTw4Zfz6V02pWAY7wPxrqw2D91qREp6nLSoRLkjk81oadP5MoVj8j8GmXMJB5FQbSGx2rtjHkehO5vXNvuGQOaznQqeRWjpt2J0+zSn94B8p/vCn3FqRniuvlT1RBkinq5HtTniKE8UzGDQlYLlpZg3B4NSiXFUgTUqscVohFoSZ6GpEmI4PI9KqA5pwY1SYG/DOJUAPWnOtZNvcGNhnpWj52UDKMjvVCGOtR/jUxkVqaVGaAOU8s96UR+lTDHfmjAzUjItgHWkK1IRTcjtSGN2/hWpY2ohAlkHznoD2qCxtwzea3IXpn1rRxk04oQ8ksetSLGKRIwe9TFcAAVYhp4HFAJHbmpQnA9ajkzyAaTArvhs+lMihEtwoxlRyaGHBPartpEIodzfebk1G4EsjhEz7cVkykuxY9e1Xbh9zEdhTLWIPJvP3V6fWolq7DC2hMMOMfvG+8fT2qb7P/e5qdU4J6AcCpjHhQcU7CMO9h3MTj2rLvYtunMv96RR/M/0robtADjHasfUF/dwJ/tFz/L/GuepHdlIj0Wx82VXYfLDkj6mtOa2Lv0qzodvt0/eRy7E/0q48YBAxzVwpJQSBvU5O8tSr9Kdakoy1sX1sOpFZJTyx+NT7NJhczr60MF84TgN86H612cDC+0+Gbj50BP17/rXP3sXnWaTj70R2t/unp+tX/DlySklqT935l+h61UI2k13B7Fe+ttkmMcVjSxlJOK7O8tw65xmudurYhjjtVSgCZQjchgQcMDkH3ro7O5W+t/mx5q/eHr71ze3axJFWLad7aVZU7dR6iiGjBmvPb5BwOaz5IipzitsMlxCJU6NVaaAOpwMGtrEmTjnpSqKlkRgQccios880rDJR0oBpEp7DPIpgOU81agnaM8niqPSnq5HemgNcbW+YdDUm32rNhmKnrxVoS8fepgYGQBTS5zxUXmUbsnrSAk6n1qSOMuwUVEpArRtY9ibm+81CVwJ0QKqqowBxVhIwelMQAnHep1wBgVQiRUxxT9nPJpiMfTJpxbPTigBXGBwaquCRkmpJGO04NVmY49qhgPiTzZVU9Op+lXZZAM1UtPlV3IxuOB9KJZAX46Ck9EBHIST3rQt4wkap36tVS2jMjbyOAePetSKPb1/GiMeoMcibm56CpXXI9qcigCkm4iNNrQDKusFyfwrJvQDPjsigVryYZxnp1rJ5mlyesj5/WsZrSw0dLYw+XYQr6IKc6j0qaMbVC+gpjjn8a3toIpTx74mB61h3URDV00iA8isq8h+UsB14qXEDNs2RmaCU/u5QUb2z3/CqVpI+n34Zlw0TlJB7dDVhl8t8Gm36b9lwOS/yv7kDr+VZtdSkdaCs0QKkFSMg1kXdttdsUnh673I1q55TlPpWpcxZG/HNarVXJOTnt9ueOtVNpXiuimgDBhisua1OCR2qXEdxdNvTbS7HP7p+vsfWtiYY5HQ965sqy9a1dNvRIn2aU8j7pP8AKqi+giSRBIMj7wqm6ZJBGDVyVdrHtULFXODwabGVFJXqKnVtw5pjrtYhuaFyOnSkBIycVHjHFSqcikZc9KYDQeafvPrUJ4NLvNFwME3IHemG9iTlpAPxrg/7QnbrNIf+BGpraR5ZQoyWY4FZ8wHoWnTx3cp2HKJyT2rdjOTWJpUC2tskI6jlj6mtpGAFaoCynB4HNSphScjJqBHwcipN/HuaYibzMdOtG4EYqEMCPencAHFIBJXAXHaoDlwOwpxy30pBjIHYVLGSb9sfHbpUagyuFHUmo5JMtgdBVqyjyTIe/A+lK12BegQAAAcAcVaUHNMRMKKnXGMitCSUDAqG5bjFTZGCap3DDGaljKFw2I5Gz2x+dUrRN17AuONwqzcnMeP7xqPThnUY/QAn/P51k1eSGjpM4HvUTHrTlPHNROeSexrckc/K8duaguIg8RHerIGUP0pjr8ucdKAOcuYjux3zUGzzIJIT1IyPqK1b2Hdll6is4fLKDjvUNDRSs7g2t3FMP4WGfp3rtjiSMHqCK4adNk7pjjJxXV6Ncefp6AnLINppQ7DYyaHDHaao+ScsOtbE6dGFUnXkkVpYkyprXcM4rPeF4XyMjB4I7V0YUEGopLVXBBHWpcRmfDdi4jCScSAfnUchwxp8thhuOMVE0Mu3ruxQMBJng80dDkdKrsxQ8jFOWQ9c0XAtA57Yp3eoVcHvUm71oAR1zUW2pjg0m2lYDxFQc1saGmdQiz25rPEfNa+iri9U+xrGO4Hd2bYArSjO41j2rHArSSTA4rdMC8JNowKcCTjmqqNuFTqcDmmBMCQODQWLcA8UzOeO1Ix7CgALYXimF9qZNVp5xvWFD8xPzH0FEkmSBUtgSpmSRVHVjW7bxhVAHQcVj6aheZpP7vArbT5acUJkwFTj2qBOmTUobkYqhDyflNUJznirjHGRVC4btSYylcHlR7Zo00/6cfZD/MUydssx7dKTTmH21vdD/MVmviGdErfJULHAIzxmlDfKKimPHHrWxJaTlCO9SDBGCO1QxN8n4VMBwPWgDPuo8ZI6VkSRgOCOhreuANprGuEKscdM8UhmZqCbLkH+8oNaHh6fbO8RPDDIqnqfIhbvgjNRafN5F9C3bdistpDOycZ47VQkADlfyrQPIyOtVLlAQHHUVsSVhgZp+3IBFRZwwJ6Gp044HQ0DIZo8jP51QkXYT7VrlQciqVxH19aGBlzqMhu3Q1WeLbyvFXJB1WoFPGKzkNFfJU09Zh3pzpmoHSlcZZ81fWk89fWqmD2pMUcwWPMQvNXrBjHcRkddwFUhVqDggjrWC3A7W2YgYrRjY4rMtTmNT32itJOgreIFtGx0qdSe/Sq0fT8amXk1YibJPPaqVzfAApFye7Ut8zARqCQrZyB3rO6jmonK2g0iS3bMpYntUzP3qCH+OpO4qUNnQadH5dqmep5NaAJ4qvB9xfpU461siCdTlQTUhOce1Rp92pOxoAbI2Kz5z8w/OtB+1Ztx95vpSYFKU5TNRWLldQUeqGpJfun6VBZf8hNf90/yrNbjOkV8pmoJXOCD1Bp0X+pqK4+6a2EXbdsoAatRnjPeqNt91aux0IRDMODWTcja49DWvN1rJuvvGgZl34zAmezGs8NtYEdjWhf/AOpH1/xrPHU1jLcpHbW0omtY3B4YCkccFT0NVNGJOmLn3q3J0rZbEmbJmOQoenUVLBJztqO+6p9aZGTvBpAaQ+YfSoJUzk1Knf6UjfcNUBiXSFTVHcUc+hrTvvu/hWY4+UVEhkoIYUx0psR+apm6VBRUZcUYFSuKZgUAf//Z"
6 |           }
7 |        }
8 |     ]
9 |  }


--------------------------------------------------------------------------------
/code/project/code/inference-service.yaml:
--------------------------------------------------------------------------------
 1 | # https://kserve.github.io/website/modelserving/v1beta1/tensorflow/
 2 | apiVersion: serving.kserve.io/v1beta1
 3 | kind: InferenceService
 4 | metadata:
 5 |   name: flower-sample
 6 | spec:
 7 |   predictor:
 8 |     model:
 9 |       modelFormat:
10 |         name: tensorflow
11 |       # This is only needed on Mac M1
12 |       image: "emacski/tensorflow-serving:2.6.0"
13 |       # https://kserve.github.io/website/modelserving/storage/pvc/pvc/
14 |       # Note that we are skipping `mountPath: /trained_model`
15 |       storageUri: "pvc://strategy-volume/saved_model_versions"
16 | 


--------------------------------------------------------------------------------
/code/project/code/model-selection.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow import keras
 4 | import tensorflow_datasets as tfds
 5 | import shutil
 6 | import os
 7 | 
 8 | 
 9 | # Scaling MNIST data from (0, 255] to (0., 1.]
10 | def scale(image, label):
11 |   image = tf.cast(image, tf.float32)
12 |   image /= 255
13 |   return image, label
14 | 
15 | best_model_path = ""
16 | best_accuracy = 0
17 | for i in range(1, 4):
18 |   model_path = "trained_model/saved_model_versions/" + str(i)
19 |   model = keras.models.load_model(model_path)
20 |   datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
21 |   ds = datasets['test'].map(scale).cache().shuffle(10000).batch(64)
22 |   _, accuracy = model.evaluate(ds)
23 |   if accuracy > best_accuracy:
24 |     best_accuracy = accuracy
25 |     best_model_path = model_path
26 | 
27 | destination = "trained_model/saved_model_versions/4"
28 | if os.path.exists(destination):
29 |   shutil.rmtree(destination)
30 | 
31 | shutil.copytree(best_model_path, destination)
32 | print("Best model with accuracy %f is copied to %s" % (best_accuracy, destination))
33 | 


--------------------------------------------------------------------------------
/code/project/code/model-selection.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: model-selection
 5 | spec:
 6 |   containers:
 7 |   - name: predict
 8 |     image: kubeflow/multi-worker-strategy:v0.1
 9 |     command: ["python", "/model-selection.py"]
10 |     volumeMounts:
11 |     - name: model
12 |       mountPath: /trained_model
13 |   volumes:
14 |   - name: model
15 |     persistentVolumeClaim:
16 |       claimName: strategy-volume
17 | 


--------------------------------------------------------------------------------
/code/project/code/multi-worker-distributed-training.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import argparse
  4 | import json
  5 | import os
  6 | 
  7 | import tensorflow_datasets as tfds
  8 | import tensorflow as tf
  9 | from tensorflow.keras import layers, models
 10 | 
 11 | 
 12 | def make_datasets_unbatched():
 13 |   BUFFER_SIZE = 10000
 14 | 
 15 |   # Scaling MNIST data from (0, 255] to (0., 1.]
 16 |   def scale(image, label):
 17 |     image = tf.cast(image, tf.float32)
 18 |     image /= 255
 19 |     return image, label
 20 |   # Use Fashion-MNIST: https://www.tensorflow.org/datasets/catalog/fashion_mnist
 21 |   datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
 22 | 
 23 |   return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE)
 24 | 
 25 | 
 26 | def build_and_compile_cnn_model():
 27 |   print("Training CNN model")
 28 |   model = models.Sequential()
 29 |   model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
 30 |   model.add(
 31 |       layers.Conv2D(32, (3, 3), activation='relu'))
 32 |   model.add(layers.MaxPooling2D((2, 2)))
 33 |   model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 34 |   model.add(layers.MaxPooling2D((2, 2)))
 35 |   model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 36 |   model.add(layers.Flatten())
 37 |   model.add(layers.Dense(64, activation='relu'))
 38 |   model.add(layers.Dense(10, activation='softmax'))
 39 | 
 40 |   model.summary()
 41 | 
 42 |   model.compile(optimizer='adam',
 43 |                 loss='sparse_categorical_crossentropy',
 44 |                 metrics=['accuracy'])
 45 | 
 46 |   return model
 47 | 
 48 | # https://d2l.ai/chapter_convolutional-modern/batch-norm.html#concise-implementation
 49 | def build_and_compile_cnn_model_with_batch_norm():
 50 |   print("Training CNN model with batch normalization")
 51 |   model = models.Sequential()
 52 |   model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
 53 |   model.add(
 54 |       layers.Conv2D(32, (3, 3), activation='relu'))
 55 |   model.add(layers.BatchNormalization())
 56 |   model.add(layers.Activation('sigmoid'))
 57 |   model.add(layers.MaxPooling2D((2, 2)))
 58 |   model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 59 |   model.add(layers.BatchNormalization())
 60 |   model.add(layers.Activation('sigmoid'))
 61 |   model.add(layers.MaxPooling2D((2, 2)))
 62 |   model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 63 |   model.add(layers.Flatten())
 64 |   model.add(layers.Dense(64, activation='relu'))
 65 |   model.add(layers.Dense(10, activation='softmax'))
 66 | 
 67 |   model.summary()
 68 | 
 69 |   model.compile(optimizer='adam',
 70 |                 loss='sparse_categorical_crossentropy',
 71 |                 metrics=['accuracy'])
 72 | 
 73 |   return model
 74 | 
 75 | # https://d2l.ai/chapter_convolutional-modern/alexnet.html
 76 | def build_and_compile_cnn_model_with_dropout():
 77 |   print("Training CNN model with dropout")
 78 |   model = models.Sequential()
 79 |   model.add(layers.Input(shape=(28, 28, 1), name='image_bytes'))
 80 |   model.add(
 81 |       layers.Conv2D(32, (3, 3), activation='relu'))
 82 |   model.add(layers.MaxPooling2D((2, 2)))
 83 |   model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 84 |   model.add(layers.MaxPooling2D((2, 2)))
 85 |   model.add(layers.Dropout(0.5))
 86 |   model.add(layers.Conv2D(64, (3, 3), activation='relu'))
 87 |   model.add(layers.Flatten())
 88 |   model.add(layers.Dense(64, activation='relu'))
 89 |   model.add(layers.Dense(10, activation='softmax'))
 90 | 
 91 |   model.summary()
 92 | 
 93 |   model.compile(optimizer='adam',
 94 |                 loss='sparse_categorical_crossentropy',
 95 |                 metrics=['accuracy'])
 96 | 
 97 |   return model
 98 | 
 99 | 
100 | def decay(epoch):
101 |   if epoch < 3:
102 |     return 1e-3
103 |   if 3 <= epoch < 7:
104 |     return 1e-4
105 |   return 1e-5
106 | 
107 | # https://cloud.google.com/blog/topics/developers-practitioners/add-preprocessing-functions-tensorflow-models-and-deploy-vertex-ai
108 | def _preprocess(bytes_inputs):
109 |     decoded = tf.io.decode_jpeg(bytes_inputs, channels=1)
110 |     resized = tf.image.resize(decoded, size=(28, 28))
111 |     return tf.cast(resized, dtype=tf.uint8)
112 | 
113 | def _get_serve_image_fn(model):
114 |     @tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.string, name='image_bytes')])
115 |     def serve_image_fn(bytes_inputs):
116 |         decoded_images = tf.map_fn(_preprocess, bytes_inputs, dtype=tf.uint8)
117 |         return model(decoded_images)
118 |     return serve_image_fn
119 | 
120 | 
121 | def main(args):
122 | 
123 |   # MultiWorkerMirroredStrategy creates copies of all variables in the model's
124 |   # layers on each device across all workers
125 |   # if your GPUs don't support NCCL, replace "communication" with another
126 |   # https://www.tensorflow.org/tutorials/distribute/keras
127 |   strategy = tf.distribute.MultiWorkerMirroredStrategy(
128 |       communication_options=tf.distribute.experimental.CommunicationOptions(implementation=tf.distribute.experimental.CollectiveCommunication.AUTO))
129 | 
130 |   BATCH_SIZE_PER_REPLICA = 64
131 |   BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
132 | 
133 |   with strategy.scope():
134 |     ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat()
135 |     options = tf.data.Options()
136 |     # https://www.tensorflow.org/tutorials/distribute/input
137 |     options.experimental_distribute.auto_shard_policy = \
138 |         tf.data.experimental.AutoShardPolicy.DATA
139 |     ds_train = ds_train.with_options(options)
140 |     # Model building/compiling need to be within `strategy.scope()`.
141 |     if args.model_type == "cnn":
142 |       multi_worker_model = build_and_compile_cnn_model()
143 |     elif args.model_type == "dropout":
144 |       multi_worker_model = build_and_compile_cnn_model_with_dropout()
145 |     elif args.model_type == "batch_norm":
146 |       multi_worker_model = build_and_compile_cnn_model_with_batch_norm()
147 |     else:
148 |       raise Exception("Unsupported model type: %s" % args.model_type)
149 | 
150 |   # Define the checkpoint directory to store the checkpoints
151 |   checkpoint_dir = args.checkpoint_dir
152 | 
153 |   # Name of the checkpoint files
154 |   checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
155 | 
156 |   # Function for decaying the learning rate.
157 |   # You can define any decay function you need.
158 |   # Callback for printing the LR at the end of each epoch.
159 |   class PrintLR(tf.keras.callbacks.Callback):
160 | 
161 |     def on_epoch_end(self, epoch, logs=None): #pylint: disable=no-self-use
162 |       print('\nLearning rate for epoch {} is {}'.format(
163 |         epoch + 1, multi_worker_model.optimizer.lr.numpy()))
164 | 
165 |   callbacks = [
166 |       tf.keras.callbacks.TensorBoard(log_dir='./logs'),
167 |       tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
168 |                                          save_weights_only=True),
169 |       tf.keras.callbacks.LearningRateScheduler(decay),
170 |       PrintLR()
171 |   ]
172 | 
173 |   # Keras' `model.fit()` trains the model with specified number of epochs and
174 |   # number of steps per epoch. Note that the numbers here are for demonstration
175 |   # purposes only and may not sufficiently produce a model with good quality.
176 |   multi_worker_model.fit(ds_train,
177 |                          epochs=1,
178 |                          steps_per_epoch=70,
179 |                          callbacks=callbacks)
180 | 
181 |   # Saving a model
182 |   # Let `is_chief` be a utility function that inspects the cluster spec and
183 |   # current task type and returns True if the worker is the chief and False
184 |   # otherwise.
185 |   def is_chief():
186 |     return TASK_INDEX == 0
187 | 
188 |   if is_chief():
189 |     model_path = args.saved_model_dir
190 | 
191 |   else:
192 |     # Save to a path that is unique across workers.
193 |     model_path = args.saved_model_dir + '/worker_tmp_' + str(TASK_INDEX)
194 | 
195 |   multi_worker_model.save(model_path)
196 | 
197 | 
198 |   signatures = {
199 |     "serving_default": _get_serve_image_fn(multi_worker_model).get_concrete_function(
200 |         tf.TensorSpec(shape=[None], dtype=tf.string, name='image_bytes')
201 |     )
202 |   }
203 | 
204 |   # https://www.tensorflow.org/api_docs/python/tf/saved_model/save
205 |   tf.saved_model.save(multi_worker_model, model_path, signatures=signatures)
206 | 
207 | 
208 | if __name__ == '__main__':
209 |   os.environ['NCCL_DEBUG'] = 'INFO'
210 | 
211 |   tfds.disable_progress_bar()
212 | 
213 |   # to decide if a worker is chief, get TASK_INDEX in Cluster info
214 |   tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}')
215 |   TASK_INDEX = tf_config['task']['index']
216 | 
217 |   parser = argparse.ArgumentParser()
218 |   parser.add_argument('--saved_model_dir',
219 |                       type=str,
220 |                       required=True,
221 |                       help='Tensorflow export directory.')
222 | 
223 |   parser.add_argument('--checkpoint_dir',
224 |                       type=str,
225 |                       required=True,
226 |                       help='Tensorflow checkpoint directory.')
227 | 
228 |   parser.add_argument('--model_type',
229 |                       type=str,
230 |                       required=True,
231 |                       help='Type of model to train.')
232 | 
233 |   parsed_args = parser.parse_args()
234 |   main(parsed_args)
235 | 


--------------------------------------------------------------------------------
/code/project/code/multi-worker-pvc.yaml:
--------------------------------------------------------------------------------
 1 | kind: PersistentVolumeClaim
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: strategy-volume
 5 | spec:
 6 |   accessModes: [ "ReadWriteOnce" ]
 7 |   resources:
 8 |     requests:
 9 |       storage: 1Gi
10 | 


--------------------------------------------------------------------------------
/code/project/code/multi-worker-tfjob.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1
 2 | kind: TFJob
 3 | metadata:
 4 |   name: multi-worker-training
 5 | spec:
 6 |   runPolicy:
 7 |     cleanPodPolicy: None
 8 |   tfReplicaSpecs:
 9 |     Worker:
10 |       replicas: 2
11 |       restartPolicy: Never
12 |       template:
13 |         spec:
14 |           containers:
15 |             - name: tensorflow
16 |               image: kubeflow/multi-worker-strategy:v0.1
17 |               imagePullPolicy: IfNotPresent
18 |               command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
19 |               volumeMounts:
20 |                 - mountPath: /trained_model
21 |                   name: training
22 |               resources:
23 |                 limits:
24 |                   cpu: 500m
25 |           volumes:
26 |             - name: training
27 |               persistentVolumeClaim:
28 |                 claimName: strategy-volume
29 | 


--------------------------------------------------------------------------------
/code/project/code/predict-service.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow import keras
 4 | import tensorflow_datasets as tfds
 5 | 
 6 | 
 7 | model = keras.models.load_model("trained_model/saved_model_versions")
 8 | 
 9 | # Scaling MNIST data from (0, 255] to (0., 1.]
10 | def scale(image, label):
11 |   image = tf.cast(image, tf.float32)
12 |   image /= 255
13 |   return image, label
14 | 
15 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True)
16 | 
17 | ds = datasets['test'].map(scale).cache().shuffle(10000).batch(64)
18 | 
19 | # TODO: Visualize the images and compare with the classified result
20 | model.predict(ds)
21 | 


--------------------------------------------------------------------------------
/code/project/code/predict-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: predict-service
 5 | spec:
 6 |   containers:
 7 |   - name: predict
 8 |     image: kubeflow/multi-worker-strategy:v0.1
 9 |     command: ['sleep', 'infinity']
10 |     volumeMounts:
11 |     - name: model
12 |       mountPath: /trained_model
13 |   volumes:
14 |   - name: model
15 |     persistentVolumeClaim:
16 |       claimName: strategy-volume
17 | 


--------------------------------------------------------------------------------
/code/project/code/workflow.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: Workflow
  3 | metadata:
  4 |   generateName: tfjob-wf-
  5 |   namespace: kubeflow
  6 | spec:
  7 |   entrypoint: tfjob-wf
  8 |   podGC:
  9 |     strategy: OnPodSuccess
 10 |   volumes:
 11 |   - name: model
 12 |     persistentVolumeClaim:
 13 |       claimName: strategy-volume
 14 | 
 15 |   templates:
 16 |   - name: tfjob-wf
 17 |     steps:
 18 |     - - name: data-ingestion-step
 19 |         template: data-ingestion-step
 20 |     - - name: distributed-tf-training-steps
 21 |         template: distributed-tf-training-steps
 22 |     - - name: model-selection-step
 23 |         template: model-selection-step
 24 |     - - name: create-model-serving-service
 25 |         template: create-model-serving-service
 26 | 
 27 |   - name: data-ingestion-step
 28 |     serviceAccountName: argo
 29 |     memoize:
 30 |       key: "step-cache"
 31 |       maxAge: "1h"
 32 |       cache:
 33 |         configMap:
 34 |           name: my-config
 35 |           key: step-cache
 36 |     container:
 37 |       image: kubeflow/multi-worker-strategy:v0.1
 38 |       imagePullPolicy: IfNotPresent
 39 |       command: ["python", "/data-ingestion.py"]
 40 | 
 41 |   - name: distributed-tf-training-steps
 42 |     steps:
 43 |     - - name: cnn-model
 44 |         template: cnn-model
 45 |       - name: cnn-model-with-dropout
 46 |         template: cnn-model-with-dropout
 47 |       - name: cnn-model-with-batch-norm
 48 |         template: cnn-model-with-batch-norm
 49 | 
 50 |   - name: cnn-model
 51 |     serviceAccountName: training-operator
 52 |     resource:
 53 |       action: create
 54 |       setOwnerReference: true
 55 |       successCondition: status.replicaStatuses.Worker.succeeded = 2
 56 |       failureCondition: status.replicaStatuses.Worker.failed > 0
 57 |       manifest: |
 58 |         apiVersion: kubeflow.org/v1
 59 |         kind: TFJob
 60 |         metadata:
 61 |           generateName: multi-worker-training-
 62 |         spec:
 63 |           runPolicy:
 64 |             cleanPodPolicy: None
 65 |           tfReplicaSpecs:
 66 |             Worker:
 67 |               replicas: 2
 68 |               restartPolicy: Never
 69 |               template:
 70 |                 spec:
 71 |                   containers:
 72 |                     - name: tensorflow
 73 |                       image: kubeflow/multi-worker-strategy:v0.1
 74 |                       imagePullPolicy: IfNotPresent
 75 |                       command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/1/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"]
 76 |                       volumeMounts:
 77 |                         - mountPath: /trained_model
 78 |                           name: training
 79 |                       resources:
 80 |                         limits:
 81 |                           cpu: 500m
 82 |                   volumes:
 83 |                     - name: training
 84 |                       persistentVolumeClaim:
 85 |                         claimName: strategy-volume
 86 | 
 87 |   - name: cnn-model-with-dropout
 88 |     serviceAccountName: training-operator
 89 |     resource:
 90 |       action: create
 91 |       setOwnerReference: true
 92 |       successCondition: status.replicaStatuses.Worker.succeeded = 2
 93 |       failureCondition: status.replicaStatuses.Worker.failed > 0
 94 |       manifest: |
 95 |         apiVersion: kubeflow.org/v1
 96 |         kind: TFJob
 97 |         metadata:
 98 |           generateName: multi-worker-training-
 99 |         spec:
100 |           runPolicy:
101 |             cleanPodPolicy: None
102 |           tfReplicaSpecs:
103 |             Worker:
104 |               replicas: 2
105 |               restartPolicy: Never
106 |               template:
107 |                 spec:
108 |                   containers:
109 |                     - name: tensorflow
110 |                       image: kubeflow/multi-worker-strategy:v0.1
111 |                       imagePullPolicy: IfNotPresent
112 |                       command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "dropout"]
113 |                       volumeMounts:
114 |                         - mountPath: /trained_model
115 |                           name: training
116 |                       resources:
117 |                         limits:
118 |                           cpu: 500m
119 |                   volumes:
120 |                     - name: training
121 |                       persistentVolumeClaim:
122 |                         claimName: strategy-volume
123 | 
124 |   - name: cnn-model-with-batch-norm
125 |     serviceAccountName: training-operator
126 |     resource:
127 |       action: create
128 |       setOwnerReference: true
129 |       successCondition: status.replicaStatuses.Worker.succeeded = 2
130 |       failureCondition: status.replicaStatuses.Worker.failed > 0
131 |       manifest: |
132 |         apiVersion: kubeflow.org/v1
133 |         kind: TFJob
134 |         metadata:
135 |           generateName: multi-worker-training-
136 |         spec:
137 |           runPolicy:
138 |             cleanPodPolicy: None
139 |           tfReplicaSpecs:
140 |             Worker:
141 |               replicas: 2
142 |               restartPolicy: Never
143 |               template:
144 |                 spec:
145 |                   containers:
146 |                     - name: tensorflow
147 |                       image: kubeflow/multi-worker-strategy:v0.1
148 |                       imagePullPolicy: IfNotPresent
149 |                       command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/3/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "dropout"]
150 |                       volumeMounts:
151 |                         - mountPath: /trained_model
152 |                           name: training
153 |                       resources:
154 |                         limits:
155 |                           cpu: 500m
156 |                   volumes:
157 |                     - name: training
158 |                       persistentVolumeClaim:
159 |                         claimName: strategy-volume
160 | 
161 |   - name: model-selection-step
162 |     serviceAccountName: argo
163 |     container:
164 |       image: kubeflow/multi-worker-strategy:v0.1
165 |       imagePullPolicy: IfNotPresent
166 |       command: ["python", "/model-selection.py"]
167 |       volumeMounts:
168 |       - name: model
169 |         mountPath: /trained_model
170 | 
171 |   - name: create-model-serving-service
172 |     serviceAccountName: training-operator
173 |     successCondition: status.modelStatus.states.transitionStatus = UpToDate
174 |     resource:
175 |       action: create
176 |       setOwnerReference: true
177 |       manifest: |
178 |         apiVersion: serving.kserve.io/v1beta1
179 |         kind: InferenceService
180 |         metadata:
181 |           name: flower-sample
182 |         spec:
183 |           predictor:
184 |             model:
185 |               modelFormat:
186 |                 name: tensorflow
187 |               image: "emacski/tensorflow-serving:2.6.0"
188 |               storageUri: "pvc://strategy-volume/saved_model_versions"
189 | 


--------------------------------------------------------------------------------
/code/project/manifests/argo-workflows/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kustomize.config.k8s.io/v1beta1
 2 | kind: Kustomization
 3 | namespace: kubeflow
 4 | 
 5 | resources:
 6 | - https://github.com/argoproj/argo-workflows/releases/download/v3.4.0/install.yaml
 7 | 
 8 | patchesStrategicMerge:
 9 | - rbac-patch.yaml
10 | 


--------------------------------------------------------------------------------
/code/project/manifests/argo-workflows/rbac-patch.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: rbac.authorization.k8s.io/v1
  2 | kind: ClusterRole
  3 | metadata:
  4 |   name: argo-cluster-role
  5 | rules:
  6 | - apiGroups:
  7 |   - ""
  8 |   resources:
  9 |   - pods
 10 |   - pods/exec
 11 |   verbs:
 12 |   - create
 13 |   - get
 14 |   - list
 15 |   - watch
 16 |   - update
 17 |   - patch
 18 |   - delete
 19 | - apiGroups:
 20 |   - ""
 21 |   resources:
 22 |   - configmaps
 23 |   verbs:
 24 |   # Note(terrytangyuan): "create" and "update" are additional RBAC needed to use memoization cache.
 25 |   - create
 26 |   - update
 27 |   - get
 28 |   - watch
 29 |   - list
 30 | - apiGroups:
 31 |   - ""
 32 |   resources:
 33 |   - persistentvolumeclaims
 34 |   - persistentvolumeclaims/finalizers
 35 |   verbs:
 36 |   - create
 37 |   - update
 38 |   - delete
 39 |   - get
 40 | - apiGroups:
 41 |   - argoproj.io
 42 |   resources:
 43 |   - workflows
 44 |   - workflows/finalizers
 45 |   - workflowtasksets
 46 |   - workflowtasksets/finalizers
 47 |   - workflowartifactgctasks
 48 |   verbs:
 49 |   - get
 50 |   - list
 51 |   - watch
 52 |   - update
 53 |   - patch
 54 |   - delete
 55 |   - create
 56 | - apiGroups:
 57 |   - argoproj.io
 58 |   resources:
 59 |   - workflowtemplates
 60 |   - workflowtemplates/finalizers
 61 |   - clusterworkflowtemplates
 62 |   - clusterworkflowtemplates/finalizers
 63 |   verbs:
 64 |   - get
 65 |   - list
 66 |   - watch
 67 | - apiGroups:
 68 |   - argoproj.io
 69 |   resources:
 70 |   - workflowtaskresults
 71 |   verbs:
 72 |   - list
 73 |   - watch
 74 |   - deletecollection
 75 | - apiGroups:
 76 |   - ""
 77 |   resources:
 78 |   - serviceaccounts
 79 |   verbs:
 80 |   - get
 81 |   - list
 82 | - apiGroups:
 83 |   - argoproj.io
 84 |   resources:
 85 |   - cronworkflows
 86 |   - cronworkflows/finalizers
 87 |   verbs:
 88 |   - get
 89 |   - list
 90 |   - watch
 91 |   - update
 92 |   - patch
 93 |   - delete
 94 | - apiGroups:
 95 |   - ""
 96 |   resources:
 97 |   - events
 98 |   verbs:
 99 |   - create
100 |   - patch
101 | - apiGroups:
102 |   - policy
103 |   resources:
104 |   - poddisruptionbudgets
105 |   verbs:
106 |   - create
107 |   - get
108 |   - delete


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/cluster-role-binding.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   labels:
 6 |     app: training-operator
 7 |   name: training-operator
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   name: training-operator
12 | subjects:
13 | - kind: ServiceAccount
14 |   name: training-operator
15 | 


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/cluster-role.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   labels:
 6 |     app: training-operator
 7 |   name: training-operator
 8 | rules:
 9 |   - apiGroups:
10 |       - serving.kserve.io
11 |     resources:
12 |       - inferenceservices
13 |     verbs:
14 |       - "*"
15 |   - apiGroups:
16 |       - kubeflow.org
17 |     resources:
18 |       - tfjobs
19 |       - mxjobs
20 |       - pytorchjobs
21 |       - xgboostjobs
22 |       - tfjobs/status
23 |       - pytorchjobs/status
24 |       - mxjobs/status
25 |       - xgboostjobs/status
26 |     verbs:
27 |       - create
28 |       - delete
29 |       - get
30 |       - list
31 |       - patch
32 |       - update
33 |       - watch
34 |   - apiGroups:
35 |       - ""
36 |     resources:
37 |       - pods
38 |       - services
39 |       - endpoints
40 |       - events
41 |     verbs:
42 |       - "*"
43 |   - apiGroups:
44 |       - apps
45 |       - extensions
46 |     resources:
47 |       - deployments
48 |     verbs:
49 |       - "*"
50 |   - apiGroups:
51 |       - scheduling.volcano.sh
52 |     resources:
53 |       - podgroups
54 |     verbs:
55 |       - "*"
56 | 


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/crds/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 | resources:
4 |   - kubeflow.org_tfjobs.yaml
5 |   - kubeflow.org_mxjobs.yaml
6 |   - kubeflow.org_pytorchjobs.yaml
7 |   - kubeflow.org_xgboostjobs.yaml
8 | 


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: training-operator
 5 |   labels:
 6 |     control-plane: kubeflow-training-operator
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       control-plane: kubeflow-training-operator
11 |   replicas: 1
12 |   template:
13 |     metadata:
14 |       labels:
15 |         control-plane: kubeflow-training-operator
16 |       annotations:
17 |         sidecar.istio.io/inject: "false"
18 |     spec:
19 |       containers:
20 |         - command:
21 |             - /manager
22 |           image: kubeflow/training-operator
23 |           name: training-operator
24 |           env:
25 |             - name: MY_POD_NAMESPACE
26 |               valueFrom:
27 |                 fieldRef:
28 |                   fieldPath: metadata.namespace
29 |             - name: MY_POD_NAME
30 |               valueFrom:
31 |                 fieldRef:
32 |                   fieldPath: metadata.name
33 |           securityContext:
34 |             allowPrivilegeEscalation: false
35 |           livenessProbe:
36 |             httpGet:
37 |               path: /healthz
38 |               port: 8081
39 |             initialDelaySeconds: 15
40 |             periodSeconds: 20
41 |           readinessProbe:
42 |             httpGet:
43 |               path: /readyz
44 |               port: 8081
45 |             initialDelaySeconds: 5
46 |             periodSeconds: 10
47 |           resources:
48 |             limits:
49 |               cpu: 100m
50 |               memory: 30Mi
51 |             requests:
52 |               cpu: 100m
53 |               memory: 20Mi
54 |       serviceAccountName: training-operator
55 |       terminationGracePeriodSeconds: 10
56 | 


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kustomize.config.k8s.io/v1beta1
 2 | kind: Kustomization
 3 | namespace: kubeflow
 4 | resources:
 5 |   - crds/
 6 |   - cluster-role-binding.yaml
 7 |   - cluster-role.yaml
 8 |   - service-account.yaml
 9 |   - service.yaml
10 |   - deployment.yaml
11 | images:
12 |   - name: kubeflow/training-operator
13 |     newName: public.ecr.aws/j1r0q0g6/training/training-operator
14 |     newTag: "5ef6c405df2bb1bf1d3ede988cd43433eff2e956"
15 | 


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/service-account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   labels:
5 |     app: training-operator
6 |   name: training-operator
7 | 


--------------------------------------------------------------------------------
/code/project/manifests/kubeflow-training/service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   annotations:
 6 |     prometheus.io/path: /metrics
 7 |     prometheus.io/scrape: "true"
 8 |     prometheus.io/port: "8443"
 9 |   labels:
10 |     app: training-operator
11 |   name: training-operator
12 | spec:
13 |   ports:
14 |   - name: monitoring-port
15 |     port: 8443
16 |     targetPort: 8443
17 |   selector:
18 |     name: training-operator
19 |   type: ClusterIP
20 | 


--------------------------------------------------------------------------------
/code/project/manifests/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 | namespace: kubeflow
4 | 
5 | resources:
6 | - argo-workflows/
7 | - kubeflow-training/
8 | 


--------------------------------------------------------------------------------
/images/chinese-cover.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/chinese-cover.pdf


--------------------------------------------------------------------------------
/images/english-front-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/english-front-cover.png


--------------------------------------------------------------------------------
/images/korean-cover-clean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover-clean.png


--------------------------------------------------------------------------------
/images/korean-cover-white.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover-white.jpg


--------------------------------------------------------------------------------
/images/korean-cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover.jpg


--------------------------------------------------------------------------------