├── .gitignore ├── LICENSE ├── README.md ├── _config.yml ├── code ├── README.md ├── chapter-2 │ ├── listing-2-1.py │ ├── listing-2-2.py │ ├── listing-2-3.py │ ├── listing-2-4.py │ ├── listing-2-5.txt │ ├── listing-2-6.txt │ ├── listing-2-7.txt │ └── listing-2-8.txt └── project │ ├── .gitignore │ ├── README.md │ ├── basics │ ├── argo-coinflip.yaml │ ├── argo-dag-diamond.yaml │ ├── argo-hello-world.yaml │ ├── argo-resource-template.yaml │ ├── argo-script-template.yaml │ ├── hello-world.yaml │ └── tfjob.yaml │ ├── code │ ├── Dockerfile │ ├── README.md │ ├── access-model.yaml │ ├── autoscaled-inference-service.yaml │ ├── data-ingestion.py │ ├── http-inference-request.py │ ├── inference-client.py │ ├── inference-input.json │ ├── inference-service.yaml │ ├── model-selection.py │ ├── model-selection.yaml │ ├── multi-worker-distributed-training.py │ ├── multi-worker-pvc.yaml │ ├── multi-worker-tfjob.yaml │ ├── predict-service.py │ ├── predict-service.yaml │ └── workflow.yaml │ └── manifests │ ├── argo-workflows │ ├── kustomization.yaml │ └── rbac-patch.yaml │ ├── kubeflow-training │ ├── cluster-role-binding.yaml │ ├── cluster-role.yaml │ ├── crds │ │ ├── kubeflow.org_mxjobs.yaml │ │ ├── kubeflow.org_pytorchjobs.yaml │ │ ├── kubeflow.org_tfjobs.yaml │ │ ├── kubeflow.org_xgboostjobs.yaml │ │ └── kustomization.yaml │ ├── deployment.yaml │ ├── kustomization.yaml │ ├── service-account.yaml │ └── service.yaml │ └── kustomization.yaml └── images ├── chinese-cover.pdf ├── english-front-cover.png ├── korean-cover-clean.png ├── korean-cover-white.jpg └── korean-cover.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Yuan Tang 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed Machine Learning Patterns 2 | 3 | [![LinkedIn](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/linkedin.svg)](https://www.linkedin.com/in/terrytangyuan) 4 | [![Bluesky](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/bluesky.svg)](https://bsky.app/profile/terrytangyuan.xyz) 5 | [![GitHub](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/github.svg)](https://github.com/terrytangyuan) 6 | [![Twitter](https://raw.githubusercontent.com/terrytangyuan/terrytangyuan/master/imgs/twitter.svg)](https://twitter.com/TerryTangYuan) 7 | 8 | book-front-cover 9 | 10 | This repository contains references and code for the book *Distributed Machine Learning Patterns* from [Manning Publications](https://bit.ly/2RKv8Zo) by [Yuan Tang](https://github.com/terrytangyuan). 11 | 12 | :fire: **[Korean](images/korean-cover.jpg) and [Chinese](images/chinese-cover.pdf) versions are available from Tsinghua University Press and Hanbit Media!** 13 | 14 | [Manning](https://bit.ly/2RKv8Zo), [Amazon](https://www.amazon.com/dp/1617299022/), [Barnes & Noble](https://www.barnesandnoble.com/w/distributed-machine-learning-patterns-yuan-tang/1140209010), [Powell’s]( https://www.powells.com/book/distributed-machine-learning-patterns-9781617299025), [Bookshop](https://bookshop.org/p/books/distributed-machine-learning-patterns-yuan-tang/17491200) 15 | 16 | 17 | In *Distributed Machine Learning Patterns* you will learn how to: 18 | 19 | * Apply patterns to build scalable and reliable machine learning systems. 20 | * Construct machine learning pipelines with data ingestion, distributed training, model serving, and more. 21 | * Automate machine learning tasks with [Kubernetes](https://kubernetes.io/), [TensorFlow](https://www.tensorflow.org/), [Kubeflow](https://www.kubeflow.org/), and [Argo Workflows](https://argoproj.github.io/argo-workflows/). 22 | * Make trade off decisions between different patterns and approaches. 23 | * Manage and monitor machine learning workloads at scale. 24 | 25 | This book teaches you how to take machine learning models from your personal laptop to large distributed clusters. You’ll explore key concepts and patterns behind successful distributed machine learning systems, and learn technologies like TensorFlow, Kubernetes, Kubeflow, and Argo Workflows directly from a key maintainer and contributor. Real-world scenarios, hands-on projects, and clear, practical advice DevOps techniques and let you easily launch, manage, and monitor cloud-native distributed machine learning pipelines. 26 | 27 | ## About the topic 28 | 29 | Scaling up models from personal devices to large distributed clusters is one of the biggest challenges faced by modern machine learning practitioners. Distributing machine learning systems allow developers to handle extremely large datasets across multiple clusters, take advantage of automation tools, and benefit from hardware accelerations. In this book, Yuan Tang shares patterns, techniques, and experience gained from years spent building and managing cutting-edge distributed machine learning infrastructure. 30 | 31 | ## About the book 32 | 33 | *Distributed Machine Learning Patterns* is filled with practical patterns for running machine learning systems on distributed Kubernetes clusters in the cloud. Each pattern is designed to help solve common challenges faced when building distributed machine learning systems, including supporting distributed model training, handling unexpected failures, and dynamic model serving traffic. Real-world scenarios provide clear examples of how to apply each pattern, alongside the potential trade-offs for each approach. Once you’ve mastered these cutting-edge techniques, you’ll put them all into practice and finish up by building a comprehensive distributed machine learning system. 34 | 35 | ## About the reader 36 | 37 | For data analysts, data scientists, and software engineers familiar with the basics of machine learning algorithms and running machine learning in production. Readers should be familiar with the basics of Bash, Python, and Docker. 38 | 39 | ## About the author 40 | 41 | Yuan is a principal software engineer at [Red Hat](https://www.redhat.com/), working on [OpenShift AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai). Previously, he has led AI infrastructure and platform teams at various companies. He holds leadership positions in open source projects, including [Argo](https://argoproj.github.io/), [Kubeflow](https://github.com/kubeflow), and [Kubernetes](https://github.com/kubernetes/community/tree/master/wg-serving). He's also a maintainer and author of many popular [open source projects](https://github.com/sponsors/terrytangyuan). In addition, Yuan [authored](https://terrytangyuan.github.io/cv#publications) three technical books and published numerous impactful papers. He's a regular [conference speaker](https://terrytangyuan.github.io/cv#talks), technical advisor, leader, and mentor at [various organizations](https://terrytangyuan.github.io/cv#services). 42 | 43 | ## Supporting Quotes 44 | 45 | *"This is a wonderful book for those wanting to understand how to be more effective with Machine Learning at scale, explained clearly and from first principles!"* 46 | 47 | **-- Laurence Moroney, AI Developer Relations Lead at Google** 48 | 49 | *"This book is an exceptionally timely and comprehensive guide to developing, running, and managing machine learning systems in a distributed environment. It covers essential topics such as data partitioning, ingestion, model training, serving, and workflow management. What truly sets this book apart is its discussion of these topics from a pattern perspective, accompanied by real-world examples and widely adopted systems like Kubernetes, Kubeflow, and Argo. I highly recommend it!"* 50 | 51 | **-- Yuan Chen, Principal Software Engineer at Apple** 52 | 53 | 54 | *"This book provides a high-level understanding of patterns with practical code examples needed for all MLOps engineering tasks. This is a must-read for anyone in the field."* 55 | 56 | **-- Brian Ray, Global Head of Data Science and Artificial Intelligence at Eviden** 57 | 58 | 59 | *"This book weaves together concepts from distributed systems, machine learning, and site reliability engineering in a way that’s approachable for beginners and that’ll excite and inspire experienced practitioners. As soon as I finished reading, I was ready to start building."* 60 | 61 | **-- James Lamb, Staff Data Engineer at SpotHero** 62 | 63 | 64 | *"Whatever your role is in the data ecosystem (scientist, analyst, or engineer), if you are looking to take your knowledge and skills to the next level, then this book is for you. This book is an amazing guide to the concepts and state-of-the-art when it comes to designing resilient and scalable, ML systems for both training and serving models. Regardless of what platform you may be working with, this book teaches you the patterns you should be familiar with when trying to scale out your systems."* 65 | 66 | **-- Ryan Russon, Senior Manager of Model Training at Capital One** 67 | 68 | 69 | *"AI is the new electricity, and distributed systems is the new power grid. Whether you are a research scientist, engineer, or product developer, you will find the best practices and recipes in this book to scale up your greatest endeavors."* 70 | 71 | **-- Linxi "Jim" Fan, Senior AI Research Scientist at NVIDIA, Stanford PhD** 72 | 73 | *"This book discusses various architectural approaches to tackle common data science problems such as scaling machine learning processes and building robust workflows and pipelines. It serves as an excellent introduction to the world of MLOps for data scientists and ML engineers who want to enhance their knowledge in this field."* 74 | 75 | **-- Rami Krispin, Senior Data Science and Engineering Manager** 76 | 77 | *"无论是新手还是专家,这本书都将引领你构建强大的机器学习系统,进而掌握分布式机器学习、自动化工具和大规模工作负载管理的要点。让你的机器学习之旅更上一层楼!"* 78 | 79 | **-- 高策,TensorChord CEO,Kubeflow 社区维护者** 80 | 81 | *"这是一本关于在分布式环境下开发、运行和管理机器学习系统的全面手册。作者详尽地阐述了从数据分区、采集、模型训练到服务和工作流程管理等一系列关键主题。通过使用现实世界中的案例,本书深入浅出地讲解了人工智能与机器学习领域用到的核心软件、系统和平台,涵盖了 PyTorch、TensorFlow、Kubeflow、Argo Workflows 和 Kubernetes 等。无论是算法工程师、系统工程师还是架构师,都能从中获得开发和维护分布式机器学习系统所需的全方位知识。我将此书极力推荐给所有对机器学习有着浓厚兴趣和实践需求的专业人士!"* 82 | 83 | **-- 陈源,NVIDIA 主任工程师** 84 | 85 | *"很高兴看到这本书能在国内出版。随着 ChatGPT 等工具和技术的爆火,AI技术迎来了又一波爆发期。与此同时,Kubernetes 等云原生技术作为基础设施的事实标准也再次在本轮技术热潮中成为首选项。这本书介绍了很多结合云原生和分布式技术进行机器学习的方法和案例,推荐对这方面感兴趣的读者进行阅读。"* 86 | 87 | **-- 张晋涛,Kong Inc., Microsoft MVP, CNCF Ambassador** 88 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect 2 | plugins: 3 | - jekyll-relative-links 4 | relative_links: 5 | enabled: true 6 | collections: true 7 | include: 8 | - README.md 9 | -------------------------------------------------------------------------------- /code/README.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | * Install Python. 4 | * Run the following to install the necessary Python packages: 5 | 6 | ```bash 7 | pip install tensorflow tensorflow_io 8 | ``` 9 | 10 | ## Instructions 11 | 12 | * All code snippets are organized by chapters and the the listing title. For example, `chapter-2/listing-2-1.py` is for Listing 2.1 in Chapter 2. 13 | * Files with `*.py` extension can be executed via `python *.py`. 14 | * Files with `*.txt` extension are pseudo-code and are not meant to be executed. 15 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf #A 2 | 3 | train, test = tf.keras.datasets.fashion_mnist.load_data() #B 4 | 5 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz 6 | # 32768/29515 [=================================] - 0s 0us/step 7 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz 8 | # 26427392/26421880 [==============================] - 0s 0us/step 9 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz 10 | # 8192/5148 [===============================================] - 0s 0us/step 11 | # Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz 12 | # 4423680/4422102 [==============================] - 0s 0us/step 13 | 14 | #A Load TensorFlow library. 15 | #B Download the Fashion-MNIST dataset and then load it into memory. 16 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | images, labels = train #A 4 | images = images/255 #B 5 | 6 | dataset = tf.data.Dataset.from_tensor_slices((images, labels)) #C 7 | dataset #D 8 | # 9 | 10 | #A Split the training dataset object into images and labels. 11 | #B Normalize the images. 12 | #C Load in-memory array representation into a tf.data.Dataset object that will make it easier to use for training in TensorFlow. 13 | #D Take a look at the information of the dataset such as shapes and data types. 14 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-3.py: -------------------------------------------------------------------------------- 1 | import tensorflow_io as tfio #A 2 | 3 | d_train = tfio.IODataset.from_mnist( #B 4 | 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 5 | 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz') 6 | 7 | #A Load TensorFlow I/O library. 8 | #B Load the MNIST dataset from a URL to access dataset files directly without downloading via HTTP file system support. 9 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-4.py: -------------------------------------------------------------------------------- 1 | import os #A 2 | import tensorflow_io as tfio #B 3 | 4 | endpoint="postgresql://{}:{}@{}?port={}&dbname={}".format( #C 5 | os.environ['TFIO_DEMO_DATABASE_USER'], 6 | os.environ['TFIO_DEMO_DATABASE_PASS'], 7 | os.environ['TFIO_DEMO_DATABASE_HOST'], 8 | os.environ['TFIO_DEMO_DATABASE_PORT'], 9 | os.environ['TFIO_DEMO_DATABASE_NAME'], 10 | ) 11 | 12 | dataset = tfio.experimental.IODataset.from_sql( #D 13 | query="SELECT co, pt08s1 FROM AirQualityUCI;", 14 | endpoint=endpoint) 15 | print(dataset.element_spec) #E 16 | # {'co': TensorSpec(shape=(), dtype=tf.float32, name=None), 'pt08s1': TensorSpec(shape=(), dtype=tf.int32, name=None)} 17 | 18 | #A Load Python’s built-in OS library for loading environment variables related to the PostgreSQL database. 19 | #B Load TensorFlow I/O library. 20 | #C Construct the endpoint for accessing the PostgreSQL database. 21 | #D Select two columns from the AirQualityUCI table in the database and instantiate a tf.data.Dataset object. 22 | #E Inspect the specification of the dataset such as the shape and data type for each column. 23 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-5.txt: -------------------------------------------------------------------------------- 1 | batch = read_next_batch(dataset) #A 2 | while batch is not None: 3 | model.train(batch) #B 4 | batch = read_next_batch(dataset) #C 5 | 6 | #A Read the next batch in the dataset. 7 | #B Train the model with this batch. 8 | #C Read the next batch once we are done training with the current batch. 9 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-6.txt: -------------------------------------------------------------------------------- 1 | if get_worker_rank() == 0: #A 2 | create_and_send_shards(dataset) #A 3 | shard = read_next_shard_locally() #B 4 | while shard is not None: 5 | model.train(shard) #C 6 | shard = read_next_shard_locally() #D 7 | 8 | #A Create and send shards to all other worker machines from the worker machine with rank 0. 9 | #B Read the next shard available locally in this worker machine. 10 | #C Train the model using the shard we just read from the worker machine locally. 11 | #D Read the next shard once we are done training with the current shard. 12 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-7.txt: -------------------------------------------------------------------------------- 1 | batch = read_next_batch(dataset) #A 2 | cache = initialize_cache(batch) #B 3 | while batch is not None: #C 4 | model.train(batch) #C 5 | cache.append(batch) #C 6 | batch = read_next_batch(dataset) 7 | while current_epoch() <= total_epochs: #D 8 | batch = cache.read_next_batch() #D 9 | model.train(batch) #D 10 | 11 | #A Read the next batch of the dataset. 12 | #B Initialize the cache for this batch. 13 | #C Train the model by iterating through the batches. 14 | #D Train the model for additional epochs using the batches that were cached previously. 15 | -------------------------------------------------------------------------------- /code/chapter-2/listing-2-8.txt: -------------------------------------------------------------------------------- 1 | batch = read_next_batch(dataset) 2 | cache = initialize_cache(preprocess(batch)) #A 3 | while batch is not None: 4 | batch = preprocess(batch) 5 | model.train(batch) 6 | cache.append(batch) 7 | batch = read_next_batch(dataset) 8 | while current_epoch() <= total_epochs: 9 | processed_batch = cache.read_next_batch() #B 10 | model.train(processed_batch) #B 11 | 12 | #A Initialize the cache with the preprocessed batch. 13 | #B Retrieve the processed batch from the cache and use it for model training. 14 | -------------------------------------------------------------------------------- /code/project/.gitignore: -------------------------------------------------------------------------------- 1 | trained_model/ 2 | istio-* 3 | -------------------------------------------------------------------------------- /code/project/README.md: -------------------------------------------------------------------------------- 1 | # Project Setup 2 | 3 | ## Cluster 4 | 5 | ``` 6 | cd project/ 7 | ``` 8 | 9 | Via `kind`: 10 | 11 | ``` 12 | go install sigs.k8s.io/kind@v0.17.0 13 | kind create cluster --name distml --image kindest/node:v1.25.3 14 | ``` 15 | 16 | Or via `k3d`: 17 | 18 | ``` 19 | k3d cluster create distml --image rancher/k3s:v1.25.3-k3s1 20 | ``` 21 | 22 | 23 | ``` 24 | kubectl create ns kubeflow 25 | kns kubeflow 26 | kubectl kustomize manifests | kubectl apply -f - 27 | ``` 28 | 29 | # Run Workflow 30 | 31 | See instructions [here](https://github.com/terrytangyuan/distributed-ml-patterns/blob/main/code/project/code/README.md). 32 | 33 | # Clean-up 34 | 35 | ``` 36 | k3d cluster rm distml 37 | kind delete cluster --name distml 38 | ``` 39 | -------------------------------------------------------------------------------- /code/project/basics/argo-coinflip.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: coinflip- 5 | spec: 6 | serviceAccountName: argo 7 | entrypoint: coinflip 8 | templates: 9 | - name: coinflip 10 | steps: 11 | - - name: flip-coin 12 | template: flip-coin 13 | - - name: heads 14 | template: heads 15 | when: "{{steps.flip-coin.outputs.result}} == heads" 16 | - name: tails 17 | template: tails 18 | when: "{{steps.flip-coin.outputs.result}} == tails" 19 | 20 | - name: flip-coin 21 | script: 22 | image: python:alpine3.6 23 | command: [python] 24 | source: | 25 | import random 26 | result = "heads" if random.randint(0,1) == 0 else "tails" 27 | print(result) 28 | 29 | - name: heads 30 | container: 31 | image: alpine:3.6 32 | command: [sh, -c] 33 | args: ["echo \"it was heads\""] 34 | 35 | - name: tails 36 | container: 37 | image: alpine:3.6 38 | command: [sh, -c] 39 | args: ["echo \"it was tails\""] 40 | -------------------------------------------------------------------------------- /code/project/basics/argo-dag-diamond.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: dag-diamond- 5 | spec: 6 | serviceAccountName: argo 7 | entrypoint: diamond 8 | templates: 9 | - name: echo 10 | inputs: 11 | parameters: 12 | - name: message 13 | container: 14 | image: alpine:3.7 15 | command: [echo, "{{inputs.parameters.message}}"] 16 | - name: diamond 17 | dag: 18 | tasks: 19 | - name: A 20 | template: echo 21 | arguments: 22 | parameters: [{name: message, value: A}] 23 | - name: B 24 | dependencies: [A] 25 | template: echo 26 | arguments: 27 | parameters: [{name: message, value: B}] 28 | - name: C 29 | dependencies: [A] 30 | template: echo 31 | arguments: 32 | parameters: [{name: message, value: C}] 33 | - name: D 34 | dependencies: [B, C] 35 | template: echo 36 | arguments: 37 | parameters: [{name: message, value: D}] 38 | -------------------------------------------------------------------------------- /code/project/basics/argo-hello-world.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: hello-world- 5 | spec: 6 | entrypoint: whalesay 7 | serviceAccountName: argo 8 | templates: 9 | - name: whalesay 10 | container: 11 | image: docker/whalesay 12 | command: [cowsay] 13 | args: ["hello world"] 14 | -------------------------------------------------------------------------------- /code/project/basics/argo-resource-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: k8s-resource- 5 | spec: 6 | entrypoint: k8s-resource 7 | serviceAccountName: argo 8 | templates: 9 | - name: k8s-resource 10 | resource: 11 | action: create 12 | manifest: | 13 | apiVersion: v1 14 | kind: ConfigMap 15 | metadata: 16 | name: cm-example 17 | data: 18 | some: value 19 | -------------------------------------------------------------------------------- /code/project/basics/argo-script-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: script-tmpl- 5 | spec: 6 | entrypoint: gen-random-int 7 | serviceAccountName: argo 8 | templates: 9 | - name: gen-random-int 10 | script: 11 | image: python:alpine3.6 12 | command: [python] 13 | source: | 14 | import random 15 | i = random.randint(1, 100) 16 | print(i) 17 | -------------------------------------------------------------------------------- /code/project/basics/hello-world.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: whalesay 5 | spec: 6 | containers: 7 | - name: whalesay 8 | image: docker/whalesay:latest 9 | command: [cowsay] 10 | args: ["hello world"] 11 | -------------------------------------------------------------------------------- /code/project/basics/tfjob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1 2 | kind: TFJob 3 | metadata: 4 | namespace: kubeflow 5 | generateName: distributed-tfjob- 6 | spec: 7 | tfReplicaSpecs: 8 | Worker: 9 | replicas: 2 10 | restartPolicy: OnFailure 11 | template: 12 | spec: 13 | containers: 14 | - name: tensorflow 15 | image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0 16 | command: 17 | - "python" 18 | - "/var/tf_mnist/mnist_with_summaries.py" 19 | - "--log_dir=/train/metrics" 20 | - "--learning_rate=0.01" 21 | - "--batch_size=100" 22 | -------------------------------------------------------------------------------- /code/project/code/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | RUN pip install tensorflow==2.11.0 tensorflow_datasets==4.7.0 4 | 5 | COPY data-ingestion.py / 6 | COPY predict-service.py / 7 | COPY model-selection.py / 8 | COPY multi-worker-distributed-training.py / 9 | -------------------------------------------------------------------------------- /code/project/code/README.md: -------------------------------------------------------------------------------- 1 | # Multi-worker Distributed Training 2 | 3 | ## Setup 4 | 5 | ``` 6 | cd project/code 7 | ``` 8 | 9 | Build the image 10 | ``` 11 | docker build -f Dockerfile -t kubeflow/multi-worker-strategy:v0.1 . 12 | # If using k3d 13 | k3d image import kubeflow/multi-worker-strategy:v0.1 --cluster distml 14 | # If using kind 15 | kind load docker-image kubeflow/multi-worker-strategy:v0.1 --name distml 16 | ``` 17 | 18 | Switch to "kubeflow" namespace: 19 | ``` 20 | kubectl config set-context --current --namespace=kubeflow 21 | ``` 22 | 23 | Specify your storageClassName and create a persistent volume claim to save 24 | models and checkpoints 25 | ``` 26 | kubectl create -f multi-worker-pvc.yaml 27 | ``` 28 | 29 | ## Submitting Training Job 30 | 31 | Create a TFJob: 32 | ``` 33 | kubectl create -f multi-worker-tfjob.yaml 34 | ``` 35 | 36 | After making code changes, run the following to resubmit the job: 37 | ``` 38 | kubectl delete tfjob --all; docker build -f Dockerfile -t kubeflow/multi-worker-strategy:v0.1 .; kind load docker-image kubeflow/multi-worker-strategy:v0.1 --name distml; kubectl create -f multi-worker-tfjob.yaml 39 | ``` 40 | 41 | ## Model loading & prediction 42 | 43 | ``` 44 | kubectl create -f predict-service.yaml 45 | kubectl exec --stdin --tty predict-service -- bin/bash 46 | python3 /predict-service.py 47 | ``` 48 | 49 | ## Model selection 50 | 51 | ``` 52 | python3 /model-selection.py 53 | ``` 54 | 55 | ## Model serving 56 | 57 | ``` 58 | # Install KServe 59 | curl -s "https://raw.githubusercontent.com/kserve/kserve/v0.10.0-rc1/hack/quick_install.sh" | bash 60 | 61 | # Create inference service 62 | kubectl create -f inference-service.yaml 63 | 64 | # https://kserve.github.io/website/master/get_started/first_isvc/#4-determine-the-ingress-ip-and-ports 65 | INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}') 66 | kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80 67 | # start another terminal 68 | export INGRESS_HOST=localhost 69 | export INGRESS_PORT=8080 70 | 71 | MODEL_NAME=flower-sample 72 | INPUT_PATH=@./inference-input.json 73 | SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -o jsonpath='{.status.url}' | cut -d "/" -f 3) 74 | curl -v -H "Host: ${SERVICE_HOSTNAME}" "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict" -d $INPUT_PATH 75 | 76 | ## TODO: gRPC serving. Not working yet 77 | # Client-side requirements 78 | python3 -m pip install tensorflow-metal 79 | python3 -m pip install tensorflow-macos==2.11.0 80 | python3 -m pip install tensorflow-serving-api==2.11.0 81 | ``` 82 | 83 | Autoscaled inference service: 84 | ``` 85 | # https://github.com/rakyll/hey 86 | brew install hey 87 | kubectl create -f autoscaled-inference-service.yaml 88 | 89 | hey -z 30s -c 5 -m POST -host ${SERVICE_HOSTNAME} -D inference-input.json "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict" 90 | ``` 91 | 92 | ## Workflow 93 | 94 | ``` 95 | kubectl create -f workflow.yaml 96 | ``` 97 | 98 | ## Debugging 99 | 100 | Access the trained model 101 | ``` 102 | kubectl create -f access-model.yaml 103 | kubectl exec --stdin --tty access-model -- ls /trained_model 104 | # Manually copy 105 | # kubectl cp trained_model access-model:/pv/trained_model -c model-storage 106 | ``` 107 | 108 | Run TFServing commands in the KServe container: 109 | ``` 110 | kubectl exec --stdin --tty flower-sample-predictor-default-00001-deployment-84759dfc5f6wfj -c kserve-container -- /usr/bin/tensorflow_model_server --model_name=flower-sample \ 111 | --port=9000 \ 112 | --rest_api_port=8080 \ 113 | --model_base_path=/mnt \ 114 | --rest_api_timeout_in_ms=60000 115 | ``` 116 | 117 | ## Cleanup 118 | 119 | ``` 120 | kubectl delete tfjob --all 121 | kubectl delete wf --all 122 | kubectl delete inferenceservice flower-sample 123 | kubectl delete pods --selector=app=flower-sample-predictor-default-00001 --force --grace-period=0 124 | kubectl delete pod access-model --force --grace-period=0 125 | kubectl delete pod predict-service --force --grace-period=0 126 | kubectl delete pvc strategy-volume 127 | ``` 128 | 129 | -------------------------------------------------------------------------------- /code/project/code/access-model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: access-model 5 | spec: 6 | containers: 7 | - name: model-storage 8 | image: alpine:latest 9 | command: ['sleep', 'infinity'] 10 | volumeMounts: 11 | - name: model 12 | mountPath: /trained_model 13 | volumes: 14 | - name: model 15 | persistentVolumeClaim: 16 | claimName: strategy-volume 17 | -------------------------------------------------------------------------------- /code/project/code/autoscaled-inference-service.yaml: -------------------------------------------------------------------------------- 1 | # https://kserve.github.io/website/master/modelserving/autoscaling/autoscaling/#create-inferenceservice 2 | apiVersion: serving.kserve.io/v1beta1 3 | kind: InferenceService 4 | metadata: 5 | name: flower-sample 6 | spec: 7 | predictor: 8 | # https://kserve.github.io/website/master/reference/api/#serving.kserve.io/v1beta1.ComponentExtensionSpec 9 | scaleTarget: 1 10 | scaleMetric: concurrency 11 | model: 12 | modelFormat: 13 | name: tensorflow 14 | # This is only needed on Mac M1 15 | image: "emacski/tensorflow-serving:2.6.0" 16 | storageUri: "pvc://strategy-volume/saved_model_versions" 17 | -------------------------------------------------------------------------------- /code/project/code/data-ingestion.py: -------------------------------------------------------------------------------- 1 | import tensorflow_datasets as tfds 2 | 3 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True) 4 | -------------------------------------------------------------------------------- /code/project/code/http-inference-request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | input_path = "inference-input.json" 5 | 6 | with open(input_path) as json_file: 7 | data = json.load(json_file) 8 | 9 | r = requests.post(url="http://localhost:8080/v1/models/flower-sample:predict", data=json.dumps(data), headers={'Host': 'flower-sample.kubeflow.example.com'}) 10 | print(r.text) 11 | -------------------------------------------------------------------------------- /code/project/code/inference-client.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import base64 4 | import grpc 5 | 6 | from tensorflow.contrib.util import make_tensor_proto 7 | from tensorflow_serving.apis import predict_pb2 8 | from tensorflow_serving.apis import prediction_service_pb2_grpc 9 | 10 | 11 | def predict(host, port, hostname, model, signature_name, input_path): 12 | # If hostname not set, we assume the host is a valid knative dns. 13 | if hostname: 14 | host_option = (('grpc.ssl_target_name_override', hostname,),) 15 | else: 16 | host_option = None 17 | channel = grpc.insecure_channel(target='{host}:{port}'.format(host=host, port=port), options=host_option) 18 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) 19 | with open(input_path) as json_file: 20 | data = json.load(json_file) 21 | image = data['instances'][0]['image_bytes']['b64'] 22 | key = data['instances'][0]['key'] 23 | 24 | # Call classification model to make prediction 25 | request = predict_pb2.PredictRequest() 26 | request.model_spec.name = model 27 | request.model_spec.signature_name = signature_name 28 | image = base64.b64decode(image) 29 | request.inputs['image_bytes'].CopyFrom( 30 | make_tensor_proto(image, shape=[1])) 31 | request.inputs['key'].CopyFrom(make_tensor_proto(key, shape=[1])) 32 | 33 | result = stub.Predict(request, 10.0) 34 | print(result) 35 | 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('--host', help='Ingress Host Name', default='localhost', type=str) 40 | parser.add_argument('--port', help='Ingress Port', default=80, type=int) 41 | parser.add_argument('--model', help='TensorFlow Model Name', type=str) 42 | parser.add_argument('--signature_name', help='Signature name of saved TensorFlow model', 43 | default='serving_default', type=str) 44 | parser.add_argument('--hostname', help='Service Host Name', default='', type=str) 45 | parser.add_argument('--input_path', help='Prediction data input path', default='./input.json', type=str) 46 | 47 | args = parser.parse_args() 48 | predict(args.host, args.port, args.hostname, args.model, args.signature_name, args.input_path) 49 | -------------------------------------------------------------------------------- /code/project/code/inference-input.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances":[ 3 | { 4 | "image_bytes":{ 5 | "b64":"/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAErASsDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwC9A42ir9vA0nOOKxYJhkDqe1bNvO0ZAYdaIsC8LLjOwH60yWDAwY1/75qzDcDAz0qfhl55BqxGE1pCzZwVPt0qJ7MgZQbh7da1Z7bncBVQgoaVhlGFvKlBIwOhqxPFxkdKmdY5xiQYP94daaqtGPKkO5P4X/pU2AoKMMQatWv+tAPXpTJ4ipyBTVYqwYHBFTezA1ivHNRsuRU1tOlymOBIOo9aVoyGNaCIEHanEEEMKXbg07BAx2NICXO5KrvwcVPEcDFRyD5qTYDYhuPuKnA4waitxmQj1FWGX9Ka2ArODzUXU5qxIM81AODzUtjGzHMfvVRcl6mmOMio4V3PSAtwjBUd60l+6DVCMAzH2q6D8v0qo7CIJ3xmsqQ8kmtC5YAVmyctntSbGRkDOT0qWMFyABUWNzD0q5EuxM9zQgJQAqgCkJxS9vemMasA3c8CpFPHNRBgBkinBvSpuBMGxRnPWo1561IOlMBQMEU2R8DFKW2rk1XdsmgCN+TmqskuHIqeUhVNZMkoZyckZqQILTi5UntzWtHMOVbpWQh2zCr6jIBpRGzUjl2jBPHY1chuSODyKx4pOzdKnVyh68VYjbDBlyvSq88G4bhVeG4Kkc8HrV3eGUEVQjLkUr+FRmQgYzV+aMODxzWdIpU0mMerh1wahdCpPvTN21gQamB3jB+qn1rOQDI5GjcMvBFbdvMt1FkfeHWsJhzU1pcG3nDZ4PWlGVgNd4+MigL8uKscMgdeVNRsAORVsRGFwc1G45qfKg/MM/U0jLG3RQPxNS2BCh2OG9DVxwM57GqxRQc8j9asp80I5zjiiIyu64zVdhxVtwMVVak2BUlOTUlumATTXXmpPux0r6AS2vLv7GrLNtFVbM/K596knbgGqT0AqXLZeqbgsRU8x96hJzgCk2A+JPmA61PA4mUSL9wk7fcetULtmEMdvGSJrltgI6hQMsfwH6kVqRIsUaqgAVQAAOwFUgEJ7UwDOc1Ky55/OmtgcCi4EZ6UqqSc0Hk4p46igB44pQaaM5NI7hVx3qkA2V8nHaoAdzE9hTZHOMd6ZczfZoQq/fNDArahcgAxLyf4iKzs0OxJ5696ZUDQP97NaVsdyg+1IPszHlFzU8SRg4jGB6VSQh3linp02mpQm5enNJs9aoBoynfirMFwVOD0qADjDUn3W9qANIsGGQeKqXCK3PekjlIOCeKfJyN1AGXIMZFNik6xscc5U+hqxMgbPrVFwVas2BezvXOMOPvCo2GD7UyOXOG/iHX3p8hGzdn6Vm0M0rG8ZLYxtzz8pp0lyx/iNZUMpzzVkturURKZGP8AEfzpRMw6Nmq5HvTMspz1pAaUVzzhjiptxjPmRnjuKyBNzzxU8NwUbDcqaXoBreYJU3L+VVn5zTEcRvkHKNUjcE4qZdwITyabK3yGpG4GaqzN+7qG9ALNicwn/eNE75UgU2zb/RQfc0krY4rS+gFZgcc0iKM+9Kc81FcI727QxnDyjbu9AepoWrAZpv8Apt7NqB5jA8mAeig/MfxOPyFa4HFQ20KW8KQxrhEUKB7VYXFWAvlkhSDx1PvUchwSAKlD7Uyep6CoS3UnrU9QGHg0DJ5xSb8mjdjvVAOZ9oqs75JOaJX3Hg1GBmmAKRuLt0Xms24lMshbt2qxezgDyEPuxqkxpNjImo4pGOOarmbk0gJvMINWIbp42BB6VBLC0Z9qjVsGjYDqrWVJ4w6n6j0qcxbh71ztndtbyBlPHcetdLayx3Me+Ns+o7irTuJkDRDvwaYVIODWg0IYc9agkgBGDTEUyCv0pwc4wac8ZTg9KjYFRSAil6ZxVOYZFXGPBBqpKKljIFJB61KzFlqJhTkbPBrO+oxysR0qwrkjk1Wxg1IoPBJ4PpSc7BYkOfU0zzHXvke9WNuFBHQ1A/BrKVRoaQm/dweDShyOOtRZB+tAPHNSq6HYv203/LNuh6VeVvkweq1ioxB+lacUm5Nw7jBrVTUoktEsp+SqNw2F4q1I3yCqM5yQKwchpF62Yi0/GkcHgmkh4gAoOSK1UxEfVuKdbKZH8zseF+nrTGO0qo6scVciXgYrWG1xDxwcUm/GQKc3FVS3J5qmwHmUkknoKYXJGaZuBzTd1JMB4PBzxUUkhPApWcnpTFUk1SAdGrOcVW1S/TTbbIwZW4Rff1rQXbEmSefSqC6bHPdNd3Q86U/dDfdQegFUBkWYurnlI2YseWPetSPSZW5llC+yjJrUVABjt6CnHihJICkulWqcsGf/AHjUwt4QMCGMD/cFSM4HWm+YKq4HO/aAww3NRuqtypxTNpFIMisrjFDFDg1ctbySCQSRsQf5/Wqm4MOaT6GlewHY2OrQXWFciOT0J4NaDRq68/nXBLIRwa0bTVLi2wBIWX0bmqU+4rHSSQcFW5HY1TkiKZBGRUtrq8NyNr8N6VaYJIvysCPar0YjGdMDOeKrSL6ng1pXFuUJxyDVCSMgH0qGMqMvao84NTOp61XYkHmspDRYXBxnpSFjG2DyKbEdwK96eR5ilT95elZSKRYglBHlseD0NJKpU4NUlk2nB6VcjlEq7HPzDofWsJTvox2KzcZojbccGnSrgkVCpIkFcdaorblJFg8VctHzlapBs1Ztf9bTw+K01CUS07dvaqMh3TqKulSSTjPHWqCgGdnzkKOtYLGxbDkZoxnEYqR4j5e4HPqKrwncAO2M1X1jVRp+lXFxn7inHu3QCu6E3JKzIaC1lF3qk+05jtgIh/vnlj+AwPzrXUYArnvB0LLoUMshJknZpXJ7kmuhLAZxXcpIgimYjPNVd3HWpJnLHFQgcYp81xjht7U4AGkGFpDPt4QZNaIQ9YiRnoPemng4Tk+tIFdzmRjj0p5dEHFWmAqRfxOeakyAOwquZ93Sk5PJouBM0uKjMrHpUeRn5qQydhRcB3Pc0vHrUYJanbT6UwMX6ikKelafA/gX8qRljPWJfwzScAMsxjNJtx9K0Ht42HykqffkVA8DpzjI9RzUOLQ7lXbijDdRU2zIppX3qRipJ0z1rQttQkj4Ylh655FZjLzQMryDRewHUwXyTphiDnof8aZcQYO5eRXPw3BQ5U4Na1tf5Xa3I9KtSuTYgkjK5x0qpIpwM9K2HRJlLIefSqEsRGR+lRJAUVcxuCO1WpDlROn4iqknTp0p1tPtYo3KNXJOdtGWlcllUMBIvRuv1oGVOD1FOWIiRIz91zgH0qS6VoNwaMgKeJQMgfWvExGK960TeMRpPmqMnD4796JIQsf7s5lGAQT61YeSOS8SFwFkH3WHRuKq6fFJdajMZ3aMRtgqByT269q4nUnJXb2KskMjUiIS7sl32hfTtWhaArIQ42kdaaY/7PvzG6+dFMSWfHC91+nvS2e+Sab7QORNsUZwG7jn0xmpc3a6FYnDpC4lDEl/4G6DtVa5UqRDEADKTj2Hqas3Nkz6hGZGxCFJLL/FzwKluIykkQgQtC53N3Kn0+maxUrNMvRoiEZWPapAJ71keItPkvdPYiNnjg+cQjOZT/8AW61uOY1cGUcryFzUgkZ0JVsDtXfDGWmm9kZuGhX0qJLXT4IACuyMfKe1WZJQFOKiKlSCWycfnUUjjua9ali41PhZk42EbLHNISFFM83nA5pVXPLV3Ql2IaDBf2FOykQycUySUIMDrVZi0h5NbJiJnuGc4WkVWP3qYo29OtSKrt34q0wHhgo4GaCx/CnLCw561OkJP8NWIrBd/apVtj3qysI74FSZjTimBElsO1S+QKQzf3cCk3v60AUiqEcVG0R7DNPIZT92kLY5yRWgFd0I7UwMyGrLHPYGoXQHkVLAYYUlGUwj/oarMhRyrqQam3FDU6yRzoEl7dG7ioaTAzyuR0phQ+tXJreSLn7yHow6GoNhas2h3K5UA5zSrIUI5qQx5qIoBwRUMZehvMEZbB7GrS3Mdy/lMQk3YH+L6HvWJnHHpSs0c0ZinBKeoOCp9Qe1Tz2CxcvY2jY8VXgYMrhhhf73pTY728geO3ukW9tXztuPusvfDe9aECQmMqE3I/OD1FeVmFaKVlua049yNLooVjLD5MMcjORmrZN3LfmHzVjg2bn3LkMvp9TVeS6htbNXSNTk5RiPSrdhdx3ds0lztIcHJHHHt714cr/FY2JTbwGzDJ80kBym48/5xTZHzLFLLEygryw43+lFvJa/YZF52AZEh5IIpkN6k2yCcOcHdtIxzUaktl10gup02zMqlcFRyM0omRElhk25QYGBzu9qqLav9rl+ySII+q7mzz6UxHiXzYbkL9oHzGQHnd2x7VLj0Fc0Fml+y5kjbY2OT/CakaRSEMY/eE7do71krqBkgWNllCzMOdp/P0q35aQPEYHd9wKlSMnjvUuFtykyeW1ju51WQsjIm38e1V5LWRSiGcDy+Hx/ETzmpxK0kgl2ERYwzHru+lJe2pn8trc4kzyC3BHrSTa0ZRG7IkQIbcvTJ65qsUeZ2H3VXqT6065kkSfyIrZiQMFmHHHU06TKWyq7ZbGSfU16GB91uUnotTOYxIVU/eyaSRivFAxgHOaGw3yn8K9+hVhWjeBi01uQFWY05YSe9L5wXjbUiTqeq4+ldcSWSRwkdRmp1RVGW4qITf3TxSg7jljWqJJRJGvTrSGZj0OBTfK3Hini3brV2Ab5jNxTgCetSCE9+DR5ZxinYBAqjmjzVpPKOOtHlU7AVRLwOhoyHFZyuw6VMsz9aq4FghajYYHBpPNzzRvDD0pMCB8HtioiQOhqV1Peq0kZBz2qWBbgu2i44ZT1U9DVwWttdjdAxjfvH/hWGWZDx0qWO4dCGBII7ip5h2L82nlTgvtP+0KrPZSjkbH+jVft9VhnTyrtQQeN4qO7025C+bp06TL/AM8pDj8mH9RSaT2AyJo2X7yEH3FVJJFUdeadN4hlsJvK1Kxnt+cbiNy/nTzf6XerkBTnupK1yVdFcpDdME8935SofKP3s9vetcmCxTZcDcm4/Oh5APrVWK4kuA0dtHjbznODTzAbiaGV0EinO5Dkfn+NfN4ibqTvLRHTHRE8losVspkKT23VY+4z3z9Ka8FvayRyQiT7G/OGHCmp44omsnW5LbsHdGrcLjgfXtQdQVdOkhYeblNqoozx9K57vYTIbmWAT2zoGUOSGUDCk9jS3Fwl7fpsuFjZUAdgM5PSobm/hmsIbZnVmaQEJjgge/arN1FHdxWoh8qBgx3N/s/TuelVta5DZWN79gmliMgaVMkcEBqnhube4s5FmTMjjcGbruPpisvVIn0u5jE9wkz3PAcLjaKkb7Pp8kEkL7lkByHP3SMcj3q+RWTW7Iua0OpLJYzQyZ+VMFAMMD2xn3plnfva3Dw3aeXMg4UntjrVC9vXk8m+ijXbG20kA7m96fNqCSz29y6EmL5Hcp0zjGah0tNtylI14dTVpZHPIz90jk1Na5aLzvNBc/MExwB6fWq7zW63cExVBI8fGO/PFSXksUcgMZbMv+s2jhPeudxWyNUyO7vfOuo0BYKAGc+vtUVwr3upCFNywxrukfH8q02mha0kAjUqqZB6Z46VAl15kGyFQCVOBn+tVSqcmtgavoQtDDgorlSOnOahkzCu6QjaOd3bFPQmGVEeAmM43yZ6+uKknaPa8IAdG4OfSvewlanJ6JL0MZJmb9ttWOVcyeyKT/TFC3Fy5/0fTZCv964kWIfhjcf0qyMRjCKFHsMU1mYmvWi0ZsehuMfvPIiOekbGT9SF/lUhl2+hqDDkU3Yx5NbJiLH2kjocU8XLf3jVUITRsNXcRdE7H+Ol85/7xqmEYdDThuHencC557g/epftL+oqllqTLU7gQKvNPCkHBNOVkIG5amVI2Aw2KYEITqKTYQfSrnkHqMGlMB/u8UgKTKfc00oSORV4REdRQbfnikwMxoh0IqJodp+WtY24I55NNNvj+GpaAx2hYcg0+C5ntmzHIw9uoNXmtjk4FQSQbe1ZO62KRZTVbe7Qw30KAHuRlTUN5YlSj2RhEWeU8sFT9DVCZAB2qvDcSQMfLkIH90nINcmJqtQa6lRjqaUTNNI1rtWJ8E7umfUVIoffHapcA7Ry5GCB/WoLC4juhKZGjSUHaoc4/HPTFVPImsbo3VyywhMjy2OSwPpivm3FuTT3OnoXLuAxTC3FzuWYFtxGDwelN06eHTmfjYe4cckVMyiOSO4nlWSXd8qfwjIqS6uYZb+1V9nmxhjz9OKm+ljNmJetBJqdw8mYlAHl4UqPekt5tQnuVeOESW6ggEN6VY8Tajb6nHBYxnE5bJbHQVDpMdxbXCWECmYjJGDwfqa6Uv3d2tfP8yGJBqcWpTvHdw7Y0G1S33vfH5VB/ZN5GrtegPDJ8lsQ3IPbP1qxc6PEbGW4Fw0FxHlnjK8euKqy67P/AGfBEkb/ALyQYJGFBHaqjr/D2/IhmlYvcaVcpHeRqpC5A/hI+vrVy1vree/vgPnWQj91jqMAdPrVKG6XVZYoZ2ZPJ+YHg7varl8sVlqdvNZkuZ1KMMZIA57VzySbs9xok0/ZCstvdQsuCRtk6qvbBrQWxKwFldjC7fOCfnVewqnA1trLkytiZMojdCh9SO/NT28k8pksN+LhfldsHGPUf0rCd73+81iaBitksnOPlRSWUnhlxVKxaJnCR2hjTqrO2SfpT2tJlb7MzLJCQCW3YJGf8aa8aWDhZWdl6RcYI+prJbWuakrNiCZdwfa3y4+g/wDr1EkYI5/OkjCG1OxtoLDrT9siDBU49ua9fApaNGM77DvIHrxSiFD1wKZ5rZ9KUEsc170DFj/JGeBSi3p6c8ZqYLnoea3RJWNsMcUwwe1XQhPUU7y+MVYGeYcUhirQ8oDjrSGIdSKYGb5dJ5RrQMIPQU3yPaqAykjPpU6R1l29zOhAT5h/d61s28jSLl49n1NNO4CqpHQ08zGPgcmplVT3psiLTsIbHdIxw6496tCNSuQAR6g1mSpg8Ypsd3JC3ysfpSGanlD0pjBV6kVB9viuE2Sh4mP/AC0jP9Kgk02WXm3vkm/2WO1v8Kl+QE8kkXr+VVnngHVAfqaqTaZqEWS8Dkf7Jz/KqLiQZDRupHqprCU5LoNIvy3cfRUT8qpl43kG9FC9ziqbS7eM0eflSvXNefi5TlHQ1glcnu1DeXFbhDvPDHt70+5toZWG26jeeDBUyc+Zj2HP6UkMUJieGEM08gyAR04/SqcNtDYXsVzPdr52SPKAzgnjqK8OPrt+JsTBvPuRDdRPEoG5se/Sori2jsZZFeYusg3K5649KtXLPeX0QikChMh3xnI9qy9ctZY9RiWdzJbsu5SOBmrpq7tsZyNEHTbfRZF8tRJtIEnVmPaq1ldtpzjdJkS4IYDkN6fSqul6XaXBkMrtuJIQZ4X3qndLKLuWISGUQpuDIMFe3Pr2rRQjJuN7kGhrerx3LyNEMoRtlYfxN7Cqtiw1eeGzm/dQRfOXHXPQAVGLL9y7yFEeNcqAfvZ9ahluvs9ukcKmK5XBwBwRnk1pGKS5Ybg0dJZG0jtpIZo8BWPzg859aS2ubi2vY5bmMIJlzC56FQeh9D7VmWIE9rJO75kzkN0GRWhLNeajosyx2jTBfmIf5SPcVzyjrZiHyXTx628kUQ8iYgGQdA2Oa2pIUWKO5tpHNxwjlv4/rWNp0M2raaFtisYUhgX7MK0tPiluoj9omEc8ZZQo6bgec1hVVvloaRZZmWW1jFzNIhiYbWIJ+Q1E01rqN0oeSR1UcBBhc/jz+lWbWKa8jIZAsQOJFc8v7AVS02IxFHkCozE+WhOSR3rFWs31Rqi7A0LRSLblWBGACw+U++Kqi4uI5WRAzhTjIU4NTzzWUEx/erHu6j3Hf9azl1GVh8x4J4NdeCS573aImaS3Of8AWwMPcCp0MMnKPj2NZi3b/wB6p470j70aNX0tKatqc7RpCNlORyPapApzmq0N1bN/ejPvyKuo6EcSK1dSVyRytinja3UUmFz1FL8o6EVdgHgKKXYDzTMgdx+dAkIosA4xDsKPJpPN9qPNHrTA5tBs4Xge1So5HrTTG5HCmmiKQ1QF2OUHgnFSk8etU44znk4NW1QbeTzTEV5Bmqrrz3rSdFxxUDL3xUtDKBB7UbmXoSKtNGOuKYYx6VLAE1C5i+5Mw9qe2r3OPnWN/qtRGIdqY0Xqal3AWXUYHH72yjPuKoSzacTuEBQg9hT5oevFZ8sLHPFcde7VmXEn/eeYtzbdeQDnFPu7KVZo5UkhLxsCD94OfbHWqUbSxgx5zGex7VYa2l/s6NxM0pY/u1QZI+mK+dqRcJnQndGhePGbQO2yMR/MvOM//rrOtymqTML5CsbMNik42iq0cTx2ciXAledJVJjbkIvX86YJUmuw5Lquwgdt/wBPWkoct7feTIfYQ20cs+/zwsbsTsU4I7HNJLo9zY28+oROHSVsuO6qff1ratLiA6O0cpHkiPGdwB4//VVW0uWmsxazo/lzZBcnoMfzo9pK7ZNjKvbeC3KusjzRsnC9dvp+FVHhj+zRsvF2W2kdSR3/AAxW42nyaXL9ktla5gnXO6Q8r+fasW3MqXbwsoM7AgZ4xj6/Wt4Surp3GQ6fFK8skUrEKW+6p4BroLLUpo7l7JciUJkuT8uK5mO21CPUGO8tKnLFDnIqzHdOmpSOJCAVG5mHJp1aanfroTY6K0uDpz/ZCWIGXDp3z6+9WLF7i9vLohkiBfKt6ZHSsHQonvdRmubySRguAFJxx9PSukhsltdRk+yN8kxBCHsTXHWSi2upcS3LMdJkiSWT5X4STt+NUY1k/tF4CVBtiWjdTxIjc8ep57U3Uzf7lR7fdCGyCg3dK0LuaKysYVcDzCwWM7eVz3rFaLu2aIpzWUcTPHdReZFIN6S9Gz3z71Sm0+SNRJC5mg6hh1H1FS3LzvcrNNcJKpGFIOFGDyMdj0qJFubWZpbdv3bHJj6r9R6V6OC3tIzkuw1M8c1MrleCaso0F4uWType4Hr/AFpklnJGN33k/vCvbhDsYtiLLz1qZZcdDVbbkcCnAHtW8SS6s7DoxqZZz6ms0PjrUqyVqmwNETE9zThKezGqKy89alDe9UmIti5cd81ILs+gqiGpd4qrgSC4hI+9SfaYecGssI1OCMO2adwNDz4ienNPE47CqSIT2q1FCfwpgTGTd3xTTz3qVIgKlEQ7UxFXy/SnCLParax47VMsakdKVgKItQad9i9av4A6CgjNS4gZUlkDxjNUbizCg5wAOSa1r6+t7NP3jZbsg5JrltR1Ca+yp/dxdkHf61hU5UtSkmV5prWRinmYj5yw74qKUyCNYbW4KKRwu7H41UWFmc47dfap7aJ5bwY+Zdp3c8189iYWnzNnTF6WHWt3BZwtbvJvn5J2Atkn1Nal49tdaKN6JuhT5Gz0z1Hsax4IBaXztIuUYbd392kupLaK4SRJVZUO5152nH9a5nFSkmhstyWMFxpyQwnDMR5f/wBeoL++utPgt4ZIkw7ArKG+X3xVgSpNJBLYoZTgsQPlO08d6ytZvTq9qbeFNkdq4355PPAIPpnj64rSlFylaW35EMty31ydUTYyOHUDviMU24kuBDJbG0mdo23m5Vcqozyc9/oKct5ZyaD5EETmXAwEUhg44zmtXSdSh/stLedlLbPLkXqzk55x1NKT5FdR2AyIMLcuYZwUCgs55JaibRp49MM7zI4yGYAc8ntV2Hw1DFpzm2u9kj/vA7DgL6GmWp1P+yWHA2pmNxxkgcAj86PaXd4PqFjVleCLT2n2YaNRtK8MQKp6XqM020mPczyEluwH8+P6UzTFuDp9159s8t42XiMg4ZSMEfzqTSrhNTsrqyMUdtK6HDdAD/jWDgkmnrqWhZXv4tUMrMwRslGU5GPatO5gklaK4wJjt+cDnI6jHrWZps88McNnKMCJXRyeQw7Y/WpVvbqynIIV4WO5UIxtB7D6UKnzTUU7D5rIsX1vFqESmJlSc/dbOAf9k+lYkTy20xjdWRgcMp4wa12tbbUpfPspzbXf8SP91/qP6ipJrZpcRX0RhnAwsvUH8e4r2sNhfZxtuYylcpiXOD1PrV63vcEBzj3qhJDLaNslXGfusOhpEkB4rtjeJD1NkxRycj5Se46GoXt2U5x+VVIrhounK+ladtcLKOPxU9RXRFpk2KZiI5IyKaU9K1WgBGV6GoXg9sVaQijgqc1IrVIY8cVGVwaYDw2aN3tUfIOMUZx1FMColwelTLcetY8EzMil12MRyKtI/vVJjNVJ6sJcZ4BrIV8fxVMk2OlO4jYS4Gcc5qdJWx/jWRHcE9MCrUcjN1ancDSDnuRUitVISKg3OwAHcnFQTauqjbANx/vEcUm0hWNZ5khj3yMFHqayLzV5HBW2XaP75/pVCSaS5cF2Lv2H/wBanraTN94rEPVzj9Kzcm9h2KMgLOWclmPJJ5NMispbs/IpCeta6WllEMyFrh/TotLPMzx7eEjHRF4FZOn3Hcx5beCBcMd2OiL0/E96zri6k6Q/JngBa0bqMYJNUU3JcBYY98jDCrjrXFXg+iNIsqbbrbHGznjgA9s9/rVu6sopIo7bcFt4z8zL1Y+/tV2a0EUCNcYeQDkZ4qqLh50MBUfLli/oo7f0ryqlKpFq+5opJk8NjLqdvLcRwxRAjYrg4woHU+grHmt1tdFuJIiRbtIPNbPMuD8oX2J5/Crbx3Oo6WzjfHHI5IROFYD19aFtfK0R/MkEqRfejJ/iPQD/AD61nB8ujfXYe5HGFn8PGHTS0JmfLGQ8r0OM1c02EmTUXhnie7eMJFt5wcYYj86ppNqEcuLe2BtLlVBQj7uM9PSrSSHS7iK8MLbZplXy2Od2Rg49MD9cU5t2aXXX/hwSGH7Tp2gW0UluxYytG28cBc5x+PrWxFdb7dmtoZjE6kR5GSB0/Qg1nWLtNrV2ZhvhY/JC/IxngAU2Rv7S+aKV7ezAbyVX+IBiCD7ZH61lOKk7P1v6jSJbfUdUknaARIG4UkKScDv+NXLCG2mvJpEjLZdmDdmGc7x7+vvVH+0jZazCiZaBIkDKRyD3x+OKtIrmUgYJD70QDgxtnHPfP9KicdNrXGMuruKe/jmRCPLYDPTI71PehhJluR2rLinxNIjLlCxwO4rbhxc2a5O5lG0n+tehhsLrcznIz1GDuGQ3Wtez1QhPJuR5kXTJ61mvGUbHSmjj8K9SneBk9TpPscFzbkQuHjP8BPT8e1Yl5pz2bFgCY/UjkfWi3uXhbKMVP1rUi1NpF2yBXHQ5711WjJE6owgeeePrT1ZlYEZBHStZ7fT5udjQk/3Dx+VQPpqE/u7kH03Cj2bWwXLNleiQhJCFY/ka0Gi4rDNjMvRkb6HFalhcvgW9wCrjhWPetI32YmDwioHgz0rTePNQMuKuwjKeEjmm+Wa0mjB7VEYOaLAciBTwxA60pX0oCVBYqyN3qQSEdqjwT7U4KR0paiJ0uGHQVMLyY8LxVUDHWpFzjHXPQDvRdgSmRmO6Qlj6ntVy2s2lAd/kQ9B3NPtLEIFknAz2X0q8TnhapR6sVyEQiJSEG0d/ekEZ/CpxGep6UpPbFOwiuI6jljwOetWSCBmq0uM5zxUtDKM0TSuscalmY4AFaVvYRWEJPDSsPmf/AD2qzY2/lp50g+dhx7CmXj449qhxS1C5g6g5lc/3RU2n6YptHkuB8snUHuOwqWO0Nzcqh+71b6VpXK7k2Lwo4Fc6pJtzY79DnL28nEj+S+yMcAYqsqg2gnmJkVW3yKeuR0rQvbUD5QKqTxFNKmI/vL/hXmYjDato1jIhsZDc2s0wfypSSsag446ULp8MtzBh2PkoN3s3p/WmwxsbSJVGGLFVA75rXMH2BYY4xnAO73NcyoVHzOJfMjOa+hkQpLGY9pZS6KeG7HNPsw9ksdtKo+yWzM5P94MD/ImoJ4AZSSOSckGtJIftGnvGRmRUIX8ulNYZuNkg5tSOaWOe6ilt412ttQkjtmogkmnak7x/xZHPpUGmyAEQsTw4Zfz6V02pWAY7wPxrqw2D91qREp6nLSoRLkjk81oadP5MoVj8j8GmXMJB5FQbSGx2rtjHkehO5vXNvuGQOaznQqeRWjpt2J0+zSn94B8p/vCn3FqRniuvlT1RBkinq5HtTniKE8UzGDQlYLlpZg3B4NSiXFUgTUqscVohFoSZ6GpEmI4PI9KqA5pwY1SYG/DOJUAPWnOtZNvcGNhnpWj52UDKMjvVCGOtR/jUxkVqaVGaAOU8s96UR+lTDHfmjAzUjItgHWkK1IRTcjtSGN2/hWpY2ohAlkHznoD2qCxtwzea3IXpn1rRxk04oQ8ksetSLGKRIwe9TFcAAVYhp4HFAJHbmpQnA9ajkzyAaTArvhs+lMihEtwoxlRyaGHBPartpEIodzfebk1G4EsjhEz7cVkykuxY9e1Xbh9zEdhTLWIPJvP3V6fWolq7DC2hMMOMfvG+8fT2qb7P/e5qdU4J6AcCpjHhQcU7CMO9h3MTj2rLvYtunMv96RR/M/0robtADjHasfUF/dwJ/tFz/L/GuepHdlIj0Wx82VXYfLDkj6mtOa2Lv0qzodvt0/eRy7E/0q48YBAxzVwpJQSBvU5O8tSr9Kdakoy1sX1sOpFZJTyx+NT7NJhczr60MF84TgN86H612cDC+0+Gbj50BP17/rXP3sXnWaTj70R2t/unp+tX/DlySklqT935l+h61UI2k13B7Fe+ttkmMcVjSxlJOK7O8tw65xmudurYhjjtVSgCZQjchgQcMDkH3ro7O5W+t/mx5q/eHr71ze3axJFWLad7aVZU7dR6iiGjBmvPb5BwOaz5IipzitsMlxCJU6NVaaAOpwMGtrEmTjnpSqKlkRgQccios880rDJR0oBpEp7DPIpgOU81agnaM8niqPSnq5HemgNcbW+YdDUm32rNhmKnrxVoS8fepgYGQBTS5zxUXmUbsnrSAk6n1qSOMuwUVEpArRtY9ibm+81CVwJ0QKqqowBxVhIwelMQAnHep1wBgVQiRUxxT9nPJpiMfTJpxbPTigBXGBwaquCRkmpJGO04NVmY49qhgPiTzZVU9Op+lXZZAM1UtPlV3IxuOB9KJZAX46Ck9EBHIST3rQt4wkap36tVS2jMjbyOAePetSKPb1/GiMeoMcibm56CpXXI9qcigCkm4iNNrQDKusFyfwrJvQDPjsigVryYZxnp1rJ5mlyesj5/WsZrSw0dLYw+XYQr6IKc6j0qaMbVC+gpjjn8a3toIpTx74mB61h3URDV00iA8isq8h+UsB14qXEDNs2RmaCU/u5QUb2z3/CqVpI+n34Zlw0TlJB7dDVhl8t8Gm36b9lwOS/yv7kDr+VZtdSkdaCs0QKkFSMg1kXdttdsUnh673I1q55TlPpWpcxZG/HNarVXJOTnt9ueOtVNpXiuimgDBhisua1OCR2qXEdxdNvTbS7HP7p+vsfWtiYY5HQ965sqy9a1dNvRIn2aU8j7pP8AKqi+giSRBIMj7wqm6ZJBGDVyVdrHtULFXODwabGVFJXqKnVtw5pjrtYhuaFyOnSkBIycVHjHFSqcikZc9KYDQeafvPrUJ4NLvNFwME3IHemG9iTlpAPxrg/7QnbrNIf+BGpraR5ZQoyWY4FZ8wHoWnTx3cp2HKJyT2rdjOTWJpUC2tskI6jlj6mtpGAFaoCynB4HNSphScjJqBHwcipN/HuaYibzMdOtG4EYqEMCPencAHFIBJXAXHaoDlwOwpxy30pBjIHYVLGSb9sfHbpUagyuFHUmo5JMtgdBVqyjyTIe/A+lK12BegQAAAcAcVaUHNMRMKKnXGMitCSUDAqG5bjFTZGCap3DDGaljKFw2I5Gz2x+dUrRN17AuONwqzcnMeP7xqPThnUY/QAn/P51k1eSGjpM4HvUTHrTlPHNROeSexrckc/K8duaguIg8RHerIGUP0pjr8ucdKAOcuYjux3zUGzzIJIT1IyPqK1b2Hdll6is4fLKDjvUNDRSs7g2t3FMP4WGfp3rtjiSMHqCK4adNk7pjjJxXV6Ncefp6AnLINppQ7DYyaHDHaao+ScsOtbE6dGFUnXkkVpYkyprXcM4rPeF4XyMjB4I7V0YUEGopLVXBBHWpcRmfDdi4jCScSAfnUchwxp8thhuOMVE0Mu3ruxQMBJng80dDkdKrsxQ8jFOWQ9c0XAtA57Yp3eoVcHvUm71oAR1zUW2pjg0m2lYDxFQc1saGmdQiz25rPEfNa+iri9U+xrGO4Hd2bYArSjO41j2rHArSSTA4rdMC8JNowKcCTjmqqNuFTqcDmmBMCQODQWLcA8UzOeO1Ix7CgALYXimF9qZNVp5xvWFD8xPzH0FEkmSBUtgSpmSRVHVjW7bxhVAHQcVj6aheZpP7vArbT5acUJkwFTj2qBOmTUobkYqhDyflNUJznirjHGRVC4btSYylcHlR7Zo00/6cfZD/MUydssx7dKTTmH21vdD/MVmviGdErfJULHAIzxmlDfKKimPHHrWxJaTlCO9SDBGCO1QxN8n4VMBwPWgDPuo8ZI6VkSRgOCOhreuANprGuEKscdM8UhmZqCbLkH+8oNaHh6fbO8RPDDIqnqfIhbvgjNRafN5F9C3bdistpDOycZ47VQkADlfyrQPIyOtVLlAQHHUVsSVhgZp+3IBFRZwwJ6Gp044HQ0DIZo8jP51QkXYT7VrlQciqVxH19aGBlzqMhu3Q1WeLbyvFXJB1WoFPGKzkNFfJU09Zh3pzpmoHSlcZZ81fWk89fWqmD2pMUcwWPMQvNXrBjHcRkddwFUhVqDggjrWC3A7W2YgYrRjY4rMtTmNT32itJOgreIFtGx0qdSe/Sq0fT8amXk1YibJPPaqVzfAApFye7Ut8zARqCQrZyB3rO6jmonK2g0iS3bMpYntUzP3qCH+OpO4qUNnQadH5dqmep5NaAJ4qvB9xfpU461siCdTlQTUhOce1Rp92pOxoAbI2Kz5z8w/OtB+1Ztx95vpSYFKU5TNRWLldQUeqGpJfun6VBZf8hNf90/yrNbjOkV8pmoJXOCD1Bp0X+pqK4+6a2EXbdsoAatRnjPeqNt91aux0IRDMODWTcja49DWvN1rJuvvGgZl34zAmezGs8NtYEdjWhf/AOpH1/xrPHU1jLcpHbW0omtY3B4YCkccFT0NVNGJOmLn3q3J0rZbEmbJmOQoenUVLBJztqO+6p9aZGTvBpAaQ+YfSoJUzk1Knf6UjfcNUBiXSFTVHcUc+hrTvvu/hWY4+UVEhkoIYUx0psR+apm6VBRUZcUYFSuKZgUAf//Z" 6 | } 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /code/project/code/inference-service.yaml: -------------------------------------------------------------------------------- 1 | # https://kserve.github.io/website/modelserving/v1beta1/tensorflow/ 2 | apiVersion: serving.kserve.io/v1beta1 3 | kind: InferenceService 4 | metadata: 5 | name: flower-sample 6 | spec: 7 | predictor: 8 | model: 9 | modelFormat: 10 | name: tensorflow 11 | # This is only needed on Mac M1 12 | image: "emacski/tensorflow-serving:2.6.0" 13 | # https://kserve.github.io/website/modelserving/storage/pvc/pvc/ 14 | # Note that we are skipping `mountPath: /trained_model` 15 | storageUri: "pvc://strategy-volume/saved_model_versions" 16 | -------------------------------------------------------------------------------- /code/project/code/model-selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | import tensorflow_datasets as tfds 5 | import shutil 6 | import os 7 | 8 | 9 | # Scaling MNIST data from (0, 255] to (0., 1.] 10 | def scale(image, label): 11 | image = tf.cast(image, tf.float32) 12 | image /= 255 13 | return image, label 14 | 15 | best_model_path = "" 16 | best_accuracy = 0 17 | for i in range(1, 4): 18 | model_path = "trained_model/saved_model_versions/" + str(i) 19 | model = keras.models.load_model(model_path) 20 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True) 21 | ds = datasets['test'].map(scale).cache().shuffle(10000).batch(64) 22 | _, accuracy = model.evaluate(ds) 23 | if accuracy > best_accuracy: 24 | best_accuracy = accuracy 25 | best_model_path = model_path 26 | 27 | destination = "trained_model/saved_model_versions/4" 28 | if os.path.exists(destination): 29 | shutil.rmtree(destination) 30 | 31 | shutil.copytree(best_model_path, destination) 32 | print("Best model with accuracy %f is copied to %s" % (best_accuracy, destination)) 33 | -------------------------------------------------------------------------------- /code/project/code/model-selection.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: model-selection 5 | spec: 6 | containers: 7 | - name: predict 8 | image: kubeflow/multi-worker-strategy:v0.1 9 | command: ["python", "/model-selection.py"] 10 | volumeMounts: 11 | - name: model 12 | mountPath: /trained_model 13 | volumes: 14 | - name: model 15 | persistentVolumeClaim: 16 | claimName: strategy-volume 17 | -------------------------------------------------------------------------------- /code/project/code/multi-worker-distributed-training.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | import tensorflow_datasets as tfds 8 | import tensorflow as tf 9 | from tensorflow.keras import layers, models 10 | 11 | 12 | def make_datasets_unbatched(): 13 | BUFFER_SIZE = 10000 14 | 15 | # Scaling MNIST data from (0, 255] to (0., 1.] 16 | def scale(image, label): 17 | image = tf.cast(image, tf.float32) 18 | image /= 255 19 | return image, label 20 | # Use Fashion-MNIST: https://www.tensorflow.org/datasets/catalog/fashion_mnist 21 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True) 22 | 23 | return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE) 24 | 25 | 26 | def build_and_compile_cnn_model(): 27 | print("Training CNN model") 28 | model = models.Sequential() 29 | model.add(layers.Input(shape=(28, 28, 1), name='image_bytes')) 30 | model.add( 31 | layers.Conv2D(32, (3, 3), activation='relu')) 32 | model.add(layers.MaxPooling2D((2, 2))) 33 | model.add(layers.Conv2D(64, (3, 3), activation='relu')) 34 | model.add(layers.MaxPooling2D((2, 2))) 35 | model.add(layers.Conv2D(64, (3, 3), activation='relu')) 36 | model.add(layers.Flatten()) 37 | model.add(layers.Dense(64, activation='relu')) 38 | model.add(layers.Dense(10, activation='softmax')) 39 | 40 | model.summary() 41 | 42 | model.compile(optimizer='adam', 43 | loss='sparse_categorical_crossentropy', 44 | metrics=['accuracy']) 45 | 46 | return model 47 | 48 | # https://d2l.ai/chapter_convolutional-modern/batch-norm.html#concise-implementation 49 | def build_and_compile_cnn_model_with_batch_norm(): 50 | print("Training CNN model with batch normalization") 51 | model = models.Sequential() 52 | model.add(layers.Input(shape=(28, 28, 1), name='image_bytes')) 53 | model.add( 54 | layers.Conv2D(32, (3, 3), activation='relu')) 55 | model.add(layers.BatchNormalization()) 56 | model.add(layers.Activation('sigmoid')) 57 | model.add(layers.MaxPooling2D((2, 2))) 58 | model.add(layers.Conv2D(64, (3, 3), activation='relu')) 59 | model.add(layers.BatchNormalization()) 60 | model.add(layers.Activation('sigmoid')) 61 | model.add(layers.MaxPooling2D((2, 2))) 62 | model.add(layers.Conv2D(64, (3, 3), activation='relu')) 63 | model.add(layers.Flatten()) 64 | model.add(layers.Dense(64, activation='relu')) 65 | model.add(layers.Dense(10, activation='softmax')) 66 | 67 | model.summary() 68 | 69 | model.compile(optimizer='adam', 70 | loss='sparse_categorical_crossentropy', 71 | metrics=['accuracy']) 72 | 73 | return model 74 | 75 | # https://d2l.ai/chapter_convolutional-modern/alexnet.html 76 | def build_and_compile_cnn_model_with_dropout(): 77 | print("Training CNN model with dropout") 78 | model = models.Sequential() 79 | model.add(layers.Input(shape=(28, 28, 1), name='image_bytes')) 80 | model.add( 81 | layers.Conv2D(32, (3, 3), activation='relu')) 82 | model.add(layers.MaxPooling2D((2, 2))) 83 | model.add(layers.Conv2D(64, (3, 3), activation='relu')) 84 | model.add(layers.MaxPooling2D((2, 2))) 85 | model.add(layers.Dropout(0.5)) 86 | model.add(layers.Conv2D(64, (3, 3), activation='relu')) 87 | model.add(layers.Flatten()) 88 | model.add(layers.Dense(64, activation='relu')) 89 | model.add(layers.Dense(10, activation='softmax')) 90 | 91 | model.summary() 92 | 93 | model.compile(optimizer='adam', 94 | loss='sparse_categorical_crossentropy', 95 | metrics=['accuracy']) 96 | 97 | return model 98 | 99 | 100 | def decay(epoch): 101 | if epoch < 3: 102 | return 1e-3 103 | if 3 <= epoch < 7: 104 | return 1e-4 105 | return 1e-5 106 | 107 | # https://cloud.google.com/blog/topics/developers-practitioners/add-preprocessing-functions-tensorflow-models-and-deploy-vertex-ai 108 | def _preprocess(bytes_inputs): 109 | decoded = tf.io.decode_jpeg(bytes_inputs, channels=1) 110 | resized = tf.image.resize(decoded, size=(28, 28)) 111 | return tf.cast(resized, dtype=tf.uint8) 112 | 113 | def _get_serve_image_fn(model): 114 | @tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.string, name='image_bytes')]) 115 | def serve_image_fn(bytes_inputs): 116 | decoded_images = tf.map_fn(_preprocess, bytes_inputs, dtype=tf.uint8) 117 | return model(decoded_images) 118 | return serve_image_fn 119 | 120 | 121 | def main(args): 122 | 123 | # MultiWorkerMirroredStrategy creates copies of all variables in the model's 124 | # layers on each device across all workers 125 | # if your GPUs don't support NCCL, replace "communication" with another 126 | # https://www.tensorflow.org/tutorials/distribute/keras 127 | strategy = tf.distribute.MultiWorkerMirroredStrategy( 128 | communication_options=tf.distribute.experimental.CommunicationOptions(implementation=tf.distribute.experimental.CollectiveCommunication.AUTO)) 129 | 130 | BATCH_SIZE_PER_REPLICA = 64 131 | BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync 132 | 133 | with strategy.scope(): 134 | ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat() 135 | options = tf.data.Options() 136 | # https://www.tensorflow.org/tutorials/distribute/input 137 | options.experimental_distribute.auto_shard_policy = \ 138 | tf.data.experimental.AutoShardPolicy.DATA 139 | ds_train = ds_train.with_options(options) 140 | # Model building/compiling need to be within `strategy.scope()`. 141 | if args.model_type == "cnn": 142 | multi_worker_model = build_and_compile_cnn_model() 143 | elif args.model_type == "dropout": 144 | multi_worker_model = build_and_compile_cnn_model_with_dropout() 145 | elif args.model_type == "batch_norm": 146 | multi_worker_model = build_and_compile_cnn_model_with_batch_norm() 147 | else: 148 | raise Exception("Unsupported model type: %s" % args.model_type) 149 | 150 | # Define the checkpoint directory to store the checkpoints 151 | checkpoint_dir = args.checkpoint_dir 152 | 153 | # Name of the checkpoint files 154 | checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") 155 | 156 | # Function for decaying the learning rate. 157 | # You can define any decay function you need. 158 | # Callback for printing the LR at the end of each epoch. 159 | class PrintLR(tf.keras.callbacks.Callback): 160 | 161 | def on_epoch_end(self, epoch, logs=None): #pylint: disable=no-self-use 162 | print('\nLearning rate for epoch {} is {}'.format( 163 | epoch + 1, multi_worker_model.optimizer.lr.numpy())) 164 | 165 | callbacks = [ 166 | tf.keras.callbacks.TensorBoard(log_dir='./logs'), 167 | tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, 168 | save_weights_only=True), 169 | tf.keras.callbacks.LearningRateScheduler(decay), 170 | PrintLR() 171 | ] 172 | 173 | # Keras' `model.fit()` trains the model with specified number of epochs and 174 | # number of steps per epoch. Note that the numbers here are for demonstration 175 | # purposes only and may not sufficiently produce a model with good quality. 176 | multi_worker_model.fit(ds_train, 177 | epochs=1, 178 | steps_per_epoch=70, 179 | callbacks=callbacks) 180 | 181 | # Saving a model 182 | # Let `is_chief` be a utility function that inspects the cluster spec and 183 | # current task type and returns True if the worker is the chief and False 184 | # otherwise. 185 | def is_chief(): 186 | return TASK_INDEX == 0 187 | 188 | if is_chief(): 189 | model_path = args.saved_model_dir 190 | 191 | else: 192 | # Save to a path that is unique across workers. 193 | model_path = args.saved_model_dir + '/worker_tmp_' + str(TASK_INDEX) 194 | 195 | multi_worker_model.save(model_path) 196 | 197 | 198 | signatures = { 199 | "serving_default": _get_serve_image_fn(multi_worker_model).get_concrete_function( 200 | tf.TensorSpec(shape=[None], dtype=tf.string, name='image_bytes') 201 | ) 202 | } 203 | 204 | # https://www.tensorflow.org/api_docs/python/tf/saved_model/save 205 | tf.saved_model.save(multi_worker_model, model_path, signatures=signatures) 206 | 207 | 208 | if __name__ == '__main__': 209 | os.environ['NCCL_DEBUG'] = 'INFO' 210 | 211 | tfds.disable_progress_bar() 212 | 213 | # to decide if a worker is chief, get TASK_INDEX in Cluster info 214 | tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}') 215 | TASK_INDEX = tf_config['task']['index'] 216 | 217 | parser = argparse.ArgumentParser() 218 | parser.add_argument('--saved_model_dir', 219 | type=str, 220 | required=True, 221 | help='Tensorflow export directory.') 222 | 223 | parser.add_argument('--checkpoint_dir', 224 | type=str, 225 | required=True, 226 | help='Tensorflow checkpoint directory.') 227 | 228 | parser.add_argument('--model_type', 229 | type=str, 230 | required=True, 231 | help='Type of model to train.') 232 | 233 | parsed_args = parser.parse_args() 234 | main(parsed_args) 235 | -------------------------------------------------------------------------------- /code/project/code/multi-worker-pvc.yaml: -------------------------------------------------------------------------------- 1 | kind: PersistentVolumeClaim 2 | apiVersion: v1 3 | metadata: 4 | name: strategy-volume 5 | spec: 6 | accessModes: [ "ReadWriteOnce" ] 7 | resources: 8 | requests: 9 | storage: 1Gi 10 | -------------------------------------------------------------------------------- /code/project/code/multi-worker-tfjob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1 2 | kind: TFJob 3 | metadata: 4 | name: multi-worker-training 5 | spec: 6 | runPolicy: 7 | cleanPodPolicy: None 8 | tfReplicaSpecs: 9 | Worker: 10 | replicas: 2 11 | restartPolicy: Never 12 | template: 13 | spec: 14 | containers: 15 | - name: tensorflow 16 | image: kubeflow/multi-worker-strategy:v0.1 17 | imagePullPolicy: IfNotPresent 18 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"] 19 | volumeMounts: 20 | - mountPath: /trained_model 21 | name: training 22 | resources: 23 | limits: 24 | cpu: 500m 25 | volumes: 26 | - name: training 27 | persistentVolumeClaim: 28 | claimName: strategy-volume 29 | -------------------------------------------------------------------------------- /code/project/code/predict-service.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | import tensorflow_datasets as tfds 5 | 6 | 7 | model = keras.models.load_model("trained_model/saved_model_versions") 8 | 9 | # Scaling MNIST data from (0, 255] to (0., 1.] 10 | def scale(image, label): 11 | image = tf.cast(image, tf.float32) 12 | image /= 255 13 | return image, label 14 | 15 | datasets, _ = tfds.load(name='fashion_mnist', with_info=True, as_supervised=True) 16 | 17 | ds = datasets['test'].map(scale).cache().shuffle(10000).batch(64) 18 | 19 | # TODO: Visualize the images and compare with the classified result 20 | model.predict(ds) 21 | -------------------------------------------------------------------------------- /code/project/code/predict-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: predict-service 5 | spec: 6 | containers: 7 | - name: predict 8 | image: kubeflow/multi-worker-strategy:v0.1 9 | command: ['sleep', 'infinity'] 10 | volumeMounts: 11 | - name: model 12 | mountPath: /trained_model 13 | volumes: 14 | - name: model 15 | persistentVolumeClaim: 16 | claimName: strategy-volume 17 | -------------------------------------------------------------------------------- /code/project/code/workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: tfjob-wf- 5 | namespace: kubeflow 6 | spec: 7 | entrypoint: tfjob-wf 8 | podGC: 9 | strategy: OnPodSuccess 10 | volumes: 11 | - name: model 12 | persistentVolumeClaim: 13 | claimName: strategy-volume 14 | 15 | templates: 16 | - name: tfjob-wf 17 | steps: 18 | - - name: data-ingestion-step 19 | template: data-ingestion-step 20 | - - name: distributed-tf-training-steps 21 | template: distributed-tf-training-steps 22 | - - name: model-selection-step 23 | template: model-selection-step 24 | - - name: create-model-serving-service 25 | template: create-model-serving-service 26 | 27 | - name: data-ingestion-step 28 | serviceAccountName: argo 29 | memoize: 30 | key: "step-cache" 31 | maxAge: "1h" 32 | cache: 33 | configMap: 34 | name: my-config 35 | key: step-cache 36 | container: 37 | image: kubeflow/multi-worker-strategy:v0.1 38 | imagePullPolicy: IfNotPresent 39 | command: ["python", "/data-ingestion.py"] 40 | 41 | - name: distributed-tf-training-steps 42 | steps: 43 | - - name: cnn-model 44 | template: cnn-model 45 | - name: cnn-model-with-dropout 46 | template: cnn-model-with-dropout 47 | - name: cnn-model-with-batch-norm 48 | template: cnn-model-with-batch-norm 49 | 50 | - name: cnn-model 51 | serviceAccountName: training-operator 52 | resource: 53 | action: create 54 | setOwnerReference: true 55 | successCondition: status.replicaStatuses.Worker.succeeded = 2 56 | failureCondition: status.replicaStatuses.Worker.failed > 0 57 | manifest: | 58 | apiVersion: kubeflow.org/v1 59 | kind: TFJob 60 | metadata: 61 | generateName: multi-worker-training- 62 | spec: 63 | runPolicy: 64 | cleanPodPolicy: None 65 | tfReplicaSpecs: 66 | Worker: 67 | replicas: 2 68 | restartPolicy: Never 69 | template: 70 | spec: 71 | containers: 72 | - name: tensorflow 73 | image: kubeflow/multi-worker-strategy:v0.1 74 | imagePullPolicy: IfNotPresent 75 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/1/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "cnn"] 76 | volumeMounts: 77 | - mountPath: /trained_model 78 | name: training 79 | resources: 80 | limits: 81 | cpu: 500m 82 | volumes: 83 | - name: training 84 | persistentVolumeClaim: 85 | claimName: strategy-volume 86 | 87 | - name: cnn-model-with-dropout 88 | serviceAccountName: training-operator 89 | resource: 90 | action: create 91 | setOwnerReference: true 92 | successCondition: status.replicaStatuses.Worker.succeeded = 2 93 | failureCondition: status.replicaStatuses.Worker.failed > 0 94 | manifest: | 95 | apiVersion: kubeflow.org/v1 96 | kind: TFJob 97 | metadata: 98 | generateName: multi-worker-training- 99 | spec: 100 | runPolicy: 101 | cleanPodPolicy: None 102 | tfReplicaSpecs: 103 | Worker: 104 | replicas: 2 105 | restartPolicy: Never 106 | template: 107 | spec: 108 | containers: 109 | - name: tensorflow 110 | image: kubeflow/multi-worker-strategy:v0.1 111 | imagePullPolicy: IfNotPresent 112 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/2/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "dropout"] 113 | volumeMounts: 114 | - mountPath: /trained_model 115 | name: training 116 | resources: 117 | limits: 118 | cpu: 500m 119 | volumes: 120 | - name: training 121 | persistentVolumeClaim: 122 | claimName: strategy-volume 123 | 124 | - name: cnn-model-with-batch-norm 125 | serviceAccountName: training-operator 126 | resource: 127 | action: create 128 | setOwnerReference: true 129 | successCondition: status.replicaStatuses.Worker.succeeded = 2 130 | failureCondition: status.replicaStatuses.Worker.failed > 0 131 | manifest: | 132 | apiVersion: kubeflow.org/v1 133 | kind: TFJob 134 | metadata: 135 | generateName: multi-worker-training- 136 | spec: 137 | runPolicy: 138 | cleanPodPolicy: None 139 | tfReplicaSpecs: 140 | Worker: 141 | replicas: 2 142 | restartPolicy: Never 143 | template: 144 | spec: 145 | containers: 146 | - name: tensorflow 147 | image: kubeflow/multi-worker-strategy:v0.1 148 | imagePullPolicy: IfNotPresent 149 | command: ["python", "/multi-worker-distributed-training.py", "--saved_model_dir", "/trained_model/saved_model_versions/3/", "--checkpoint_dir", "/trained_model/checkpoint", "--model_type", "dropout"] 150 | volumeMounts: 151 | - mountPath: /trained_model 152 | name: training 153 | resources: 154 | limits: 155 | cpu: 500m 156 | volumes: 157 | - name: training 158 | persistentVolumeClaim: 159 | claimName: strategy-volume 160 | 161 | - name: model-selection-step 162 | serviceAccountName: argo 163 | container: 164 | image: kubeflow/multi-worker-strategy:v0.1 165 | imagePullPolicy: IfNotPresent 166 | command: ["python", "/model-selection.py"] 167 | volumeMounts: 168 | - name: model 169 | mountPath: /trained_model 170 | 171 | - name: create-model-serving-service 172 | serviceAccountName: training-operator 173 | successCondition: status.modelStatus.states.transitionStatus = UpToDate 174 | resource: 175 | action: create 176 | setOwnerReference: true 177 | manifest: | 178 | apiVersion: serving.kserve.io/v1beta1 179 | kind: InferenceService 180 | metadata: 181 | name: flower-sample 182 | spec: 183 | predictor: 184 | model: 185 | modelFormat: 186 | name: tensorflow 187 | image: "emacski/tensorflow-serving:2.6.0" 188 | storageUri: "pvc://strategy-volume/saved_model_versions" 189 | -------------------------------------------------------------------------------- /code/project/manifests/argo-workflows/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | namespace: kubeflow 4 | 5 | resources: 6 | - https://github.com/argoproj/argo-workflows/releases/download/v3.4.0/install.yaml 7 | 8 | patchesStrategicMerge: 9 | - rbac-patch.yaml 10 | -------------------------------------------------------------------------------- /code/project/manifests/argo-workflows/rbac-patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: argo-cluster-role 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - pods 10 | - pods/exec 11 | verbs: 12 | - create 13 | - get 14 | - list 15 | - watch 16 | - update 17 | - patch 18 | - delete 19 | - apiGroups: 20 | - "" 21 | resources: 22 | - configmaps 23 | verbs: 24 | # Note(terrytangyuan): "create" and "update" are additional RBAC needed to use memoization cache. 25 | - create 26 | - update 27 | - get 28 | - watch 29 | - list 30 | - apiGroups: 31 | - "" 32 | resources: 33 | - persistentvolumeclaims 34 | - persistentvolumeclaims/finalizers 35 | verbs: 36 | - create 37 | - update 38 | - delete 39 | - get 40 | - apiGroups: 41 | - argoproj.io 42 | resources: 43 | - workflows 44 | - workflows/finalizers 45 | - workflowtasksets 46 | - workflowtasksets/finalizers 47 | - workflowartifactgctasks 48 | verbs: 49 | - get 50 | - list 51 | - watch 52 | - update 53 | - patch 54 | - delete 55 | - create 56 | - apiGroups: 57 | - argoproj.io 58 | resources: 59 | - workflowtemplates 60 | - workflowtemplates/finalizers 61 | - clusterworkflowtemplates 62 | - clusterworkflowtemplates/finalizers 63 | verbs: 64 | - get 65 | - list 66 | - watch 67 | - apiGroups: 68 | - argoproj.io 69 | resources: 70 | - workflowtaskresults 71 | verbs: 72 | - list 73 | - watch 74 | - deletecollection 75 | - apiGroups: 76 | - "" 77 | resources: 78 | - serviceaccounts 79 | verbs: 80 | - get 81 | - list 82 | - apiGroups: 83 | - argoproj.io 84 | resources: 85 | - cronworkflows 86 | - cronworkflows/finalizers 87 | verbs: 88 | - get 89 | - list 90 | - watch 91 | - update 92 | - patch 93 | - delete 94 | - apiGroups: 95 | - "" 96 | resources: 97 | - events 98 | verbs: 99 | - create 100 | - patch 101 | - apiGroups: 102 | - policy 103 | resources: 104 | - poddisruptionbudgets 105 | verbs: 106 | - create 107 | - get 108 | - delete -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | labels: 6 | app: training-operator 7 | name: training-operator 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: training-operator 12 | subjects: 13 | - kind: ServiceAccount 14 | name: training-operator 15 | -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/cluster-role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app: training-operator 7 | name: training-operator 8 | rules: 9 | - apiGroups: 10 | - serving.kserve.io 11 | resources: 12 | - inferenceservices 13 | verbs: 14 | - "*" 15 | - apiGroups: 16 | - kubeflow.org 17 | resources: 18 | - tfjobs 19 | - mxjobs 20 | - pytorchjobs 21 | - xgboostjobs 22 | - tfjobs/status 23 | - pytorchjobs/status 24 | - mxjobs/status 25 | - xgboostjobs/status 26 | verbs: 27 | - create 28 | - delete 29 | - get 30 | - list 31 | - patch 32 | - update 33 | - watch 34 | - apiGroups: 35 | - "" 36 | resources: 37 | - pods 38 | - services 39 | - endpoints 40 | - events 41 | verbs: 42 | - "*" 43 | - apiGroups: 44 | - apps 45 | - extensions 46 | resources: 47 | - deployments 48 | verbs: 49 | - "*" 50 | - apiGroups: 51 | - scheduling.volcano.sh 52 | resources: 53 | - podgroups 54 | verbs: 55 | - "*" 56 | -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/crds/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - kubeflow.org_tfjobs.yaml 5 | - kubeflow.org_mxjobs.yaml 6 | - kubeflow.org_pytorchjobs.yaml 7 | - kubeflow.org_xgboostjobs.yaml 8 | -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: training-operator 5 | labels: 6 | control-plane: kubeflow-training-operator 7 | spec: 8 | selector: 9 | matchLabels: 10 | control-plane: kubeflow-training-operator 11 | replicas: 1 12 | template: 13 | metadata: 14 | labels: 15 | control-plane: kubeflow-training-operator 16 | annotations: 17 | sidecar.istio.io/inject: "false" 18 | spec: 19 | containers: 20 | - command: 21 | - /manager 22 | image: kubeflow/training-operator 23 | name: training-operator 24 | env: 25 | - name: MY_POD_NAMESPACE 26 | valueFrom: 27 | fieldRef: 28 | fieldPath: metadata.namespace 29 | - name: MY_POD_NAME 30 | valueFrom: 31 | fieldRef: 32 | fieldPath: metadata.name 33 | securityContext: 34 | allowPrivilegeEscalation: false 35 | livenessProbe: 36 | httpGet: 37 | path: /healthz 38 | port: 8081 39 | initialDelaySeconds: 15 40 | periodSeconds: 20 41 | readinessProbe: 42 | httpGet: 43 | path: /readyz 44 | port: 8081 45 | initialDelaySeconds: 5 46 | periodSeconds: 10 47 | resources: 48 | limits: 49 | cpu: 100m 50 | memory: 30Mi 51 | requests: 52 | cpu: 100m 53 | memory: 20Mi 54 | serviceAccountName: training-operator 55 | terminationGracePeriodSeconds: 10 56 | -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | namespace: kubeflow 4 | resources: 5 | - crds/ 6 | - cluster-role-binding.yaml 7 | - cluster-role.yaml 8 | - service-account.yaml 9 | - service.yaml 10 | - deployment.yaml 11 | images: 12 | - name: kubeflow/training-operator 13 | newName: public.ecr.aws/j1r0q0g6/training/training-operator 14 | newTag: "5ef6c405df2bb1bf1d3ede988cd43433eff2e956" 15 | -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: training-operator 6 | name: training-operator 7 | -------------------------------------------------------------------------------- /code/project/manifests/kubeflow-training/service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | annotations: 6 | prometheus.io/path: /metrics 7 | prometheus.io/scrape: "true" 8 | prometheus.io/port: "8443" 9 | labels: 10 | app: training-operator 11 | name: training-operator 12 | spec: 13 | ports: 14 | - name: monitoring-port 15 | port: 8443 16 | targetPort: 8443 17 | selector: 18 | name: training-operator 19 | type: ClusterIP 20 | -------------------------------------------------------------------------------- /code/project/manifests/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | namespace: kubeflow 4 | 5 | resources: 6 | - argo-workflows/ 7 | - kubeflow-training/ 8 | -------------------------------------------------------------------------------- /images/chinese-cover.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/chinese-cover.pdf -------------------------------------------------------------------------------- /images/english-front-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/english-front-cover.png -------------------------------------------------------------------------------- /images/korean-cover-clean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover-clean.png -------------------------------------------------------------------------------- /images/korean-cover-white.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover-white.jpg -------------------------------------------------------------------------------- /images/korean-cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrytangyuan/distributed-ml-patterns/0c64a653ef4a3d51dab1ab98294730dc260d4a37/images/korean-cover.jpg --------------------------------------------------------------------------------