├── .github ├── CODEOWNERS └── workflows │ ├── publish-docs.yaml │ └── validate-docs.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── README.rst ├── VERSION ├── _extra └── robots.txt ├── _static ├── SystemsApproachLogoURL.png ├── bridge.ico ├── cover.jpg ├── css │ └── rtd_theme_mods.css └── fonts │ ├── Inconsolata-Bold.ttf │ └── Inconsolata-Regular.ttf ├── arch.rst ├── authors.rst ├── code ├── build.sh ├── cluster-edge_val.tfvars ├── cluster-gcp_val.tfvars ├── log.ascii ├── log.json ├── main-rke.tf ├── prometheus-rule.yaml ├── provider.tf ├── roc-api-tests.groovy ├── template.yang ├── trigger-event.yaml ├── trigger-time.yaml └── uptime.yaml ├── conf.py ├── control.rst ├── dict.txt ├── figures.pptx ├── figures ├── Slide1.png ├── Slide10.png ├── Slide11.png ├── Slide12.png ├── Slide13.png ├── Slide14.png ├── Slide15.png ├── Slide16.png ├── Slide17.png ├── Slide18.png ├── Slide19.png ├── Slide2.png ├── Slide20.png ├── Slide21.png ├── Slide22.png ├── Slide23.png ├── Slide24.png ├── Slide25.png ├── Slide26.png ├── Slide27.png ├── Slide3.png ├── Slide4.png ├── Slide5.png ├── Slide6.png ├── Slide7.png ├── Slide8.png ├── Slide9.png ├── ace_dash.png ├── cable_list.png ├── es_dash.png ├── gui1.png ├── gui2.png ├── pronto_logical_diagram.png ├── pronto_logical_diagram.svg ├── rack_diagram.png └── upf_dash.png ├── foreword.rst ├── index.rst ├── intro.rst ├── latest.rst ├── lifecycle.rst ├── monitor.rst ├── preface.rst ├── print.rst ├── provision.rst └── requirements.txt /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | #require review 2 | * @llpeterson @drbruced12 3 | -------------------------------------------------------------------------------- /.github/workflows/publish-docs.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Docs Workflow 2 | run-name: ${{ github.actor }} is publishing document artifacts 🚀 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 15 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 16 | concurrency: 17 | group: "pages" 18 | cancel-in-progress: false 19 | 20 | jobs: 21 | # Single deploy job since we're just deploying 22 | deploy: 23 | environment: 24 | name: github-pages 25 | url: ${{ steps.deployment.outputs.page_url }} 26 | runs-on: ubuntu-latest 27 | steps: 28 | - name: Checkout 29 | uses: actions/checkout@v4 30 | - name: Setup Pages 31 | uses: actions/configure-pages@v4 32 | - name: Build html 33 | run: make html 34 | - name: Upload artifact 35 | uses: actions/upload-pages-artifact@v3 36 | with: 37 | # Upload build repository 38 | path: './_build/html' 39 | - name: Deploy to GitHub Pages 40 | id: deployment 41 | uses: actions/deploy-pages@v4 42 | 43 | 44 | - run: echo "🍏 This job's status is ${{ job.status }}." 45 | -------------------------------------------------------------------------------- /.github/workflows/validate-docs.yaml: -------------------------------------------------------------------------------- 1 | name: Validate Docs Workflow 2 | run-name: ${{ github.actor }} is validating document source 3 | on: [pull_request, workflow_dispatch] 4 | jobs: 5 | Validate_Docs: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." 9 | - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" 10 | - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." 11 | - name: Check out repo 12 | uses: actions/checkout@v4 13 | - name: Validate source 14 | run: make test 15 | - name: Build html 16 | run: make html 17 | - name: List built files 18 | run: | 19 | ls ${{ github.workspace }}/_build/html 20 | 21 | - run: echo "🍏 This job's status is ${{ job.status }}." 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pdf 2 | *.tex 3 | venv-docs 4 | .DS_Store 5 | */.DS_Store 6 | figures-pdf/ 7 | figures-hi_res/ 8 | figures-low_res/ 9 | private/ 10 | local/ 11 | scripts/ 12 | _build/ 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution 4.0 International Public License 2 | 3 | By exercising the Licensed Rights (defined below), You accept and 4 | agree to be bound by the terms and conditions of this Creative Commons 5 | Attribution-NonCommercial-NoDerivatives 4.0 International Public 6 | License ("Public License"). To the extent this Public License may be 7 | interpreted as a contract, You are granted the Licensed Rights in 8 | consideration of Your acceptance of these terms and conditions, and 9 | the Licensor grants You such rights in consideration of benefits the 10 | Licensor receives from making the Licensed Material available under 11 | these terms and conditions. 12 | 13 | Section 1 – Definitions. 14 | 15 | (a) Adapted Material means material subject to Copyright and Similar 16 | Rights that is derived from or based upon the Licensed Material and 17 | in which the Licensed Material is translated, altered, arranged, 18 | transformed, or otherwise modified in a manner requiring permission 19 | under the Copyright and Similar Rights held by the Licensor. For 20 | purposes of this Public License, where the Licensed Material is a 21 | musical work, performance, or sound recording, Adapted Material is 22 | always produced where the Licensed Material is synched in timed 23 | relation with a moving image. 24 | 25 | (b) Copyright and Similar Rights means copyright and/or similar 26 | rights closely related to copyright including, without limitation, 27 | performance, broadcast, sound recording, and Sui Generis Database 28 | Rights, without regard to how the rights are labeled or 29 | categorized. For purposes of this Public License, the rights 30 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 31 | Rights. 32 | 33 | (c) Effective Technological Measures means those measures that, in 34 | the absence of proper authority, may not be circumvented under laws 35 | fulfilling obligations under Article 11 of the WIPO Copyright Treaty 36 | adopted on December 20, 1996, and/or similar international 37 | agreements. 38 | 39 | (d) Exceptions and Limitations means fair use, fair dealing, and/or 40 | any other exception or limitation to Copyright and Similar Rights 41 | that applies to Your use of the Licensed Material. 42 | 43 | (e) Licensed Material means the artistic or literary work, database, 44 | or other material to which the Licensor applied this Public License. 45 | 46 | (f) Licensed Rights means the rights granted to You subject to the 47 | terms and conditions of this Public License, which are limited to 48 | all Copyright and Similar Rights that apply to Your use of the 49 | Licensed Material and that the Licensor has authority to license. 50 | 51 | (g) Licensor means the individual(s) or entity(ies) granting rights 52 | under this Public License. 53 | 54 | (h) NonCommercial means not primarily intended for or directed 55 | towards commercial advantage or monetary compensation. For purposes 56 | of this Public License, the exchange of the Licensed Material for 57 | other material subject to Copyright and Similar Rights by digital 58 | file-sharing or similar means is NonCommercial provided there is no 59 | payment of monetary compensation in connection with the exchange. 60 | 61 | (i) Share means to provide material to the public by any means or 62 | process that requires permission under the Licensed Rights, such as 63 | reproduction, public display, public performance, distribution, 64 | dissemination, communication, or importation, and to make material 65 | available to the public including in ways that members of the public 66 | may access the material from a place and at a time individually 67 | chosen by them. 68 | 69 | (j) Sui Generis Database Rights means rights other than copyright 70 | resulting from Directive 96/9/EC of the European Parliament and of 71 | the Council of 11 March 1996 on the legal protection of databases, 72 | as amended and/or succeeded, as well as other essentially equivalent 73 | rights anywhere in the world. 74 | 75 | (k) You means the individual or entity exercising the Licensed 76 | Rights under this Public License. Your has a corresponding meaning. 77 | 78 | Section 2 – Scope. 79 | 80 | (a) License grant. 81 | 82 | (1) Subject to the terms and conditions of this Public License, 83 | the Licensor hereby grants You a worldwide, royalty-free, 84 | non-sublicensable, non-exclusive, irrevocable license to exercise 85 | the Licensed Rights in the Licensed Material to: 86 | 87 | (A) reproduce and Share the Licensed Material, in whole or in 88 | part, for NonCommercial purposes only; and 89 | 90 | (B) produce and reproduce, but not Share, Adapted Material for 91 | NonCommercial purposes only. 92 | 93 | (2) Exceptions and Limitations. For the avoidance of doubt, where 94 | Exceptions and Limitations apply to Your use, this Public License 95 | does not apply, and You do not need to comply with its terms and 96 | conditions. 97 | 98 | (3) Term. The term of this Public License is specified in Section 99 | 6(a). 100 | 101 | (4) Media and formats; technical modifications allowed. The 102 | Licensor authorizes You to exercise the Licensed Rights in all 103 | media and formats whether now known or hereafter created, and to 104 | make technical modifications necessary to do so. The Licensor 105 | waives and/or agrees not to assert any right or authority to 106 | forbid You from making technical modifications necessary to 107 | exercise the Licensed Rights, including technical modifications 108 | necessary to circumvent Effective Technological Measures. For 109 | purposes of this Public License, simply making modifications 110 | authorized by this Section 2(a)(4) never produces Adapted 111 | Material. 112 | 113 | (5) Downstream recipients. 114 | 115 | (A) Offer from the Licensor – Licensed Material. Every recipient 116 | of the Licensed Material automatically receives an offer from 117 | the Licensor to exercise the Licensed Rights under the terms and 118 | conditions of this Public License. 119 | 120 | (B) No downstream restrictions. You may not offer or impose any 121 | additional or different terms or conditions on, or apply any 122 | Effective Technological Measures to, the Licensed Material if 123 | doing so restricts exercise of the Licensed Rights by any 124 | recipient of the Licensed Material. 125 | 126 | (6) No endorsement. Nothing in this Public License constitutes or 127 | may be construed as permission to assert or imply that You are, or 128 | that Your use of the Licensed Material is, connected with, or 129 | sponsored, endorsed, or granted official status by, the Licensor 130 | or others designated to receive attribution as provided in Section 131 | 3(a)(1)(A)(i). 132 | 133 | (b) Other rights. 134 | 135 | (1) Moral rights, such as the right of integrity, are not licensed 136 | under this Public License, nor are publicity, privacy, and/or 137 | other similar personality rights; however, to the extent possible, 138 | the Licensor waives and/or agrees not to assert any such rights 139 | held by the Licensor to the limited extent necessary to allow You 140 | to exercise the Licensed Rights, but not otherwise. 141 | 142 | (2) Patent and trademark rights are not licensed under this Public 143 | License. 144 | 145 | (3) To the extent possible, the Licensor waives any right to 146 | collect royalties from You for the exercise of the Licensed 147 | Rights, whether directly or through a collecting society under any 148 | voluntary or waivable statutory or compulsory licensing scheme. In 149 | all other cases the Licensor expressly reserves any right to 150 | collect such royalties, including when the Licensed Material is 151 | used other than for NonCommercial purposes. 152 | 153 | Section 3 – License Conditions. 154 | 155 | Your exercise of the Licensed Rights is expressly made subject to the 156 | following conditions. 157 | 158 | (a) Attribution. 159 | 160 | (1) If You Share the Licensed Material, You must: 161 | 162 | (A) retain the following if it is supplied by the Licensor with 163 | the Licensed Material: 164 | 165 | (i) identification of the creator(s) of the Licensed Material 166 | and any others designated to receive attribution, in any 167 | reasonable manner requested by the Licensor (including by 168 | pseudonym if designated); 169 | 170 | (ii) a copyright notice; 171 | 172 | (iii) a notice that refers to this Public License; 173 | 174 | (iv) a notice that refers to the disclaimer of warranties; 175 | 176 | (v) a URI or hyperlink to the Licensed Material to the extent 177 | reasonably practicable; 178 | 179 | (B) indicate if You modified the Licensed Material and retain an 180 | indication of any previous modifications; and 181 | 182 | (C) indicate the Licensed Material is licensed under this Public 183 | License, and include the text of, or the URI or hyperlink to, 184 | this Public License. 185 | 186 | For the avoidance of doubt, You do not have permission under this 187 | Public License to Share Adapted Material. 188 | 189 | (2) You may satisfy the conditions in Section 3(a)(1) in any 190 | reasonable manner based on the medium, means, and context in which 191 | You Share the Licensed Material. For example, it may be reasonable 192 | to satisfy the conditions by providing a URI or hyperlink to a 193 | resource that includes the required information. 194 | 195 | (3) If requested by the Licensor, You must remove any of the 196 | information required by Section 3(a)(1)(A) to the extent 197 | reasonably practicable. 198 | 199 | Section 4 – Sui Generis Database Rights. 200 | 201 | Where the Licensed Rights include Sui Generis Database Rights that 202 | apply to Your use of the Licensed Material: 203 | 204 | (a) for the avoidance of doubt, Section 2(a)(1) grants You the right 205 | to extract, reuse, reproduce, and Share all or a substantial portion 206 | of the contents of the database for NonCommercial purposes only and 207 | provided You do not Share Adapted Material; 208 | 209 | (b) if You include all or a substantial portion of the database 210 | contents in a database in which You have Sui Generis Database 211 | Rights, then the database in which You have Sui Generis Database 212 | Rights (but not its individual contents) is Adapted Material; and 213 | 214 | (c) You must comply with the conditions in Section 3(a) if You Share 215 | all or a substantial portion of the contents of the database. 216 | 217 | For the avoidance of doubt, this Section 4 supplements and does not 218 | replace Your obligations under this Public License where the Licensed 219 | Rights include other Copyright and Similar Rights. 220 | 221 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 222 | 223 | (a) Unless otherwise separately undertaken by the Licensor, to the 224 | extent possible, the Licensor offers the Licensed Material as-is and 225 | as-available, and makes no representations or warranties of any kind 226 | concerning the Licensed Material, whether express, implied, 227 | statutory, or other. This includes, without limitation, warranties 228 | of title, merchantability, fitness for a particular purpose, 229 | non-infringement, absence of latent or other defects, accuracy, or 230 | the presence or absence of errors, whether or not known or 231 | discoverable. Where disclaimers of warranties are not allowed in 232 | full or in part, this disclaimer may not apply to You. 233 | 234 | (b) To the extent possible, in no event will the Licensor be liable 235 | to You on any legal theory (including, without limitation, 236 | negligence) or otherwise for any direct, special, indirect, 237 | incidental, consequential, punitive, exemplary, or other losses, 238 | costs, expenses, or damages arising out of this Public License or 239 | use of the Licensed Material, even if the Licensor has been advised 240 | of the possibility of such losses, costs, expenses, or 241 | damages. Where a limitation of liability is not allowed in full or 242 | in part, this limitation may not apply to You. 243 | 244 | (c) The disclaimer of warranties and limitation of liability 245 | provided above shall be interpreted in a manner that, to the extent 246 | possible, most closely approximates an absolute disclaimer and 247 | waiver of all liability. 248 | 249 | Section 6 – Term and Termination. 250 | 251 | (a) This Public License applies for the term of the Copyright and 252 | Similar Rights licensed here. However, if You fail to comply with 253 | this Public License, then Your rights under this Public License 254 | terminate automatically. 255 | 256 | (b) Where Your right to use the Licensed Material has terminated 257 | under Section 6(a), it reinstates: 258 | 259 | (1) automatically as of the date the violation is cured, provided 260 | it is cured within 30 days of Your discovery of the violation; or 261 | 262 | (2) upon express reinstatement by the Licensor. 263 | 264 | For the avoidance of doubt, this Section 6(b) does not affect 265 | any right the Licensor may have to seek remedies for Your violations 266 | of this Public License. 267 | 268 | (c) For the avoidance of doubt, the Licensor may also offer the 269 | Licensed Material under separate terms or conditions or stop 270 | distributing the Licensed Material at any time; however, doing so 271 | will not terminate this Public License. 272 | 273 | (d) Sections 1, 5, 6, 7, and 8 survive termination of this Public 274 | License. 275 | 276 | Section 7 – Other Terms and Conditions. 277 | 278 | (a) The Licensor shall not be bound by any additional or different 279 | terms or conditions communicated by You unless expressly agreed. 280 | 281 | (b) Any arrangements, understandings, or agreements regarding the 282 | Licensed Material not stated herein are separate from and 283 | independent of the terms and conditions of this Public License. 284 | 285 | Section 8 – Interpretation. 286 | 287 | (a) For the avoidance of doubt, this Public License does not, and 288 | shall not be interpreted to, reduce, limit, restrict, or impose 289 | conditions on any use of the Licensed Material that could lawfully 290 | be made without permission under this Public License. 291 | 292 | (b) To the extent possible, if any provision of this Public License 293 | is deemed unenforceable, it shall be automatically reformed to the 294 | minimum extent necessary to make it enforceable. If the provision 295 | cannot be reformed, it shall be severed from this Public License 296 | without affecting the enforceability of the remaining terms and 297 | conditions. 298 | 299 | (c) No term or condition of this Public License will be waived and 300 | no failure to comply consented to unless expressly agreed to by the 301 | Licensor. 302 | 303 | (d) Nothing in this Public License constitutes or may be interpreted 304 | as a limitation upon, or waiver of, any privileges and immunities 305 | that apply to the Licensor or You, including from the legal 306 | processes of any jurisdiction or authority. 307 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for ONF documentation created with Sphinx 2 | 3 | # use bash for pushd/popd, and to fail quickly. virtualenv's activate 4 | # has undefined variables, so no -u 5 | SHELL = bash -e -o pipefail 6 | 7 | # You can set these variables from the command line. 8 | SPHINXOPTS ?= 9 | SPHINXBUILD ?= sphinx-build 10 | SOURCEDIR ?= . 11 | BUILDDIR ?= _build 12 | 13 | # Create the virtualenv with all the tools installed 14 | VIRTUALENV = venv-docs 15 | 16 | # Put it first so that "make" without argument is like "make help". 17 | help: $(VIRTUALENV) 18 | source ./$(VIRTUALENV)/bin/activate ;\ 19 | $(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | 21 | .PHONY: help lint reload Makefile test 22 | 23 | # Create the virtualenv with all the tools installed 24 | $(VIRTUALENV): 25 | python3 -m venv $@ ;\ 26 | source ./$@/bin/activate ;\ 27 | pip install -r requirements.txt 28 | 29 | # lint and link verification. linkcheck is built into sphinx 30 | test: lint spelling 31 | 32 | # lint all .rst files 33 | lint: $(VIRTUALENV) 34 | source ./$`__ license. The 15 | community is invited to contribute corrections, improvements, updates, 16 | and new material under the same terms. While this license does not 17 | automatically grant the right to make derivative works, we are keen to 18 | discuss derivative works (such as translations) with interested 19 | parties. Please reach out to discuss@systemsapproach.org. 20 | 21 | If you make use of this work, the attribution should include the 22 | following information: 23 | 24 | | *Title: Edge Cloud Operations: A Systems Approach* 25 | | *Authors: Larry Peterson, Scott Baker, Andy Bavier, Zack Williams, Bruce Davie* 26 | | *Source:* https://github.com/SystemsApproach/ops 27 | | *License:* \ `CC BY-NC-ND 4.0 `__ 28 | 29 | Read the Book 30 | ------------- 31 | 32 | This book is part of the `Systems Approach Series 33 | `__, with an online version published 34 | at `https://ops.systemsapproach.org 35 | `__. 36 | 37 | To track progress and receive notices about new versions, you can follow 38 | the project on 39 | `Mastodon `__. To read a running 40 | commentary on how the Internet is evolving, and for updates on our writing projects, you can sign up for the 41 | `Systems Approach newsletter `__. 42 | 43 | Build the Book 44 | -------------- 45 | 46 | To build a web-viewable version, you first need to download the 47 | source: 48 | 49 | .. literalinclude:: code/build.sh 50 | 51 | The build process is stored in the Makefile and requires Python be 52 | installed. The Makefile will create a virtualenv (``venv-docs``) which 53 | installs the documentation generation toolset. You may also need to 54 | install the ``enchant`` C library using your system’s package manager 55 | for the spelling checker to function properly. 56 | 57 | To generate HTML in ``_build/html``, run ``make html``. 58 | 59 | To check the formatting of the book, run ``make lint``. 60 | 61 | To check spelling, run ``make spelling``. If there are additional 62 | words, names, or acronyms that are correctly spelled but not in the dictionary, 63 | please add them to the ``dict.txt`` file. 64 | 65 | To see the other available output formats, run ``make``. 66 | 67 | Contribute to the Book 68 | ---------------------- 69 | 70 | We hope that if you use this material, you are also willing to 71 | contribute back to it. If you are new to open source, you might check 72 | out this `How to Contribute to Open 73 | Source `__ guide. Among 74 | other things, you’ll learn about posting *Issues* that you’d like to see 75 | addressed, and issuing *Pull Requests* to merge your improvements back 76 | into GitHub. 77 | 78 | If you’d like to contribute and are looking for something that needs 79 | attention, see the `wiki `__ 80 | for the current TODO list. 81 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | Version 1.1-dev -------------------------------------------------------------------------------- /_extra/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: AI2Bot 2 | User-agent: Ai2Bot-Dolma 3 | User-agent: aiHitBot 4 | User-agent: Amazonbot 5 | User-agent: anthropic-ai 6 | User-agent: Applebot 7 | User-agent: Applebot-Extended 8 | User-agent: Brightbot 1.0 9 | User-agent: Bytespider 10 | User-agent: CCBot 11 | User-agent: ChatGPT-User 12 | User-agent: Claude-Web 13 | User-agent: ClaudeBot 14 | User-agent: cohere-ai 15 | User-agent: cohere-training-data-crawler 16 | User-agent: Cotoyogi 17 | User-agent: Crawlspace 18 | User-agent: Diffbot 19 | User-agent: DuckAssistBot 20 | User-agent: FacebookBot 21 | User-agent: Factset_spyderbot 22 | User-agent: FirecrawlAgent 23 | User-agent: FriendlyCrawler 24 | User-agent: Google-Extended 25 | User-agent: GoogleOther 26 | User-agent: GoogleOther-Image 27 | User-agent: GoogleOther-Video 28 | User-agent: GPTBot 29 | User-agent: iaskspider/2.0 30 | User-agent: ICC-Crawler 31 | User-agent: ImagesiftBot 32 | User-agent: img2dataset 33 | User-agent: imgproxy 34 | User-agent: ISSCyberRiskCrawler 35 | User-agent: Kangaroo Bot 36 | User-agent: meta-externalagent 37 | User-agent: Meta-ExternalAgent 38 | User-agent: meta-externalfetcher 39 | User-agent: Meta-ExternalFetcher 40 | User-agent: NovaAct 41 | User-agent: OAI-SearchBot 42 | User-agent: omgili 43 | User-agent: omgilibot 44 | User-agent: Operator 45 | User-agent: PanguBot 46 | User-agent: Perplexity-User 47 | User-agent: PerplexityBot 48 | User-agent: PetalBot 49 | User-agent: Scrapy 50 | User-agent: SemrushBot-OCOB 51 | User-agent: SemrushBot-SWA 52 | User-agent: Sidetrade indexer bot 53 | User-agent: TikTokSpider 54 | User-agent: Timpibot 55 | User-agent: VelenPublicWebCrawler 56 | User-agent: Webzio-Extended 57 | User-agent: YouBot 58 | Disallow: / 59 | -------------------------------------------------------------------------------- /_static/SystemsApproachLogoURL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/SystemsApproachLogoURL.png -------------------------------------------------------------------------------- /_static/bridge.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/bridge.ico -------------------------------------------------------------------------------- /_static/cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/cover.jpg -------------------------------------------------------------------------------- /_static/css/rtd_theme_mods.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019-present Open Networking Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. */ 15 | 16 | /* Don't restrict content width on the RTD theme 17 | * from: https://stackoverflow.com/a/32898444 */ 18 | 19 | .wy-nav-content { 20 | max-width: none; 21 | } 22 | 23 | .wy-table-responsive table td, .wy-table-responsive table th { 24 | white-space: normal; 25 | } 26 | 27 | /* Colors for navigation */ 28 | 29 | .wy-side-nav-search, .wy-nav-top { 30 | background: #2F5597; 31 | } 32 | 33 | /* .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#2F5597} */ 34 | 35 | .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#6AB0DE} 36 | 37 | /* Headings */ 38 | h1, h2 { 39 | font-weight: bold; 40 | line-height: 1.25; 41 | color: #3279a8 42 | text-rendering: optimizeLegibility; 43 | } 44 | 45 | h3, h4, h5, h6 { 46 | margin-bottom: .5rem; 47 | font-style: italic; 48 | line-height: 1.25; 49 | color: #313131; 50 | text-rendering: optimizeLegibility; 51 | } 52 | 53 | h1 { 54 | margin-bottom: 2rem; 55 | font-size: 2rem; 56 | } 57 | 58 | h2 { 59 | margin-bottom: .5rem; 60 | margin-top: 1rem; 61 | font-size: 1.5rem; 62 | } 63 | 64 | h3 { 65 | margin-top: 1.5rem; 66 | font-size: 1.25rem; 67 | } 68 | 69 | .pop { 70 | color: #6AB0DE; 71 | font-style: italic; 72 | font-weight: bold; 73 | } 74 | aside.sidebar { 75 | margin: 0 0 0.5em 1em; 76 | border: 1px solid #ddb; 77 | padding: 7px 7px 0 7px; 78 | background-color: #ffe; 79 | width: 40%; 80 | float: right; 81 | } 82 | -------------------------------------------------------------------------------- /_static/fonts/Inconsolata-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/fonts/Inconsolata-Bold.ttf -------------------------------------------------------------------------------- /_static/fonts/Inconsolata-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/fonts/Inconsolata-Regular.ttf -------------------------------------------------------------------------------- /arch.rst: -------------------------------------------------------------------------------- 1 | Chapter 2: Architecture 2 | ======================== 3 | 4 | This chapter identifies all the subsystems that go into building and 5 | operationalizing a cloud capable of running an assortment of 6 | cloud-native services. We use Aether to illustrate specific design 7 | choices, and so we start by describing why an enterprise might install 8 | a system like Aether in the first place. 9 | 10 | .. sidebar:: PaaS for Industry 4.0 11 | 12 | *Edge clouds like Aether are an important component of a trend 13 | called Industry 4.0: A combination of intelligent devices, 14 | robust wireless connectivity, and cloud-based AI/ML 15 | capabilities, all working together to enable software-based 16 | optimization and innovation.* 17 | 18 | *Connecting industry assets to the cloud has the potential to 19 | bring transformative benefits. This starts with collecting deep 20 | operational data on assets and infrastructure, from sensors, 21 | video feeds and telemetry from machinery. It also includes 22 | applying ML to this data to gain insights, identify patterns 23 | and predict outcomes (e.g., when a device is likely to fail), 24 | followed by automating industrial processes so as to minimize 25 | human intervention and enable remote operations (e.g., power 26 | optimization, idling quiescent machinery). In general, the goal 27 | is to create an IT foundation for continually improving 28 | industrial operations through software.* 29 | 30 | *As for why we refer to Aether as a PaaS for such use cases, 31 | the answer is somewhat subjective. Generally, a PaaS offers 32 | more than virtualized compute and storage (that is what IaaS 33 | does), and includes additional layers of "middleware" to enable 34 | application developers to deploy their applications without 35 | dealing with all the intricacies of managing the underlying 36 | infrastructure. In the case of Aether, the platform includes 37 | support for 5G connectivity, including an API that edge apps 38 | can use to customize that connectivity to better meet their 39 | objectives. This does not preclude also loading an ML-platform 40 | or an IoT-platform onto Aether, further enhancing the 41 | application support it provides.* 42 | 43 | Aether is a Kubernetes-based edge cloud, augmented with a 5G-based 44 | connectivity service. Aether is targeted at enterprises that want to 45 | take advantage of 5G connectivity in support of mission-critical edge 46 | applications requiring predictable, low-latency connectivity. In 47 | short, “Kubernetes-based” means Aether is able to host container-based 48 | services, and “5G-based connectivity” means Aether is able to connect 49 | those services to mobile devices throughout the enterprise's physical 50 | plant. This combination of features to support deployment of edge 51 | applications, coupled with Aether being offered as a managed service, 52 | means Aether can fairly be characterized as a Platform-as-a-Service 53 | (PaaS). 54 | 55 | Aether supports this combination by implementing both the RAN and the 56 | user plane of the Mobile Core on-prem, as cloud-native workloads 57 | co-located on the Aether cluster. This is often referred to as *local 58 | breakout* because it enables direct communication between mobile 59 | devices and edge applications without data traffic leaving the 60 | enterprise. This scenario is depicted in :numref:`Figure %s 61 | `, which does not name the edge applications, but 62 | substituting Internet-of-Things (IoT) would be an illustrative 63 | example. 64 | 65 | .. _fig-hybrid: 66 | .. figure:: figures/Slide2.png 67 | :width: 700px 68 | :align: center 69 | 70 | Overview of Aether as a hybrid cloud, with edge apps and the 5G 71 | data plane (called *local breakout*) running on-prem and various 72 | management and control-related workloads running in a central 73 | cloud. 74 | 75 | The approach includes both edge (on-prem) and centralized (off-prem) 76 | components. This is true for edge apps, which often have a centralized 77 | counterpart running in a commodity cloud. It is also true for the 5G 78 | Mobile Core, where the on-prem User Plane (UP) is paired with a 79 | centralized Control Plane (CP). The central cloud shown in this figure 80 | might be private (i.e., operated by the enterprise), public (i.e., 81 | operated by a commercial cloud provider), or some combination of the 82 | two (i.e., not all centralized elements need to run in the same 83 | cloud). Also shown in :numref:`Figure %s ` is a 84 | centralized *Control and Management Platform*. This represents all the 85 | functionality needed to offer Aether as a managed service, with system 86 | administrators using a portal exported by this platform to operate the 87 | underlying infrastructure and services within their enterprise. The 88 | rest of this book is about everything that goes into implementing that 89 | *Control and Management Platform*. 90 | 91 | 2.1 Edge Cloud 92 | -------------- 93 | 94 | The edge cloud, which in Aether is called ACE (Aether Connected Edge), 95 | is a Kubernetes-based cluster similar to the one shown in 96 | :numref:`Figure %s ` of Chapter 1. It is a platform that 97 | consists of one or more server racks interconnected by a leaf-spine 98 | switching fabric, with an SDN control plane (denoted SD-Fabric) 99 | managing the fabric. 100 | 101 | .. _fig-ace: 102 | .. figure:: figures/Slide3.png 103 | :width: 400px 104 | :align: center 105 | 106 | Aether Connected Edge (ACE) = The cloud platform (Kubernetes and 107 | SD-Fabric) plus the 5G connectivity service (RAN and User Plane of 108 | Mobile Core). Dotted lines (e.g., between SD-RAN and the individual 109 | base stations, and between the Network OS and the individual 110 | switches) represent control relationships (e.g., SD-RAN controls 111 | the small cells and SD-Fabric controls the switches). 112 | 113 | As shown in :numref:`Figure %s `, ACE hosts two additional 114 | microservice-based subsystems on top of this platform; they 115 | collectively implement *5G-Connectivity-as-a-Service*. The first 116 | subsystem, SD-RAN, is an SDN-based implementation of the 5G Radio 117 | Access Network (RAN). It controls the small cell base stations 118 | deployed throughout the enterprise. The second subsystem, SD-Core, is 119 | an SDN-based implementation of the User Plane half of the Mobile 120 | Core. It is responsible for forwarding traffic between the RAN and the 121 | Internet. The SD-Core Control Plane (CP) runs off-site, and is not 122 | shown in :numref:`Figure %s `. Both subsystems (as well as 123 | the SD-Fabric), are deployed as a set of microservices, but details 124 | about the functionality implemented by these containers is otherwise 125 | not critical to this discussion. For our purposes, they are 126 | representative of any cloud native workload. (The interested reader is 127 | referred to our companion 5G and SDN books for more information about 128 | the internal working of SD-RAN, SD-Core, and SD-Fabric.) 129 | 130 | .. _reading_5g: 131 | .. admonition:: Further Reading 132 | 133 | L. Peterson and O. Sunay. `5G Mobile Networks: A Systems Approach 134 | `__. March 2020. 135 | 136 | L. Peterson, *et al.* `Software-Defined Networks: A Systems Approach 137 | `__. November 2021. 138 | 139 | Once ACE is running in this configuration, it is ready to host a 140 | collection of edge applications (not shown in :numref:`Figure %s 141 | `), and as with any Kubernetes-based cluster, a Helm chart 142 | would be the preferred way to deploy such applications. What’s unique 143 | to ACE is the ability to connect such applications to mobile devices 144 | throughout the enterprise using the 5G Connectivity Service 145 | implemented by SD-RAN and SD-Core. This service is offered as a 146 | managed service, with enterprise system administrators able to use a 147 | programmatic API (and associated GUI portal) to control that service; 148 | that is, authorize devices, restrict access, set Quality-of-Service 149 | parameters for different devices and applications, and so on. How to 150 | provide such a runtime control interface is the topic of Chapter 5. 151 | 152 | 2.2 Hybrid Cloud 153 | ----------------- 154 | 155 | While it is possible to instantiate a single ACE cluster in just one 156 | site, Aether is designed to support multiple ACE deployments, all of 157 | which are managed from the central cloud. Such a hybrid cloud scenario 158 | is depicted in :numref:`Figure %s `, which shows two 159 | subsystems running in the central cloud: (1) one or more instances of 160 | the Mobile Core Control Plane (CP), and (2) the Aether Management 161 | Platform (AMP). 162 | 163 | Each SD-Core CP controls one or more SD-Core UPs, as specified by 164 | 3GPP, the standards organization responsible for 5G. Exactly how CP 165 | instances (running centrally) are paired with UP instances (running at 166 | the edges) is a runtime decision, and depends on the degree of 167 | isolation the enterprise sites require. AMP is responsible for 168 | managing all the centralized and edge subsystems (as introduced in the 169 | next section). 170 | 171 | .. _fig-aether: 172 | .. figure:: figures/Slide4.png 173 | :width: 600px 174 | :align: center 175 | 176 | Aether runs in a hybrid cloud configuration, with Control Plane of 177 | Mobile Core and the Aether Management Platform (AMP) running in the 178 | Central Cloud. 179 | 180 | There is an important aspect of this hybrid cloud that is not obvious 181 | from :numref:`Figure %s `, which is that the “hybrid 182 | cloud” we keep referring to is best described as a set of Kubernetes 183 | clusters, rather than a set of physical clusters (similar to the one 184 | we started with in :numref:`Figure %s ` of Chapter 1). 185 | This is because, while each ACE site usually corresponds to a physical 186 | cluster built out of bare-metal components, each of the SD-Core CP 187 | subsystems shown in :numref:`Figure %s ` is actually 188 | deployed in a logical Kubernetes cluster on a commodity cloud. The 189 | same is true for AMP. Aether’s centralized components are able to run 190 | in Google Cloud Platform, Microsoft Azure, and Amazon’s AWS. They can also 191 | run as an emulated cluster implemented by a system like 192 | KIND—Kubernetes in Docker—making it possible for developers to run 193 | these components on their laptops. 194 | 195 | To be clear, Kubernetes adopts generic terminology, such as “cluster” 196 | and “service”, and gives it a very specific meaning. In 197 | Kubernetes-speak, a *Cluster* is a logical domain in which Kubernetes 198 | manages a set of containers. This “Kubernetes cluster” may have a 199 | one-to-one relationship with an underlying physical cluster, but it is 200 | also possible that a Kubernetes cluster is instantiated inside a 201 | datacenter, as one of potentially thousands of such logical 202 | clusters. And as we'll see in a later chapter, even an ACE edge site 203 | sometimes hosts more than one Kubernetes cluster, for example, one 204 | running production services and one used for trial deployments of new 205 | services. 206 | 207 | 2.3 Stakeholders 208 | ---------------- 209 | 210 | With the understanding that our target environment is a collection of 211 | Kubernetes clusters—some running on bare-metal hardware at edge sites 212 | and some running in central datacenters—there is an orthogonal issue 213 | of how decision-making responsibility for those clusters is shared 214 | among multiple stakeholders. Identifying the relevant stakeholders is 215 | an important prerequisite for establishing a cloud service, and while 216 | the example we use may not be suitable for all situations, it does 217 | illustrate the design implications. 218 | 219 | For Aether, we care about two primary stakeholders: (1) the *cloud 220 | operators* who manage the hybrid cloud as a whole, and (2) the 221 | *enterprise users* who decide on a per-site basis how to take 222 | advantage of the local cloud resources (e.g., what edge applications 223 | to run and how to slice connectivity resources among those apps). We 224 | sometimes call the latter "enterprise admins" to distinguish them from 225 | "end-users" who might want to manage their own personal devices. 226 | 227 | The architecture is multi-tenant in the sense that it authenticates 228 | and isolates these stakeholders, allowing each to access only those 229 | objects they are responsible for. This makes the approach agnostic as 230 | to whether all the edge sites belong to a single organization (with 231 | that organization also responsible for operating the cloud), or 232 | alternatively, there being a separate organization that offers a 233 | managed service to a set of distinct enterprises (each of which spans 234 | one or more sites). The architecture can also accommodate end-users, 235 | and provide them with a "self-service" portal, but we do not elaborate 236 | on that possibility. 237 | 238 | There is a potential third stakeholder of note—third-party service 239 | providers—which points to the larger issue of how we deploy and manage 240 | additional edge applications. To keep the discussion tangible—but 241 | remaining in the open source arena—we use OpenVINO as an illustrative 242 | example. OpenVINO is a framework for deploying AI inference models. 243 | It is interesting in the context of Aether because one of its use 244 | cases is processing video streams, for example to detect and count 245 | people who enter the field of view of a collection of 5G-connected 246 | cameras. 247 | 248 | .. _reading_openvino: 249 | .. admonition:: Further Reading 250 | 251 | `OpenVINO Toolkit `__. 252 | 253 | On the one hand, OpenVINO is just like the 5G-related components we're 254 | already incorporating into our hybrid cloud: it is deployed as a 255 | Kubernetes-based set of microservices. On the other hand, we have to 256 | ask who is responsible for managing it, which is to say “who 257 | operationalizes OpenVINO?” 258 | 259 | One answer is that the operators who already manage the rest of the 260 | hybrid cloud also manage the collection of edge applications added to 261 | cloud. Enterprise admins might activate and control those apps on a 262 | site-by-site basis, but it is the operations team already responsible 263 | for provisioning, deploying, and managing those edge clouds that also 264 | does the same for OpenVINO and any other applications that run on that 265 | cloud. Generalizing from one edge service (5G connectivity) to 266 | arbitrarily many edge services has implications for control and 267 | management (which we’ll discuss throughout the book), but 268 | fundamentally nothing changes in the course we've already set out for 269 | ourselves. 270 | 271 | Having the cloud operator *curate and manage* a set of edge services 272 | is the assumption Aether makes (and we assume throughout this book), 273 | but for completeness, we take note of two other possibilities. One is 274 | that we extend our hybrid architecture to support independent 275 | third-party service providers. Each new edge service acquires its own 276 | isolated Kubernetes cluster from the edge cloud, and then the 277 | 3rd-party provider takes over all responsibility for managing the 278 | service running in that cluster. From the perspective of the cloud 279 | operator, though, the task just became significantly more difficult 280 | because the architecture would need to support Kubernetes as a managed 281 | service, which is sometimes called *Containers-as-a-Service (CaaS)*.\ [#]_ 282 | Creating isolated Kubernetes clusters on-demand is a step further than 283 | we take things in this book, in part because there is a second 284 | possible answer that seems more likely to happen. 285 | 286 | .. [#] This is not strictly an either-or-situation. It is possible to 287 | curate an edge service, provision cluster resources for it, but 288 | then delegate operational responsibility to a 3rd-party service 289 | provider. 290 | 291 | This second approach is that a multi-cloud emerges *within* 292 | enterprises. Today, most people equate multi-cloud with services 293 | running across multiple hyperscalers, but with edge clouds becoming 294 | more common, it seems likely that enterprises will invite multiple edge 295 | clouds onto their local premises, some hyperscaler-provided and some 296 | not, each hosting a different subset of edge services. For example, 297 | one edge cloud might host a 5G connectivity service and another might 298 | host an AI platform like OpenVINO. The question this raises is whether 299 | the cloud management technologies described in this book still apply 300 | in that setting. The answer is yes: the fundamental management 301 | challenges remain the same. The main difference is knowing when to 302 | directly control a Kubernetes cluster (as we do in this book) and when 303 | to do so indirectly through the manager for that cluster. There are 304 | also new problems that are unique to multi-clouds, such as inter-cloud 305 | service discovery, but they are beyond the scope of this book. 306 | 307 | 2.4 Control and Management 308 | -------------------------- 309 | 310 | We are now ready to describe the architecture of the Aether Management 311 | Platform (AMP), which as shown in :numref:`Figure %s `, 312 | manages both the distributed set of ACE clusters and the other control 313 | clusters running in the central cloud. And illustrating the recursive 314 | nature of the management challenge, AMP is also responsible for 315 | managing AMP! 316 | 317 | AMP includes one or more portals targeted at different stakeholders, 318 | with :numref:`Figure %s ` showing the two examples we focus 319 | on in this book: a User Portal intended for enterprise admins who 320 | need to manage services delivered to a local site, and an Operations 321 | Portal intended for the ops team responsible for keeping Aether 322 | up to date and running smoothly. Again, other stakeholders (classes of 323 | users) are possible, but this distinction does represent a natural 324 | division between those who *use* cloud services and those who 325 | *operate* cloud services. 326 | 327 | .. _fig-amp: 328 | .. figure:: figures/Slide5.png 329 | :width: 600px 330 | :align: center 331 | 332 | The four subsystems that comprise AMP: Resource Provisioning, 333 | Lifecycle Management, Runtime Control, and Monitoring & Telemetry. 334 | 335 | We do not focus on these portals, which provide a graphical interface 336 | to a subset of AMP functionality, but we instead describe the 337 | aggregate functionality supported by AMP, which is organized around 338 | four subsystems: 339 | 340 | * Resource Provisioning: Responsible for initializing and configuring 341 | resources (e.g., servers, switches) that add, replace, or upgrade 342 | capacity for Aether. 343 | 344 | * Lifecycle Management: Responsible for continuous integration and 345 | deployment of software functionality available on Aether. 346 | 347 | * Runtime Control: Responsible for the ongoing configuration and 348 | control of the services (e.g., connectivity) provided by Aether. 349 | 350 | * Monitoring & Telemetry: Responsible for collecting, archiving, 351 | evaluating, and analyzing telemetry data generated by Aether 352 | components. 353 | 354 | Internally, each of these subsystems is implemented as a highly 355 | available cloud service, running as a collection of microservices. The 356 | design is cloud-agnostic, so AMP can be deployed in a public cloud 357 | (e.g., Google Cloud, AWS, Azure), an operator-owned Telco cloud, (e.g, 358 | AT&T’s AIC), or an enterprise-owned private cloud. For the pilot 359 | deployment of Aether, AMP runs in the Google Cloud. 360 | 361 | The rest of this section introduces these four subsystems, with the 362 | chapters that follow filling in more detail about each. 363 | 364 | 365 | 2.4.1 Resource Provisioning 366 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 367 | 368 | Resource Provisioning configures and bootstraps resources (both 369 | physical and virtual), bringing them up to a state so Lifecycle 370 | Management can take over and manage the software running on those 371 | resources. It roughly corresponds to Day 0 operations, and includes 372 | both the hands-on aspect of installing and physically connecting 373 | hardware, and the inventory-tracking required to manage physical 374 | assets. 375 | 376 | .. _fig-provision: 377 | .. figure:: figures/Slide6.png 378 | :width: 500px 379 | :align: center 380 | 381 | High-level overview of Resource Provisioning. 382 | 383 | :numref:`Figure %s ` gives a high-level overview. As a 384 | consequence of the operations team physically connecting resources to 385 | the cloud and recording attributes for those resources in an Inventory 386 | Repo, a Zero-Touch Provisioning system (a) generates a set of 387 | configuration artifacts that are stored in a Config Repo and used 388 | during Lifecycle Management, and (b) initializes the newly deployed 389 | resources so they are in a state that Lifecycle Management is able to 390 | control. The idea of storing configuration directives in a Repo, like 391 | any other code module, is a practice known as *Configuration-as-Code*, 392 | and we will see it applied in different ways throughout this book. 393 | 394 | Recall from Chapter 1 that we called out the "Aether platform" as 395 | distinct from the cloud-native workloads that are hosted on the 396 | platform. This is relevant here because Resource Provisioning has to 397 | get this platform up and running before Lifecycle Management can do 398 | its job. But in another example of circular dependencies, Lifecycle 399 | Management also plays a role in keeping the underlying platform 400 | up to date. 401 | 402 | Clearly, the “Install & Inventory” step requires human involvement, 403 | and some amount of hands-on resource-prep is necessary, but the goal 404 | is to minimize the operator configuration steps (and associated 405 | expertise) and maximize the automation carried out by the Zero-Touch 406 | Provisioning system. Also realize that :numref:`Figure %s 407 | ` is biased towards provisioning a physical cluster, 408 | such as the edge sites in Aether. For a hybrid cloud that also 409 | includes one or more virtual clusters running in central datacenters, 410 | it is necessary to provision those virtual resources as well. Chapter 411 | 3 describes provisioning from this broader perspective, considering 412 | both physical and virtual resources. 413 | 414 | 2.4.2 Lifecycle Management 415 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 416 | 417 | Lifecycle Management is the process of integrating debugged, extended, 418 | and refactored components (often microservices) into a set of 419 | artifacts (e.g., Docker containers and Helm charts), and subsequently 420 | deploying those artifacts to the operational cloud. It includes a 421 | comprehensive testing regime, and typically, a procedure by which 422 | developers inspect and comment on each others’ code. 423 | 424 | .. _fig-lifecycle: 425 | .. figure:: figures/Slide7.png 426 | :width: 600px 427 | :align: center 428 | 429 | High-level overview of Lifecycle Management. 430 | 431 | :numref:`Figure %s ` gives a high-level overview, where 432 | it is common to split the integration and deployment phases, the 433 | latter of which combines the integration artifacts from the first 434 | phase with the configuration artifacts generated by Resource 435 | Provisioning described in the previous subsection. The figure does not 436 | show any human intervention (after development), which implies any 437 | patches checked into the code repo trigger integration, and any new 438 | integration artifacts trigger deployment. This is commonly referred to 439 | as Continuous Integration / Continuous Deployment (CI/CD), although in 440 | practice, operator discretion and other factors are also taken into 441 | account before deployment actually happens. 442 | 443 | One of the key responsibilities of Lifecycle Management is version 444 | control, which includes evaluating dependencies, but also the 445 | possibility that it will sometimes be necessary to both roll out new 446 | versions of software and rollback to old versions, as well as operate 447 | with multiple versions deployed simultaneously. Managing all the 448 | configuration state needed to successfully deploy the right version of 449 | each component in the system is the central challenge, which we 450 | address in Chapter 4. 451 | 452 | 2.4.3 Runtime Control 453 | ~~~~~~~~~~~~~~~~~~~~~ 454 | 455 | Once deployed and running, Runtime Control provides a programmatic API 456 | that can be used by various stakeholders to manage whatever abstract 457 | service(s) the system offers (e.g., 5G connectivity in the case of 458 | Aether). As shown in :numref:`Figure %s `, Runtime 459 | Control partially addresses the “management silo” issue raised in 460 | Chapter 1, so users do not need to know that connectivity potentially 461 | spans four different components, or how to control/configure each of 462 | them individually. (Or, as in the case of the Mobile Core, that 463 | SD-Core is distributed across two clouds, with the CP sub-part 464 | responsible for controlling the UP sub-part.) In the case of the 465 | connectivity service, for example, users only care about being able to 466 | authorize devices and set QoS parameters on an end-to-end basis. 467 | 468 | .. _fig-control: 469 | .. figure:: figures/Slide8.png 470 | :width: 400px 471 | :align: center 472 | 473 | Example use case that requires ongoing runtime control. 474 | 475 | Note that :numref:`Figure %s ` focuses on 476 | Connectivity-as-a-Service, but the same idea applies to all services 477 | the cloud offers to end users. Thus, we can generalize the figure so 478 | Runtime Control mediates access to any of the underlying microservices 479 | (or collections of microservices) the cloud designer wishes to make 480 | publicly accessible, including the rest of AMP! In effect, Runtime 481 | Control implements an abstraction layer, codified with a programmatic 482 | API. 483 | 484 | Given this mediation role, Runtime Control provides mechanisms to 485 | model (represent) the abstract services to be offered to users; store 486 | any configuration and control state associated with those models; 487 | apply that state to the underlying components, ensuring they remain in 488 | sync with the operator’s intentions; and authorize the set of API 489 | calls that users try to invoke on each service. These details are 490 | spelled out in Chapter 5. 491 | 492 | 493 | 2.4.4 Monitoring and Telemetry 494 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 495 | 496 | In addition to controlling service functionality, a running system has 497 | to be continuously monitored so that operators can diagnose and 498 | respond to failures, tune performance, do root cause analysis, perform 499 | security audits, and understand when it is necessary to provision 500 | additional capacity. This requires mechanisms to observe system 501 | behavior, collect and archive the resulting data, analyze the data and 502 | trigger various actions in response, and visualize the data in human 503 | consumable dashboards (similar to the example shown in :numref:`Figure 504 | %s `). 505 | 506 | .. _fig-monitor: 507 | .. figure:: figures/Slide18.png 508 | :width: 500px 509 | :align: center 510 | 511 | Example Aether dashboard, showing the health of one of the 512 | subsystems (SD-Core). 513 | 514 | In broad terms, it is common to think of this aspect of cloud 515 | management as having three parts: a monitoring component that collects 516 | quantitative metrics (e.g., load averages, transmission rates, 517 | ops per second); a logging component that collects diagnostic 518 | messages (i.e., text strings explaining various event); and a tracing 519 | component that can reconstruct workflows through a set of 520 | microservices. All include a timestamp, so it is possible to link 521 | quantitative analysis with qualitative explanations in support of 522 | diagnostics and analytics. 523 | 524 | 2.4.5 Summary 525 | ~~~~~~~~~~~~~ 526 | 527 | This overview of the management architecture could lead one to 528 | conclude that these four subsystems were architected, in a rigorous, 529 | top-down fashion, to be completely independent. But that is not the 530 | case. It is more accurate to say that the system evolved bottom up, 531 | solving the next immediate problem one at a time, all the while 532 | creating a large ecosystem of open source components that can be used 533 | in different combinations. What this book presents is a retrospective 534 | description of the end result, organized into four subsystems to help 535 | make sense of it all. 536 | 537 | There are, in practice, many opportunities for interactions among the 538 | four components, and in some cases, there are overlapping concerns 539 | that lead to considerable debate. This is what makes operationalizing 540 | a cloud such a thorny problem. For example, it's difficult to draw a crisp 541 | line between where resource provisioning ends and lifecycle management 542 | begins. One could view provisioning as "Step 0" of lifecycle 543 | management. As another example, the runtime control and monitoring 544 | subsystems are often combined in a single user interface, giving 545 | operators a way to both read (monitor) and write (control) various 546 | parameters of a running system. Connecting those two subsystems is how 547 | we build closed loop control. 548 | 549 | These two "simplifications" allow us to reduce the architectural 550 | overview of the management platform to the two-dimensional 551 | representation shown in :numref:`Figure %s `. In one 552 | dimension, layered on top of the hybrid cloud being managed, is the 553 | Runtime Control system (including Monitoring and Telemetry to close 554 | the control loop). Users and Operators read and write parameters of 555 | the running system via a well-defined REST API. In the other 556 | dimension, running beside the hybrid cloud, is the Lifecycle 557 | Management system (including Resource Provisioning as Step 0). 558 | Operators and Developers specify changes to the system by checking 559 | code (including configuration specs) into a repo, and then 560 | periodically triggering an upgrade of the running system. 561 | 562 | .. _fig-2D: 563 | .. figure:: figures/Slide25.png 564 | :width: 500px 565 | :align: center 566 | 567 | Simplified representation of the management platform. 568 | 569 | This simplified perspective draws attention to an ambiguity, which is 570 | the distinction between "changes to the parameters of a running 571 | system" versus "upgrading the system that is running." Generally, 572 | Lifecycle Management takes responsibility for *configuring* each 573 | component (including what version of each component is deployed), 574 | while runtime control takes responsibility for *controlling* each 575 | component. But where you draw the line between configuration and 576 | control is somewhat arbitrary. Do configuration changes only happen 577 | when you first boot a component, or can you change the configuration 578 | of a running system, and if you do, how does that differ from changing 579 | a control parameter? And as suggested by the dotted arrow in 580 | :numref:`Figure %s `, is there value in having Runtime Control 581 | instigate changes via Lifecycle Management? The difference is usually 582 | related to frequency of change (which is in turn related to how 583 | disruptive to existing traffic/workload the change is), but ultimately 584 | it doesn't matter what you call it, as long as the mechanisms you use 585 | meet all of your requirements. 586 | 587 | Of course, an operational system doesn't tolerate such ambiguities 588 | very well. Each aspect of management has to be supported in a 589 | well-defined, efficient and repeatable way. That's why we include a 590 | description of a concrete realization of each of the four subsystems, 591 | reflecting one particular set of design choices. We call out the 592 | opportunities to make different engineering decisions, along with the 593 | design rationale behind our choices, as we add more details in the 594 | chapters that follow. 595 | 596 | 2.5 DevOps 597 | ---------- 598 | 599 | The preceding discussion focuses on the subsystems that make up the 600 | Control and Management Platform, but such a platform is used by 601 | people. This implies the need for a set of operational processes and 602 | procedures, which in a cloud setting, are now commonly organized 603 | around the DevOps model. The following gives a high-level summary, 604 | with a more extensive discussion of ops-related procedures presented 605 | throughout the book. 606 | 607 | DevOps has become an overused term, generally taken to mean that the 608 | line between the engineers who develop cloud functionality and the 609 | operators who deploy and manage cloud functionality is blurred, with 610 | the same team responsible for both. But that definition is too 611 | imprecise to be helpful. There are really three aspects of DevOps that 612 | are important to understand. 613 | 614 | First, when it comes to a set of services (or user-visible features), 615 | it is true that the developers play a role in deploying and operating 616 | those services. Enabling them to do that is exactly the value of the 617 | Management Platform. Consider the team responsible for SD-RAN in 618 | Aether, as an example. That team not only implements new SD-RAN 619 | features, but once their patch sets are checked into the code 620 | repository, those changes are integrated and deployed by the automated 621 | toolchain introduced in the previous section. This means the SD-RAN 622 | team is also responsible for: 623 | 624 | 1. Adding test cases to the CI half of Lifecycle Management, and 625 | writing any configuration specifications needed by the CD half of 626 | Lifecycle Management. 627 | 628 | 2. Instrumenting their code so it reports into the Monitoring and 629 | Telemetry framework, giving them the dashboards and alarms they 630 | need to troubleshoot any problems that arise. 631 | 632 | 3. Augmenting the data model of Runtime Control, so their component’s 633 | internal interfaces are plumbed through to the cloud’s externally 634 | visible Northbound Interface. 635 | 636 | Once deployed and operational, the SD-RAN team is also responsible for 637 | diagnosing any problems that cannot be resolved by a dedicated “on 638 | call” support staff.\ [#]_ The SD-RAN team is motivated to take 639 | advantage of the platform’s automated mechanisms (rather than exploit 640 | short-term workarounds), and to document their component’s behavior 641 | (especially how to resolve known problems), so they do not get support 642 | calls in the middle of the night. 643 | 644 | .. [#] Whether traditional or DevOps-based, there is typically a 645 | front-line support team, which is often said to provide Tier-1 646 | support. They interact directly with customers and are the 647 | first to respond to alarms, resolving the issue according to a 648 | well-scripted playbook. If Tier-1 support is not able to 649 | resolve an issue, it is elevated to Tier-2 and eventually 650 | Tier-3, the latter of which is the developers who best 651 | understand implementation details. 652 | 653 | .. sidebar:: Experience at Google 654 | 655 | *Our brief sketch of DevOps is based on how the approach is 656 | practiced at Google, and in this context, it is a great 657 | example of how good things come from efforts to minimize 658 | toil. As Google gained experience building and running its 659 | cloud, the incremental improvements to their cloud management 660 | system were assimilated in a system known as Borg.* 661 | 662 | *Kubernetes, the open source project widely used across the 663 | industry today, was spun out of Borg. The functionality 664 | embodied by Kubernetes evolved over time to deal with the 665 | operational challenges of deploying, upgrading, and monitoring 666 | a set of containers, serving as a great example of how a 667 | "rising tide lifts all boats." Given enough time, it may be 668 | the case that next layer of cloud management machinery, 669 | roughly corresponding to the topics covered in this book, will 670 | also be taken as a given. The challenge, as we will see, is 671 | the multi-dimensional scope of the problem.* 672 | 673 | Second, all of the activity outlined in the previous paragraph is 674 | possible only because of the rich set of capabilities built into the 675 | Control and Management Platform that is the subject of this book.\ 676 | [#]_ Someone had to build that platform, which includes a testing 677 | framework that individual tests can be plugged into; an automated 678 | deployment framework that is able to roll upgrades out to a scalable 679 | number of servers and sites without manual intervention; a monitoring 680 | and telemetry framework that components can report into; a runtime 681 | control environment that can translate high-level directives into 682 | low-level operations on backend components; and so on. While each of 683 | these frameworks was once created by a team tasked with keeping some 684 | other service running smoothly, they have taken on a life of their 685 | own. The Control and Management Platform now has its own DevOps 686 | team(s), who in addition to continually improving the platform, also 687 | field operational events, and when necessary, interact with other 688 | teams (e.g., the SD-RAN team in Aether) to resolve issues that come 689 | up. They are sometimes called Site Reliability Engineers (SREs), and 690 | in addition to being responsible for the Control and Management 691 | Platform, they enforce operational discipline—the third aspect of 692 | DevOps discussed next—on everyone else. 693 | 694 | .. [#] This we why we refer to the management system as a "platform", 695 | with AMP as an illustrative example. It serves as a common framework 696 | that developers of all the other cloud components can plug into and 697 | leverage. This is how you ultimately address the "management silo" 698 | problem. 699 | 700 | Finally, when operating with discipline and rigor, all of these teams 701 | strictly adhere to two quantitative rules. The first balances *feature 702 | velocity* with *system reliability*. Each component is given an *error 703 | budget* (percentage of time it can be down), and new features cannot 704 | be rolled out unless the corresponding component has been operating 705 | within this bound. This test is a “gate” on the CI/CD pipeline. The 706 | second rule balances how much time is spent on *operational toil* 707 | (time spent by a human diagnosing or fixing problems) with time spent 708 | engineering new capabilities into the Control and Management Platform 709 | to reduce future toil. If too much time is spent toiling and too 710 | little time is spent making the Control and Management Platform 711 | better, then it is taken as a sign that additional engineering 712 | resources are needed. 713 | 714 | .. _reading_sre: 715 | .. admonition:: Further Reading 716 | 717 | `Site Reliability Engineering: How Google Runs Production Systems 718 | `__, 719 | 2016. 720 | -------------------------------------------------------------------------------- /authors.rst: -------------------------------------------------------------------------------- 1 | About The Authors 2 | ================== 3 | 4 | **Larry Peterson** is the Robert E. Kahn Professor of Computer 5 | Science, Emeritus at Princeton University, where he served as Chair 6 | from 2003-2009. His research focuses on the design, implementation, 7 | and operation of Internet-scale distributed systems, including the 8 | widely used PlanetLab and MeasurementLab platforms. He is currently 9 | contributing to the Aether access-edge cloud project at the Linux 10 | Foundation. Peterson is a member of the National Academy of 11 | Engineering, a Fellow of the ACM and the IEEE, the 2010 recipient of 12 | the IEEE Kobayashi Computer and Communication Award, and the 2013 13 | recipient of the ACM SIGCOMM Award. He received his Ph.D. degree from 14 | Purdue University. 15 | 16 | **Scott Baker** is a Cloud Software Architect at Intel, where he works 17 | on the Open Edge Platform. Prior to joining Intel, he was on the Open 18 | Networking Foundation (ONF) engineering team that built Aether, 19 | leading the runtime control effort. Baker has also worked on 20 | cloud-related research projects at Princeton and the University of 21 | Arizona, including PlanetLab, GENI, and VICCI. He received his 22 | Ph.D. in Computer Science from the University of Arizona in 2005. 23 | 24 | **Andy Bavier** is a Cloud Software Engineer at Intel, where he works 25 | on the Open Edge Platform. Prior to joining Intel, he was on the Open 26 | Networking Foundation (ONF) engineering team that built Aether, 27 | leading the observability effort. Bavier has also been a Research 28 | Scientist at Princeton University, where he worked on the PlanetLab 29 | project. He received a BA in Philosophy from William & Mary in 1990, 30 | an MS in Computer Science from the University of Arizona in 1995, and 31 | a PhD in Computer Science from Princeton University in 2004. 32 | 33 | **Zack Williams** is a Cloud Software Engineer at Intel, where he 34 | works on the Open Edge Platform. Prior to joining Intel, he was on the 35 | Open Networking Foundation (ONF) engineering team that built 36 | Aether, leading the infrastructure provisioning effort. Williams has also 37 | been a systems programmer at the University of Arizona. He received 38 | his BS in Computer Science from the University of Arizona in 2001. 39 | 40 | **Bruce Davie** is a computer scientist noted for his contributions to 41 | the field of networking. He began his networking career at Bellcore 42 | where he worked on the Aurora Gigabit testbed and collaborated with 43 | Larry Peterson on high-speed host-network interfaces. He then went to 44 | Cisco where he led a team of architects responsible for Multiprotocol 45 | Label Switching (MPLS). He worked extensively at the IETF on 46 | standardizing MPLS and various quality of service technologies. He 47 | also spent five years as a visiting lecturer at the Massachusetts 48 | Institute of Technology. In 2012 he joined Software Defined Networking 49 | (SDN) startup Nicira and was then a principal engineer at VMware 50 | following the acquisition of Nicira. In 2017 he took on the role of VP 51 | and CTO for the Asia Pacific region at VMware. He is a Fellow of the 52 | ACM and chaired ACM SIGCOMM from 2009 to 2013. Davie is the author of 53 | multiple books and the holder of more than 40 U.S. patents. 54 | 55 | -------------------------------------------------------------------------------- /code/build.sh: -------------------------------------------------------------------------------- 1 | $ mkdir ~/systemsapproach 2 | $ cd ~/systemsapproach 3 | $ git clone https://github.com/SystemsApproach/ops.git 4 | $ cd ops 5 | -------------------------------------------------------------------------------- /code/cluster-edge_val.tfvars: -------------------------------------------------------------------------------- 1 | cluster_name = "ace-X" 2 | cluster_nodes = { 3 | leaf1 = { 4 | user = "terraform" 5 | private_key = "~/.ssh/id_rsa_terraform" 6 | host = "10.64.10.133" 7 | roles = ["worker"] 8 | labels = ["node-role.aetherproject.org=switch"] 9 | taints = ["node-role.aetherproject.org=switch:NoSchedule"] 10 | }, 11 | leaf2 = { 12 | user = "terraform" 13 | private_key = "~/.ssh/id_rsa_terraform" 14 | host = "10.64.10.137" 15 | roles = ["worker"] 16 | labels = ["node-role.aetherproject.org=switch"] 17 | taints = ["node-role.aetherproject.org=switch:NoSchedule"] 18 | }, 19 | spine1 = { 20 | user = "terraform" 21 | private_key = "~/.ssh/id_rsa_terraform" 22 | host = "10.64.10.131" 23 | roles = ["worker"] 24 | labels = ["node-role.aetherproject.org=switch"] 25 | taints = ["node-role.aetherproject.org=switch:NoSchedule"] 26 | }, 27 | spine2 = { 28 | user = "terraform" 29 | private_key = "~/.ssh/id_rsa_terraform" 30 | host = "10.64.10.135" 31 | roles = ["worker"] 32 | labels = ["node-role.aetherproject.org=switch"] 33 | taints = ["node-role.aetherproject.org=switch:NoSchedule"] 34 | }, 35 | server-1 = { 36 | user = "terraform" 37 | private_key = "~/.ssh/id_rsa_terraform" 38 | host = "10.64.10.138" 39 | roles = ["etcd", "controlplane", "worker"] 40 | labels = [] 41 | taints = [] 42 | }, 43 | server-2 = { 44 | user = "terraform" 45 | private_key = "~/.ssh/id_rsa_terraform" 46 | host = "10.64.10.139" 47 | roles = ["etcd", "controlplane", "worker"] 48 | labels = [] 49 | taints = [] 50 | }, 51 | server-3 = { 52 | user = "terraform" 53 | private_key = "~/.ssh/id_rsa_terraform" 54 | host = "10.64.10.140" 55 | roles = ["etcd", "controlplane", "worker"] 56 | labels = [] 57 | taints = [] 58 | }, 59 | server-4 = { 60 | user = "terraform" 61 | private_key = "~/.ssh/id_rsa_terraform" 62 | host = "10.64.10.141" 63 | roles = ["worker"] 64 | labels = [] 65 | taints = [] 66 | }, 67 | server-5 = { 68 | user = "terraform" 69 | private_key = "~/.ssh/id_rsa_terraform" 70 | host = "10.64.10.142" 71 | roles = ["worker"] 72 | labels = [] 73 | taints = [] 74 | } 75 | } 76 | cluster_labels = { 77 | env = "production" 78 | clusterInfra = "bare-metal" 79 | clusterRole = "ace" 80 | k8s = "self-managed" 81 | coreType = "4g" 82 | upfType = "up4" 83 | } 84 | -------------------------------------------------------------------------------- /code/cluster-gcp_val.tfvars: -------------------------------------------------------------------------------- 1 | cluster_name = "amp-gcp" 2 | cluster_nodes = { 3 | amp-us-west2-a = { 4 | host = "10.168.0.18" 5 | roles = ["etcd", "controlplane", "worker"] 6 | labels = [] 7 | taints = [] 8 | }, 9 | amp-us-west2-b = { 10 | host = "10.168.0.17" 11 | roles = ["etcd", "controlplane", "worker"] 12 | labels = [] 13 | taints = [] 14 | }, 15 | amp-us-west2-c = { 16 | host = "10.168.0.250" 17 | roles = ["etcd", "controlplane", "worker"] 18 | labels = [] 19 | taints = [] 20 | } 21 | } 22 | cluster_labels = { 23 | env = "production" 24 | clusterInfra = "gcp" 25 | clusterRole = "amp" 26 | k8s = "self-managed" 27 | backup = "enabled" 28 | } 29 | -------------------------------------------------------------------------------- /code/log.ascii: -------------------------------------------------------------------------------- 1 | 2020-08-18 05:35:54.842Z INFO [DistributedP4RuntimeTableMirror] Synchronized TABLE_ENTRY mirror for device:leaf1: 0 removed, 2 updated, 4 added 2 | -------------------------------------------------------------------------------- /code/log.json: -------------------------------------------------------------------------------- 1 | { 2 | "time": "2020-08-18 05:35:54.842Z", 3 | "logLevel": "INFO", "component": "DistributedP4RuntimeTableMirror", 4 | "log": "Synchronized TABLE_ENTRY mirror for device:leaf1: 0 removed, 2 updated, 4 added" 5 | } 6 | -------------------------------------------------------------------------------- /code/main-rke.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | rancher2 = { 4 | source = "rancher/rancher2" 5 | } 6 | null = { 7 | source = "hashicorp/null" 8 | version = "~> 2.1.2" 9 | } 10 | } 11 | } 12 | 13 | resource "rancher2_cluster" "cluster" { 14 | name = var.cluster_config.cluster_name 15 | 16 | enable_cluster_monitoring = false 17 | enable_cluster_alerting = false 18 | 19 | labels = var.cluster_labels 20 | 21 | rke_config { 22 | kubernetes_version = var.cluster_config.k8s_version 23 | 24 | authentication { 25 | strategy = "x509" 26 | } 27 | 28 | monitoring { 29 | provider = "none" 30 | } 31 | 32 | network { 33 | plugin = "calico" 34 | } 35 | 36 | services { 37 | etcd { 38 | backup_config { 39 | enabled = true 40 | interval_hours = 6 41 | retention = 30 42 | } 43 | retention = "72h" 44 | snapshot = false 45 | } 46 | 47 | kube_api { 48 | service_cluster_ip_range = var.cluster_config.k8s_cluster_ip_range 49 | extra_args = { 50 | feature-gates = "SCTPSupport=True" 51 | } 52 | } 53 | 54 | kubelet { 55 | cluster_domain = var.cluster_config.cluster_domain 56 | cluster_dns_server = var.cluster_config.kube_dns_cluster_ip 57 | fail_swap_on = false 58 | extra_args = { 59 | cpu-manager-policy = "static" 60 | kube-reserved = "cpu=500m,memory=256Mi" 61 | system-reserved = "cpu=500m,memory=256Mi" 62 | feature-gates = "SCTPSupport=True" 63 | } 64 | } 65 | 66 | kube_controller { 67 | cluster_cidr = var.cluster_config.k8s_pod_range 68 | service_cluster_ip_range = var.cluster_config.k8s_cluster_ip_range 69 | extra_args = { 70 | feature-gates = "SCTPSupport=True" 71 | } 72 | } 73 | 74 | scheduler { 75 | extra_args = { 76 | feature-gates = "SCTPSupport=True" 77 | } 78 | } 79 | 80 | kubeproxy { 81 | extra_args = { 82 | feature-gates = "SCTPSupport=True" 83 | proxy-mode = "ipvs" 84 | } 85 | } 86 | } 87 | addons_include = ["https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/release-3.7/images/multus-daemonset.yml"] 88 | addons = var.addon_manifests 89 | } 90 | } 91 | 92 | resource "null_resource" "nodes" { 93 | triggers = { 94 | cluster_nodes = length(var.nodes) 95 | } 96 | 97 | for_each = var.nodes 98 | 99 | connection { 100 | type = "ssh" 101 | 102 | bastion_host = var.bastion_host 103 | bastion_private_key = file(var.bastion_private_key) 104 | bastion_user = var.bastion_user 105 | 106 | user = each.value.user 107 | host = each.value.host 108 | private_key = file(each.value.private_key) 109 | } 110 | 111 | provisioner "remote-exec" { 112 | inline = [< 300 6 | for: 1m 7 | labels: 8 | severity: critical 9 | - alert: SingleEdgeConnectTestFailing 10 | annotations: 11 | message: | 12 | Cluster {{`{{ .Labels.name }}`}} reporting UE connect failure for at least 10 minutes. 13 | expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1 14 | for: 10m 15 | labels: 16 | severity: critical -------------------------------------------------------------------------------- /code/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.13" 3 | required_providers { 4 | rancher2 = { 5 | source = "rancher/rancher2" 6 | version = "= 1.15.1" 7 | } 8 | google = { 9 | source = "hashicorp/google" 10 | version = "~> 3.65.0" 11 | } 12 | null = { 13 | source = "hashicorp/null" 14 | version = "~> 2.1.2" 15 | } 16 | } 17 | } 18 | 19 | variable "rancher" { 20 | description = "Rancher credential" 21 | type = object({ 22 | url = string 23 | access_key = string 24 | secret_key = string 25 | }) 26 | } 27 | 28 | variable "gcp_config" { 29 | description = "GCP project and network configuration" 30 | type = object({ 31 | region = string 32 | compute_project = string 33 | network_project = string 34 | network_name = string 35 | subnet_name = string 36 | }) 37 | } 38 | 39 | provider "rancher2" { 40 | api_url = var.rancher.url 41 | access_key = var.rancher.access_key 42 | secret_key = var.rancher.secret_key 43 | } 44 | 45 | provider "google" { 46 | # Provide GCP credential using GOOGLE_CREDENTIALS environment variable 47 | project = var.gcp_config.compute_project 48 | region = var.gcp_config.region 49 | } 50 | -------------------------------------------------------------------------------- /code/roc-api-tests.groovy: -------------------------------------------------------------------------------- 1 | pipeline { 2 | ... 3 | stages { 4 | stage("Cleanup"){ 5 | ... 6 | } 7 | stage("Install Kind"){ 8 | ... 9 | } 10 | stage("Clone Test Repo"){ 11 | ... 12 | } 13 | stage("Setup Virtual Environment"){ 14 | ... 15 | } 16 | stage("Generate API Test Framework and API Tests"){ 17 | ... 18 | } 19 | stage("Run API Tests"){ 20 | steps { 21 | sh """ 22 | mkdir -p /tmp/robotlogs 23 | cd ${WORKSPACE}/api-tests 24 | source ast-venv/bin/activate; set -u; 25 | robot ${WORKSPACE}/api-tests/ap_list.robot || true 26 | robot ${WORKSPACE}/api-tests/application.robot || true 27 | robot ${WORKSPACE}/api-tests/connectivity_service.robot || true 28 | robot ${WORKSPACE}/api-tests/device_group.robot || true 29 | robot ${WORKSPACE}/api-tests/enterprise.robot || true 30 | robot ${WORKSPACE}/api-tests/ip_domain.robot || true 31 | robot ${WORKSPACE}/api-tests/site.robot || true 32 | robot ${WORKSPACE}/api-tests/template.robot || true 33 | robot ${WORKSPACE}/api-tests/traffic_class.robot || true 34 | robot ${WORKSPACE}/api-tests/upf.robot || true 35 | robot ${WORKSPACE}/api-tests/vcs.robot || true 36 | """ 37 | } 38 | } 39 | } 40 | ... 41 | } -------------------------------------------------------------------------------- /code/template.yang: -------------------------------------------------------------------------------- 1 | module onf-template { 2 | ... 3 | description 4 | "The aether vcs-template holds common parameters used 5 | by a virtual connectivity service. Templates are used to 6 | populate a VCS."; 7 | typedef template-id { 8 | type yg:yang-identifier { 9 | length 1..32; 10 | } 11 | } 12 | container template { 13 | description "The top level container"; 14 | list template { 15 | key "id"; 16 | description 17 | "List of vcs templates"; 18 | leaf id { 19 | type template-id; 20 | description "ID for this vcs template."; 21 | } 22 | leaf display-name { 23 | type string { 24 | length 1..80; 25 | } 26 | description "display name to use in GUI or CLI"; 27 | } 28 | leaf sst { 29 | type at:sst; 30 | description "Slice/Service type"; 31 | } 32 | leaf sd { 33 | type at:sd; 34 | description "Slice differentiator"; 35 | } 36 | container device { 37 | description "Per-device QOS Settings"; 38 | container mbr { 39 | description "Maximum bitrate"; 40 | leaf uplink { 41 | type at:bitrate; 42 | units bps; 43 | description "Per-device mbr uplink data rate in mbps"; 44 | } 45 | leaf downlink { 46 | type at:bitrate; 47 | units bps; 48 | description "Per-device mbr downlink data rate in mbps"; 49 | } 50 | } 51 | } 52 | container slice { 53 | description "Per-Slice QOS Settings"; 54 | container mbr { 55 | description "Maximum bitrate"; 56 | leaf uplink { 57 | type at:bitrate; 58 | units bps; 59 | description "Per-Slice mbr uplink data rate in mbps"; 60 | } 61 | leaf downlink { 62 | type at:bitrate; 63 | units bps; 64 | description "Per-Slice mbr downlink data rate in mbps"; 65 | } 66 | } 67 | } 68 | leaf traffic-class { 69 | type leafref { 70 | path "/tc:traffic-class/tc:traffic-class/tc:id"; 71 | } 72 | description 73 | "Link to traffic class"; 74 | } 75 | leaf description { 76 | type at:description; 77 | description "description of this vcs template"; 78 | } 79 | } 80 | } 81 | } -------------------------------------------------------------------------------- /code/trigger-event.yaml: -------------------------------------------------------------------------------- 1 | - job-template: 2 | id: 'aether-patchset' 3 | name: 'aether-verify-{project}{suffix}' 4 | project-type: pipeline 5 | pipeline-script: 'aether-test.groovy' 6 | ... 7 | triggers: 8 | - gerrit: 9 | server-name: '{gerrit-server-name}' 10 | dependency-jobs: '{dependency-jobs}' 11 | trigger-on: 12 | - patchset-created-event: 13 | exclude-drafts: true 14 | exclude-trivial-rebase: false 15 | exclude-no-code-change: true 16 | - draft-published-event 17 | - comment-added-contains-event: 18 | comment-contains-value: '(?i)^.*recheck$' 19 | ... -------------------------------------------------------------------------------- /code/trigger-time.yaml: -------------------------------------------------------------------------------- 1 | - job-template: 2 | id: aether-api-tests 3 | name: 'aether-api-{api-version}-tests-{release-version}' 4 | project-type: pipeline 5 | pipeline-file: 'aether-api-tests.groovy' 6 | ... 7 | triggers: 8 | - timed: | 9 | TZ=America/Los_Angeles 10 | H {time} * * * 11 | ... -------------------------------------------------------------------------------- /code/uptime.yaml: -------------------------------------------------------------------------------- 1 | "expr": "avg(avg_over_time(ace_e2e_ok{endpoint=\"metrics80\",name=\"$edge\"}[$__interval]) * 100)", 2 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | import os 20 | 21 | from subprocess import check_output, CalledProcessError 22 | 23 | def get_version(): 24 | 25 | try: 26 | version = check_output(['cat', 'VERSION'], 27 | universal_newlines=True) 28 | except CalledProcessError: 29 | return 'unknown version' 30 | 31 | return version.rstrip() 32 | 33 | # "version" is used for html build 34 | version = get_version() 35 | # "release" is used for LaTeX build 36 | release = version 37 | 38 | 39 | # -- Project information ----------------------------------------------------- 40 | 41 | project = u'Edge Cloud Operations: A Systems Approach' 42 | copyright = u'2022, Systems Approach LLC (Publisher)' 43 | author = u'Peterson, Baker, Bavier, Williams, Davie' 44 | 45 | # -- General configuration --------------------------------------------------- 46 | 47 | # If your documentation needs a minimal Sphinx version, state it here. 48 | # 49 | # needs_sphinx = '1.0' 50 | 51 | # make all warnings errors 52 | warning_is_error = False 53 | 54 | # Add any Sphinx extension module names here, as strings. They can be 55 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 56 | # ones. ***Replace "mathjax" with "imgmath" for epub output.*** 57 | extensions = [ 58 | 'sphinx.ext.autosectionlabel', 59 | 'sphinx.ext.coverage', 60 | 'sphinx.ext.ifconfig', 61 | 'sphinx.ext.mathjax', 62 | 'sphinx.ext.todo', 63 | 'sphinxcontrib.spelling', 64 | "sphinx_multiversion", 65 | ] 66 | 67 | # Text files with lists of words that shouldn't fail the spellchecker: 68 | spelling_word_list_filename=['dict.txt', ] 69 | 70 | # Add any paths that contain templates here, relative to this directory. 71 | templates_path = ['_templates'] 72 | 73 | # The suffix(es) of source filenames. 74 | # You can specify multiple suffix as a list of string: 75 | # 76 | # source_suffix = ['.rst', '.md'] 77 | source_suffix = '.rst' 78 | 79 | # The master toctree document. 80 | master_doc = 'index' 81 | 82 | # The language for content autogenerated by Sphinx. Refer to documentation 83 | # for a list of supported languages. 84 | # 85 | # This is also used if you do content translation via gettext catalogs. 86 | # Usually you set "language" from the command line for these cases. 87 | language = 'en' 88 | 89 | # List of patterns, relative to source directory, that match files and 90 | # directories to ignore when looking for source files. 91 | # This pattern also affects html_static_path and html_extra_path. 92 | exclude_patterns = [u'_build', 'venv-docs', 'requirements.txt', 'Thumbs.db', 'private', '.DS_Store', '*/README.rst'] 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = None 96 | 97 | # Enable numbered figures 98 | numfig = True 99 | numfig_format = { 100 | 'figure': 'Figure %s.', 101 | 'table': 'Table %s.' 102 | } 103 | 104 | # Ignore link check for the following websites 105 | linkcheck_ignore = [ 106 | 'https://www.amazon.com/','https://amzn.to/' 107 | ] 108 | 109 | # -- Options for HTML output ------------------------------------------------- 110 | 111 | # The theme to use for HTML and HTML Help pages. See the documentation for 112 | # a list of builtin themes. 113 | # 114 | html_theme = 'sphinx_rtd_theme' 115 | 116 | # Theme options are theme-specific and customize the look and feel of a theme 117 | # further. For a list of options available for each theme, see the 118 | # documentation. 119 | # 120 | html_theme_options = { 121 | 'prev_next_buttons_location': 'both' 122 | } 123 | 124 | # Add any paths that contain custom static files (such as style sheets) here, 125 | # relative to this directory. They are copied after the builtin static files, 126 | # so a file named "default.css" will overwrite the builtin "default.css". 127 | html_static_path = ['_static'] 128 | 129 | html_css_files = [ 130 | 'css/rtd_theme_mods.css', 131 | ] 132 | 133 | 134 | # HTML Favicon 135 | html_favicon = '_static/bridge.ico' 136 | 137 | # HTML Index 138 | html_use_index = False 139 | 140 | # Custom sidebar templates, must be a dictionary that maps document names 141 | # to template names. 142 | # 143 | # The default sidebars (for documents that don't match any pattern) are 144 | # defined by theme itself. Builtin themes are using these templates by 145 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 146 | # 'searchbox.html']``. 147 | # 148 | # html_sidebars = {} 149 | 150 | #extra HTML files 151 | html_extra_path = ['_extra'] 152 | 153 | # -- Options for HTMLHelp output --------------------------------------------- 154 | 155 | # Output file base name for HTML help builder. 156 | htmlhelp_basename = 'SystemsApproach' 157 | 158 | 159 | # -- Options for LaTeX output ------------------------------------------------ 160 | #latex_engine = 'xelatex' 161 | 162 | latex_elements = { 163 | # The paper size ('letterpaper' or 'a4paper'). 164 | # 165 | 'papersize': 'letterpaper', 166 | 167 | # The font size ('10pt', '11pt' or '12pt'). 168 | # 169 | 'pointsize': '11pt', 170 | 171 | # Get unicode to work 172 | # 173 | 'fontenc': '\\usepackage[LGR,T1]{fontenc}', 174 | 175 | # Latex figure (float) alignment 176 | # 177 | 'figure_align': 'ht', 178 | } 179 | 180 | # Grouping the document tree into LaTeX files. List of tuples 181 | # (source start file, target name, title, 182 | # author, documentclass [howto, manual, or own class]). 183 | latex_documents = [ 184 | (master_doc, 'book.tex', u'Edge Cloud Operations: A Systems Approach', 185 | u'Peterson, Baker, Bavier, Williams and Davie ', 'manual', True), 186 | ] 187 | 188 | latex_toplevel_sectioning = 'chapter' 189 | 190 | 191 | # -- Options for manual page output ------------------------------------------ 192 | 193 | # One entry per manual page. List of tuples 194 | # (source start file, name, description, authors, manual section). 195 | man_pages = [ 196 | (master_doc, 'Systems Approach', u'Systems Approach', 197 | [author], 1) 198 | ] 199 | 200 | 201 | # -- Options for Texinfo output ---------------------------------------------- 202 | 203 | # Grouping the document tree into Texinfo files. List of tuples 204 | # (source start file, target name, title, author, 205 | # dir menu entry, description, category) 206 | texinfo_documents = [ 207 | (master_doc, 'Edge Cloud Opetaions', u'Edge Cloud Operations', 208 | author, 'Peterson, Baker, Bavier, Williams, and Davie', 'A Systems Approach', 209 | 'Miscellaneous'), 210 | ] 211 | 212 | 213 | # -- Options for Epub output ------------------------------------------------- 214 | epub_title = project 215 | epub_description = 'Building a Cloud Management Platform' 216 | epub_cover = ('_static/cover.jpg', '') 217 | epub_show_urls = 'False' 218 | epub_use_index = False 219 | 220 | # The unique identifier of the text. This can be a ISBN number 221 | # or the project homepage. 222 | # 223 | # epub_identifier = '' 224 | 225 | # A unique identification for the text. 226 | # 227 | # epub_uid = '' 228 | 229 | # A list of files that should not be packed into the epub file. 230 | epub_exclude_files = ['search.html','robots.txt'] 231 | 232 | 233 | # -- Extension configuration ------------------------------------------------- 234 | 235 | # -- options for Intersphinx extension --------------------------------------- 236 | 237 | intersphinx_mapping = { 238 | 'sphinx': ('https://www.sphinx-doc.org/en/master', None), 239 | 'aether': ('https://docs.aetherproject.org/master', None), 240 | 'sdcore': ('https://docs.sd-core.opennetworking.org/master', None), 241 | 'sdran': ('https://docs.sd-ran.org/master', None), 242 | 'sdran': ('https://docs.sd-fabric.org/master', None), 243 | 'sysapproach5g': ('https://5g.systemsapproach.org/', None), 244 | 'sysapproachnet': ('https://book.systemsapproach.org/', None), 245 | 'sysapproachsdn': ('https://sdn.systemsapproach.org/', None), 246 | } 247 | 248 | # -- Options for todo extension ---------------------------------------------- 249 | # If true, `todo` and `todoList` produce output, else they produce nothing. 250 | todo_include_todos = True 251 | 252 | 253 | # -- Set up Google Analytics 254 | # -- using approach at https://stackoverflow.com/questions/9444342/adding-a-javascript-script-tag-some-place-so-that-it-works-for-every-file-in-sph/41885884#41885884 255 | 256 | 257 | GA_INVOKE_JS = """ 258 | window.dataLayer = window.dataLayer || []; 259 | function gtag(){dataLayer.push(arguments);} 260 | gtag('js', new Date()); 261 | 262 | gtag('config', 'G-K101Q1MWLM'); 263 | """ 264 | 265 | def setup(app): 266 | 267 | app.add_js_file('https://www.googletagmanager.com/gtag/js?id=G-K101Q1MWLM', loading_method="async") 268 | app.add_js_file(None, body=GA_INVOKE_JS) 269 | -------------------------------------------------------------------------------- /dict.txt: -------------------------------------------------------------------------------- 1 | Acknowledgements 2 | Adaptor 3 | Adaptors 4 | Aether 5 | Alertmanager 6 | Ansible 7 | Anthos 8 | Atomix 9 | BMC 10 | Bavier 11 | Bazel 12 | Calcote 13 | Chiu 14 | Condon 15 | Config 16 | Davie 17 | DevOps 18 | Docker 19 | Dockerfile 20 | ECS 21 | ElasticStack 22 | Elkstack 23 | Fluentbit 24 | Fluentd 25 | GCP 26 | GPP 27 | Gerrit 28 | Gradle 29 | Grafana 30 | Hostname 31 | Hyunsun 32 | IPMI 33 | IaaS 34 | IoT 35 | Istio 36 | Jaeger 37 | Kahn 38 | Keycloak 39 | Kibana 40 | Kobayashi 41 | Kubernetes 42 | Lifecycle 43 | Linkerd 44 | Logstash 45 | Makefile 46 | Multiprotocol 47 | NetBox 48 | Netplan 49 | Nginx 50 | Nicira 51 | ONOS 52 | Oauth 53 | Observability 54 | Oguz 55 | Onos 56 | POD 57 | PODs 58 | PaaS 59 | Ph 60 | Plugable 61 | Pluggable 62 | Proxmox 63 | QoS 64 | RKE 65 | Redfish 66 | Renderer 67 | Repo 68 | Repos 69 | Runtime 70 | SDN 71 | Sigelman 72 | Suchitra 73 | Sunay 74 | Sys 75 | Syslog 76 | Sámi 77 | Tanzu 78 | Telco 79 | Telcos 80 | Terraform 81 | Todo 82 | Tofino 83 | Uber 84 | VM 85 | VMs 86 | VMware 87 | Vemuri 88 | Weaveworks 89 | absorber 90 | adaptor 91 | adaptors 92 | analytics 93 | architected 94 | auth 95 | backend 96 | bitrate 97 | centric 98 | cloudified 99 | config 100 | customizable 101 | datacenter 102 | datacenters 103 | de 104 | decrypt 105 | decrypting 106 | deployable 107 | disaggregate 108 | disaggregated 109 | disaggregation 110 | downlink 111 | eBPF 112 | eNB 113 | eNBs 114 | evolvable 115 | exemplifed 116 | facto 117 | filesystem 118 | frontend 119 | gNMI 120 | gNOI 121 | gNodeB 122 | gNodeBs 123 | gRPC 124 | heatmap 125 | hoc 126 | hyperscale 127 | hyperscaler 128 | hyperscalers 129 | iPXE 130 | instantiation 131 | integrators 132 | invariants 133 | jitter 134 | lifecycle 135 | linter 136 | liveness 137 | llp 138 | mbr 139 | microservice 140 | microservices 141 | mindshare 142 | namespaces 143 | natively 144 | observability 145 | onos 146 | operationalization 147 | operationalize 148 | operationalized 149 | operationalizes 150 | operationalizing 151 | orchestrator 152 | pre 153 | precompiled 154 | prem 155 | programmatically 156 | qsfp 157 | reactively 158 | rearchitecting 159 | recode 160 | repo 161 | repos 162 | repurpose 163 | roadmap 164 | rollout 165 | rst 166 | runtime 167 | runtimes 168 | scalability 169 | scalable 170 | signalling 171 | stderr 172 | stdin 173 | stdout 174 | storylines 175 | subcomponents 176 | subnet 177 | systemsapproach 178 | textboxes 179 | todo 180 | todolist 181 | toolchain 182 | toolset 183 | untrusted 184 | unwinnable 185 | uplink 186 | uptime 187 | virtualenv 188 | -------------------------------------------------------------------------------- /figures.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures.pptx -------------------------------------------------------------------------------- /figures/Slide1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide1.png -------------------------------------------------------------------------------- /figures/Slide10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide10.png -------------------------------------------------------------------------------- /figures/Slide11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide11.png -------------------------------------------------------------------------------- /figures/Slide12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide12.png -------------------------------------------------------------------------------- /figures/Slide13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide13.png -------------------------------------------------------------------------------- /figures/Slide14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide14.png -------------------------------------------------------------------------------- /figures/Slide15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide15.png -------------------------------------------------------------------------------- /figures/Slide16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide16.png -------------------------------------------------------------------------------- /figures/Slide17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide17.png -------------------------------------------------------------------------------- /figures/Slide18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide18.png -------------------------------------------------------------------------------- /figures/Slide19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide19.png -------------------------------------------------------------------------------- /figures/Slide2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide2.png -------------------------------------------------------------------------------- /figures/Slide20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide20.png -------------------------------------------------------------------------------- /figures/Slide21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide21.png -------------------------------------------------------------------------------- /figures/Slide22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide22.png -------------------------------------------------------------------------------- /figures/Slide23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide23.png -------------------------------------------------------------------------------- /figures/Slide24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide24.png -------------------------------------------------------------------------------- /figures/Slide25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide25.png -------------------------------------------------------------------------------- /figures/Slide26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide26.png -------------------------------------------------------------------------------- /figures/Slide27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide27.png -------------------------------------------------------------------------------- /figures/Slide3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide3.png -------------------------------------------------------------------------------- /figures/Slide4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide4.png -------------------------------------------------------------------------------- /figures/Slide5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide5.png -------------------------------------------------------------------------------- /figures/Slide6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide6.png -------------------------------------------------------------------------------- /figures/Slide7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide7.png -------------------------------------------------------------------------------- /figures/Slide8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide8.png -------------------------------------------------------------------------------- /figures/Slide9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide9.png -------------------------------------------------------------------------------- /figures/ace_dash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/ace_dash.png -------------------------------------------------------------------------------- /figures/cable_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/cable_list.png -------------------------------------------------------------------------------- /figures/es_dash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/es_dash.png -------------------------------------------------------------------------------- /figures/gui1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/gui1.png -------------------------------------------------------------------------------- /figures/gui2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/gui2.png -------------------------------------------------------------------------------- /figures/pronto_logical_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/pronto_logical_diagram.png -------------------------------------------------------------------------------- /figures/rack_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/rack_diagram.png -------------------------------------------------------------------------------- /figures/upf_dash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/upf_dash.png -------------------------------------------------------------------------------- /foreword.rst: -------------------------------------------------------------------------------- 1 | Foreword 2 | ========== 3 | 4 | 5 | First the applications all moved to the cloud. And now they're being 6 | torn apart. Let me explain what I mean by that. 7 | 8 | As markets grow, the unit of function around which one can build a 9 | business shrinks. A classic example of this can be seen in the history 10 | of the automotive industry. The Ford River Rouge Complex was built in 11 | the late 1920s. At the time, mass-produced cars were relatively new, 12 | and the market was relatively small. And so factories like the River Rouge 13 | Complex had to build all the subcomponents too. Roughly, in one side 14 | of the factory went water, rubber, and iron ore, and out the other 15 | side came full automobiles. Of course, as the market for cars grew, so 16 | did a massive ecosystem of suppliers of car components: wheels, 17 | seats, floor mats, and the like. Today the large car companies are 18 | more akin to integrators than auto parts makers. 19 | 20 | The same dynamic is happening with the application. In the 1970s the 21 | same manufacturer would build the chips, the circuit boards, the 22 | system form factor, the operating system, and each of the 23 | applications. Over time as the market has grown, the system has 24 | disaggregated. The hardware and software separated and spawned multiple 25 | independent companies. And then companies started to be built around 26 | independent applications. 27 | 28 | The market hasn't stopped growing and over the last few years we've 29 | seen the application itself disaggregate. Commonly used subcomponents 30 | of applications are being pulled out, and entire companies and 31 | projects are being built around them. Today, if you're building an 32 | application, there are third-party APIs available for authenticating 33 | users, sending texts or email, streaming videos, authorizing access to 34 | resources, and many other useful functions. 35 | 36 | So what does this have to do with the book you're about to read? While 37 | the last decade was a consolidation of applications into the 38 | cloud, the next decade is largely going to be about the explosion of 39 | applications and application components away from it. Now that 40 | subcomponents of workloads have been largely decoupled from having to 41 | sit with the application, they can be run anywhere. And in particular 42 | they can be run on infrastructure that's purposely built and optimized 43 | for them! In fact, we are starting to see what can only be described 44 | as an anti-cloud trend where large companies are choosing to pull some 45 | workloads back from large clouds to their own optimized 46 | infrastructure. And we're even seeing startups choosing to build their 47 | own infrastructure from the get-go because they understand the cost 48 | and performance advantages of doing so. 49 | 50 | In "Edge Cloud Operations: A Systems Approach" the authors provide a 51 | detailed overview of not just cloud operations (which are so last 52 | decade) but operations in this new era of distributed clouds. In many 53 | ways, the cloud era was a low point of systems, because so much below 54 | the application layer was buried deep within the engineering organizations of 55 | the three large cloud providers. But that's changing, and to change 56 | with it, you need to understand how it all works. And that's exactly 57 | why you need to read this book. 58 | 59 | | Martin Casado 60 | | General Partner, a16z 61 | -------------------------------------------------------------------------------- /index.rst: -------------------------------------------------------------------------------- 1 | .. image:: _static/SystemsApproachLogoURL.png 2 | :width: 300px 3 | :align: center 4 | :target: https://systemsapproach.org 5 | 6 | | 7 | 8 | Edge Cloud Operations: A Systems Approach 9 | ========================================= 10 | 11 | Peterson, Baker, Bavier, Williams and Davie 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | | 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :caption: Table of Contents 19 | 20 | foreword.rst 21 | preface.rst 22 | intro.rst 23 | arch.rst 24 | provision.rst 25 | lifecycle.rst 26 | control.rst 27 | monitor.rst 28 | README.rst 29 | authors.rst 30 | latest.rst 31 | print.rst 32 | 33 | -------------------------------------------------------------------------------- /intro.rst: -------------------------------------------------------------------------------- 1 | Chapter 1: Introduction 2 | ======================== 3 | 4 | Clouds provide a set of tools for bringing up and operating scalable 5 | services, but how do you operationalize a cloud in the first place? 6 | The two problems are not mutually exclusive—after all, a cloud is 7 | realized as a set of services—but asking the question this way 8 | eliminates the temptation to give the answer “the cloud takes care of 9 | that for you.” This book describes how to operationalize a cloud, 10 | starting with bare-metal hardware, all the way to offering one or more 11 | managed services to users. 12 | 13 | Few of us are likely to have reason to instantiate a hyperscale 14 | datacenter, but deploying private edge clouds in an enterprise—and 15 | optionally connecting that edge to a datacenter to form a hybrid 16 | cloud—is becoming increasingly common. We use the term "edge cloud" to 17 | distinguish our focus from the "core", which is the traditional domain 18 | of the hyperscale operators. The edge is more likely to be in a 19 | enterprise or an "Internet of Things" setting such as a factory. The 20 | edge is the place where the cloud services connect to the real world, 21 | e.g., via sensors and actuators, and where latency-sensitive services 22 | are deployed to be close to the consumers of those services.\ [#]_ 23 | 24 | .. [#] Server clusters hosted in co-location facilities can also be 25 | considered edge clouds, and benefit from the technologies and 26 | practices described in this book, but we use enterprises as our 27 | exemplar deployment because they expose a broader set of 28 | requirements. 29 | 30 | The hyperscalers are indeed willing to manage your edge cloud for you, 31 | as an extension of their core datacenters. And correspondingly, there 32 | is significant activity to provide such products, with Google’s 33 | Anthos, Microsoft’s Azure Arc, and Amazon’s ECS-Anywhere as prime 34 | examples. But the barrier to operationalizing a cloud is not so high 35 | that only a hyperscaler has the wherewithal to do it. It is possible 36 | to build a cloud—and all the associated lifecycle management and 37 | runtime controls that are required to operate it—using readily 38 | available open source software packages. 39 | 40 | .. sidebar:: Developers Have an Equal Role to Play 41 | 42 | *This book takes an operator-centric view of cloud operations, but 43 | developers have an equal role to play. This role is reflected in 44 | practices like DevOps (which we discuss in Section 2.5), but can 45 | also been seen in the underlying system design. The cloud 46 | architecture includes a management platform, which specifies a 47 | runtime interface through which service developers (who provide 48 | functionality) interact with cloud operators (who manage that 49 | functionality). Because there is a shared management platform to 50 | leverage, developers do not need to (and should not) reinvent the 51 | wheel when it comes to provisioning, configuring, controlling, and 52 | monitoring the services they implement.* 53 | 54 | *Looking at the broader picture, this management platform is an 55 | essential part of how app builders and service developers deliver 56 | functionality to end users. Today, functionality is most often 57 | delivered as a Managed Service (as opposed to an inert pile of 58 | software). This means developers not only have to worry about the 59 | algorithms and data structures needed to implement their app or 60 | service, they also need to interface with the platform that 61 | operationalizes (activates) their code. It is common to focus on the 62 | former and view the latter as a burden (especially if someone else 63 | will be responsible for deploying and operating their code), but 64 | coding to the management platform interface is a central part of the 65 | contract for delivering a managed service. Understanding and 66 | appreciating the "hows" and "whys" of this platform is critical to 67 | developers doing their job.* 68 | 69 | This book describes what such a cloud management platform looks 70 | like. Our approach is to focus on the fundamental problems that must 71 | be addressed—design issues that are common to all clouds—but then 72 | couple this conceptual discussion with specific engineering choices 73 | made while operationalizing a specific enterprise cloud. Our example 74 | is Aether, an open source edge cloud that supports 5G connectivity as 75 | a managed service. Aether has the following properties that make it an 76 | interesting use case to study: 77 | 78 | * Aether starts with bare-metal hardware (servers and switches) 79 | deployed in edge sites (e.g., enterprises). This on-prem cloud can 80 | range in size from a partial rack to multi-rack cluster, assembled 81 | according to the best practices used in datacenters. 82 | 83 | * Aether supports both “edge services” running on these on-prem 84 | clusters and “centralized services” running in commodity cloud 85 | datacenters. In this sense it is a hybrid cloud.\ [#]_ 86 | 87 | * Aether augments this edge cloud with 5G-Connectivity-as-a-Service, 88 | giving us a service that must be operationalized (in addition to the 89 | underlying cloud). The end result is that Aether provides a managed 90 | Platform-as-a-Service (PaaS). 91 | 92 | * Aether is built entirely from open source components. The only thing 93 | it adds is the “glue code” and “specification directives” required 94 | to make it operational. This means the recipe is fully reproducible 95 | by anyone. 96 | 97 | .. [#] Technically, Aether is also a multi-cloud because it is 98 | designed to take advantage of services provided by multiple 99 | public clouds, but the private/public (edge/central) aspect is 100 | the most relevant, so we use hybrid terminology throughout this book. 101 | 102 | There is another important reason Aether makes for an interesting 103 | example. It is a system being deployed at the confluence of three 104 | traditionally distinct management domains: enterprises (where system 105 | admins have long been responsible for installing and maintaining 106 | purpose-built appliances), network operators (where access 107 | technologies have historically been delivered as Telco-based 108 | solutions), and cloud providers (where commodity hardware and cloud 109 | native software is now readily available). This complicates our job, 110 | because each of these three domains brings its own conventions and 111 | terminology to the table. But understanding how these three 112 | stakeholders approach operationalization gives us a broader 113 | perspective on the problem. We return to the confluence of enterprise, 114 | cloud, and access technologies later in this chapter, but we start by 115 | addressing the terminology challenge. 116 | 117 | .. _reading_aether: 118 | .. admonition:: Further Reading 119 | 120 | `Aether: 5G-Connected Edge Cloud 121 | `__. 122 | 123 | `Aether Documentation 124 | `__. 125 | 126 | 1.1 Terminology 127 | --------------- 128 | 129 | The terminology used to talk about operating cloud services represents 130 | a mix of “modern” concepts that are native to the cloud, and 131 | “traditional” concepts that are artifacts from earlier systems (many 132 | of which are now being subsumed by the cloud, but retain some of their 133 | original operational language). This is especially true at the 134 | intersection of the cloud and Telcos, who—like the Sámi of Scandinavia 135 | having over 180 words for snow—have an exceedingly rich vocabulary 136 | for *operating* a network. 137 | 138 | A major source of confusion is that we are in the midst of a 139 | transition from network systems being built from purpose-built 140 | *devices* to software-based *services* running on commodity 141 | hardware. This often results in multiple terms being used for the same 142 | concept, or more problematically, having one domain subtly repurpose a 143 | term from another domain. To avoid talking past each other, it is 144 | important to first define a few concepts and introduce the related 145 | terminology. 146 | 147 | * **Operations & Maintenance (O&M):** A traditional term used to 148 | characterize the overall challenge of operationalizing a network, 149 | where generally speaking, operators use an O&M Interface to manage 150 | the system. 151 | 152 | * **FCAPS:** An acronym (Fault, Configuration, Accounting, Performance, 153 | Security) historically used in the Telco industry to enumerate the 154 | requirements for an operational system. The O&M interface must 155 | provide a means to detect and manage faults, configure the system, 156 | account for usage, and so on. 157 | 158 | * **OSS/BSS:** Another Telco acronym (Operations Support System, 159 | Business Support System), referring to the subsystem that 160 | implements both operational logic (OSS) and business logic 161 | (BSS). It is usually the top-most component in the overall O&M 162 | hierarchy. 163 | 164 | * **EMS:** Yet another Telco acronym (Element Management System), 165 | corresponding to an intermediate layer in the overall O&M 166 | hierarchy. An EMS is to a particular type of device what an 167 | OSS/BSS is to the network as a whole. 168 | 169 | * **Orchestration:** A general term similar to O&M, but originating in 170 | the cloud context. Involves assembling (e.g., allocating, 171 | configuring, connecting) a collection of physical or logical 172 | resources on behalf of some workload. If only a single resource or 173 | device is involved, we would probably use a term like 174 | “configuration” instead, so orchestration typically implies 175 | “orchestrating” across multiple components. 176 | 177 | Narrowly defined, an orchestrator is responsible for spinning up 178 | virtual machines (or containers) and logically interconnecting them 179 | (with virtual networks). More broadly, orchestration encompasses 180 | aspects of all the management-related functions described in this 181 | book. 182 | 183 | If you are trying to map cloud terminology onto Telco terminology, 184 | an orchestrator is often equated with a cloudified version of the 185 | OSS/BSS mechanism. This top-most layer is sometimes called a 186 | *Service Orchestrator* since it is responsible for assembling a 187 | collection of *Virtual Network Functions (VNFs)* into an 188 | end-to-end-service chain. 189 | 190 | * **Playbook/Workflow:** A program or script that implements a 191 | multi-step orchestration process. (The term workflow is also used 192 | in a UX context to describe a multi-step operation that a user 193 | performs on a system using a GUI.) 194 | 195 | * **Provisioning:** Adding capacity (either physical or virtual 196 | resources) to a system, usually in response to changes in workload, 197 | including the initial deployment. 198 | 199 | * **Zero-Touch Provisioning:** Usually implies adding new hardware 200 | without requiring a human to configure it (beyond physically 201 | connecting the device). This implies the new component 202 | auto-configures itself, which means the term can also be applied 203 | to virtual resources (e.g., virtual machines, services) to 204 | indicate that no manual configuration step is needed to 205 | instantiate the resource. 206 | 207 | * **Remote Device Management:** A standard (e.g., IPMI, Redfish) that 208 | defines a way to remotely manage hardware devices in support of 209 | zero-touch provisioning. The idea is to send and receive 210 | out-of-band messages over the LAN in place of having video or serial 211 | console access to the device. Additionally, these may integrate with 212 | monitoring and other device health telemetry systems. 213 | 214 | * **Inventory Management:** Planning and tracking both the physical 215 | (racks, servers, switches, cabling) and virtual (IP ranges and 216 | addresses, VLANs) resources is a sub-step of the provisioning 217 | process. This process frequently starts using simple spreadsheets 218 | and text files, but as complexity grows, a dedicated database for 219 | inventory facilitates greater automation. 220 | 221 | * **Lifecycle Management:** Upgrading and replacing functionality (e.g., 222 | new services, new features to existing services) over time. 223 | 224 | * **Continuous Integration / Continuous Deployment (CI/CD):** An 225 | approach to Lifecycle Management in which the path from 226 | development (producing new functionality) to testing, integration, 227 | and ultimately deployment is an automated pipeline. CI/CD 228 | typically implies continuously making small incremental changes 229 | rather than performing large disruptive upgrades. 230 | 231 | * **DevOps:** An engineering discipline that fuses the Development 232 | process and Operational requirements silos, balancing feature 233 | velocity against system reliability. As a practice, it leverages 234 | CI/CD methods and is typically associated with container-based 235 | (also known as *cloud native*) systems. There is some overlap 236 | between DevOps and *Site 237 | Reliability Engineering (SRE)* as practiced by cloud providers such as 238 | Google. 239 | 240 | * **In-Service Software Upgrade (ISSU):** A requirement that a 241 | component continue running during the deployment of an upgrade, 242 | with minimal disruption to the service delivered to 243 | end-users. ISSU generally implies the ability to incrementally 244 | roll-out (and roll-back) an upgrade, but is specifically a 245 | requirement on individual components (as opposed to the 246 | platform used to manage a set of components). 247 | 248 | * **Monitoring & Telemetry:** Collecting data from system components 249 | to aid in management decisions. This includes diagnosing faults, 250 | tuning performance, doing root cause analysis, performing security 251 | audits, and provisioning additional capacity. 252 | 253 | * **Analytics:** A program (often using statistical models) that 254 | produces additional insights (value) from raw data. It can be used 255 | to close a control loop (i.e., auto-reconfigure a system based on 256 | these insights), but could also be targeted at a human operator 257 | who subsequently takes some action. 258 | 259 | Another way to talk about operations is in terms of stages, leading to 260 | a characterization that is common for traditional network devices: 261 | 262 | * **Day (-1):** Hardware configuration that is applied to a device (e.g., 263 | via a console) when it is first powered on. These configurations correspond 264 | to firmware (BIOS or similar) settings, and often need knowledge of how the 265 | device is physically connected to the network (e.g., the port being used). 266 | 267 | * **Day 0:** Connectivity configuration required to establish 268 | communication between the device and the available network services 269 | (e.g., setting a device’s IP address and default router). While such 270 | information may be provided manually, this is an opportunity to 271 | auto-configure the device, in support of Zero-Touch Provisioning. 272 | 273 | * **Day 1:** Service-level configuration needed by the device, including 274 | parameters that allow the device to take advantage of other services 275 | (e.g., NTP, Syslog, SMTP, NFS), as well as setting the parameters 276 | this device needs to perform whatever service it provides. At the 277 | end of Day-1 operationalization, the device is considered 278 | up-and-running, and able to support user traffic. This is also an 279 | opportunity for zero-touch provisioning, in the sense that 280 | pre-programmed playbooks (workflows) should be able to 281 | auto-configure the device rather than depending on human 282 | intervention. 283 | 284 | * **Day 2..N:** On-going management in support of day-to-day operations, 285 | coupled with monitoring the network to detect failures and service 286 | degradation, with the goal of sustaining the service. This may 287 | involve some closed-loop control, but is often human-intense, which 288 | involves monitoring a dashboard and fielding alerts, and then 289 | re-configuring the system as necessary. This is often referred to 290 | simply as "Day 2 Operations". 291 | 292 | Again, “Day x” is how traditional network vendors characterize the 293 | process of operationalizing the devices they sell, which in turn 294 | dictates how network operators and enterprise system admins bring 295 | those devices online. While the general framework has been extended to 296 | Virtual Network Functions (VNFs), it is still a device-centric view of 297 | operations. But once a system becomes cloud native, two things shift 298 | the balance of concerns. First, all hardware is commodity, and so Days 299 | 0 and 1 configurations become fully automated (and Day -1 is minimized 300 | since all devices are identical).\ [#]_ Second, Day 2 operations 301 | become a much more sophisticated process. This is because 302 | software-based systems are more agile, making functional upgrades more 303 | commonplace. This focus on *feature velocity* is one of the inherent 304 | values of cloud-based systems, but not surprisingly, it brings its own 305 | set of challenges to management. 306 | 307 | .. [#] Colloquially, this is sometimes referred to as a shift from 308 | taking care of pets to one of herding cattle. 309 | 310 | This book addresses those management challenges, which brings us to a 311 | final note about two words we use frequently: *Operating* and 312 | *Operationalizing*. Being able to operate a cloud is the end goal and 313 | implies an ongoing process, whereas to operationalize a cloud implies 314 | the process of bringing a set of hardware and software components into 315 | a state that makes it easy to sustain their ongoing operation. This 316 | distinction is relevant because operationalizing a cloud is not a 317 | one-time proposition, but rather, an essential aspect of day-to-day 318 | operations. Being rapidly evolvable is one of the cloud's most 319 | important features, making continual operationalization a key 320 | requirement for operating an edge cloud. 321 | 322 | 1.2 Disaggregation 323 | ------------------ 324 | 325 | To fully understand the challenge of operating a cloud, we have 326 | to start with the underlying building blocks: a collection of 327 | software-based microservices running on commodity hardware. These 328 | building blocks are the consequence of having *disaggregated* the 329 | bundled and purpose-built network appliances that came before. 330 | From the management perspective, it is helpful to identify what 331 | becomes easier and what becomes harder when you make this 332 | transition. This is both the challenge and the opportunity of 333 | disaggregation. 334 | 335 | Broadly speaking, disaggregation is the process of breaking large 336 | bundled components into a set of smaller constituent parts. SDN is one 337 | example of disaggregation—it decouples the network’s control and data 338 | planes, with the former running as a cloud service and the latter 339 | running in commodity switches. The microservice architecture is 340 | another example of disaggregation—it breaks monolithic cloud 341 | applications into a mesh of single-function components. Disaggregation 342 | is widely viewed as an essential step in accelerating feature velocity. 343 | This is the opportunity side of the story, which is one of the 344 | widely-claimed benefits of cloud native application architectures. A 345 | useful, if opinionated, view on such architectures is the Twelve-Factor 346 | App. 347 | 348 | .. _reading_disaggregate: 349 | .. admonition:: Further Reading 350 | 351 | Adam Wiggins. `The Twelve-Factor App. 352 | `__. 353 | 354 | The challenge side of the story is that there are many more moving 355 | parts that have to be integrated, coordinated, and managed. Circling 356 | back to terminology, Orchestration and Lifecycle Management become the 357 | dominant issues because (a) many smaller parts have to be assembled, 358 | and (b) these individual parts are expected to change more 359 | frequently. Much of this book focuses on these two issues. 360 | 361 | The good news is that the industry seems to have converged on 362 | *containers* as the common representation for “component packaging” 363 | and Kubernetes as the first-level *container orchestrator*. (We say 364 | “first-level” because Kubernetes is not sufficient by itself.) This 365 | foundation, in turn, makes many of the other challenges more 366 | manageable: 367 | 368 | * Monitoring and other telemetry-related mechanisms are themselves 369 | realized as a set of container-based microservices, deployed within 370 | the cloud they observe. 371 | 372 | * ISSU becomes more tractable because the microservice architecture 373 | encourages stateless components, with persistent state isolated in a 374 | single function-agnostic storage service, such as a key-value store. 375 | 376 | * Zero-Touch Provisioning is more tractable because the hardware is 377 | commodity, and hence, (nearly) identical. This also means the vast 378 | majority of configuration involves initializing software parameters, 379 | which is more readily automated. 380 | 381 | * Cloud native implies a set of best practices for addressing many of 382 | the FCAPS requirements, especially as they relate to availability 383 | and performance, both of which are achieved through horizontal 384 | scaling. Secure communication is also typically built into cloud RPC 385 | mechanisms. 386 | 387 | Another way to say this is that by rearchitecting bundled appliances 388 | and devices as horizontally scalable microservices running on 389 | commodity hardware, what used to be a set of one-off O&M problems are 390 | now solved by widely applied best practices from distributed systems, 391 | which have in turn been codified in state-of-the-art cloud management 392 | frameworks (like Kubernetes). This leaves us with the problem of (a) 393 | provisioning commodity hardware, (b) orchestrating the container 394 | building blocks, (c) deploying microservices to collect and archive 395 | monitoring data in a uniform way, and (d) continually integrating and 396 | deploying individual microservices as they evolve over time. 397 | 398 | Finally, because a cloud is infinitely programmable, the system being 399 | managed has the potential to change substantially over time.\ [#]_ 400 | This means that the cloud management system must itself be easily 401 | extended to support new features (as well as the refactoring of 402 | existing features). This is accomplished in part by implementing the 403 | cloud management system as a cloud service, which means we will see a 404 | fair amount of recursive dependencies throughout this book. It also 405 | points to taking advantage of declarative specifications of how all 406 | the disaggregated pieces fit together. These specifications can then 407 | be used to generate elements of the management system, rather than 408 | having to manually recode them. This is a subtle issue we will return 409 | to in later chapters, but ultimately, we want to be able to 410 | auto-configure the subsystem responsible for auto-configuring the rest 411 | of the system. 412 | 413 | .. [#] For example, compare the two services Amazon offered ten years 414 | ago (EC2 and S3) with the well over 100 services available on 415 | the AWS console today (not counting the marketplace of 416 | partner-provided services). 417 | 418 | 419 | 1.3 Cloud Technology 420 | -------------------- 421 | 422 | Being able to operationalize a cloud starts with the building blocks 423 | used to construct the cloud in the first place. This section 424 | summarizes the available technology, with the goal of identifying the 425 | baseline capabilities of the underlying system. This baseline is then 426 | assumed by the collection of management-related subsystems described 427 | throughout this book. 428 | 429 | Before identifying these building blocks, we need to acknowledge that 430 | we are venturing into a gray area, having to do with what you consider 431 | to be “part of the platform being managed” versus “part of the 432 | subsystem that manages the platform.” To further complicate matters, 433 | where you draw the line shifts over time as technology matures and 434 | becomes ubiquitous. 435 | 436 | For example, if you start with the premise that a cloud hosts a set of 437 | containers, then your management layer would be responsible for 438 | detecting and restarting failed containers. On the other hand, if you 439 | assume containers are resilient (i.e., able to auto-recover), then the 440 | management layer would not need to include that functionality 441 | (although it probably still needs to detect when the auto-recovery 442 | mechanism fails and correct for that). This is not a unique 443 | situation—complex systems often include mechanisms that address 444 | problems at multiple levels. For the purpose of this book, we just 445 | need to decide on a line that separates “technology that is assumed” 446 | from “problems that remain and how we address them.” The following 447 | identifies the technology we assume. 448 | 449 | 1.3.1 Hardware Platform 450 | ~~~~~~~~~~~~~~~~~~~~~~~ 451 | 452 | The assumed hardware building blocks are straightforward. We start 453 | with bare-metal servers and switches, built using merchant silicon 454 | chips. These might, for example, be ARM or x86 processor chips and 455 | Tomahawk or Tofino switching chips, respectively. The bare-metal boxes 456 | also include a bootstrap mechanism (e.g., BIOS for servers and ONIE 457 | for switches), and a remote device management interface (e.g., IPMI or 458 | Redfish). 459 | 460 | .. _reading_redfish: 461 | .. admonition:: Further Reading 462 | 463 | DMTF. `Redfish 464 | `__. 465 | 466 | A physical cloud cluster is then constructed with the hardware 467 | building blocks arranged as shown in :numref:`Figure %s `: one 468 | or more racks of servers connected by a leaf-spine switching 469 | fabric. The servers are shown above the switching fabric to emphasize 470 | that software running on the servers controls the switches. 471 | 472 | .. _fig-hw: 473 | .. figure:: figures/Slide1.png 474 | :width: 400px 475 | :align: center 476 | 477 | Example building block components used to construct a cloud, 478 | including commodity servers and switches, interconnected by a 479 | leaf-spine switching fabric. 480 | 481 | :numref:`Figure %s ` also includes the assumed low-level 482 | software components, which we describe next. Collectively, all the 483 | hardware and software components shown in the figure form the 484 | *platform*. Where we draw the line between what's *in the platform* 485 | and what runs *on top of the platform*, and why it is important, will 486 | become clear in later chapters. The summary is that one mechanism is 487 | responsible for bringing up the platform and preparing it to host 488 | workloads, and a different mechanism is responsible for managing the 489 | various workloads that are deployed on that platform. 490 | 491 | 492 | 1.3.2 Software Building Blocks 493 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 494 | 495 | We assume four foundational software technologies, all running on the 496 | commodity processors in the cluster: 497 | 498 | 1. Linux provides isolation for running container workloads. 499 | 500 | 2. Docker containers package software functionality. 501 | 502 | 3. Kubernetes instantiates and interconnects containers. 503 | 504 | 4. Helm charts specify how collections of related containers are 505 | interconnected to build applications. 506 | 507 | These are all well known and ubiquitous, and so we only summarize them 508 | here. Links to related information for anyone who is not familiar 509 | with them (including excellent hands-on tutorials for the three 510 | container-related building blocks) are given below. 511 | 512 | Linux is the OS that runs on the bare metal systems. It provides 513 | low-level APIs that container runtime systems use to implement 514 | isolation, including *namespaces* to isolate filesystem and network 515 | access, and *cgroups* to limit memory and processor usage. 516 | 517 | Docker is a container runtime that leverages OS isolation APIs to 518 | instantiate and run multiple containers, each of which is an instance 519 | defined by a Docker image. Docker images are most frequently built 520 | using a Dockerfile, which uses a layering approach that allows sharing 521 | and building customized images on top of base images. A final image 522 | for a particular task incorporates all dependencies required by the 523 | software that is to run in the container, resulting in a container 524 | image that is portable across servers, depending only on the kernel 525 | and Docker runtime. We also assume one or more image artifact 526 | repositories of Docker containers that we will want to deploy in our 527 | cloud, of which ``__ is the best known 528 | example. 529 | 530 | .. _reading_docker: 531 | .. admonition:: Further Reading 532 | 533 | `Docker Tutorial 534 | `__. 535 | 536 | Kubernetes is a container management system. It provides a 537 | programmatic interface for scaling container instances up and down, 538 | allocating server resources to them, setting up virtual networks to 539 | interconnect those instances, and opening service ports that external 540 | clients can use to access those instances. Behind the scenes, 541 | Kubernetes monitors the liveness of those containers, and 542 | automatically restarts any that have failed. In other words, if you 543 | instruct Kubernetes to spin up three instances of microservice X, 544 | Kubernetes will do its best to keep three instances of the container 545 | that implements X running at all times. 546 | 547 | Kubernetes also provides mechanisms that can be used to configure 548 | microservices when they start up, including *ConfigMaps*, *Secrets*, 549 | and *Operators*. Because of the role they play in cloud management, we 550 | discuss these mechanisms in more detail as they are introduced in 551 | later chapters. 552 | 553 | .. _reading_k8s: 554 | .. admonition:: Further Reading 555 | 556 | `Kubernetes Tutorial 557 | `__. 558 | 559 | Helm is a configuration set manager that runs on top of Kubernetes. It issues 560 | calls against the Kubernetes API according to an operator-provided 561 | specification, known as a *Helm Chart*. It is now common practice for 562 | cloud applications built from a set of microservices to publish a Helm 563 | chart that defines how the application is to be deployed on a 564 | Kubernetes cluster. See ``__ for a collection of 565 | publicly available Helm Charts. 566 | 567 | .. _reading_helm: 568 | .. admonition:: Further Reading 569 | 570 | `Helm Tutorial 571 | `__. 572 | 573 | The cloud management software described in this book is available in 574 | the form of a set of Docker containers, plus the associated Helm 575 | Charts that specify how they are to be deployed in a Kubernetes 576 | cluster. Overall, we make use of over 20 such open source software 577 | packages in the chapters that follow. Our goal is to show how all 578 | these open building blocks can be assembled into a comprehensive cloud 579 | management platform. We describe each tool in enough detail to 580 | appreciate how all the parts fit together—providing end-to-end 581 | coverage by connecting all the dots—plus links to full documentation 582 | for those who want to dig deeper into the details. 583 | 584 | .. 585 | List: NexBox, Ansible, Netplan, Terraform, Rancher, Fleet, 586 | Prometheus, Grafana, AlertManager, Jenkins, Robot, Selenium, 587 | onos-config, Atomix, OPF, Kibana, Logstash, Elasticsearch, 588 | Kubernetes, Helm, Docker (21) 589 | 590 | 1.3.3 Switching Fabric 591 | ~~~~~~~~~~~~~~~~~~~~~~ 592 | 593 | We assume the cloud is constructed using an SDN-based switching 594 | fabric, with a disaggregated control plane running in the same cloud 595 | as the fabric interconnects. For the purpose of this book, we assume 596 | the following SDN software stack: 597 | 598 | * A Network OS hosts a set of control applications, including a 599 | control application that manages the leaf-spine switching fabric. We 600 | use ONOS as an open source exemplar Network OS. ONOS, in turn, hosts 601 | the SD-Fabric control app. 602 | 603 | * A Switch OS runs on each switch, providing a northbound gNMI and 604 | gNOI interface through which the Network OS controls and configures 605 | each switch. We use Stratum as an open source exemplar Switch OS. 606 | 607 | Building a cloud using an SDN-based switching fabric is a best 608 | practice adopted by hyperscaler cloud providers. Their solutions 609 | remain proprietary, so we use ONOS and Stratum as open source 610 | examples. It is noteworthy that ONOS and Stratum are both packaged as 611 | Docker containers, and so can be orchestrated (on *both* servers and 612 | switches) by Kubernetes and Helm.\ [#]_ 613 | 614 | .. [#] Switches often include a commodity processor, typically running 615 | Linux and hosting control software, in addition to any 616 | switching chip that implements the data plane. Stratum runs on 617 | this processor, and exports a northbound API that ONOS uses to 618 | configure and control the switch. 619 | 620 | 1.3.4 Repositories 621 | ~~~~~~~~~~~~~~~~~~ 622 | 623 | For completeness, we need to mention that nearly every mechanism 624 | described in this book takes advantage of cloud-hosted repositories, 625 | such as GitHub (for code), DockerHub (for Docker images), and 626 | ArtifactHub (for Helm charts). We also assume complementary systems 627 | like Gerrit, which layer a code-review mechanism on top of a Git 628 | repository, but having direct experience with Gerrit is not critical 629 | to understanding the material. 630 | 631 | .. _reading_github: 632 | .. admonition:: Further Reading 633 | 634 | `GitHub Tutorial 635 | `__. 636 | 637 | `Gerrit Code Review 638 | `__. 639 | 640 | 641 | 642 | 1.3.5 Other Options 643 | ~~~~~~~~~~~~~~~~~~~ 644 | 645 | Just as important as what building blocks we take for granted are the 646 | technologies we do not include. We discuss three here. 647 | 648 | First, you might have expected Service Mesh frameworks like Istio or 649 | Linkerd to be included. While it is true that anyone running 650 | applications on top of Kubernetes might decide to use Istio or Linkerd 651 | to help do that job—and this includes us, since much of the management 652 | system described in this book is implemented as a set of 653 | microservices—we happen to not take that approach. This is primarily 654 | an engineering choice: Service Meshes provide more features than we 655 | need, and correspondingly, we are able to realize the necessary 656 | functionality using more narrowly focused mechanisms. There is also a 657 | pedagogical reason: The fine-grained components we use are more 658 | consistent with our goal of identifying the elemental pieces of 659 | operations and management, rather than having those components bundled 660 | in a comprehensive package. We do, however, return to the role of 661 | service meshes in our discussion of observability in Chapter 6. 662 | 663 | .. sidebar:: What's the Master Plan? 664 | 665 | *There is a general issue of how one makes engineering choices about 666 | the combination of software packages to use in a cloud-based system 667 | like the one this book describes. Ignoring the plethora of commercial 668 | offerings, just the number of open source projects at the Linux 669 | Foundation and the Apache Foundation available to help you build and 670 | operate a cloud is (by our count) approaching 100. These projects 671 | are largely independent, and in many cases, competing for mindshare. 672 | This results in significant overlap in functionality, with any Venn 673 | diagram you try to draw constantly shifting over time as projects 674 | add and deprecate features.* 675 | 676 | *This is all to say, there is no master plan for what a cloud 677 | management stack should look like. If you start with component X as 678 | the centerpiece of your approach—perhaps because it solves your most 679 | immediate problem—you will end up adding dozens of other components 680 | over time to fully complete the system. Moreover, the end result 681 | will likely look different from the system someone else constructs 682 | starting with component Y. There simply is no consensus framework 683 | for which you get to select a component from column A, a second 684 | complementary component from column B, and so on. This is also true 685 | for the Aether managed service we use as an exemplar.* 686 | 687 | *This makes it all the more important that we take a first 688 | principles approach, which starts by identifying the set of 689 | requirements and exploring the design space. Only as a final step do 690 | we select an existing software component. This approach naturally 691 | results in an end-to-end solution that assembles many smaller 692 | components, and tends to avoid bundled/multi-faceted solutions. This 693 | does not inoculate us from having to evolve the system over time, 694 | but it does help to approach the topic with visibility into the full 695 | scope and complexity of the design space. And even if one ends up 696 | adopting a bundled solution, understanding all the trade-offs being 697 | made under the covers will help to make a more informed decision.* 698 | 699 | Second, we assume a container-based cloud platform. An alternative 700 | would have been VM-based. The main reason for this choice is that 701 | containers are rapidly becoming the de facto way to deploy scalable 702 | and highly available functionality, and operationalizing such 703 | functionality in enterprises is our primary use case. Containers are 704 | sometimes deployed inside of VMs (rather than directly on physical 705 | machines), but in that case, the VMs can be viewed as part of the 706 | underlying infrastructure (rather than a service that is offered to 707 | users). Another way of saying this is that this book focuses on how to 708 | operationalize a Platform-as-a-Service (PaaS) rather than an 709 | Infrastructure-as-a-Service (IaaS), although later chapters will 710 | describe how to introduce VMs as an optional way to provision the 711 | underlying infrastructure for that PaaS. 712 | 713 | Finally, the Aether edge cloud we use as an example is similar to many 714 | other cloud platforms being built to support on-prem deployments. 715 | The dominant use case shifts over time—with Artificial Intelligence 716 | (AI) recently overtaking Internet-of-Things (IoT) as the most 717 | compelling justification for edge clouds—but the operational 718 | challenge remains the same. For example, *Open Edge Platform* recently 719 | open sourced by Intel includes example AI applications and a 720 | collection of AI libraries, but also an *Edge Management Framework* 721 | that mirrors the one describe this book. It starts with a Kubernetes 722 | foundation, and includes tools for provisioning edge servers, 723 | orchestrating edge clusters using those servers, lifecycle managing 724 | edge applications, and enabling observability. Many of the engineering 725 | choices are the same as in Aether (some are different), but the 726 | important takeaway is that Kubernetes-based edge clouds are quickly 727 | becoming commonplace. That's the reason they are such a good case 728 | study. 729 | 730 | .. admonition:: Further Reading 731 | 732 | `Open Edge Platform `__. 733 | 734 | `Edge Management Framework `__. 735 | 736 | 1.4 Future of the Sysadmin 737 | -------------------------- 738 | 739 | System administrators have been responsible for operating enterprise 740 | networks since the first file servers, client workstations, and LANs 741 | were deployed over 30 years ago. Throughout that history, a robust 742 | vendor ecosystem has introduced an increasingly diverse set of network 743 | appliances, compounding the challenge of the sysadmin’s job. The 744 | introduction of virtualization technology led to server consolidation, 745 | but did not greatly reduce the management overhead. This is because each 746 | virtual appliance remains in a management silo. 747 | 748 | Cloud providers, because of the scale of the systems they build, 749 | cannot survive with operational silos, and so they introduced 750 | increasingly sophisticated cloud orchestration 751 | technologies. Kubernetes and Helm are two high-impact examples. These 752 | cloud best practices are now available to enterprises as well, but 753 | they are often bundled as a managed service, with the cloud provider 754 | playing an ever-greater role in operating the enterprise’s services. 755 | Outsourcing portions of the IT responsibility to a cloud provider is an 756 | attractive value proposition for many enterprises, but comes with the 757 | risk of increased dependence on a single provider. This equation is 758 | complicated by the increased likelihood that Mobile Network Operators 759 | (MNOs) also participate in the rollout of private 5G connectivity 760 | within the enterprise, deployed as yet another cloud service. 761 | 762 | The approach this book takes is to explore a best-of-both-worlds 763 | opportunity. It does this by walking you through the collection of 764 | subsystems, and associated management processes, required to 765 | operationalize an on-premises cloud, and then provide on-going support for 766 | that cloud and the services it hosts (including 5G connectivity). Our 767 | hope is that understanding what’s under the covers of cloud-managed 768 | services will help enterprises better share responsibility for 769 | managing their IT infrastructure with cloud providers, and potentially with 770 | MNOs. 771 | -------------------------------------------------------------------------------- /latest.rst: -------------------------------------------------------------------------------- 1 | .. role:: pop 2 | 3 | :pop:`Read The Latest!` 4 | ======================== 5 | 6 | `Systems Approach Newsletter: `__ Stay 7 | up to date with the latest developments by subscribing to the 8 | `Systems Approach Newsletter 9 | `__, where the authors 10 | connect the concepts and lessons in this book to what's happening in 11 | the Internet today. 12 | 13 | `Book Series: `__ Also check out 14 | our companion books that cover emerging topics in more depth. 15 | 16 | * `Private 5G: A Systems Approach `__ 17 | 18 | * `Software-Defined Networks: A Systems Approach `__ 19 | 20 | * `TCP Congestion Control: A Systems Approach `__ 21 | 22 | .. * `Edge Cloud Operations: A Systems Approach `__ 23 | -------------------------------------------------------------------------------- /monitor.rst: -------------------------------------------------------------------------------- 1 | Chapter 6: Monitoring and Telemetry 2 | ==================================== 3 | 4 | Collecting telemetry data for a running system is an essential 5 | function of the management platform. It enables operators to monitor 6 | system behavior, evaluate performance, make informed provisioning 7 | decisions, respond to failures, identify attacks, and diagnose 8 | problems. This chapter focuses on three types of telemetry 9 | data—*metrics*, *logs*, and *traces*\—along with exemplar open source 10 | software stacks available to help collect, store, and act upon each of 11 | them. 12 | 13 | Metrics are quantitative data about a system. These include common 14 | performance metrics such as link bandwidth, CPU utilization, and memory 15 | usage, but also binary results corresponding to "up" and "down", as 16 | well as other state variables that can be encoded numerically. These 17 | values are produced and collected periodically (e.g., every few 18 | seconds), either by reading a counter, or by executing a runtime test 19 | that returns a value. These metrics can be associated with physical 20 | resources such as servers and switches, virtual resources such as VMs and 21 | containers, or high-level abstractions such as the *Connectivity Service* 22 | described in Section 5.3. Given these many possible sources of data, 23 | the job of the metrics monitoring stack is to collect, archive, 24 | visualize, and optionally analyze this data. 25 | 26 | Logs are the qualitative data that is generated whenever a noteworthy 27 | event occurs. This information can be used to identify problematic 28 | operating conditions (i.e., it may trigger an alert), but more 29 | commonly, it is used to troubleshoot problems after they have been 30 | detected. Various system components—all the way from the low-level OS 31 | kernel to high-level cloud services—write messages that adhere to a 32 | well-defined format to the log. These messages include a timestamp, 33 | which makes it possible for the logging stack to parse and correlate 34 | messages from different components. 35 | 36 | Traces are a record of causal relationships (e.g., Service A calls 37 | Service B) resulting from user-initiated transactions or jobs. They 38 | are related to logs, but provide more specialized information about 39 | the context in which different events happen. Tracing is 40 | well-understood in a single program, where an execution trace is 41 | commonly recorded as an in-memory call stack, but traces are 42 | inherently distributed across a graph of network-connected 43 | microservices in a cloud setting. This makes the problem challenging, 44 | but also critically important because it is often the case that the 45 | only way to understand time-dependent phenomena—such as why a 46 | particular resource is overloaded—is to understand how multiple 47 | independent workflows interact with each other. 48 | 49 | Taking a step back from the three types of telemetry data, it is 50 | helpful to have a broad understanding of the design space, and to that 51 | end, we make four observations. 52 | 53 | First, there are two general use cases for telemetry data, which we 54 | broadly characterize as "monitoring" and "troubleshooting". We use 55 | these terms in the most general way to represent (a) proactively 56 | watching for warning signs of trouble (attacks, bugs, failures, 57 | overload conditions) in a steady-state system; versus (b) reactively 58 | taking a closer look to determine the root cause and resolve an issue 59 | (fix a bug, optimize performance, provision more resources, defend 60 | against an attack), once alerted to a potential problem. This 61 | distinction is important because the former (monitoring) needs to 62 | incur minimal overhead and require minimal human involvement, while 63 | the latter (troubleshooting) can be more invasive/expensive and 64 | typically involves some level of human expertise. This is not a 65 | perfect distinction, with plenty of operator activity happening in a 66 | gray area, but being aware of the cost/benefit trade-offs of the 67 | available tools is an important starting point. 68 | 69 | Second, the more aspects of monitoring and troubleshooting that can be 70 | automated, the better. This starts with alerts that automatically 71 | detect potential problems; typically includes dashboards that make it 72 | easy for humans to see patterns and drill down for relevant details 73 | across all three types of data; increasingly leverages Machine 74 | Learning and statistical analysis to identify deeper connections 75 | that are not obvious to human operators; and ultimately supports 76 | closed-loop control where the automated tool not only detects problems 77 | but is also able to issue corrective control directives. For the 78 | purpose of this chapter, we give examples of the first two (alerts and 79 | dashboards), and declare the latter two (analytics and close-loop 80 | control) as out of scope (but likely running as applications that 81 | consume the telemetry data outlined in the sections that follow). 82 | 83 | Third, when viewed from the perspective of lifecycle management, 84 | monitoring and troubleshooting are just a continuation of testing, 85 | except under production workloads rather than test workloads. In fact, 86 | the same set of tools can be used on either side of the 87 | development-vs-production boundary. For example, as anyone who has 88 | profiled a program will recognize and appreciate, tracing is an 89 | extremely valuable tool during development—both to track down bugs and 90 | to tune performance. Similarly, artificial end-to-end tests can 91 | provide value in production systems by triggering early warning 92 | alerts. This can be especially helpful when dealing with problematic 93 | failure modes. 94 | 95 | Finally, because the metrics, logs, and traces collected by the 96 | various subsystems are timestamped, it is possible to establish 97 | correlations among them, which is helpful when debugging a problem or 98 | deciding whether or not an alert is warranted. We give examples of how 99 | such telemetry-wide functions are implemented in practice today, and 100 | discuss the future of generating and using telemetry data, in the 101 | final two sections of this chapter. 102 | 103 | 6.1 Metrics and Alerts 104 | ------------------------------- 105 | 106 | Starting with metrics, a popular open source monitoring stack uses 107 | Prometheus to collect and store platform and service metrics, Grafana 108 | to visualize metrics over time, and Alertmanager to notify the 109 | operations team of events that require attention. In Aether, 110 | Prometheus and Alertmanager are instantiated on each edge cluster, 111 | with a single instantiation of Grafana running centrally in the 112 | cloud. More information about each tool is available online, so we 113 | focus more narrowly on (1) how individual Aether components "opt into" 114 | this stack, and (2) how the stack can be customized in 115 | service-specific ways. 116 | 117 | .. _reading_monitor: 118 | .. admonition:: Further Reading 119 | 120 | `Prometheus `__. 121 | 122 | `Grafana 123 | `__. 124 | 125 | `Alertmanager `__. 126 | 127 | 128 | 6.1.1 Exporting Metrics 129 | ~~~~~~~~~~~~~~~~~~~~~~~ 130 | 131 | Individual components implement a *Prometheus Exporter* to provide the 132 | current value of the component's metrics. A component's Exporter is 133 | queried via HTTP, with the corresponding metrics returned using a 134 | simple text format. Prometheus periodically scrapes the Exporter's 135 | HTTP endpoint and stores the metrics in its Time Series Database 136 | (TSDB) for querying and analysis. Many client libraries are available 137 | for instrumenting code to produce metrics in Prometheus format. If a 138 | component's metrics are available in some other format, tools are 139 | often available to convert the metrics into Prometheus format and 140 | export them. 141 | 142 | A YAML configuration file specifies the set of Exporter endpoints that 143 | Prometheus is to pull metrics from, along will the polling frequency 144 | for each endpoint. Alternatively, Kubernetes-based microservices can 145 | be extended with a *Service Monitor* Custom Resource Descriptor (CRD) 146 | that Prometheus then queries to learn about any Exporter endpoints the 147 | microservice has made available. 148 | 149 | In addition to component-based Exporters, every edge cluster 150 | periodically tests end-to-end connectivity (for various definitions of 151 | end-to-end). One test determines whether the 5G control plane is 152 | working (i.e., the edge site can reach the SD-Core running in the 153 | central cloud) and a second test determines whether the 5G user plane 154 | is working (i.e., UEs can reach the Internet). This is a common 155 | pattern: individual components can export accumulators and other local 156 | variables, but only a "third-party observer" can actively test 157 | external behavior, and report the results. These examples correspond 158 | to the rightmost "End-to-End Tests" shown in :numref:`Figure %s 159 | ` of Chapter 4. 160 | 161 | Finally, when a system is running across multiple edge sites, as is 162 | the case with Aether, there is an design question of whether 163 | monitoring data is stored on the edge sites and lazily pulled to the 164 | central location only when needed, or is proactively pushed to the 165 | central location as soon as it's generated. Aether employs both 166 | approaches, depending on the volume and urgency of the data being 167 | collected. By default, metrics collected by the local instantiation of 168 | Prometheus stay on the edge sites, and only query results are returned 169 | to the central location (e.g., to be displayed by Grafana as described 170 | in the next subsection). This is appropriate for metrics that are both 171 | high-volume and seldom viewed. One exception is the end-to-end tests 172 | described in the previous paragraph. These results are immediately 173 | pushed to the central site (bypassing the local Prometheus instance), because 174 | they are low-volume and may require immediate attention. 175 | 176 | 6.1.2 Creating Dashboards 177 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 178 | 179 | The metrics collected by Prometheus are visualized using Grafana 180 | dashboards. In Aether, this means the Grafana instance running as 181 | part of AMP in the central cloud sends queries to some combination of 182 | the central Prometheus instance and a subset of the Prometheus instances 183 | running on edge clusters. For example, :numref:`Figure %s 184 | ` shows the summary dashboard for a collection of Aether 185 | edge sites. 186 | 187 | .. _fig-ace_dash: 188 | .. figure:: figures/ace_dash.png 189 | :width: 600px 190 | :align: center 191 | 192 | Central dashboard showing status of Aether edge deployments. 193 | 194 | Grafana comes with a set of predefined dashboards for the most common 195 | set of metrics—in particular, those associated with physical servers 196 | and virtual resources such as containers—but it can also be customized to 197 | include dashboards for service-level metrics and other 198 | deployment-specific information (e.g., per-enterprise in Aether). For 199 | example, :numref:`Figure %s ` shows a custom dashboard 200 | for UPF (User Plane Function), the data plane packet forwarder of the 201 | SD-Core. The example shows latency and jitter metrics over the last 202 | hour at one site, with three additional collapsed panels (PFCP Sessions 203 | and Messages) at the bottom. 204 | 205 | .. _fig-upf_dash: 206 | .. figure:: figures/upf_dash.png 207 | :width: 600px 208 | :align: center 209 | 210 | Custom dashboard showing latency and jitter metrics for UPF, the 211 | packet forwarding data plane of the SD-Core component. 212 | 213 | Briefly, a dashboard is constructed from a set of *panels*, where each 214 | panel has a well-defined *type* (e.g., graph, table, gauge, heatmap) 215 | bound to a particular Prometheus *query*. New dashboards are created 216 | using the Grafana GUI, and the resulting configuration then saved in a 217 | JSON file. This configuration file is then committed to the Config 218 | Repo, and later loaded into Grafana whenever it is restarted as part 219 | of Lifecycle Management. For example, the following code snippet 220 | shows the Prometheus query corresponding to the ``Uptime`` panel 221 | in :numref:`Figure %s `. 222 | 223 | .. literalinclude:: code/uptime.yaml 224 | 225 | Note that this expression includes variables for the site (``$edge``) 226 | and the interval over which the uptime is computed (``$__interval``). 227 | 228 | 6.1.3 Defining Alerts 229 | ~~~~~~~~~~~~~~~~~~~~~ 230 | 231 | Alerts can be triggered in Prometheus when a component metric crosses 232 | some threshold. Alertmanager is a tool that then routes the alert to 233 | one or more receivers, such as an email address or Slack channel. 234 | 235 | An alert for a particular component is defined by an *alerting rule*, 236 | an expression involving a Prometheus query, such that whenever it 237 | evaluates to true for the indicated time period, it triggers a 238 | corresponding message to be routed to a set of receivers. These rules 239 | are recorded in a YAML file that is checked into the Config Repo and 240 | loaded into Prometheus. (Alternatively, Helm Charts for individual 241 | components can define rules via *Prometheus Rule* custom resources.) 242 | For example, the following code snippet shows the Prometheus Rule for 243 | two alerts, where the ``expr`` lines corresponds to the respective 244 | queries submitted to Prometheus. 245 | 246 | .. literalinclude:: code/prometheus-rule.yaml 247 | 248 | In Aether, the Alertmanager is configured to send alerts with 249 | *critical* or *warning* severity to a general set of receivers. If it 250 | is desirable to route a specific alert to a different receiver (e.g., 251 | a Slack channel used by the developers for that particular component), 252 | the Alertmanager configuration is changed accordingly. 253 | 254 | 6.2 Logging 255 | ------------------ 256 | 257 | OS programmers have been writing diagnostic messages to a *syslog* 258 | since the earliest days of Unix. Originally collected in a local file, 259 | the syslog abstraction has been adapted to cloud environments by 260 | adding a suite of scalable services. Today, one typical open source 261 | logging stack uses Fluentd to collect (aggregate, buffer, and route) 262 | log messages written by a set of components, with Fluentbit serving as a 263 | client-side agent running in each component helping developers 264 | normalize their log messages. ElasticSearch is then used to store, 265 | search, and analyze those messages, with Kibana used to display and 266 | visualize the results. The general flow of data is shown in 267 | :numref:`Figure %s `, using the main Aether subsystems as 268 | illustrative sources of log messages. 269 | 270 | .. _fig-log: 271 | .. figure:: figures/Slide23.png 272 | :width: 450px 273 | :align: center 274 | 275 | Flow of log messages through the Logging subsystem. 276 | 277 | .. _reading_logging: 278 | .. admonition:: Further Reading 279 | 280 | `Fluentd `__. 281 | 282 | `ElasticSearch 283 | `__. 284 | 285 | `Kibana `__. 286 | 287 | 6.2.1 Common Schema 288 | ~~~~~~~~~~~~~~~~~~~ 289 | 290 | The key challenge in logging is to adopt a uniform message format 291 | across all components, a requirement that is complicated by the fact 292 | that the various components integrated in a complex system are often 293 | developed independently of each other. Fluentbit plays a role in 294 | normalizing these messages by supporting a set of filters. These 295 | filters parse "raw" log messages written by the component (an ASCII 296 | string), and output "canonical" log messages as structured JSON. There 297 | are other options, but JSON is reasonably readable as text, which 298 | still matters for debugging by humans. It is also well-supported by 299 | tooling. 300 | 301 | For example, developers for the SD-Fabric component might 302 | write a log message that looks like this: 303 | 304 | .. literalinclude:: code/log.ascii 305 | 306 | where a Fluentbit filter transforms into a structure that looks like 307 | this: 308 | 309 | .. literalinclude:: code/log.json 310 | 311 | This example is simplified, but it does serve to illustrate the basic 312 | idea. It also highlights the challenge the DevOps team faces in 313 | building the management platform, which is to decide on a meaningful 314 | set of name/value pairs for the system as a whole. In other words, 315 | they must define a common schema for these structured log messages. 316 | The *Elastic Common Schema* is a good place to start that definition, 317 | but among other things, it will be necessary to establish the accepted 318 | set of log levels, and conventions for using each level. In Aether, 319 | for example, the log levels are: FATAL, ERROR, WARNING, INFO, and 320 | DEBUG. 321 | 322 | .. _reading_ecs: 323 | .. admonition:: Further Reading 324 | 325 | `Elastic Common Schema 326 | `__. 327 | 328 | 329 | 6.2.2 Best Practices 330 | ~~~~~~~~~~~~~~~~~~~~ 331 | 332 | Establishing a shared logging platform is, of course, of little value 333 | unless all the individual components are properly instrumented to 334 | write log messages. Programming languages typically come with library 335 | support for writing log messages (e.g., Java's log4j), but that's just 336 | a start. Logging is most effective if the components adhere to the 337 | following set of best practices. 338 | 339 | * **Log shipping is handled by the platform.** Components should 340 | assume that stdout/stderr is ingested into the logging system by 341 | Fluentbit (or similar tooling), and avoid making the job more 342 | complicated by trying to route their own logs. The exception is for 343 | external services and hardware devices that are outside the 344 | management platform's control. How these systems send their logs to 345 | a log aggregator must be established as a part of the deployment 346 | process. 347 | 348 | * **File logging should be disabled.** Writing log files directly to a 349 | container's layered file system is proven to be I/O inefficient and 350 | can become a performance bottleneck. It is also generally 351 | unnecessary if the logs are also being sent to stdout/stderr. 352 | Generally, logging to a file is discouraged when a component runs in 353 | a container environment. Instead, components should stream all logs 354 | to the collecting system. 355 | 356 | * **Asynchronous logging is encouraged.** Synchronous logging can 357 | become a performance bottleneck in a scaled environment. Components 358 | should write logs asynchronously. 359 | 360 | * **Timestamps should be created by the program's logger.** Components 361 | should use the selected logging library to create timestamps, with 362 | as precise a timestamp as the logging framework allows. Using the 363 | shipper or logging handlers may be slower, or create timestamps on 364 | receipt, which may be delayed. This makes trying to align events 365 | between multiple services after log aggregation problematic. 366 | 367 | * **Must be able to change log levels without interrupting service.** 368 | Components should provide a mechanism to set the log level at 369 | startup, and an API that allows the log level to be changed at 370 | runtime. Scoping the log level based on specific subsystems is a 371 | useful feature, but not required. When a component is implemented by 372 | a suite of microservices, the logging configuration need only be 373 | applied to one instance for it to apply to all instances. 374 | 375 | 6.3 Distributed Tracing 376 | ------------------------- 377 | 378 | Execution traces are the third source of telemetry data. Tracing is 379 | challenging in a cloud setting because it involves following the flow 380 | of control for each user-initiated request across multiple 381 | microservices. The good news is that instrumenting a set of 382 | microservices involves activating tracing support in the underlying 383 | language runtime system—typically in the RPC stubs—rather than asking 384 | app developers to explicitly instrument their programs. 385 | 386 | The general pattern is similar to what we've already seen with metrics 387 | and logs: the running code is instrumented to produce data that is 388 | then collected, aggregated, stored, and made available for display and 389 | analysis. The main difference is the type of data we're interested in 390 | collecting, which, for tracing, is typically the sequence of API 391 | boundary crossings from one module to another. This data gives us 392 | the information we need to reconstruct the call chain. In principle, 393 | we could leverage the logging system to support tracing—and just be 394 | diligent in logging the necessary interface-crossing 395 | information—but it is a specialized enough use case to warrant its own 396 | vocabulary, abstractions, and mechanisms. 397 | 398 | At a high level, a *trace* is a description of a transaction as it 399 | moves through the system. It consists of a sequence of *spans* (each 400 | of which represents work done within a service) interleaved with a set 401 | of *span contexts* (each of which represents the state carried across 402 | the network from one service to another). An illustrative example of a 403 | trace is shown in :numref:`Figure %s `, but abstractly, a 404 | trace is a directed graph with nodes that correspond to spans and 405 | edges that correspond to span contexts. The nodes and edges are then 406 | timestamped and annotated with relevant facts (key/value tags) about 407 | the end-to-end execution path, including when and for how long it 408 | ran. Each span also includes timestamped log messages generated while 409 | the span was executing, simplifying the process of correlating log 410 | messages with traces. 411 | 412 | .. _fig-trace: 413 | .. figure:: figures/Slide26.png 414 | :width: 500px 415 | :align: center 416 | 417 | Example trace spanning two network services. 418 | 419 | Again, as with metrics and log messages, the details are important and 420 | those details are specified by an agreed-upon data model. The 421 | OpenTelemetry project is now defining one such model, building on the 422 | earlier OpenTracing project (which was in turn influenced by the 423 | Dapper distributed tracing mechanism developed by Google). Beyond the 424 | challenge of defining a model that captures the most relevant semantic 425 | information, there is the pragmatic issue of (1) minimizing the 426 | overhead of tracing so as not to negatively impact application 427 | performance, yet (2) extracting enough information from traces so as 428 | to make collecting it worthwhile. Sampling is a widely adopted 429 | technique introduced into the data collection pipeline to manage this 430 | trade-off. One consequence of these challenges is that distributed 431 | tracing is the subject of ongoing research, and we can expect the 432 | model definitions and sampling techniques to evolve and mature in the 433 | foreseeable future. 434 | 435 | .. _reading_tracing: 436 | .. admonition:: Further Reading 437 | 438 | B. Sigelman, *et al.* `Dapper, a Large-Scale Distributed Systems 439 | Tracing Infrastructure 440 | `__. 441 | Google Technical Report. April 2010. 442 | 443 | `OpenTelemetry: High-quality, ubiquitous, and portable telemetry to 444 | enable effective observability `__. 445 | 446 | `Jaeger: End-to-End Distributed Tracing 447 | `__. 448 | 449 | With respect to mechanisms, Jaeger is a widely used open source 450 | tracing tool originally developed by Uber. (Jaeger is not included in 451 | Aether, but was utilized in a predecessor edge cloud.) Jaeger 452 | includes instrumentation of the runtime system for the language(s) 453 | used to implement an application, a collector, storage, and a query 454 | language that can be used to diagnose performance problems and do root 455 | cause analysis. 456 | 457 | 6.4 Integrated Dashboards 458 | ------------------------- 459 | 460 | The metrics, logs and traces being generated by instrumented 461 | application software make it possible to collect a wealth of data 462 | about the health of a system. But this instrumentation is only useful 463 | if the right data is displayed to the right people (those with the 464 | ability to take action) at the right time (when action needs to be 465 | taken). Creating useful panels and organizing them into intuitive 466 | dashboards is part of the solution, but integrating information across 467 | the subsystems of the management platform is also a requirement. 468 | 469 | Unifying all this data is the ultimate objective of ongoing efforts 470 | like the OpenTelemetry project mentioned in the previous section, but 471 | there are also opportunities to use the tools described in this 472 | chapter to better integrate data. This section highlights two 473 | general strategies. 474 | 475 | First, both Kibana and Grafana can be configured to display telemetry 476 | data from multiple sources. For example, it is straightforward to 477 | integrate both logs and traces in Kibana. This is typically done by 478 | first feeding the tracing data into ElasticSearch, which Kibana then 479 | queries. Similarly, it is useful to have a convenient way to see the 480 | log messages associated with a particular component in the context of 481 | metrics that have been collected. This is easy to accomplish because 482 | Grafana can be configured to display data from ElasticSearch just as 483 | easily as from Prometheus. Both are data sources that can be 484 | queried. This makes it to possible to create a Grafana dashboard that 485 | includes a selected set of log messages, similar to the one from 486 | Aether shown in :numref:`Figure %s `. In this example, 487 | we see INFO-level messages associated with the UPF sub-component of 488 | SD-Core, which augments the UPF performance data shown in 489 | :numref:`Figure %s `. 490 | 491 | .. _fig-es_dash: 492 | .. figure:: figures/es_dash.png 493 | :width: 600px 494 | :align: center 495 | 496 | Log messages associated with the UPF element of SD-Core, displayed 497 | in a Grafana dashboard. 498 | 499 | Second, the runtime control interface described in Chapter 5 provides 500 | a means to change various parameters of a running system, but to make 501 | informed decisions about what changes (if any) need to be 502 | made, it is necessary to have access to the right data. To this end, it 503 | is ideal to have access to both the "knobs" and the "dials" on an 504 | integrated dashboard. This can be accomplished by incorporating 505 | Grafana frames in the Runtime Control GUI, which, in its simplest form, 506 | displays a set of web forms corresponding to the fields in the 507 | underlying data models. (More sophisticated control panels are 508 | certainly possible.) 509 | 510 | .. _fig-dev_group: 511 | .. figure:: figures/gui1.png 512 | :width: 600px 513 | :align: center 514 | 515 | Example control dashboard showing the set of Device Groups defined 516 | for a fictional set of Aether sites. 517 | 518 | For example, :numref:`Figure %s ` shows the set 519 | of device groups for a fictional set of Aether sites, where clicking 520 | on the "Edit" button pops up a web form that lets the enterprise admin 521 | modify the corresponding fields of the `Device-Group` model (not 522 | shown), and clicking on the "Monitor" button pops up a 523 | Grafana-generated frame similar to the one shown in :numref:`Figure %s 524 | `. In principle, this frame is tailored to show only 525 | the most relevant information associated with the selected object. 526 | 527 | .. _fig-dev_monitor: 528 | .. figure:: figures/gui2.png 529 | :width: 600px 530 | :align: center 531 | 532 | Example monitoring frame associated with a selected Device Group. 533 | 534 | 6.5 Observability 535 | ----------------- 536 | 537 | Knowing what telemetry data to collect, so you have exactly the right 538 | information when you need it, but doing so without negatively 539 | impacting system performance, is a difficult problem. *Observability* 540 | is a relatively new term being used to describe this general problem 541 | space, and while the term can be dismissed as the latest marketing 542 | buzzword (which it is), it can also be interpreted as another of the 543 | set of *"-ities"* that all good systems aspire to, alongside 544 | scalability, reliability, availability, security, usability, and so 545 | on. Observability is the quality of a system that makes visible the 546 | facts about its internal operation needed to make informed management 547 | and control decisions. It has become a fertile space for innovation, 548 | and so we conclude this chapter with two examples that may become 549 | commonplace in the near future. 550 | 551 | The first is *Inband Network Telemetry (INT)*, which takes advantage 552 | of programmable switching hardware to allow operators to ask new 553 | questions about how packets are being processed "in-band", as they 554 | flow through the network. This is in contrast to either depending on 555 | the predefined set of counters hardwired into fixed-function network 556 | devices, or being able to inspect just a sampled subset of packets. 557 | Because Aether uses programmable switches as the foundation for its 558 | SDN-based switching fabric, it is able to use INT as a fourth type of 559 | telemetry data, and in doing so provide qualitatively deeper insights 560 | into traffic patterns and the root causes of network failures. 561 | 562 | For example, INT has been used to measure and record queuing delay 563 | individual packets experience while traversing a sequence of switches 564 | along an end-to-end path, making it possible to detect *microbursts* 565 | (queuing delays measured over millisecond or even sub-millisecond time 566 | scales). It is even possible to correlate this information across 567 | packet flows that followed different routes, so as to determine which 568 | flows shared buffer capacity at each switch. As another example, INT 569 | has been used to record the decision making process that directed how 570 | packets are delivered, that is, which forwarding rules were applied at 571 | each switch along the end-to-end path. This opens the door to using 572 | INT to verify that the data plane is faithfully executing the 573 | forwarding behavior the network operator intends. For more information 574 | about INT, we refer the reader to our companion SDN book. 575 | 576 | .. _reading_int: 577 | .. admonition:: Further Reading 578 | 579 | L. Peterson, *et al.* `Software-Defined Networking: A Systems Approach 580 | `__. November 2021. 581 | 582 | The second is the emergence of *Service Meshes* mentioned in 583 | Chapter 1. A Service Mesh framework such as Istio provides a means to 584 | enforce fine-grained security policies and collect telemetry data in 585 | cloud native applications by injecting "observation/enforcement 586 | points" between microservices. These injection points, called 587 | *sidecars*, are typically implemented by a container that "runs 588 | alongside" the containers that implement each microservice, with all 589 | RPC calls from Service A to Service B passing through their associated 590 | sidecars. As shown in :numref:`Figure %s `, these sidecars 591 | then implement whatever policies the operator wants to impose on the 592 | application, sending telemetry data to a global collector and 593 | receiving security directives from a global policy engine. 594 | 595 | .. _fig-mesh: 596 | .. figure:: figures/Slide27.png 597 | :width: 300px 598 | :align: center 599 | 600 | Overview of a Service Mesh framework, with sidecars intercepting 601 | messages flowing between Services A and B. Each sidecar enforces 602 | security policy received from the central controller and sends 603 | telemetry data to the central controller. 604 | 605 | From the perspective of observability, sidecars can be programmed to 606 | record whatever information operators might want to collect, and in 607 | principle, they can even be dynamically updated as conditions warrant. 608 | This provides a general way for operators to define how the system is 609 | observed without having to rely on instrumentation developers might 610 | include in their services. The downside is that sidecars impose a 611 | nontrivial amount of overhead on inter-service communication. For that 612 | reason, alternative approaches to sidecars are gaining traction, 613 | notably Cilium, which uses eBPF (extended Berkeley Packet Filters) to 614 | implement observability, security and networking data plane features 615 | inside the kernel rather than in a sidecar. 616 | 617 | For more information about the Istio Service Mesh, we recommend 618 | Calcote and Butcher's book. The Cilium project has extensive 619 | documentation and tutorials at its web site. 620 | 621 | .. _reading_mesh: 622 | .. admonition:: Further Reading 623 | 624 | L. Calcote and Z. Butcher `Istio: Up and Running 625 | `__. October 2019. 626 | 627 | `Cilium: eBPF-based Networking, Observability, Security `__. 628 | 629 | 630 | -------------------------------------------------------------------------------- /preface.rst: -------------------------------------------------------------------------------- 1 | Preface 2 | ======= 3 | 4 | The cloud is ubiquitous. Everyone uses the cloud to either access or 5 | deliver services, but not everyone will build and operate a cloud. So 6 | why should anyone care about how to turn a pile of servers and 7 | switches into a 24/7 service delivery platform? That's what Google, 8 | Microsoft, Amazon and the other cloud providers do for us, and they do 9 | a perfectly good job of it. 10 | 11 | The answer, we believe, is that the cloud is becoming ubiquitous in 12 | another way, as distributed applications increasingly run not just in 13 | large, central datacenters but at the edge. As applications are 14 | disaggregated, the cloud is expanding from hundreds of datacenters to 15 | tens of thousands of enterprises. And while it is clear that the 16 | commodity cloud providers are eager to manage those edge clouds as a 17 | logical extension of their datacenters, they do not have a monopoly on 18 | the know-how for making that happen. 19 | 20 | At the same time edge applications are moving to the forefront, 21 | increasing importance is also being placed on *digital sovereignty*, 22 | the ability of countries and organizations to control their destiny and 23 | their data. Cloud technology is important for running today's 24 | workloads, but access to that technology does not necessarily have to 25 | be bundled with outsourcing operational control. 26 | 27 | This book lays out a roadmap that a small team of engineers followed 28 | over the course of a year to stand up and operationalize an edge cloud 29 | and then operate it 24/7. This edge cloud spans a dozen 30 | enterprises, and hosts a non-trivial cloud native service—5G 31 | connectivity in our case, but that’s just an example. The team was 32 | able to do this by leveraging 20+ open source components, but 33 | selecting those components is just a start. There were dozens of 34 | technical decisions to make along the way, and a few thousand lines of 35 | configuration code to write. We believe this is a repeatable exercise, 36 | which we report in this book. The code for those configuration 37 | files is open source, for those who want to pursue the topic in more 38 | detail. 39 | 40 | What do we mean by an edge cloud? We're drawing a distinction between 41 | clouds run by the hyperscale cloud providers in their massive data 42 | centers, which we think of as the core, and those run by enterprises 43 | (or managed for them) at the edge. The edge is where the real, physical 44 | world meets the cloud. For example, it is the place where data from 45 | sensors is likely to be gathered and processed, and where services 46 | that need to be close to the end user for reasons of latency or 47 | bandwidth are delivered. 48 | 49 | Our roadmap may not be the right one for all circumstances, but it 50 | does shine a light on the fundamental challenges and trade-offs 51 | involved in operationalizing a cloud. As we can attest based on our 52 | experience, it’s a complicated design space with an overabundance of 53 | terminology and storylines to untangle. 54 | 55 | Intended Audience 56 | ------------------ 57 | 58 | We hope this book makes valuable reading for anyone who is trying to 59 | stand up and operationalize their own edge cloud infrastructure, but 60 | we also aim to provide useful information for at least two other broad 61 | groups. 62 | 63 | First, there will be a set of readers who need to evaluate the 64 | options available, particularly to decide between using the cloud 65 | services offered by one of the hyperscalers or building their own edge 66 | cloud (or some combination of these). We hope to demystify the 67 | landscape of edge clouds for this audience to help inform those 68 | decisions. 69 | 70 | Secondly, there will be a group of application and service 71 | developers who need to build on top of whatever cloud infrastructure 72 | their organization has chosen to use. We believe it is important for 73 | these developers to understand what goes on "under the hood" of the 74 | cloud at least at a high level, so that they can make their 75 | applications manageable and reliable. There is increasingly close 76 | interaction between developers and operators (as evidenced by the 77 | DevOps movement) and we aim to facilitate that collaboration. Topics 78 | such as monitoring and observability are particularly important for 79 | this audience. 80 | 81 | Guided Tour of Open Source 82 | -------------------------- 83 | 84 | The good news is that there is a wealth of open source components that 85 | can be assembled to help manage cloud platforms and scalable 86 | applications built on those platforms. That's also the bad news. With 87 | several dozen cloud-related projects available at open source 88 | consortia such as the Linux Foundation, Cloud Native Computing 89 | Foundation, and Apache Foundation, navigating the project space is one 90 | of the biggest challenges we faced in putting together a cloud 91 | management platform. This is in large part because these projects are 92 | competing for mindshare, with both significant overlap in the 93 | functionality they offer and dependencies on each other. 94 | 95 | One way to read this book is as a guided tour of the open source 96 | landscape for cloud control and management. And in that spirit, we do 97 | not replicate the excellent documentation those projects already 98 | provide, but instead include links to project-specific documentation 99 | (which often includes tutorials that we encourage you to try). We also 100 | include snippets of code from those projects, but these examples are 101 | chosen to help solidify the main points we're trying to make about the 102 | management platform as a whole; they should not be interpreted as an 103 | attempt to document the inner working of the individual projects. Our 104 | goal is to explain how the various puzzle pieces fit together to build 105 | an end-to-end management system, and in doing so, identify both 106 | various tools that help and the hard problems that no amount of 107 | tooling can eliminate. 108 | 109 | It should come as no surprise that there are challenging technical 110 | issues to address (despite marketing claims to the contrary). After 111 | all, how to operationalize a computing system is a question that’s as 112 | old as the field of *Operating Systems*. Operationalizing a cloud is 113 | just today’s version of that fundamental problem, which has become all 114 | the more interesting as we move up the stack, from managing *devices* 115 | to managing *services*. This topic is both timely and 116 | foundational. 117 | 118 | Acknowledgements 119 | ------------------ 120 | 121 | *Aether*, the example edge cloud this book uses to illustrate how to 122 | operationalize a cloud, was built by the Open Networking Foundation 123 | (ONF) engineering team and the open source community that worked with 124 | them. We acknowledge their contributions, with a special thank-you to 125 | Hyunsun Moon, Sean Condon, and HungWei Chiu for their significant 126 | contributions to Aether's control and management platform, and to Oguz 127 | Sunay for his influence on Aether's overall design. Suchitra Vemuri's 128 | insights into testing and quality assurance were also invaluable. 129 | 130 | The ONF is no longer active, but Aether continues as an open source 131 | project of the Linux Foundation. Visit https://aetherproject.org to 132 | learn about the ongoing project. We will also happily accept feedback 133 | to this book. Please send us your comments using the `Issues Link 134 | `__, or submit a Pull 135 | Request with suggested changes. 136 | 137 | | Larry Peterson, Scott Baker, Andy Bavier, Zack Williams, and Bruce Davie 138 | | June 2025 139 | 140 | -------------------------------------------------------------------------------- /print.rst: -------------------------------------------------------------------------------- 1 | .. role:: pop 2 | 3 | :pop:`Print Copies` 4 | =========================== 5 | 6 | We make all books in the *Systems Approach* series available as both 7 | print and e-books. This book is available via Amazon: `Edge Cloud Operations: A Systems Approach `__ 8 | 9 | `Book Series: `__ Also check out 10 | our companion books that cover networking and emerging topics in more depth. 11 | 12 | * `Computer Networks: A Systems Approach `__ 13 | 14 | * `Private 5G: A Systems Approach `__ 15 | 16 | * `Software-Defined Networks: A Systems Approach `__ 17 | 18 | * `TCP Congestion Control: A Systems Approach `__ 19 | 20 | .. * `Edge Cloud Operations: A Systems Approach `__ 21 | 22 | As participants in the Amazon Associate program we may earn income from qualifying purchases using the links above. 23 | -------------------------------------------------------------------------------- /provision.rst: -------------------------------------------------------------------------------- 1 | Chapter 3: Resource Provisioning 2 | ================================= 3 | 4 | Resource Provisioning is the process of bringing virtual and physical 5 | resources online. It has both a hands-on component (racking and 6 | connecting devices) and a bootstrap component (configuring how the 7 | resources boot into a "ready" state). Resource Provisioning happens 8 | when a cloud deployment is first installed—i.e., an initial set of 9 | resources are provisioned—but also incrementally over time as new 10 | resources are added, obsolete resources are removed, and out-of-date 11 | resources are upgraded. 12 | 13 | The goal of Resource Provisioning is to be zero-touch, which is 14 | impossible for hardware resources because it includes an intrinsically 15 | manual step. (We take up the issue of provisioning virtual resources 16 | in a moment.) Realistically, the goal is to minimize the number and 17 | complexity of configuration steps required beyond physically 18 | connecting the device, keeping in mind that we are starting with 19 | commodity hardware received directly from a vendor, and not a 20 | plug-and-play appliance that has already been prepped. 21 | 22 | When a cloud is built from virtual resources (e.g., VMs instantiated 23 | on a commercial cloud) the "rack and connect" step is carried out by a 24 | sequence of API calls rather than a hands-on technician. Of course, 25 | we want to automate the sequence of calls needed to activate virtual 26 | infrastructure, which has inspired an approach known as 27 | *Infrastructure-as-Code*, a special case of the 28 | *Configuration-as-Code* concept introduced in Chapter 2. The general 29 | idea is to document, in a declarative format that can be "executed", 30 | exactly what our infrastructure is to look like; how it is to be 31 | configured. Aether uses Terraform as its approach to 32 | Infrastructure-as-Code. 33 | 34 | When a cloud is built from a combination of virtual and physical 35 | resources, as is the case for a hybrid cloud like Aether, we need a 36 | seamless way to accommodate both. To this end, our approach is to 37 | first overlay a *logical structure* on top of hardware resources, 38 | making them roughly equivalent to the virtual resources we get from a 39 | commercial cloud provider. This results in a hybrid scenario similar 40 | to the one shown in :numref:`Figure %s `. NetBox is 41 | our open source solution for layering this logical structure on top of 42 | physical hardware. NetBox also helps us address the requirement of 43 | tracking physical inventory. 44 | 45 | .. _fig-infra: 46 | .. figure:: figures/Slide19.png 47 | :width: 450px 48 | :align: center 49 | 50 | Resource Provisioning in a hybrid cloud that includes both 51 | physical and virtual resources. 52 | 53 | Note that the Provisioning API shown on the right in :numref:`Figure 54 | %s ` is *not* the NetBox API. Terraform does not interact 55 | directly with NetBox, but instead with artifacts left behind by the 56 | hardware provisioning process described in Section 3.1. One way to 57 | think about this is that the task of booting hardware into the "ready" 58 | state involves installing and configuring several subsystems that 59 | collectively form the cloud platform. It is this platform that 60 | Terraform interacts with, using an API we describe at the end of 61 | Section 3.1. 62 | 63 | This chapter describes both sides of :numref:`Figure %s ` 64 | starting with provisioning physical infrastructure. Our approach is to 65 | focus on the challenge of provisioning an entire site the first time. 66 | We comment on the simpler problem of incrementally provisioning 67 | individual resources as relevant details emerge. 68 | 69 | 3.1 Physical Infrastructure 70 | --------------------------- 71 | 72 | The process of stacking and racking hardware is inherently 73 | human-intensive, and includes considerations such as airflow and cable 74 | management. These issues are beyond the scope of this book. We focus 75 | instead on the "physical/virtual" boundary, which starts with the 76 | cabling plan that a hands-on technician uses as a blueprint. The 77 | details of such a plan are highly deployment-specific, but we use the 78 | example shown in :numref:`Figure %s ` to help 79 | illustrate all the steps involved. The example is based on Aether 80 | clusters deployed in enterprises, which serves to highlight the 81 | required level of specificity. Considerable planning is required to 82 | specify an appropriate *Bill of Materials (BOM)*, including details 83 | about individual device models, but this aspect of the problem is also 84 | outside our scope. 85 | 86 | .. _fig-cable_plan: 87 | .. figure:: figures/pronto_logical_diagram.png 88 | :width: 700px 89 | :align: center 90 | 91 | Example network cable plan for an edge cluster. 92 | 93 | The blueprint shown in :numref:`Figure %s ` actually 94 | includes two logical clusters sharing a Management Switch and a 95 | Management Server. The upper cluster corresponds to a production 96 | deployment, and includes five servers and a 2x2 leaf-spine switching 97 | fabric. The lower cluster is for development, and includes two servers 98 | and a single switch. Defining such logical groupings of hardware 99 | resources is not unique to Aether; we can ask a commercial cloud 100 | provider to provision multiple logical clusters, so being able to do 101 | the same on physical resources is a natural requirement. 102 | 103 | In addition to following this blueprint, the technician also enters 104 | various facts about the physical infrastructure into a database. This 105 | information, which is used in later provisioning steps, is where we 106 | pick up the story. 107 | 108 | 3.1.1 Document Infrastructure 109 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 110 | 111 | Documenting the physical infrastructure's logical structure in a 112 | database is how we cross the physical-to-virtual divide. It involves 113 | both defining a set of models for the information being collected 114 | (this schema effectively represents the logical structure shown in 115 | :numref:`Figure %s `), and entering the corresponding facts 116 | about the physical devices. This process is familiar to anyone who is 117 | responsible for managing a network of devices, whether it is the first 118 | stage in a larger automated framework (such as the one described in 119 | this book) or simply a place to record what IP address has been 120 | assigned to each network appliance. 121 | 122 | There are several open source tools available for this task. Our choice 123 | is NetBox. It supports IP address management (IPAM); inventory-related 124 | information about types of devices and where they are installed; how 125 | infrastructure is organized (racked) by group and site; and how 126 | devices are connected to consoles, networks, and power sources. More 127 | information is readily available on the NetBox web site. 128 | 129 | .. _reading_netbox: 130 | .. admonition:: Further Reading 131 | 132 | `NetBox: `_ Information Resource Modeling 133 | Application. 134 | 135 | One of the key features of NetBox is the ability to customize the set 136 | of models used to organize all the information that is collected. For 137 | example, an operator can define physical groupings like *Rack* and 138 | *Site*, but also logical groupings like *Organization* and 139 | *Deployment*.\ [#]_ In the following we use the Aether cable plan shown in 140 | :numref:`Figure %s ` as an illustrative example, 141 | focusing on what happens when provisioning a single Aether site (but 142 | keeping in mind that Aether spans multiple sites, as outlined in 143 | Chapter 2). 144 | 145 | .. [#] In this section, we denote models and model fields in italics 146 | (e.g., *Site*, *Address*) and specific values assigned to an 147 | instance of a model as a constant (e.g., ``10.0.0.0/22``). 148 | 149 | The first step is to create a record for the site being provisioned, 150 | and document all the relevant metadata for that site. This includes 151 | the *Name* and *Location* of the *Site*, along with the *Organization* 152 | the site belongs to. An *Organization* can have more than one *Site*, 153 | while a *Site* can (a) span one or more *Racks*, and (b) host one or 154 | more *Deployments*. A *Deployment* is a logical cluster, 155 | corresponding, for example, to ``Production``, ``Staging``, and 156 | ``Development``. The cabling plan shown in :numref:`Figure %s 157 | ` includes two such deployments. 158 | 159 | This is also the time to specify the VLANs and IP Prefixes assigned to 160 | this particular edge deployment. Because it is important to maintain a 161 | clear relationship between VLANs, IP Prefixes, and DNS names (the last 162 | of which are auto-generated), it is helpful to walk through the 163 | following concrete example. We start with the minimal set of VLANs 164 | needed per Site: 165 | 166 | * ADMIN 1 167 | * UPLINK 10 168 | * MGMT 800 169 | * FABRIC 801 170 | 171 | These are Aether-specific, but they illustrate the set of VLANs a 172 | cluster might need. Minimally, one would expect to see a "management" 173 | network (MGMT in this example) and a "data" network (FABRIC in this 174 | example) in any cluster. Also specific to Aether (but generally 175 | applicable), if there are multiple Deployments at a Site sharing a 176 | single management server, additional VLANs (incremented by 10 for 177 | MGMT/FABRIC) are added. For example, a second ``Development`` 178 | deployment might define: 179 | 180 | * DEVMGMT 810 181 | * DEVFABRIC 811 182 | 183 | IP Prefixes are then associated with VLANs, with all edge IP prefixes 184 | fitting into a ``/22`` sized block. This block is then partitioned in 185 | a way that works in concert with how DNS names are managed; i.e., 186 | names are generated by combining the first ```` component of 187 | the *Device* names (see below) with this suffix. Using ``10.0.0.0/22`` 188 | as an example, there are four edge prefixes, with the following 189 | purposes: 190 | 191 | * ADMIN Prefix ``10.0.0.0/25`` (for IPMI) 192 | 193 | * Has the Management Server and Management Switch 194 | * Assign the ADMIN 1 VLAN 195 | * Set domain to ``admin...aetherproject.net`` 196 | 197 | * MGMT Prefix ``10.0.0.128/25`` (for infrastructure control plane) 198 | 199 | * Has the Server Management plane, Fabric Switch Management 200 | * Assign MGMT 800 VLAN 201 | * Set domain to ``mgmt...aetherproject.net`` 202 | 203 | * FABRIC Prefix ``10.0.1.0/25`` (for infrastructure data plane) 204 | 205 | * IP addresses of the ``qsfp0`` port of the Compute Nodes to Fabric 206 | switches, plus other Fabric-connected devices (e.g., eNB) 207 | * Assign FABRIC 801 VLAN 208 | * Set domain to ``fab1...aetherproject.net`` 209 | 210 | * FABRIC Prefix ``10.0.1.128/25`` (for infrastructure data plane) 211 | 212 | * IP addresses of the ``qsfp1`` port of the Compute Nodes to fabric switches 213 | * Assign FABRIC 801 VLAN 214 | * Set domain to ``fab2...aetherproject.net`` 215 | 216 | There are other edge prefixes used by Kubernetes, but they do not need 217 | to be created in NetBox. Note that ``qsfp0`` and ``qsfp1`` in this 218 | example denote transceiver ports connecting the switching fabric, 219 | where *QSFP* stands for Quad (4-channel) Small Form-factor Pluggable. 220 | 221 | With this site-wide information recorded, the next step is to install 222 | and document each *Device*. This includes entering a ````, 223 | which is subsequently used to generate a fully qualified domain name 224 | for the device: ``...aetherproject.net``. 225 | The following fields are also filled in when creating a Device: 226 | 227 | * Site 228 | * Rack & Rack Position 229 | * Manufacturer 230 | * Model 231 | * Serial number 232 | * Device Type 233 | * MAC Addresses 234 | 235 | Note there is typically both a primary and a management (e.g., BMC/IPMI) 236 | interface. One convenience feature of NetBox is to use the *Device Type* as a 237 | template that sets the default naming of interfaces, power connections, and 238 | other equipment model specific attributes. 239 | 240 | Finally, the virtual interfaces for the Device must be specified, with its 241 | *Label* field set to the physical network interface that it is assigned. IP 242 | addresses are then assigned to the physical and virtual interfaces we have 243 | defined. The Management Server should always have the first IP address within 244 | each prefix, and by convention they are assigned incrementally as follows: 245 | 246 | * Management Server 247 | 248 | * ``eno1`` - site provided public IP address, or blank if DHCP provided 249 | * ``eno2`` - 10.0.0.1/25 (first of ADMIN) - set as primary IP 250 | * ``bmc`` - 10.0.0.2/25 (next of ADMIN) 251 | * ``mgmt800`` - 10.0.0.129/25 (first of MGMT, on VLAN 800) 252 | * ``fab801`` - 10.0.1.1/25 (first of FABRIC, on VLAN 801) 253 | 254 | * Management Switch 255 | 256 | * ``gbe1`` - 10.0.0.3/25 (next of ADMIN) - set as primary IP 257 | 258 | * Fabric Switch 259 | 260 | * ``eth0`` - 10.0.0.130/25 (next of MGMT), set as primary IP 261 | * ``bmc`` - 10.0.0.131/25 262 | 263 | * Compute Server 264 | 265 | * ``eth0`` - 10.0.0.132/25 (next of MGMT), set as primary IP 266 | * ``bmc`` - 10.0.0.4/25 (next of ADMIN) 267 | * ``qsfp0`` - 10.0.1.2/25 (next of FABRIC) 268 | * ``qsfp1`` - 10.0.1.3/25 269 | 270 | * Other Fabric devices (eNB, etc.) 271 | 272 | * ``eth0`` or other primary interface - 10.0.1.4/25 (next of FABRIC) 273 | 274 | Once this data is entered into NetBox, it can be used to generate a 275 | rack diagram, similar to the one shown in :numref:`Figure %s 276 | `, corresponding to the cabling diagram shown in 277 | :numref:`Figure %s `. Note that the diagram shows two 278 | logical *Deployments* (``Production`` and ``Development``), co-located 279 | in one physical rack. 280 | 281 | .. _fig-rack_diagram: 282 | .. figure:: figures/rack_diagram.png 283 | :width: 500px 284 | :align: center 285 | 286 | NetBox rendering of rack configuration. 287 | 288 | It is also possible to generate other useful specifications for the 289 | deployment, helping the technician confirm the recorded logical specification 290 | matches the actual physical representation. For example, 291 | :numref:`Figure %s ` shows the set of cables and how 292 | they connect the set of hardware in our example deployment. 293 | 294 | .. _fig-cable_list: 295 | .. figure:: figures/cable_list.png 296 | :width: 700px 297 | :align: center 298 | 299 | NetBox report of cabling. 300 | 301 | If all of this seems like a tedious amount of detail, then you get the 302 | main point of this section. Everything about automating the control 303 | and management of a cloud hinges on having complete and accurate data 304 | about its resources. Keeping this information in sync with the reality 305 | of the physical infrastructure is often the weakest link in this 306 | process. The only saving grace is that the information is highly 307 | structured, and tools like NetBox help us codify this structure. 308 | 309 | 3.1.2 Configure and Boot 310 | ~~~~~~~~~~~~~~~~~~~~~~~~ 311 | 312 | After installing the hardware and recording the relevant facts about 313 | the installation, the next step is to configure and boot the hardware 314 | so that it is "ready" for the automated procedures that follow. The 315 | goal is to minimize manual configuration required to onboard physical 316 | infrastructure like that shown in :numref:`Figure %s 317 | `, but *zero-touch* is a high bar. To illustrate, the 318 | bootstrapping steps needed to complete provisioning for our example 319 | deployment include: 320 | 321 | * Configure the Management Switch to know the set of VLANs being 322 | used. 323 | 324 | * Configure the Management Server so it boots from a provided USB key. 325 | 326 | * Run Ansible playbooks needed to complete configuration 327 | onto the Management Server. 328 | 329 | * Configure the Compute Servers so they boot from the Management 330 | Server (via iPXE). 331 | 332 | * Configure the Fabric Switches so they boot from the Management 333 | Server (via Nginx). 334 | 335 | * Configure the eNBs (mobile base stations) so they know their IP 336 | addresses. 337 | 338 | These are all manual configuration steps, requiring either console 339 | access or entering information into a device web interface, such that 340 | any subsequent configuration steps can be both fully automated and 341 | resilient. Note that while these steps cannot be automated away, they 342 | do not necessarily have to be performed in the field; hardware shipped 343 | to a remote site can first be prepped accordingly. Also note that care 344 | should be taken to *not* overload this step with configuration that 345 | can be done later. For example, various radio parameters can be set on 346 | the eNBs when it is physically installed, but those parameters will 347 | become settable through the Management Platform once the cluster is 348 | brought online. 349 | 350 | Manual configuration work done at this stage should be minimized, and 351 | most systems should use automated means of configuration. For example, 352 | using DHCP pervasively with MAC reservations for IP address assignment 353 | instead of manual configuration of each interface allows for 354 | management to be zero-touch and simplifies future reconfiguration. 355 | 356 | The automated aspects of configuration are implemented as a set of 357 | Ansible *roles* and *playbooks*, which in terms of the high-level 358 | overview shown in :numref:`Figure %s ` of Chapter 2, 359 | corresponds to the box representing the *"Zero-Touch Provision 360 | (System)"*. Said another way, there is no off-the-shelf ZTP solution 361 | we can use (i.e., someone has to write the playbooks), but the problem 362 | is greatly simplified by having access to all the configuration 363 | parameters that NetBox maintains. 364 | 365 | The general idea is as follows. For every network service (e.g., DNS, 366 | DHCP, iPXE, Nginx) and every per-device subsystem (e.g., network 367 | interfaces, Docker) that needs to be configured, there is a 368 | corresponding Ansible role (set of related playbooks). These 369 | configurations are applied to the Management Server during the manual 370 | configuration stage summarized above, once the management network is 371 | online. 372 | 373 | The Ansible playbooks install and configure the network services on the 374 | Management Server. The role of DNS and DHCP are obvious. As for iPXE and Nginx, 375 | they are used to bootstrap the rest of the infrastructure. The compute servers 376 | are configured by iPXE delivered over DHCP/TFTP, and then load the scripted OS 377 | installation from a Nginx web server. The fabric switches load their 378 | Stratum OS package from Nginx. 379 | 380 | In many cases, the playbooks use parameters—such as VLANs, IP 381 | addresses, DNS names, and so on—extracted from NetBox. :numref:`Figure 382 | %s ` illustrates the approach, and fills in a few 383 | details. For example, a home-grown Python program (``edgeconfig.py``) 384 | extracts data from NetBox using the REST API and outputs a corresponding 385 | set of YAML files, crafted to serve as input to Ansible, which creates yet 386 | more configuration on the management and compute systems. One example of this 387 | is the *Netplan* file, which is used in Ubuntu to manage network interfaces. 388 | More information about Ansible and Netplan can be found on their respective web 389 | sites. 390 | 391 | .. _reading_ansible: 392 | .. admonition:: Further Reading 393 | 394 | `Ansible: `_ Automation Platform. 395 | 396 | `Netplan: `_ Network Configuration Abstraction Renderer. 397 | 398 | .. _fig-ansible: 399 | .. figure:: figures/Slide20.png 400 | :width: 550px 401 | :align: center 402 | 403 | Configuring network services and OS-level subsystems using NetBox data. 404 | 405 | While :numref:`Figure %s ` highlights how Ansible is 406 | paired with Netplan to configure kernel-level details, there is also 407 | an Ansible playbook that installs Docker on each compute server and 408 | fabric switch, and then launches a Docker container running a 409 | "finalize" image. This image makes calls into the next layer of the 410 | provisioning stack, effectively signaling that the cluster is running and 411 | ready for further instructions. We are now ready to describe that next 412 | layer of the stack. 413 | 414 | 415 | 3.1.3 Provisioning API 416 | ~~~~~~~~~~~~~~~~~~~~~~~~ 417 | 418 | As a result of the steps described so far, we can assume each server 419 | and switch is up and running, but we still have a little work to do to 420 | prepare our bare-metal clusters for the next layer in the provisioning 421 | stack, essentially establishing parity between the left- and 422 | right-hand sides of the hybrid cloud shown in :numref:`Figure %s 423 | `. If you ask yourself *"What would Google do?"* this 424 | reduces to the task of setting up a GCP-like API for the bare-metal 425 | edge clouds. This API primarily subsumes the Kubernetes API, but it 426 | goes beyond providing a way to *use* Kubernetes to also include calls 427 | to *manage* Kubernetes. 428 | 429 | In short, this "manage Kubernetes" task is to turn a set of 430 | interconnected servers and switches into a fully-instantiated 431 | Kubernetes cluster. For starters, the API needs to provide a means to 432 | install and configure Kubernetes on each physical cluster. This 433 | includes specifying which version of Kubernetes to run, selecting the 434 | right combination of Container Network Interface (CNI) plugins 435 | (virtual network adaptors), and connecting Kubernetes to the local 436 | network (and any VPNs it might need). This layer also needs to provide 437 | a means to set up accounts (and associated credentials) for accessing 438 | and using each Kubernetes cluster, and a way to manage 439 | independent projects that are to be deployed on a given cluster (i.e., 440 | manage namespaces for multiple applications). 441 | 442 | As an example, Aether uses Rancher to manage Kubernetes on 443 | the bare-metal clusters, with one centralized instance of Rancher 444 | being responsible for managing all the edge sites. This results in the 445 | configuration shown in :numref:`Figure %s `, which to 446 | emphasize Rancher's scope, shows multiple edge clusters. Although not 447 | shown in the Figure, the GCP-provided API, just like Rancher, also 448 | spans multiple physical sites (e.g., ``us-west1-a``, 449 | ``europe-north1-b``, ``asia-south2-c``, and so on). 450 | 451 | .. _fig-rancher: 452 | .. figure:: figures/Slide21.png 453 | :width: 450px 454 | :align: center 455 | 456 | Provisioning in a hybrid cloud that includes an API layer 457 | for managing Kubernetes running on multiple bare-metal clusters. 458 | 459 | We conclude this discussion by noting that while we often treat 460 | Kubernetes as though it is an industry-wide standard, that is not 461 | quite the reality of the situation. Each cloud provider offers its own 462 | customized version: 463 | 464 | * Microsoft Azure offers the Azure Kubernetes Service (AKS) 465 | * AWS offers the Amazon Elastic Kubernetes Service (EKS) 466 | * Google Cloud offers the Google Kubernetes Engine (GKE) 467 | * Aether edges run the Rancher-certified version of Kubernetes (RKE) 468 | 469 | Although the *CNCF (Cloud Native Computing Foundation)*—the open 470 | source organization responsible for shepherding the Kubernetes 471 | project—certifies these and other versions of Kubernetes, this only 472 | establishes baseline compliance. Each version if free to enhance their 473 | offering beyond this baseline, and these enhancements often take the 474 | form of additional features for provisioning and controlling a 475 | Kubernetes cluster. Our job at the cloud management layer is to 476 | provide operators with a means to manage this heterogeneity. And as 477 | we'll see in Section 3.2, this is the primary challenge addressed by 478 | the Infrastructure-as-Code layer. 479 | 480 | 3.1.4 Provisioning VMs 481 | ~~~~~~~~~~~~~~~~~~~~~~ 482 | 483 | We conclude our discussion of the steps required to provision physical 484 | machines by considering the implications of provisioning virtual 485 | machines, or VMs. That's something that happens "behind the scenes" 486 | when you request a Kubernetes cluster from AKS, EKS, or GKE, but 487 | that's because the hyperscalers have the option of layering their 488 | Kubernetes service on top of their Infrastructure-as-a-Service 489 | (IaaS). Do we need something similar for the edge cloud we're 490 | building? 491 | 492 | Not necessarily. Because our goal is to support a curated set of edge 493 | services that provide value to our enterprise users, and not to 494 | support Container-as-a-Service so untrusted third-parties can spin up 495 | whatever applications they want, we do not need to manage VMs "as a 496 | service." But we still may want to use VMs as a way to isolate 497 | Kubernetes workloads on a limited number of physical servers. This can 498 | be done as a provisioning step, akin to connecting and booting a 499 | physical machine, but using virtualization mechanisms like KVM and 500 | Proxmox. There is no need for a full-fledged IaaS mechanism, such as 501 | OpenStack. These VMs would then be recorded as first-class cloud 502 | resource in NetBox and the other tools described in this section, no 503 | different from a physical machine. 504 | 505 | The unanswered question is why one might decide to do that, 506 | considering that Kubernetes already allows us to deploy multiple 507 | applications on a single cluster. One reason is to support fine-grained 508 | resource isolation, making it possible to (a) ensure that each 509 | Kubernetes application receives the processor, memory, and storage 510 | resources it needs to do its job, and (b) reduce the risk of 511 | information leaking between the applications. Suppose, for example, 512 | that in addition to SD-Fabric, SD-RAN and SD-Core workloads that run 513 | (by default) on each edge site, we also want to run one or more other 514 | edge apps, such as the OpenVINO platform introduced in Section 2.3. To 515 | ensure that there is no interference between these applications, we 516 | could dedicate a subset of physical servers to each of them. Physical 517 | partitioning is a coarse-grained way to share the physical cluster. 518 | Being able to "split" one or more servers between multiple uses—by 519 | instantiating VMs—gives the operator more flexibility in allocating 520 | resources, which usually translates into requiring fewer overall 521 | resources. Note that there are other ways to specify how cluster 522 | resources are shared between applications (which we will see in 523 | Section 4.4), but the provisioning layer is one place where the issue 524 | can be addressed. 525 | 526 | 3.2 Infrastructure-as-Code 527 | -------------------------- 528 | 529 | The provisioning interface for each of the Kubernetes variants just 530 | described includes a programmatic API, a Command Line Interface (CLI), 531 | and a Graphical User Interface (GUI). If you try any of the 532 | tutorials we recommended throughout this book, you'll likely use one 533 | of the latter two. For operational deployments, however, having a 534 | human operator interact with a CLI or GUI is problematic. This is not 535 | only because humans are error-prone, but also because it's nearly 536 | impossible to consistently repeat a sequence of configuration steps. 537 | Being able to continuously repeat the process is at the heart of 538 | Lifecycle Management described in the next chapter. 539 | 540 | The solution is to find a declarative way of saying what your 541 | infrastructure is to look like—what set of Kubernetes clusters (e.g., 542 | some running at the edges on bare-metal and some instantiated in GCP) 543 | are to be instantiated, and how each is to be configured—and then 544 | automate the task of making calls against the programmatic API to make 545 | it so. This is the essence of Infrastructure-as-Code, and as we've 546 | already said, Terraform is our open source example. 547 | 548 | Since Terraform specifications are declarative, the best way to 549 | understand them is to walk through a specific example. In doing so, 550 | our goal isn't to document Terraform (online documentation and 551 | step-by-step tutorials are available for those those interested in 552 | more detail), but rather, to build some intuition about the role this 553 | layer plays in managing a cloud. 554 | 555 | .. _reading_terraform: 556 | .. admonition:: Further Reading 557 | 558 | `Terraform Documentation `_. 559 | 560 | `Terraform Getting Started Tutorials 561 | `__. 562 | 563 | To make sense of the example, the main thing you need to know about 564 | the Terraform configuration language is that it provides a means to 565 | both (1) specify *templates* for different kinds of resources (these 566 | are ``.tf`` files), and (2) fill in the *variables* for specific 567 | instances of those resource templates (these are ``.tfvars`` files). 568 | Then given a set of ``.tf`` and ``tfvars`` files, Terraform implements 569 | a two-stage process. In the first stage it constructs an execution 570 | plan, based on what has changed since the previous plan it 571 | executed. In the second stage, Terraform carries out the sequence of 572 | tasks required to bring the underlying infrastructure "up to spec" 573 | with the latest definition. Note that our job, for now, is the write 574 | these specification files, and check them into the Config Repo. 575 | Terraform gets invoked as part of the CI/CD pipeline described in 576 | Chapter 4. 577 | 578 | Now to the specific files. At the top-most level, the operator defines 579 | the set of *providers* they plan to incorporate into their 580 | infrastructure. We can think of each provider as corresponding to a 581 | cloud backend, including the corresponding provisioning API depicted 582 | in :numref:`Figure %s `. In our example, we show only two 583 | providers: the Rancher-managed edge clusters and the GCP-managed 584 | centralized clusters. Note that the example file declares a set of 585 | relevant variables for each provider (e.g., ``url``, ``access-key``), 586 | which are "filled in" in by instance-specific variable files described 587 | next. 588 | 589 | .. literalinclude:: code/provider.tf 590 | 591 | The next step is to fill in the details (define values) for the actual 592 | set of clusters we want to provision. Let's look at two examples, 593 | corresponding to the two providers we just specified. The first shows 594 | a GCP-provided cluster (named ``amp-gcp``) that is to host the AMP 595 | workload. (There's a similar ``sdcore-gcp`` that hosts an instance of 596 | the SD-Core.) The labels associated with this particular cluster 597 | (e.g., ``env = "production"``) establish linkage between Terraform 598 | (which assigns the label to each cluster it instantiates) and other 599 | layers of the management stack (which selectively take different 600 | actions based on the associated labels). We'll see an example of these 601 | labels being used in Section 4.4. 602 | 603 | .. literalinclude:: code/cluster-gcp_val.tfvars 604 | 605 | The second example shows an edge cluster (named ``ace-X``) to be 606 | instantiated at *Site X*. As shown in the example code, this is a 607 | bare-metal cluster consisting of five servers and four switches (two 608 | leaf switches and two spine switches). The address for each device 609 | must match the one assigned during the hardware-provisioning stage 610 | outlined in Section 3.1. Ideally, the NetBox (and related) tool chain 611 | described in that section would auto-generate these Terraform 612 | variables files, but in practice, manually entering the data is often 613 | still necessary. 614 | 615 | .. literalinclude:: code/cluster-edge_val.tfvars 616 | 617 | The final piece of the puzzle is to to fill in the remaining details 618 | about exactly how each Kubernetes cluster is to be instantiated. In 619 | this case, we show just the RKE-specific module used to configure the 620 | edge clusters, where most of the details are straightforward if you 621 | understand Kubernetes. For example, the module specifies that each 622 | edge cluster should load the ``calico`` and ``multus`` CNI plugins. It 623 | also defines how to invoke ``kubectl`` to configure Kubernetes 624 | according to these specifications. Less familiar, all references to 625 | ``SCTPSupport`` indicate whether or not that particular Kubernetes 626 | cluster needs to support SCTP, a Telco-oriented network protocol that 627 | is not included in a vanilla Kubernetes deployment, but is needed by 628 | the SD-Core. 629 | 630 | .. literalinclude:: code/main-rke.tf 631 | 632 | There are other loose ends that need to be tied up, such as defining 633 | the VPN to be used to connect edge clusters to their counterparts in 634 | GCP, but the above examples are sufficient to illustrate the role 635 | Infrastructure-as-Code plays in the cloud management stack. The key 636 | takeaway is that everything Terraform handles could have been done by 637 | a human operator making a sequence of CLI calls (or GUI clicks) on the 638 | backend Provisioning APIs, but experience has shown that approach to 639 | be error-prone and difficult to make consistently repeatable. 640 | Starting with declarative language and auto-generating the right 641 | sequence of API calls is a proven way to overcome that problem. 642 | 643 | 644 | We conclude by drawing attention to the fact that while we now have a 645 | declarative specification for our cloud infrastructure, which we refer 646 | to as the *Aether Platform*, these specification files are yet another 647 | software artifact that we check into the Config Repo. This is what we 648 | mean by Infrastructure-as-Code: infrastructure specifications are 649 | checked into a repo and version-controlled like any other code. This 650 | repo, in turn, feeds the lifecycle management pipeline described in 651 | the next chapter. The physical provisioning steps described in Section 652 | 3.1 happen "outside" the pipeline (which is why we don't just fold 653 | resource provisioning into Lifecycle Management), but it is fair to 654 | think of resource provisioning as "Stage 0" of lifecycle management. 655 | 656 | 3.3 Platform Definition 657 | ------------------------ 658 | 659 | The art of defining a system architecture, in our case a management 660 | framework for a hybrid cloud, is deciding where to draw the line 661 | between what's included inside the platform and what is considered an 662 | application running on top of the platform. For Aether, we have 663 | decided to include SD-Fabric inside the platform (along with 664 | Kubernetes), with SD-Core and SD-RAN treated as applications, even 665 | though all three are implemented as Kubernetes-based microservices. 666 | One consequence of this decision is that SD-Fabric is initialized as 667 | part of the provisioning system described in this chapter (with 668 | NetBox, Ansible, Rancher, and Terraform playing a role), whereas 669 | SD-Core and SD-RAN are deployed using the application-level mechanisms 670 | described in Chapter 4. 671 | 672 | There may also be other edge applications running as Kubernetes 673 | workloads, which complicates the story because from their perspective, 674 | all of Aether (including the 5G connectivity that SD-Core and SD-RAN 675 | implements) is assumed to be part of the platform. In other words, 676 | Aether draws two lines, one demarcating Aether's base platform 677 | (Kubernetes plus SD-Fabric) and a second demarcating the Aether PaaS 678 | (which includes SD-Core and SD-RAN running on top of the platform, 679 | plus AMP managing the whole system). The distinction between "base 680 | platform" and "PaaS" is subtle, but essentially corresponds to the 681 | difference between a software stack and a managed service, 682 | respectively. 683 | 684 | In some respects this is just a matter of terminology, which is 685 | certainly important, but the relevance to our discussion is that 686 | because we have multiple overlapping mechanisms at our disposal, 687 | giving us more than one way to solve each engineering problem we 688 | encounter, it is easy to end up with an implementation that 689 | unnecessarily conflates separable concerns. Being explicit and 690 | consistent about what is platform and what is application is a 691 | prerequisite for a sound overall design. It is also important to 692 | recognize the difference between an internal engineering decision 693 | (e.g., what mechanism is used to deploy a given component), and an 694 | externally-visible architectural decision (e.g., what functionality to 695 | expose through a public API). 696 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx~=5.3.0 2 | doc8~=0.10.1 3 | docutils~=0.17.1 4 | reuse~=0.14.0 5 | sphinx-rtd-theme~=1.0.0 6 | sphinxcontrib-spelling~=7.3.2 7 | sphinx-multiversion~=0.2.4 8 | pytz~=2023.3 9 | --------------------------------------------------------------------------------