├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── publish-docs.yaml
    │   └── validate-docs.yaml
├── .gitignore
├── LICENSE
├── Makefile
├── README.rst
├── VERSION
├── _extra
    └── robots.txt
├── _static
    ├── SystemsApproachLogoURL.png
    ├── bridge.ico
    ├── cover.jpg
    ├── css
    │   └── rtd_theme_mods.css
    └── fonts
    │   ├── Inconsolata-Bold.ttf
    │   └── Inconsolata-Regular.ttf
├── arch.rst
├── authors.rst
├── code
    ├── build.sh
    ├── cluster-edge_val.tfvars
    ├── cluster-gcp_val.tfvars
    ├── log.ascii
    ├── log.json
    ├── main-rke.tf
    ├── prometheus-rule.yaml
    ├── provider.tf
    ├── roc-api-tests.groovy
    ├── template.yang
    ├── trigger-event.yaml
    ├── trigger-time.yaml
    └── uptime.yaml
├── conf.py
├── control.rst
├── dict.txt
├── figures.pptx
├── figures
    ├── Slide1.png
    ├── Slide10.png
    ├── Slide11.png
    ├── Slide12.png
    ├── Slide13.png
    ├── Slide14.png
    ├── Slide15.png
    ├── Slide16.png
    ├── Slide17.png
    ├── Slide18.png
    ├── Slide19.png
    ├── Slide2.png
    ├── Slide20.png
    ├── Slide21.png
    ├── Slide22.png
    ├── Slide23.png
    ├── Slide24.png
    ├── Slide25.png
    ├── Slide26.png
    ├── Slide27.png
    ├── Slide3.png
    ├── Slide4.png
    ├── Slide5.png
    ├── Slide6.png
    ├── Slide7.png
    ├── Slide8.png
    ├── Slide9.png
    ├── ace_dash.png
    ├── cable_list.png
    ├── es_dash.png
    ├── gui1.png
    ├── gui2.png
    ├── pronto_logical_diagram.png
    ├── pronto_logical_diagram.svg
    ├── rack_diagram.png
    └── upf_dash.png
├── foreword.rst
├── index.rst
├── intro.rst
├── latest.rst
├── lifecycle.rst
├── monitor.rst
├── preface.rst
├── print.rst
├── provision.rst
└── requirements.txt


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | #require review
2 | * @llpeterson @drbruced12
3 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Docs Workflow
 2 | run-name: ${{ github.actor }} is publishing document artifacts 🚀
 3 | on: 
 4 |   push:
 5 |     branches: 
 6 |       - master
 7 | 
 8 |           # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 9 | permissions:
10 |   contents: read
11 |   pages: write
12 |   id-token: write
13 | 
14 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
15 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
16 | concurrency:
17 |   group: "pages"
18 |   cancel-in-progress: false
19 | 
20 | jobs:
21 |   # Single deploy job since we're just deploying
22 |   deploy:
23 |     environment:
24 |       name: github-pages
25 |       url: ${{ steps.deployment.outputs.page_url }}
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - name: Checkout
29 |         uses: actions/checkout@v4
30 |       - name: Setup Pages
31 |         uses: actions/configure-pages@v4
32 |       - name: Build html
33 |         run: make html
34 |       - name: Upload artifact
35 |         uses: actions/upload-pages-artifact@v3
36 |         with:
37 |           # Upload build repository
38 |           path: './_build/html'
39 |       - name: Deploy to GitHub Pages
40 |         id: deployment
41 |         uses: actions/deploy-pages@v4
42 | 
43 |      
44 |       - run: echo "🍏 This job's status is ${{ job.status }}."
45 | 


--------------------------------------------------------------------------------
/.github/workflows/validate-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Validate Docs Workflow
 2 | run-name: ${{ github.actor }} is validating document source
 3 | on: [pull_request, workflow_dispatch]
 4 | jobs:
 5 |   Validate_Docs:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
 9 |       - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
10 |       - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
11 |       - name: Check out repo
12 |         uses: actions/checkout@v4
13 |       - name: Validate source
14 |         run: make test
15 |       - name: Build html
16 |         run: make html
17 |       - name: List built files
18 |         run: |
19 |           ls ${{ github.workspace }}/_build/html
20 | 
21 |       - run: echo "🍏 This job's status is ${{ job.status }}."
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pdf
 2 | *.tex
 3 | venv-docs
 4 | .DS_Store
 5 | */.DS_Store
 6 | figures-pdf/
 7 | figures-hi_res/
 8 | figures-low_res/
 9 | private/
10 | local/
11 | scripts/
12 | _build/
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution 4.0 International Public License
  2 | 
  3 | By exercising the Licensed Rights (defined below), You accept and
  4 | agree to be bound by the terms and conditions of this Creative Commons
  5 | Attribution-NonCommercial-NoDerivatives 4.0 International Public
  6 | License ("Public License"). To the extent this Public License may be
  7 | interpreted as a contract, You are granted the Licensed Rights in
  8 | consideration of Your acceptance of these terms and conditions, and
  9 | the Licensor grants You such rights in consideration of benefits the
 10 | Licensor receives from making the Licensed Material available under
 11 | these terms and conditions.
 12 | 
 13 | Section 1 – Definitions.
 14 | 
 15 |   (a) Adapted Material means material subject to Copyright and Similar
 16 |   Rights that is derived from or based upon the Licensed Material and
 17 |   in which the Licensed Material is translated, altered, arranged,
 18 |   transformed, or otherwise modified in a manner requiring permission
 19 |   under the Copyright and Similar Rights held by the Licensor. For
 20 |   purposes of this Public License, where the Licensed Material is a
 21 |   musical work, performance, or sound recording, Adapted Material is
 22 |   always produced where the Licensed Material is synched in timed
 23 |   relation with a moving image.
 24 |   
 25 |   (b) Copyright and Similar Rights means copyright and/or similar
 26 |   rights closely related to copyright including, without limitation,
 27 |   performance, broadcast, sound recording, and Sui Generis Database
 28 |   Rights, without regard to how the rights are labeled or
 29 |   categorized. For purposes of this Public License, the rights
 30 |   specified in Section 2(b)(1)-(2) are not Copyright and Similar
 31 |   Rights.
 32 | 
 33 |   (c) Effective Technological Measures means those measures that, in
 34 |   the absence of proper authority, may not be circumvented under laws
 35 |   fulfilling obligations under Article 11 of the WIPO Copyright Treaty
 36 |   adopted on December 20, 1996, and/or similar international
 37 |   agreements.
 38 | 
 39 |   (d) Exceptions and Limitations means fair use, fair dealing, and/or
 40 |   any other exception or limitation to Copyright and Similar Rights
 41 |   that applies to Your use of the Licensed Material.
 42 | 
 43 |   (e) Licensed Material means the artistic or literary work, database,
 44 |   or other material to which the Licensor applied this Public License.
 45 |   
 46 |   (f) Licensed Rights means the rights granted to You subject to the
 47 |   terms and conditions of this Public License, which are limited to
 48 |   all Copyright and Similar Rights that apply to Your use of the
 49 |   Licensed Material and that the Licensor has authority to license.
 50 | 
 51 |   (g) Licensor means the individual(s) or entity(ies) granting rights
 52 |   under this Public License.
 53 | 
 54 |   (h) NonCommercial means not primarily intended for or directed
 55 |   towards commercial advantage or monetary compensation. For purposes
 56 |   of this Public License, the exchange of the Licensed Material for
 57 |   other material subject to Copyright and Similar Rights by digital
 58 |   file-sharing or similar means is NonCommercial provided there is no
 59 |   payment of monetary compensation in connection with the exchange.
 60 | 
 61 |   (i) Share means to provide material to the public by any means or
 62 |   process that requires permission under the Licensed Rights, such as
 63 |   reproduction, public display, public performance, distribution,
 64 |   dissemination, communication, or importation, and to make material
 65 |   available to the public including in ways that members of the public
 66 |   may access the material from a place and at a time individually
 67 |   chosen by them.
 68 | 
 69 |   (j) Sui Generis Database Rights means rights other than copyright
 70 |   resulting from Directive 96/9/EC of the European Parliament and of
 71 |   the Council of 11 March 1996 on the legal protection of databases,
 72 |   as amended and/or succeeded, as well as other essentially equivalent
 73 |   rights anywhere in the world.
 74 | 
 75 |   (k) You means the individual or entity exercising the Licensed
 76 |   Rights under this Public License. Your has a corresponding meaning.
 77 |   
 78 | Section 2 – Scope.
 79 | 
 80 |   (a) License grant.
 81 | 
 82 |     (1) Subject to the terms and conditions of this Public License,
 83 |     the Licensor hereby grants You a worldwide, royalty-free,
 84 |     non-sublicensable, non-exclusive, irrevocable license to exercise
 85 |     the Licensed Rights in the Licensed Material to:
 86 | 
 87 |       (A) reproduce and Share the Licensed Material, in whole or in
 88 |       part, for NonCommercial purposes only; and
 89 | 
 90 |       (B) produce and reproduce, but not Share, Adapted Material for
 91 |       NonCommercial purposes only.
 92 |   
 93 |     (2) Exceptions and Limitations. For the avoidance of doubt, where
 94 |     Exceptions and Limitations apply to Your use, this Public License
 95 |     does not apply, and You do not need to comply with its terms and
 96 |     conditions.
 97 | 
 98 |     (3) Term. The term of this Public License is specified in Section
 99 |     6(a).
100 | 
101 |     (4) Media and formats; technical modifications allowed. The
102 |     Licensor authorizes You to exercise the Licensed Rights in all
103 |     media and formats whether now known or hereafter created, and to
104 |     make technical modifications necessary to do so. The Licensor
105 |     waives and/or agrees not to assert any right or authority to
106 |     forbid You from making technical modifications necessary to
107 |     exercise the Licensed Rights, including technical modifications
108 |     necessary to circumvent Effective Technological Measures. For
109 |     purposes of this Public License, simply making modifications
110 |     authorized by this Section 2(a)(4) never produces Adapted
111 |     Material.
112 | 
113 |     (5) Downstream recipients.
114 | 
115 |       (A) Offer from the Licensor – Licensed Material. Every recipient
116 |       of the Licensed Material automatically receives an offer from
117 |       the Licensor to exercise the Licensed Rights under the terms and
118 |       conditions of this Public License.
119 | 
120 |       (B) No downstream restrictions. You may not offer or impose any
121 |       additional or different terms or conditions on, or apply any
122 |       Effective Technological Measures to, the Licensed Material if
123 |       doing so restricts exercise of the Licensed Rights by any
124 |       recipient of the Licensed Material.
125 | 
126 |     (6) No endorsement. Nothing in this Public License constitutes or
127 |     may be construed as permission to assert or imply that You are, or
128 |     that Your use of the Licensed Material is, connected with, or
129 |     sponsored, endorsed, or granted official status by, the Licensor
130 |     or others designated to receive attribution as provided in Section
131 |     3(a)(1)(A)(i).
132 | 
133 |   (b) Other rights.
134 | 
135 |     (1) Moral rights, such as the right of integrity, are not licensed
136 |     under this Public License, nor are publicity, privacy, and/or
137 |     other similar personality rights; however, to the extent possible,
138 |     the Licensor waives and/or agrees not to assert any such rights
139 |     held by the Licensor to the limited extent necessary to allow You
140 |     to exercise the Licensed Rights, but not otherwise.
141 | 
142 |     (2) Patent and trademark rights are not licensed under this Public
143 |     License.
144 | 
145 |     (3) To the extent possible, the Licensor waives any right to
146 |     collect royalties from You for the exercise of the Licensed
147 |     Rights, whether directly or through a collecting society under any
148 |     voluntary or waivable statutory or compulsory licensing scheme. In
149 |     all other cases the Licensor expressly reserves any right to
150 |     collect such royalties, including when the Licensed Material is
151 |     used other than for NonCommercial purposes.
152 |     
153 | Section 3 – License Conditions.
154 | 
155 | Your exercise of the Licensed Rights is expressly made subject to the
156 | following conditions.
157 | 
158 |   (a) Attribution.
159 | 
160 |     (1) If You Share the Licensed Material, You must:
161 | 
162 |       (A) retain the following if it is supplied by the Licensor with
163 |       the Licensed Material:
164 |   
165 |         (i) identification of the creator(s) of the Licensed Material
166 |         and any others designated to receive attribution, in any
167 |         reasonable manner requested by the Licensor (including by
168 |         pseudonym if designated);
169 | 
170 |         (ii) a copyright notice;
171 |   
172 |         (iii) a notice that refers to this Public License;
173 |   
174 |         (iv) a notice that refers to the disclaimer of warranties;
175 |   
176 |         (v) a URI or hyperlink to the Licensed Material to the extent
177 |         reasonably practicable;
178 |   
179 |       (B) indicate if You modified the Licensed Material and retain an
180 |       indication of any previous modifications; and
181 | 
182 |       (C) indicate the Licensed Material is licensed under this Public
183 |       License, and include the text of, or the URI or hyperlink to,
184 |       this Public License.
185 | 
186 |     For the avoidance of doubt, You do not have permission under this
187 |     Public License to Share Adapted Material.
188 | 
189 |     (2) You may satisfy the conditions in Section 3(a)(1) in any
190 |     reasonable manner based on the medium, means, and context in which
191 |     You Share the Licensed Material. For example, it may be reasonable
192 |     to satisfy the conditions by providing a URI or hyperlink to a
193 |     resource that includes the required information.
194 | 
195 |     (3) If requested by the Licensor, You must remove any of the
196 |     information required by Section 3(a)(1)(A) to the extent
197 |     reasonably practicable.
198 | 
199 | Section 4 – Sui Generis Database Rights.
200 | 
201 | Where the Licensed Rights include Sui Generis Database Rights that
202 | apply to Your use of the Licensed Material:
203 | 
204 |   (a) for the avoidance of doubt, Section 2(a)(1) grants You the right
205 |   to extract, reuse, reproduce, and Share all or a substantial portion
206 |   of the contents of the database for NonCommercial purposes only and
207 |   provided You do not Share Adapted Material;
208 |   
209 |   (b) if You include all or a substantial portion of the database
210 |   contents in a database in which You have Sui Generis Database
211 |   Rights, then the database in which You have Sui Generis Database
212 |   Rights (but not its individual contents) is Adapted Material; and
213 | 
214 |   (c) You must comply with the conditions in Section 3(a) if You Share
215 |   all or a substantial portion of the contents of the database.
216 | 
217 | For the avoidance of doubt, this Section 4 supplements and does not
218 | replace Your obligations under this Public License where the Licensed
219 | Rights include other Copyright and Similar Rights.
220 | 
221 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
222 | 
223 |   (a) Unless otherwise separately undertaken by the Licensor, to the
224 |   extent possible, the Licensor offers the Licensed Material as-is and
225 |   as-available, and makes no representations or warranties of any kind
226 |   concerning the Licensed Material, whether express, implied,
227 |   statutory, or other. This includes, without limitation, warranties
228 |   of title, merchantability, fitness for a particular purpose,
229 |   non-infringement, absence of latent or other defects, accuracy, or
230 |   the presence or absence of errors, whether or not known or
231 |   discoverable. Where disclaimers of warranties are not allowed in
232 |   full or in part, this disclaimer may not apply to You.
233 |   
234 |   (b) To the extent possible, in no event will the Licensor be liable
235 |   to You on any legal theory (including, without limitation,
236 |   negligence) or otherwise for any direct, special, indirect,
237 |   incidental, consequential, punitive, exemplary, or other losses,
238 |   costs, expenses, or damages arising out of this Public License or
239 |   use of the Licensed Material, even if the Licensor has been advised
240 |   of the possibility of such losses, costs, expenses, or
241 |   damages. Where a limitation of liability is not allowed in full or
242 |   in part, this limitation may not apply to You.
243 | 
244 |   (c) The disclaimer of warranties and limitation of liability
245 |   provided above shall be interpreted in a manner that, to the extent
246 |   possible, most closely approximates an absolute disclaimer and
247 |   waiver of all liability.
248 | 
249 | Section 6 – Term and Termination.
250 | 
251 |   (a) This Public License applies for the term of the Copyright and
252 |   Similar Rights licensed here. However, if You fail to comply with
253 |   this Public License, then Your rights under this Public License
254 |   terminate automatically.
255 | 
256 |   (b) Where Your right to use the Licensed Material has terminated
257 |   under Section 6(a), it reinstates:
258 | 
259 |     (1) automatically as of the date the violation is cured, provided
260 |     it is cured within 30 days of Your discovery of the violation; or
261 | 
262 |     (2) upon express reinstatement by the Licensor.
263 | 
264 |   For the avoidance of doubt, this Section 6(b) does not affect
265 |   any right the Licensor may have to seek remedies for Your violations
266 |   of this Public License.
267 | 
268 |   (c) For the avoidance of doubt, the Licensor may also offer the
269 |   Licensed Material under separate terms or conditions or stop
270 |   distributing the Licensed Material at any time; however, doing so
271 |   will not terminate this Public License.
272 | 
273 |   (d) Sections 1, 5, 6, 7, and 8 survive termination of this Public
274 |   License.
275 | 
276 | Section 7 – Other Terms and Conditions.
277 | 
278 |   (a) The Licensor shall not be bound by any additional or different
279 |   terms or conditions communicated by You unless expressly agreed.
280 | 
281 |   (b) Any arrangements, understandings, or agreements regarding the
282 |   Licensed Material not stated herein are separate from and
283 |   independent of the terms and conditions of this Public License.
284 | 
285 | Section 8 – Interpretation.
286 | 
287 |   (a) For the avoidance of doubt, this Public License does not, and
288 |   shall not be interpreted to, reduce, limit, restrict, or impose
289 |   conditions on any use of the Licensed Material that could lawfully
290 |   be made without permission under this Public License.
291 | 
292 |   (b) To the extent possible, if any provision of this Public License
293 |   is deemed unenforceable, it shall be automatically reformed to the
294 |   minimum extent necessary to make it enforceable. If the provision
295 |   cannot be reformed, it shall be severed from this Public License
296 |   without affecting the enforceability of the remaining terms and
297 |   conditions.
298 | 
299 |   (c) No term or condition of this Public License will be waived and
300 |   no failure to comply consented to unless expressly agreed to by the
301 |   Licensor.
302 | 
303 |   (d) Nothing in this Public License constitutes or may be interpreted
304 |   as a limitation upon, or waiver of, any privileges and immunities
305 |   that apply to the Licensor or You, including from the legal
306 |   processes of any jurisdiction or authority.
307 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for ONF documentation created with Sphinx
 2 | 
 3 | # use bash for pushd/popd, and to fail quickly. virtualenv's activate
 4 | # has undefined variables, so no -u
 5 | SHELL = bash -e -o pipefail
 6 | 
 7 | # You can set these variables from the command line.
 8 | SPHINXOPTS   ?= 
 9 | SPHINXBUILD  ?= sphinx-build
10 | SOURCEDIR    ?= .
11 | BUILDDIR     ?= _build
12 | 
13 | # Create the virtualenv with all the tools installed
14 | VIRTUALENV    = venv-docs
15 | 
16 | # Put it first so that "make" without argument is like "make help".
17 | help: $(VIRTUALENV)
18 | 	source ./$(VIRTUALENV)/bin/activate ;\
19 |   $(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 
21 | .PHONY: help lint reload Makefile test
22 | 
23 | # Create the virtualenv with all the tools installed
24 | $(VIRTUALENV):
25 | 	python3 -m venv $@ ;\
26 |   source ./$@/bin/activate ;\
27 |   pip install -r requirements.txt
28 | 
29 | # lint and link verification. linkcheck is built into sphinx
30 | test: lint spelling
31 | 
32 | # lint all .rst files
33 | lint: $(VIRTUALENV)
34 | 	source ./$</bin/activate ;\
35 |   doc8 --ignore-path $< --ignore-path _build --max-line-length 120 .
36 | 
37 | # clean up
38 | clean:
39 | 	rm -rf "$(BUILDDIR)"
40 | 
41 | # clean-all - delete the virtualenv too
42 | clean-all: clean
43 | 	rm -rf "$(VIRTUALENV)"
44 | 
45 | # Catch-all target: route all unknown targets to Sphinx using the new
46 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
47 | %: $(VIRTUALENV) Makefile
48 | 	source ./$</bin/activate ; set -u;\
49 |   $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
50 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | About The Book
 2 | ===============
 3 | 
 4 | .. image:: https://github.com/SystemsApproach/ops/actions/workflows/publish-docs.yaml/badge.svg
 5 |   :align: left
 6 |   :alt: deployment status button
 7 |   :target: https://github.com/SystemsApproach/ops/actions/
 8 | 
 9 | |
10 | 
11 | Source for *Edge Cloud Operations: A Systems Approach* is available on
12 | GitHub under
13 | terms of the `Creative Commons (CC BY-NC-ND 4.0)
14 | <https://creativecommons.org/licenses/by-nc-nd/4.0>`__ license. The
15 | community is invited to contribute corrections, improvements, updates,
16 | and new material under the same terms. While this license does not
17 | automatically grant the right to make derivative works, we are keen to
18 | discuss derivative works (such as translations) with interested
19 | parties. Please reach out to discuss@systemsapproach.org.
20 | 
21 | If you make use of this work, the attribution should include the
22 | following information:
23 | 
24 | | *Title: Edge Cloud Operations: A Systems Approach*
25 | | *Authors: Larry Peterson, Scott Baker, Andy Bavier, Zack Williams, Bruce Davie*
26 | | *Source:* https://github.com/SystemsApproach/ops
27 | | *License:* \ `CC BY-NC-ND 4.0 <https://creativecommons.org/licenses/by-nc-nd/4.0>`__
28 | 
29 | Read the Book
30 | -------------
31 | 
32 | This book is part of the `Systems Approach Series
33 | <https://www.systemsapproach.org>`__, with an online version published
34 | at `https://ops.systemsapproach.org
35 | <https://ops.systemsapproach.org>`__.
36 | 
37 | To track progress and receive notices about new versions, you can follow
38 | the project on
39 | `Mastodon <https://discuss.systems/@SystemsAppr>`__. To read a running
40 | commentary on how the Internet is evolving, and for updates on our writing projects, you can sign up for the
41 | `Systems Approach newsletter <https://systemsapproach.org/newsletter/>`__.
42 | 
43 | Build the Book
44 | --------------
45 | 
46 | To build a web-viewable version, you first need to download the
47 | source:
48 | 
49 | .. literalinclude:: code/build.sh
50 | 
51 | The build process is stored in the Makefile and requires Python be
52 | installed. The Makefile will create a virtualenv (``venv-docs``) which
53 | installs the documentation generation toolset. You may also need to
54 | install the ``enchant`` C library using your system’s package manager
55 | for the spelling checker to function properly.
56 | 
57 | To generate HTML in ``_build/html``,  run ``make html``.
58 | 
59 | To check the formatting of the book, run ``make lint``.
60 | 
61 | To check spelling, run ``make spelling``. If there are additional
62 | words, names, or acronyms that are correctly spelled but not in the dictionary,
63 | please add them to the ``dict.txt`` file.
64 | 
65 | To see the other available output formats, run ``make``.
66 | 
67 | Contribute to the Book
68 | ----------------------
69 | 
70 | We hope that if you use this material, you are also willing to
71 | contribute back to it. If you are new to open source, you might check
72 | out this `How to Contribute to Open
73 | Source <https://opensource.guide/how-to-contribute/>`__ guide. Among
74 | other things, you’ll learn about posting *Issues* that you’d like to see
75 | addressed, and issuing *Pull Requests* to merge your improvements back
76 | into GitHub.
77 | 
78 | If you’d like to contribute and are looking for something that needs
79 | attention, see the `wiki <https://github.com/SystemsApproach/ops/wiki>`__
80 | for the current TODO list.
81 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | Version 1.1-dev


--------------------------------------------------------------------------------
/_extra/robots.txt:
--------------------------------------------------------------------------------
 1 | User-agent: AI2Bot
 2 | User-agent: Ai2Bot-Dolma
 3 | User-agent: aiHitBot
 4 | User-agent: Amazonbot
 5 | User-agent: anthropic-ai
 6 | User-agent: Applebot
 7 | User-agent: Applebot-Extended
 8 | User-agent: Brightbot 1.0
 9 | User-agent: Bytespider
10 | User-agent: CCBot
11 | User-agent: ChatGPT-User
12 | User-agent: Claude-Web
13 | User-agent: ClaudeBot
14 | User-agent: cohere-ai
15 | User-agent: cohere-training-data-crawler
16 | User-agent: Cotoyogi
17 | User-agent: Crawlspace
18 | User-agent: Diffbot
19 | User-agent: DuckAssistBot
20 | User-agent: FacebookBot
21 | User-agent: Factset_spyderbot
22 | User-agent: FirecrawlAgent
23 | User-agent: FriendlyCrawler
24 | User-agent: Google-Extended
25 | User-agent: GoogleOther
26 | User-agent: GoogleOther-Image
27 | User-agent: GoogleOther-Video
28 | User-agent: GPTBot
29 | User-agent: iaskspider/2.0
30 | User-agent: ICC-Crawler
31 | User-agent: ImagesiftBot
32 | User-agent: img2dataset
33 | User-agent: imgproxy
34 | User-agent: ISSCyberRiskCrawler
35 | User-agent: Kangaroo Bot
36 | User-agent: meta-externalagent
37 | User-agent: Meta-ExternalAgent
38 | User-agent: meta-externalfetcher
39 | User-agent: Meta-ExternalFetcher
40 | User-agent: NovaAct
41 | User-agent: OAI-SearchBot
42 | User-agent: omgili
43 | User-agent: omgilibot
44 | User-agent: Operator
45 | User-agent: PanguBot
46 | User-agent: Perplexity-User
47 | User-agent: PerplexityBot
48 | User-agent: PetalBot
49 | User-agent: Scrapy
50 | User-agent: SemrushBot-OCOB
51 | User-agent: SemrushBot-SWA
52 | User-agent: Sidetrade indexer bot
53 | User-agent: TikTokSpider
54 | User-agent: Timpibot
55 | User-agent: VelenPublicWebCrawler
56 | User-agent: Webzio-Extended
57 | User-agent: YouBot
58 | Disallow: /
59 | 


--------------------------------------------------------------------------------
/_static/SystemsApproachLogoURL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/SystemsApproachLogoURL.png


--------------------------------------------------------------------------------
/_static/bridge.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/bridge.ico


--------------------------------------------------------------------------------
/_static/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/cover.jpg


--------------------------------------------------------------------------------
/_static/css/rtd_theme_mods.css:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019-present Open Networking Foundation
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.  */
15 | 
16 | /* Don't restrict content width on the RTD theme
17 |  * from: https://stackoverflow.com/a/32898444 */
18 | 
19 | .wy-nav-content {
20 |   max-width: none;
21 | }
22 | 
23 | .wy-table-responsive table td, .wy-table-responsive table th {
24 |   white-space: normal;
25 | }
26 | 
27 | /* Colors for navigation */
28 | 
29 | .wy-side-nav-search, .wy-nav-top {
30 |     background: #2F5597;
31 | }
32 | 
33 | /* .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#2F5597} */
34 | 
35 | .wy-menu-vertical header,.wy-menu-vertical p.caption{color:#6AB0DE}
36 | 
37 | /* Headings */
38 | h1, h2 {
39 |   font-weight: bold;
40 |   line-height: 1.25;
41 |   color: #3279a8
42 |   text-rendering: optimizeLegibility;
43 | }
44 | 
45 | h3, h4, h5, h6 {
46 |   margin-bottom: .5rem;
47 |   font-style: italic;
48 |   line-height: 1.25;
49 |   color: #313131;
50 |   text-rendering: optimizeLegibility;
51 | }
52 | 
53 | h1 {
54 |   margin-bottom: 2rem;
55 |   font-size: 2rem;
56 | }
57 | 
58 | h2 {
59 |   margin-bottom: .5rem;
60 |   margin-top: 1rem;
61 |   font-size: 1.5rem;   
62 | }
63 | 
64 | h3 {
65 |   margin-top: 1.5rem;
66 |   font-size: 1.25rem;
67 | }
68 | 
69 | .pop {
70 |     color: #6AB0DE;
71 |     font-style: italic;
72 |     font-weight: bold;
73 | }
74 | aside.sidebar {
75 |     margin: 0 0 0.5em 1em;
76 |     border: 1px solid #ddb;
77 |     padding: 7px 7px 0 7px;
78 |     background-color: #ffe;
79 |     width: 40%;
80 |     float: right;
81 | }
82 | 


--------------------------------------------------------------------------------
/_static/fonts/Inconsolata-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/fonts/Inconsolata-Bold.ttf


--------------------------------------------------------------------------------
/_static/fonts/Inconsolata-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/_static/fonts/Inconsolata-Regular.ttf


--------------------------------------------------------------------------------
/arch.rst:
--------------------------------------------------------------------------------
  1 | Chapter 2:  Architecture
  2 | ========================
  3 | 
  4 | This chapter identifies all the subsystems that go into building and
  5 | operationalizing a cloud capable of running an assortment of
  6 | cloud-native services. We use Aether to illustrate specific design
  7 | choices, and so we start by describing why an enterprise might install
  8 | a system like Aether in the first place.
  9 | 
 10 | .. sidebar:: PaaS for Industry 4.0
 11 | 
 12 |        *Edge clouds like Aether are an important component of a trend
 13 |        called Industry 4.0: A combination of intelligent devices,
 14 |        robust wireless connectivity, and cloud-based AI/ML
 15 |        capabilities, all working together to enable software-based
 16 |        optimization and innovation.*
 17 | 
 18 |        *Connecting industry assets to the cloud has the potential to
 19 |        bring transformative benefits. This starts with collecting deep
 20 |        operational data on assets and infrastructure, from sensors,
 21 |        video feeds and telemetry from machinery. It also includes
 22 |        applying ML to this data to gain insights, identify patterns
 23 |        and predict outcomes (e.g., when a device is likely to fail),
 24 |        followed by automating industrial processes so as to minimize
 25 |        human intervention and enable remote operations (e.g., power
 26 |        optimization, idling quiescent machinery). In general, the goal
 27 |        is to create an IT foundation for continually improving
 28 |        industrial operations through software.*
 29 | 
 30 |        *As for why we refer to Aether as a PaaS for such use cases,
 31 |        the answer is somewhat subjective. Generally, a PaaS offers
 32 |        more than virtualized compute and storage (that is what IaaS
 33 |        does), and includes additional layers of "middleware" to enable
 34 |        application developers to deploy their applications without
 35 |        dealing with all the intricacies of managing the underlying
 36 |        infrastructure. In the case of Aether, the platform includes
 37 |        support for 5G connectivity, including an API that edge apps
 38 |        can use to customize that connectivity to better meet their
 39 |        objectives.  This does not preclude also loading an ML-platform
 40 |        or an IoT-platform onto Aether, further enhancing the
 41 |        application support it provides.*
 42 | 
 43 | Aether is a Kubernetes-based edge cloud, augmented with a 5G-based
 44 | connectivity service. Aether is targeted at enterprises that want to
 45 | take advantage of 5G connectivity in support of mission-critical edge
 46 | applications requiring predictable, low-latency connectivity. In
 47 | short, “Kubernetes-based” means Aether is able to host container-based
 48 | services, and “5G-based connectivity” means Aether is able to connect
 49 | those services to mobile devices throughout the enterprise's physical
 50 | plant. This combination of features to support deployment of edge
 51 | applications, coupled with Aether being offered as a managed service,
 52 | means Aether can fairly be characterized as a Platform-as-a-Service
 53 | (PaaS).
 54 | 
 55 | Aether supports this combination by implementing both the RAN and the
 56 | user plane of the Mobile Core on-prem, as cloud-native workloads
 57 | co-located on the Aether cluster. This is often referred to as *local
 58 | breakout* because it enables direct communication between mobile
 59 | devices and edge applications without data traffic leaving the
 60 | enterprise. This scenario is depicted in :numref:`Figure %s
 61 | <fig-hybrid>`, which does not name the edge applications, but
 62 | substituting Internet-of-Things (IoT) would be an illustrative
 63 | example.
 64 | 
 65 | .. _fig-hybrid:
 66 | .. figure:: figures/Slide2.png
 67 |    :width: 700px
 68 |    :align: center
 69 | 
 70 |    Overview of Aether as a hybrid cloud, with edge apps and the 5G
 71 |    data plane (called *local breakout*) running on-prem and various
 72 |    management and control-related workloads running in a central
 73 |    cloud.
 74 | 
 75 | The approach includes both edge (on-prem) and centralized (off-prem)
 76 | components. This is true for edge apps, which often have a centralized
 77 | counterpart running in a commodity cloud. It is also true for the 5G
 78 | Mobile Core, where the on-prem User Plane (UP) is paired with a
 79 | centralized Control Plane (CP). The central cloud shown in this figure
 80 | might be private (i.e., operated by the enterprise), public (i.e.,
 81 | operated by a commercial cloud provider), or some combination of the
 82 | two (i.e., not all centralized elements need to run in the same
 83 | cloud). Also shown in :numref:`Figure %s <fig-hybrid>` is a
 84 | centralized *Control and Management Platform*. This represents all the
 85 | functionality needed to offer Aether as a managed service, with system
 86 | administrators using a portal exported by this platform to operate the
 87 | underlying infrastructure and services within their enterprise. The
 88 | rest of this book is about everything that goes into implementing that
 89 | *Control and Management Platform*.
 90 | 
 91 | 2.1 Edge Cloud
 92 | --------------
 93 | 
 94 | The edge cloud, which in Aether is called ACE (Aether Connected Edge),
 95 | is a Kubernetes-based cluster similar to the one shown in
 96 | :numref:`Figure %s <fig-hw>` of Chapter 1. It is a platform that
 97 | consists of one or more server racks interconnected by a leaf-spine
 98 | switching fabric, with an SDN control plane (denoted SD-Fabric)
 99 | managing the fabric.
100 | 
101 | .. _fig-ace:
102 | .. figure:: figures/Slide3.png
103 |    :width: 400px
104 |    :align: center
105 | 
106 |    Aether Connected Edge (ACE) = The cloud platform (Kubernetes and
107 |    SD-Fabric) plus the 5G connectivity service (RAN and User Plane of
108 |    Mobile Core). Dotted lines (e.g., between SD-RAN and the individual
109 |    base stations, and between the Network OS and the individual
110 |    switches) represent control relationships (e.g., SD-RAN controls
111 |    the small cells and SD-Fabric controls the switches).
112 | 
113 | As shown in :numref:`Figure %s <fig-ace>`, ACE hosts two additional
114 | microservice-based subsystems on top of this platform; they
115 | collectively implement *5G-Connectivity-as-a-Service*. The first
116 | subsystem, SD-RAN, is an SDN-based implementation of the 5G Radio
117 | Access Network (RAN). It controls the small cell base stations
118 | deployed throughout the enterprise. The second subsystem, SD-Core, is
119 | an SDN-based implementation of the User Plane half of the Mobile
120 | Core. It is responsible for forwarding traffic between the RAN and the
121 | Internet. The SD-Core Control Plane (CP) runs off-site, and is not
122 | shown in :numref:`Figure %s <fig-ace>`. Both subsystems (as well as
123 | the SD-Fabric), are deployed as a set of microservices, but details
124 | about the functionality implemented by these containers is otherwise
125 | not critical to this discussion. For our purposes, they are
126 | representative of any cloud native workload. (The interested reader is
127 | referred to our companion 5G and SDN books for more information about
128 | the internal working of SD-RAN, SD-Core, and SD-Fabric.)
129 | 
130 | .. _reading_5g:
131 | .. admonition:: Further Reading
132 | 
133 |    L. Peterson and O. Sunay. `5G Mobile Networks: A Systems Approach
134 |    <https://5G.systemsapproach.org>`__. March 2020.
135 | 
136 |    L. Peterson, *et al.* `Software-Defined Networks: A Systems Approach
137 |    <https://sdn.systemsapproach.org>`__. November 2021.
138 | 
139 | Once ACE is running in this configuration, it is ready to host a
140 | collection of edge applications (not shown in :numref:`Figure %s
141 | <fig-ace>`), and as with any Kubernetes-based cluster, a Helm chart
142 | would be the preferred way to deploy such applications. What’s unique
143 | to ACE is the ability to connect such applications to mobile devices
144 | throughout the enterprise using the 5G Connectivity Service
145 | implemented by SD-RAN and SD-Core. This service is offered as a
146 | managed service, with enterprise system administrators able to use a
147 | programmatic API (and associated GUI portal) to control that service;
148 | that is, authorize devices, restrict access, set Quality-of-Service
149 | parameters for different devices and applications, and so on. How to
150 | provide such a runtime control interface is the topic of Chapter 5.
151 | 
152 | 2.2 Hybrid Cloud
153 | -----------------
154 | 
155 | While it is possible to instantiate a single ACE cluster in just one
156 | site, Aether is designed to support multiple ACE deployments, all of
157 | which are managed from the central cloud. Such a hybrid cloud scenario
158 | is depicted in :numref:`Figure %s <fig-aether>`, which shows two
159 | subsystems running in the central cloud: (1) one or more instances of
160 | the Mobile Core Control Plane (CP), and (2) the Aether Management
161 | Platform (AMP).
162 | 
163 | Each SD-Core CP controls one or more SD-Core UPs, as specified by
164 | 3GPP, the standards organization responsible for 5G. Exactly how CP
165 | instances (running centrally) are paired with UP instances (running at
166 | the edges) is a runtime decision, and depends on the degree of
167 | isolation the enterprise sites require. AMP is responsible for
168 | managing all the centralized and edge subsystems (as introduced in the
169 | next section).
170 | 
171 | .. _fig-aether:
172 | .. figure:: figures/Slide4.png
173 |    :width: 600px
174 |    :align: center
175 | 
176 |    Aether runs in a hybrid cloud configuration, with Control Plane of
177 |    Mobile Core and the Aether Management Platform (AMP) running in the
178 |    Central Cloud.
179 | 
180 | There is an important aspect of this hybrid cloud that is not obvious
181 | from :numref:`Figure %s <fig-aether>`, which is that the “hybrid
182 | cloud” we keep referring to is best described as a set of Kubernetes
183 | clusters, rather than a set of physical clusters (similar to the one
184 | we started with in :numref:`Figure %s <fig-hw>` of Chapter 1).
185 | This is because, while each ACE site usually corresponds to a physical
186 | cluster built out of bare-metal components, each of the SD-Core CP
187 | subsystems shown in :numref:`Figure %s <fig-aether>` is actually
188 | deployed in a logical Kubernetes cluster on a commodity cloud. The
189 | same is true for AMP. Aether’s centralized components are able to run
190 | in Google Cloud Platform, Microsoft Azure, and Amazon’s AWS. They can also
191 | run as an emulated cluster implemented by a system like
192 | KIND—Kubernetes in Docker—making it possible for developers to run
193 | these components on their laptops.
194 | 
195 | To be clear, Kubernetes adopts generic terminology, such as “cluster”
196 | and “service”, and gives it a very specific meaning. In
197 | Kubernetes-speak, a *Cluster* is a logical domain in which Kubernetes
198 | manages a set of containers. This “Kubernetes cluster” may have a
199 | one-to-one relationship with an underlying physical cluster, but it is
200 | also possible that a Kubernetes cluster is instantiated inside a
201 | datacenter, as one of potentially thousands of such logical
202 | clusters. And as we'll see in a later chapter, even an ACE edge site
203 | sometimes hosts more than one Kubernetes cluster, for example, one
204 | running production services and one used for trial deployments of new
205 | services.
206 | 
207 | 2.3 Stakeholders
208 | ----------------
209 | 
210 | With the understanding that our target environment is a collection of
211 | Kubernetes clusters—some running on bare-metal hardware at edge sites
212 | and some running in central datacenters—there is an orthogonal issue
213 | of how decision-making responsibility for those clusters is shared
214 | among multiple stakeholders. Identifying the relevant stakeholders is
215 | an important prerequisite for establishing a cloud service, and while
216 | the example we use may not be suitable for all situations, it does
217 | illustrate the design implications.
218 | 
219 | For Aether, we care about two primary stakeholders: (1) the *cloud
220 | operators* who manage the hybrid cloud as a whole, and (2) the
221 | *enterprise users* who decide on a per-site basis how to take
222 | advantage of the local cloud resources (e.g., what edge applications
223 | to run and how to slice connectivity resources among those apps).  We
224 | sometimes call the latter "enterprise admins" to distinguish them from
225 | "end-users" who might want to manage their own personal devices.
226 | 
227 | The architecture is multi-tenant in the sense that it authenticates
228 | and isolates these stakeholders, allowing each to access only those
229 | objects they are responsible for. This makes the approach agnostic as
230 | to whether all the edge sites belong to a single organization (with
231 | that organization also responsible for operating the cloud), or
232 | alternatively, there being a separate organization that offers a
233 | managed service to a set of distinct enterprises (each of which spans
234 | one or more sites). The architecture can also accommodate end-users,
235 | and provide them with a "self-service" portal, but we do not elaborate
236 | on that possibility.
237 | 
238 | There is a potential third stakeholder of note—third-party service
239 | providers—which points to the larger issue of how we deploy and manage
240 | additional edge applications. To keep the discussion tangible—but
241 | remaining in the open source arena—we use OpenVINO as an illustrative
242 | example. OpenVINO is a framework for deploying AI inference models.
243 | It is interesting in the context of Aether because one of its use
244 | cases is processing video streams, for example to detect and count
245 | people who enter the field of view of a collection of 5G-connected
246 | cameras.
247 | 
248 | .. _reading_openvino:
249 | .. admonition:: Further Reading
250 | 
251 |    `OpenVINO Toolkit <https://docs.openvino.ai>`__.
252 | 
253 | On the one hand, OpenVINO is just like the 5G-related components we're
254 | already incorporating into our hybrid cloud: it is deployed as a
255 | Kubernetes-based set of microservices. On the other hand, we have to
256 | ask who is responsible for managing it, which is to say “who
257 | operationalizes OpenVINO?”
258 | 
259 | One answer is that the operators who already manage the rest of the
260 | hybrid cloud also manage the collection of edge applications added to
261 | cloud. Enterprise admins might activate and control those apps on a
262 | site-by-site basis, but it is the operations team already responsible
263 | for provisioning, deploying, and managing those edge clouds that also
264 | does the same for OpenVINO and any other applications that run on that
265 | cloud. Generalizing from one edge service (5G connectivity) to
266 | arbitrarily many edge services has implications for control and
267 | management (which we’ll discuss throughout the book), but
268 | fundamentally nothing changes in the course we've already set out for
269 | ourselves.
270 | 
271 | Having the cloud operator *curate and manage* a set of edge services
272 | is the assumption Aether makes (and we assume throughout this book),
273 | but for completeness, we take note of two other possibilities.  One is
274 | that we extend our hybrid architecture to support independent
275 | third-party service providers. Each new edge service acquires its own
276 | isolated Kubernetes cluster from the edge cloud, and then the
277 | 3rd-party provider takes over all responsibility for managing the
278 | service running in that cluster. From the perspective of the cloud
279 | operator, though, the task just became significantly more difficult
280 | because the architecture would need to support Kubernetes as a managed
281 | service, which is sometimes called *Containers-as-a-Service (CaaS)*.\ [#]_
282 | Creating isolated Kubernetes clusters on-demand is a step further than
283 | we take things in this book, in part because there is a second
284 | possible answer that seems more likely to happen.
285 | 
286 | .. [#] This is not strictly an either-or-situation. It is possible to
287 |        curate an edge service, provision cluster resources for it, but
288 |        then delegate operational responsibility to a 3rd-party service
289 |        provider.
290 | 
291 | This second approach is that a multi-cloud emerges *within*
292 | enterprises. Today, most people equate multi-cloud with services
293 | running across multiple hyperscalers, but with edge clouds becoming
294 | more common, it seems likely that enterprises will invite multiple edge
295 | clouds onto their local premises, some hyperscaler-provided and some
296 | not, each hosting a different subset of edge services. For example,
297 | one edge cloud might host a 5G connectivity service and another might
298 | host an AI platform like OpenVINO. The question this raises is whether
299 | the cloud management technologies described in this book still apply
300 | in that setting. The answer is yes: the fundamental management
301 | challenges remain the same. The main difference is knowing when to
302 | directly control a Kubernetes cluster (as we do in this book) and when
303 | to do so indirectly through the manager for that cluster. There are
304 | also new problems that are unique to multi-clouds, such as inter-cloud
305 | service discovery, but they are beyond the scope of this book.
306 | 
307 | 2.4 Control and Management
308 | --------------------------
309 | 
310 | We are now ready to describe the architecture of the Aether Management
311 | Platform (AMP), which as shown in :numref:`Figure %s <fig-amp>`,
312 | manages both the distributed set of ACE clusters and the other control
313 | clusters running in the central cloud. And illustrating the recursive
314 | nature of the management challenge, AMP is also responsible for
315 | managing AMP!
316 | 
317 | AMP includes one or more portals targeted at different stakeholders,
318 | with :numref:`Figure %s <fig-amp>` showing the two examples we focus
319 | on in this book: a User Portal intended for enterprise admins who
320 | need to manage services delivered to a local site, and an Operations
321 | Portal intended for the ops team responsible for keeping Aether
322 | up to date and running smoothly. Again, other stakeholders (classes of
323 | users) are possible, but this distinction does represent a natural
324 | division between those who *use* cloud services and those who
325 | *operate* cloud services.
326 | 
327 | .. _fig-amp:
328 | .. figure:: figures/Slide5.png
329 |    :width: 600px
330 |    :align: center
331 | 
332 |    The four subsystems that comprise AMP: Resource Provisioning,
333 |    Lifecycle Management, Runtime Control, and Monitoring & Telemetry.
334 | 
335 | We do not focus on these portals, which provide a graphical interface
336 | to a subset of AMP functionality, but we instead describe the
337 | aggregate functionality supported by AMP, which is organized around
338 | four subsystems:
339 | 
340 | * Resource Provisioning: Responsible for initializing and configuring
341 |   resources (e.g., servers, switches) that add, replace, or upgrade
342 |   capacity for Aether.
343 | 
344 | * Lifecycle Management: Responsible for continuous integration and
345 |   deployment of software functionality available on Aether.
346 | 
347 | * Runtime Control: Responsible for the ongoing configuration and
348 |   control of the services (e.g., connectivity) provided by Aether.
349 | 
350 | * Monitoring & Telemetry: Responsible for collecting, archiving,
351 |   evaluating, and analyzing telemetry data generated by Aether
352 |   components.
353 | 
354 | Internally, each of these subsystems is implemented as a highly
355 | available cloud service, running as a collection of microservices. The
356 | design is cloud-agnostic, so AMP can be deployed in a public cloud
357 | (e.g., Google Cloud, AWS, Azure), an operator-owned Telco cloud, (e.g,
358 | AT&T’s AIC), or an enterprise-owned private cloud. For the pilot
359 | deployment of Aether, AMP runs in the Google Cloud.
360 | 
361 | The rest of this section introduces these four subsystems, with the
362 | chapters that follow filling in more detail about each.
363 | 
364 | 
365 | 2.4.1 Resource Provisioning
366 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
367 | 
368 | Resource Provisioning configures and bootstraps resources (both
369 | physical and virtual), bringing them up to a state so Lifecycle
370 | Management can take over and manage the software running on those
371 | resources. It roughly corresponds to Day 0 operations, and includes
372 | both the hands-on aspect of installing and physically connecting
373 | hardware, and the inventory-tracking required to manage physical
374 | assets.
375 | 
376 | .. _fig-provision:
377 | .. figure:: figures/Slide6.png
378 |    :width: 500px
379 |    :align: center
380 | 
381 |    High-level overview of Resource Provisioning.
382 | 
383 | :numref:`Figure %s <fig-provision>` gives a high-level overview. As a
384 | consequence of the operations team physically connecting resources to
385 | the cloud and recording attributes for those resources in an Inventory
386 | Repo, a Zero-Touch Provisioning system (a) generates a set of
387 | configuration artifacts that are stored in a Config Repo and used
388 | during Lifecycle Management, and (b) initializes the newly deployed
389 | resources so they are in a state that Lifecycle Management is able to
390 | control. The idea of storing configuration directives in a Repo, like
391 | any other code module, is a practice known as *Configuration-as-Code*,
392 | and we will see it applied in different ways throughout this book.
393 | 
394 | Recall from Chapter 1 that we called out the "Aether platform" as
395 | distinct from the cloud-native workloads that are hosted on the
396 | platform. This is relevant here because Resource Provisioning has to
397 | get this platform up and running before Lifecycle Management can do
398 | its job. But in another example of circular dependencies, Lifecycle
399 | Management also plays a role in keeping the underlying platform
400 | up to date.
401 | 
402 | Clearly, the “Install & Inventory” step requires human involvement,
403 | and some amount of hands-on resource-prep is necessary, but the goal
404 | is to minimize the operator configuration steps (and associated
405 | expertise) and maximize the automation carried out by the Zero-Touch
406 | Provisioning system. Also realize that :numref:`Figure %s
407 | <fig-provision>` is biased towards provisioning a physical cluster,
408 | such as the edge sites in Aether. For a hybrid cloud that also
409 | includes one or more virtual clusters running in central datacenters,
410 | it is necessary to provision those virtual resources as well. Chapter
411 | 3 describes provisioning from this broader perspective, considering
412 | both physical and virtual resources.
413 | 
414 | 2.4.2 Lifecycle Management
415 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
416 | 
417 | Lifecycle Management is the process of integrating debugged, extended,
418 | and refactored components (often microservices) into a set of
419 | artifacts (e.g., Docker containers and Helm charts), and subsequently
420 | deploying those artifacts to the operational cloud. It includes a
421 | comprehensive testing regime, and typically, a procedure by which
422 | developers inspect and comment on each others’ code.
423 | 
424 | .. _fig-lifecycle:
425 | .. figure:: figures/Slide7.png
426 |    :width: 600px
427 |    :align: center
428 | 
429 |    High-level overview of Lifecycle Management.
430 | 
431 | :numref:`Figure %s <fig-lifecycle>` gives a high-level overview, where
432 | it is common to split the integration and deployment phases, the
433 | latter of which combines the integration artifacts from the first
434 | phase with the configuration artifacts generated by Resource
435 | Provisioning described in the previous subsection. The figure does not
436 | show any human intervention (after development), which implies any
437 | patches checked into the code repo trigger integration, and any new
438 | integration artifacts trigger deployment. This is commonly referred to
439 | as Continuous Integration / Continuous Deployment (CI/CD), although in
440 | practice, operator discretion and other factors are also taken into
441 | account before deployment actually happens.
442 | 
443 | One of the key responsibilities of Lifecycle Management is version
444 | control, which includes evaluating dependencies, but also the
445 | possibility that it will sometimes be necessary to both roll out new
446 | versions of software and rollback to old versions, as well as operate
447 | with multiple versions deployed simultaneously. Managing all the
448 | configuration state needed to successfully deploy the right version of
449 | each component in the system is the central challenge, which we
450 | address in Chapter 4.
451 | 
452 | 2.4.3 Runtime Control
453 | ~~~~~~~~~~~~~~~~~~~~~
454 | 
455 | Once deployed and running, Runtime Control provides a programmatic API
456 | that can be used by various stakeholders to manage whatever abstract
457 | service(s) the system offers (e.g., 5G connectivity in the case of
458 | Aether). As shown in :numref:`Figure %s <fig-control>`, Runtime
459 | Control partially addresses the “management silo” issue raised in
460 | Chapter 1, so users do not need to know that connectivity potentially
461 | spans four different components, or how to control/configure each of
462 | them individually. (Or, as in the case of the Mobile Core, that
463 | SD-Core is distributed across two clouds, with the CP sub-part
464 | responsible for controlling the UP sub-part.) In the case of the
465 | connectivity service, for example, users only care about being able to
466 | authorize devices and set QoS parameters on an end-to-end basis.
467 | 
468 | .. _fig-control:
469 | .. figure:: figures/Slide8.png
470 |    :width: 400px
471 |    :align: center
472 | 
473 |    Example use case that requires ongoing runtime control.
474 | 
475 | Note that :numref:`Figure %s <fig-control>` focuses on
476 | Connectivity-as-a-Service, but the same idea applies to all services
477 | the cloud offers to end users. Thus, we can generalize the figure so
478 | Runtime Control mediates access to any of the underlying microservices
479 | (or collections of microservices) the cloud designer wishes to make
480 | publicly accessible, including the rest of AMP! In effect, Runtime
481 | Control implements an abstraction layer, codified with a programmatic
482 | API.
483 | 
484 | Given this mediation role, Runtime Control provides mechanisms to
485 | model (represent) the abstract services to be offered to users; store
486 | any configuration and control state associated with those models;
487 | apply that state to the underlying components, ensuring they remain in
488 | sync with the operator’s intentions; and authorize the set of API
489 | calls that users try to invoke on each service. These details are
490 | spelled out in Chapter 5.
491 | 
492 | 
493 | 2.4.4 Monitoring and Telemetry
494 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
495 | 
496 | In addition to controlling service functionality, a running system has
497 | to be continuously monitored so that operators can diagnose and
498 | respond to failures, tune performance, do root cause analysis, perform
499 | security audits, and understand when it is necessary to provision
500 | additional capacity. This requires mechanisms to observe system
501 | behavior, collect and archive the resulting data, analyze the data and
502 | trigger various actions in response, and visualize the data in human
503 | consumable dashboards (similar to the example shown in :numref:`Figure
504 | %s <fig-monitor>`).
505 | 
506 | .. _fig-monitor:
507 | .. figure:: figures/Slide18.png
508 |    :width: 500px
509 |    :align: center
510 | 
511 |    Example Aether dashboard, showing the health of one of the
512 |    subsystems (SD-Core).
513 | 
514 | In broad terms, it is common to think of this aspect of cloud
515 | management as having three parts: a monitoring component that collects
516 | quantitative metrics (e.g., load averages, transmission rates,
517 | ops per second); a logging component that collects diagnostic
518 | messages (i.e., text strings explaining various event); and a tracing
519 | component that can reconstruct workflows through a set of
520 | microservices. All include a timestamp, so it is possible to link
521 | quantitative analysis with qualitative explanations in support of
522 | diagnostics and analytics.
523 | 
524 | 2.4.5 Summary
525 | ~~~~~~~~~~~~~
526 | 
527 | This overview of the management architecture could lead one to
528 | conclude that these four subsystems were architected, in a rigorous,
529 | top-down fashion, to be completely independent.  But that is not the
530 | case. It is more accurate to say that the system evolved bottom up,
531 | solving the next immediate problem one at a time, all the while
532 | creating a large ecosystem of open source components that can be used
533 | in different combinations. What this book presents is a retrospective
534 | description of the end result, organized into four subsystems to help
535 | make sense of it all.
536 | 
537 | There are, in practice, many opportunities for interactions among the
538 | four components, and in some cases, there are overlapping concerns
539 | that lead to considerable debate. This is what makes operationalizing
540 | a cloud such a thorny problem. For example, it's difficult to draw a crisp
541 | line between where resource provisioning ends and lifecycle management
542 | begins. One could view provisioning as "Step 0" of lifecycle
543 | management. As another example, the runtime control and monitoring
544 | subsystems are often combined in a single user interface, giving
545 | operators a way to both read (monitor) and write (control) various
546 | parameters of a running system. Connecting those two subsystems is how
547 | we build closed loop control.
548 | 
549 | These two "simplifications" allow us to reduce the architectural
550 | overview of the management platform to the two-dimensional
551 | representation shown in :numref:`Figure %s <fig-2D>`. In one
552 | dimension, layered on top of the hybrid cloud being managed, is the
553 | Runtime Control system (including Monitoring and Telemetry to close
554 | the control loop). Users and Operators read and write parameters of
555 | the running system via a well-defined REST API. In the other
556 | dimension, running beside the hybrid cloud, is the Lifecycle
557 | Management system (including Resource Provisioning as Step 0).
558 | Operators and Developers specify changes to the system by checking
559 | code (including configuration specs) into a repo, and then
560 | periodically triggering an upgrade of the running system.
561 | 
562 | .. _fig-2D:
563 | .. figure:: figures/Slide25.png
564 |    :width: 500px
565 |    :align: center
566 | 
567 |    Simplified representation of the management platform.
568 | 
569 | This simplified perspective draws attention to an ambiguity, which is
570 | the distinction between "changes to the parameters of a running
571 | system" versus "upgrading the system that is running."  Generally,
572 | Lifecycle Management takes responsibility for *configuring* each
573 | component (including what version of each component is deployed),
574 | while runtime control takes responsibility for *controlling* each
575 | component. But where you draw the line between configuration and
576 | control is somewhat arbitrary. Do configuration changes only happen
577 | when you first boot a component, or can you change the configuration
578 | of a running system, and if you do, how does that differ from changing
579 | a control parameter? And as suggested by the dotted arrow in
580 | :numref:`Figure %s <fig-2D>`, is there value in having Runtime Control
581 | instigate changes via Lifecycle Management? The difference is usually
582 | related to frequency of change (which is in turn related to how
583 | disruptive to existing traffic/workload the change is), but ultimately
584 | it doesn't matter what you call it, as long as the mechanisms you use
585 | meet all of your requirements.
586 | 
587 | Of course, an operational system doesn't tolerate such ambiguities
588 | very well. Each aspect of management has to be supported in a
589 | well-defined, efficient and repeatable way. That's why we include a
590 | description of a concrete realization of each of the four subsystems,
591 | reflecting one particular set of design choices. We call out the
592 | opportunities to make different engineering decisions, along with the
593 | design rationale behind our choices, as we add more details in the
594 | chapters that follow.
595 | 
596 | 2.5 DevOps
597 | ----------
598 | 
599 | The preceding discussion focuses on the subsystems that make up the
600 | Control and Management Platform, but such a platform is used by
601 | people. This implies the need for a set of operational processes and
602 | procedures, which in a cloud setting, are now commonly organized
603 | around the DevOps model. The following gives a high-level summary,
604 | with a more extensive discussion of ops-related procedures presented
605 | throughout the book.
606 | 
607 | DevOps has become an overused term, generally taken to mean that the
608 | line between the engineers who develop cloud functionality and the
609 | operators who deploy and manage cloud functionality is blurred, with
610 | the same team responsible for both. But that definition is too
611 | imprecise to be helpful. There are really three aspects of DevOps that
612 | are important to understand.
613 | 
614 | First, when it comes to a set of services (or user-visible features),
615 | it is true that the developers play a role in deploying and operating
616 | those services. Enabling them to do that is exactly the value of the
617 | Management Platform. Consider the team responsible for SD-RAN in
618 | Aether, as an example. That team not only implements new SD-RAN
619 | features, but once their patch sets are checked into the code
620 | repository, those changes are integrated and deployed by the automated
621 | toolchain introduced in the previous section. This means the SD-RAN
622 | team is also responsible for:
623 | 
624 | 1. Adding test cases to the CI half of Lifecycle Management, and
625 |    writing any configuration specifications needed by the CD half of
626 |    Lifecycle Management.
627 | 
628 | 2. Instrumenting their code so it reports into the Monitoring and
629 |    Telemetry framework, giving them the dashboards and alarms they
630 |    need to troubleshoot any problems that arise.
631 | 
632 | 3. Augmenting the data model of Runtime Control, so their component’s
633 |    internal interfaces are plumbed through to the cloud’s externally
634 |    visible Northbound Interface.
635 | 
636 | Once deployed and operational, the SD-RAN team is also responsible for
637 | diagnosing any problems that cannot be resolved by a dedicated “on
638 | call” support staff.\ [#]_  The SD-RAN team is motivated to take
639 | advantage of the platform’s automated mechanisms (rather than exploit
640 | short-term workarounds), and to document their component’s behavior
641 | (especially how to resolve known problems), so they do not get support
642 | calls in the middle of the night.
643 | 
644 | .. [#] Whether traditional or DevOps-based, there is typically a
645 |        front-line support team, which is often said to provide Tier-1
646 |        support. They interact directly with customers and are the
647 |        first to respond to alarms, resolving the issue according to a
648 |        well-scripted playbook. If Tier-1 support is not able to
649 |        resolve an issue, it is elevated to Tier-2 and eventually
650 |        Tier-3, the latter of which is the developers who best
651 |        understand implementation details.
652 | 
653 | .. sidebar:: Experience at Google
654 | 
655 |     *Our brief sketch of DevOps is based on how the approach is
656 |     practiced at Google, and in this context, it is a great
657 |     example of how good things come from efforts to minimize
658 |     toil. As Google gained experience building and running its
659 |     cloud, the incremental improvements to their cloud management
660 |     system were assimilated in a system known as Borg.*
661 | 
662 |     *Kubernetes, the open source project widely used across the
663 |     industry today, was spun out of Borg. The functionality
664 |     embodied by Kubernetes evolved over time to deal with the
665 |     operational challenges of deploying, upgrading, and monitoring
666 |     a set of containers, serving as a great example of how a
667 |     "rising tide lifts all boats." Given enough time, it may be
668 |     the case that next layer of cloud management machinery,
669 |     roughly corresponding to the topics covered in this book, will
670 |     also be taken as a given. The challenge, as we will see, is
671 |     the multi-dimensional scope of the problem.*
672 | 
673 | Second, all of the activity outlined in the previous paragraph is
674 | possible only because of the rich set of capabilities built into the
675 | Control and Management Platform that is the subject of this book.\
676 | [#]_ Someone had to build that platform, which includes a testing
677 | framework that individual tests can be plugged into; an automated
678 | deployment framework that is able to roll upgrades out to a scalable
679 | number of servers and sites without manual intervention; a monitoring
680 | and telemetry framework that components can report into; a runtime
681 | control environment that can translate high-level directives into
682 | low-level operations on backend components; and so on. While each of
683 | these frameworks was once created by a team tasked with keeping some
684 | other service running smoothly, they have taken on a life of their
685 | own. The Control and Management Platform now has its own DevOps
686 | team(s), who in addition to continually improving the platform, also
687 | field operational events, and when necessary, interact with other
688 | teams (e.g., the SD-RAN team in Aether) to resolve issues that come
689 | up. They are sometimes called Site Reliability Engineers (SREs), and
690 | in addition to being responsible for the Control and Management
691 | Platform, they enforce operational discipline—the third aspect of
692 | DevOps discussed next—on everyone else.
693 | 
694 | .. [#] This we why we refer to the management system as a "platform",
695 |   with AMP as an illustrative example. It serves as a common framework
696 |   that developers of all the other cloud components can plug into and
697 |   leverage. This is how you ultimately address the "management silo"
698 |   problem.
699 | 
700 | Finally, when operating with discipline and rigor, all of these teams
701 | strictly adhere to two quantitative rules. The first balances *feature
702 | velocity* with *system reliability*. Each component is given an *error
703 | budget* (percentage of time it can be down), and new features cannot
704 | be rolled out unless the corresponding component has been operating
705 | within this bound. This test is a “gate” on the CI/CD pipeline. The
706 | second rule balances how much time is spent on *operational toil*
707 | (time spent by a human diagnosing or fixing problems) with time spent
708 | engineering new capabilities into the Control and Management Platform
709 | to reduce future toil. If too much time is spent toiling and too
710 | little time is spent making the Control and Management Platform
711 | better, then it is taken as a sign that additional engineering
712 | resources are needed.
713 | 
714 | .. _reading_sre:
715 | .. admonition:: Further Reading
716 | 
717 |    `Site Reliability Engineering: How Google Runs Production Systems
718 |    <https://www.amazon.com/Site-Reliability-Engineering-Production-Systems/dp/149192912X/ref=pd_bxgy_14_img_2/131-5109792-2268338?_encoding=UTF8&pd_rd_i=149192912X&pd_rd_r=4b77155f-234d-11e9-944e-278ce23a35b5&pd_rd_w=qIfxg&pd_rd_wg=12dE2&pf_rd_p=6725dbd6-9917-451d-beba-16af7874e407&pf_rd_r=5GN656H9VEG4WEVGB728&psc=1&refRID=5GN656H9VEG4WEVGB728>`__,
719 |    2016.
720 | 


--------------------------------------------------------------------------------
/authors.rst:
--------------------------------------------------------------------------------
 1 | About The Authors
 2 | ==================
 3 | 
 4 | **Larry Peterson** is the Robert E. Kahn Professor of Computer
 5 | Science, Emeritus at Princeton University, where he served as Chair
 6 | from 2003-2009. His research focuses on the design, implementation,
 7 | and operation of Internet-scale distributed systems, including the
 8 | widely used PlanetLab and MeasurementLab platforms.  He is currently
 9 | contributing to the Aether access-edge cloud project at the Linux
10 | Foundation.  Peterson is a member of the National Academy of
11 | Engineering, a Fellow of the ACM and the IEEE, the 2010 recipient of
12 | the IEEE Kobayashi Computer and Communication Award, and the 2013
13 | recipient of the ACM SIGCOMM Award. He received his Ph.D. degree from
14 | Purdue University.
15 | 
16 | **Scott Baker** is a Cloud Software Architect at Intel, where he works
17 | on the Open Edge Platform. Prior to joining Intel, he was on the Open
18 | Networking Foundation (ONF) engineering team that built Aether,
19 | leading the runtime control effort. Baker has also worked on
20 | cloud-related research projects at Princeton and the University of
21 | Arizona, including PlanetLab, GENI, and VICCI. He received his
22 | Ph.D. in Computer Science from the University of Arizona in 2005.
23 | 
24 | **Andy Bavier** is a Cloud Software Engineer at Intel, where he works
25 | on the Open Edge Platform. Prior to joining Intel, he was on the Open
26 | Networking Foundation (ONF) engineering team that built Aether,
27 | leading the observability effort. Bavier has also been a Research
28 | Scientist at Princeton University, where he worked on the PlanetLab
29 | project. He received a BA in Philosophy from William & Mary in 1990,
30 | an MS in Computer Science from the University of Arizona in 1995, and
31 | a PhD in Computer Science from Princeton University in 2004.
32 | 
33 | **Zack Williams** is a Cloud Software Engineer at Intel, where he
34 | works on the Open Edge Platform. Prior to joining Intel, he was on the
35 | Open Networking Foundation (ONF) engineering team that built
36 | Aether, leading the infrastructure provisioning effort. Williams has also
37 | been a systems programmer at the University of Arizona. He received
38 | his BS in Computer Science from the University of Arizona in 2001.
39 | 
40 | **Bruce Davie** is a computer scientist noted for his contributions to
41 | the field of networking. He began his networking career at Bellcore
42 | where he worked on the Aurora Gigabit testbed and collaborated with
43 | Larry Peterson on high-speed host-network interfaces. He then went to
44 | Cisco where he led a team of architects responsible for Multiprotocol
45 | Label Switching (MPLS). He worked extensively at the IETF on
46 | standardizing MPLS and various quality of service technologies. He
47 | also spent five years as a visiting lecturer at the Massachusetts
48 | Institute of Technology. In 2012 he joined Software Defined Networking
49 | (SDN) startup Nicira and was then a principal engineer at VMware
50 | following the acquisition of Nicira. In 2017 he took on the role of VP
51 | and CTO for the Asia Pacific region at VMware. He is a Fellow of the
52 | ACM and chaired ACM SIGCOMM from 2009 to 2013. Davie is the author of
53 | multiple books and the holder of more than 40 U.S. patents.
54 | 
55 | 


--------------------------------------------------------------------------------
/code/build.sh:
--------------------------------------------------------------------------------
1 | $ mkdir ~/systemsapproach
2 | $ cd ~/systemsapproach
3 | $ git clone https://github.com/SystemsApproach/ops.git 
4 | $ cd ops
5 | 


--------------------------------------------------------------------------------
/code/cluster-edge_val.tfvars:
--------------------------------------------------------------------------------
 1 | cluster_name  = "ace-X"
 2 | cluster_nodes = {
 3 |   leaf1 = {
 4 |     user        = "terraform"
 5 |     private_key = "~/.ssh/id_rsa_terraform"
 6 |     host        = "10.64.10.133"
 7 |     roles       = ["worker"]
 8 |     labels      = ["node-role.aetherproject.org=switch"]
 9 |     taints      = ["node-role.aetherproject.org=switch:NoSchedule"]
10 |   },
11 |   leaf2 = {
12 |     user        = "terraform"
13 |     private_key = "~/.ssh/id_rsa_terraform"
14 |     host        = "10.64.10.137"
15 |     roles       = ["worker"]
16 |     labels      = ["node-role.aetherproject.org=switch"]
17 |     taints      = ["node-role.aetherproject.org=switch:NoSchedule"]
18 |   },
19 |   spine1 = {
20 |     user        = "terraform"
21 |     private_key = "~/.ssh/id_rsa_terraform"
22 |     host        = "10.64.10.131"
23 |     roles       = ["worker"]
24 |     labels      = ["node-role.aetherproject.org=switch"]
25 |     taints      = ["node-role.aetherproject.org=switch:NoSchedule"]
26 |   },
27 |   spine2 = {
28 |     user        = "terraform"
29 |     private_key = "~/.ssh/id_rsa_terraform"
30 |     host        = "10.64.10.135"
31 |     roles       = ["worker"]
32 |     labels      = ["node-role.aetherproject.org=switch"]
33 |     taints      = ["node-role.aetherproject.org=switch:NoSchedule"]
34 |   },
35 |   server-1 = {
36 |     user        = "terraform"
37 |     private_key = "~/.ssh/id_rsa_terraform"
38 |     host        = "10.64.10.138"
39 |     roles       = ["etcd", "controlplane", "worker"]
40 |     labels      = []
41 |     taints      = []
42 |   },
43 |   server-2 = {
44 |     user        = "terraform"
45 |     private_key = "~/.ssh/id_rsa_terraform"
46 |     host        = "10.64.10.139"
47 |     roles       = ["etcd", "controlplane", "worker"]
48 |     labels      = []
49 |     taints      = []
50 |   },
51 |   server-3 = {
52 |     user        = "terraform"
53 |     private_key = "~/.ssh/id_rsa_terraform"
54 |     host        = "10.64.10.140"
55 |     roles       = ["etcd", "controlplane", "worker"]
56 |     labels      = []
57 |     taints      = []
58 |   },
59 |   server-4 = {
60 |     user        = "terraform"
61 |     private_key = "~/.ssh/id_rsa_terraform"
62 |     host        = "10.64.10.141"
63 |     roles       = ["worker"]
64 |     labels      = []
65 |     taints      = []
66 |   },
67 |   server-5 = {
68 |     user        = "terraform"
69 |     private_key = "~/.ssh/id_rsa_terraform"
70 |     host        = "10.64.10.142"
71 |     roles       = ["worker"]
72 |     labels      = []
73 |     taints      = []
74 |   }
75 | }
76 | cluster_labels = {
77 |   env          = "production"
78 |   clusterInfra = "bare-metal"
79 |   clusterRole  = "ace"
80 |   k8s          = "self-managed"
81 |   coreType     = "4g"
82 |   upfType      = "up4"
83 | }
84 | 


--------------------------------------------------------------------------------
/code/cluster-gcp_val.tfvars:
--------------------------------------------------------------------------------
 1 | cluster_name = "amp-gcp"
 2 | cluster_nodes = {
 3 |   amp-us-west2-a = {
 4 |     host        = "10.168.0.18"
 5 |     roles       = ["etcd", "controlplane", "worker"]
 6 |     labels      = []
 7 |     taints      = []
 8 |   },
 9 |   amp-us-west2-b = {
10 |     host        = "10.168.0.17"
11 |     roles       = ["etcd", "controlplane", "worker"]
12 |     labels      = []
13 |     taints      = []
14 |   },
15 |   amp-us-west2-c = {
16 |     host        = "10.168.0.250"
17 |     roles       = ["etcd", "controlplane", "worker"]
18 |     labels      = []
19 |     taints      = []
20 |   }
21 | }
22 | cluster_labels = {
23 |   env          = "production"
24 |   clusterInfra = "gcp"
25 |   clusterRole  = "amp"
26 |   k8s          = "self-managed"
27 |   backup       = "enabled"
28 | }
29 | 


--------------------------------------------------------------------------------
/code/log.ascii:
--------------------------------------------------------------------------------
1 | 2020-08-18 05:35:54.842Z INFO [DistributedP4RuntimeTableMirror] Synchronized TABLE_ENTRY mirror for device:leaf1: 0 removed, 2 updated, 4 added
2 | 


--------------------------------------------------------------------------------
/code/log.json:
--------------------------------------------------------------------------------
1 | {
2 |   "time": "2020-08-18 05:35:54.842Z",
3 |   "logLevel": "INFO",  "component": "DistributedP4RuntimeTableMirror",
4 |   "log": "Synchronized TABLE_ENTRY mirror for device:leaf1: 0 removed, 2 updated, 4 added"
5 | }
6 | 


--------------------------------------------------------------------------------
/code/main-rke.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_providers {
  3 |     rancher2 = {
  4 |       source  = "rancher/rancher2"
  5 |     }
  6 |     null = {
  7 |       source  = "hashicorp/null"
  8 |       version = "~> 2.1.2"
  9 |     }
 10 |   }
 11 | }
 12 | 
 13 | resource "rancher2_cluster" "cluster" {
 14 |   name = var.cluster_config.cluster_name
 15 | 
 16 |   enable_cluster_monitoring = false
 17 |   enable_cluster_alerting   = false
 18 | 
 19 |   labels = var.cluster_labels
 20 | 
 21 |   rke_config {
 22 |     kubernetes_version = var.cluster_config.k8s_version
 23 | 
 24 |     authentication {
 25 |       strategy = "x509"
 26 |     }
 27 | 
 28 |     monitoring {
 29 |       provider = "none"
 30 |     }
 31 | 
 32 |     network {
 33 |       plugin = "calico"
 34 |     }
 35 | 
 36 |     services {
 37 |       etcd {
 38 |         backup_config {
 39 |           enabled        = true
 40 |           interval_hours = 6
 41 |           retention      = 30
 42 |         }
 43 |         retention = "72h"
 44 |         snapshot  = false
 45 |       }
 46 | 
 47 |       kube_api {
 48 |         service_cluster_ip_range = var.cluster_config.k8s_cluster_ip_range
 49 |         extra_args = {
 50 |           feature-gates = "SCTPSupport=True"
 51 |         }
 52 |       }
 53 | 
 54 |       kubelet {
 55 |         cluster_domain     = var.cluster_config.cluster_domain
 56 |         cluster_dns_server = var.cluster_config.kube_dns_cluster_ip
 57 |         fail_swap_on       = false
 58 |         extra_args = {
 59 |           cpu-manager-policy = "static"
 60 |           kube-reserved      = "cpu=500m,memory=256Mi"
 61 |           system-reserved    = "cpu=500m,memory=256Mi"
 62 |           feature-gates      = "SCTPSupport=True"
 63 |         }
 64 |       }
 65 | 
 66 |       kube_controller {
 67 |         cluster_cidr             = var.cluster_config.k8s_pod_range
 68 |         service_cluster_ip_range = var.cluster_config.k8s_cluster_ip_range
 69 |         extra_args = {
 70 |           feature-gates = "SCTPSupport=True"
 71 |         }
 72 |       }
 73 | 
 74 |       scheduler {
 75 |         extra_args = {
 76 |           feature-gates = "SCTPSupport=True"
 77 |         }
 78 |       }
 79 | 
 80 |       kubeproxy {
 81 |         extra_args = {
 82 |           feature-gates = "SCTPSupport=True"
 83 |           proxy-mode    = "ipvs"
 84 |         }
 85 |       }
 86 |     }
 87 |     addons_include = ["https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/release-3.7/images/multus-daemonset.yml"]
 88 |     addons = var.addon_manifests
 89 |   }
 90 | }
 91 | 
 92 | resource "null_resource" "nodes" {
 93 |   triggers = {
 94 |     cluster_nodes = length(var.nodes)
 95 |   }
 96 | 
 97 |   for_each = var.nodes
 98 | 
 99 |   connection {
100 |     type                = "ssh"
101 | 
102 |     bastion_host        = var.bastion_host
103 |     bastion_private_key = file(var.bastion_private_key)
104 |     bastion_user        = var.bastion_user
105 | 
106 |     user        = each.value.user
107 |     host        = each.value.host
108 |     private_key = file(each.value.private_key)
109 |   }
110 | 
111 |   provisioner "remote-exec" {
112 |     inline = [<<EOT
113 |       ${rancher2_cluster.cluster.cluster_registration_token[0].node_command} \
114 |       ${join(" ", formatlist("--%s", each.value.roles))} \
115 |       ${join(" ", formatlist("--taints %s", each.value.taints))} \
116 |       ${join(" ", formatlist("--label %s", each.value.labels))}
117 |       EOT
118 |     ]
119 |   }
120 | }
121 | 
122 | resource "rancher2_cluster_sync" "cluster-wait" {
123 |   cluster_id = rancher2_cluster.cluster.id
124 | 
125 |   provisioner "local-exec" {
126 |     command = <<EOT
127 |       kubectl set env daemonset/calico-node \
128 |         --server ${yamldecode(rancher2_cluster.cluster.kube_config).clusters[0].cluster.server} \
129 |         --token ${yamldecode(rancher2_cluster.cluster.kube_config).users[0].user.token} \
130 |         --namespace kube-system \
131 |         IP_AUTODETECTION_METHOD=${var.cluster_config.calico_ip_detect_method}
132 |     EOT
133 |   }
134 | }
135 | 


--------------------------------------------------------------------------------
/code/prometheus-rule.yaml:
--------------------------------------------------------------------------------
 1 | - alert: SingleEdgeTestNotReporting
 2 |   annotations:
 3 |     message: |
 4 |       Cluster {{`{{ .Labels.name }}`}} has not reported for at least 5 minutes.
 5 |   expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 300
 6 |   for: 1m
 7 |   labels:
 8 |     severity: critical
 9 | - alert: SingleEdgeConnectTestFailing
10 |   annotations:
11 |     message: |
12 |       Cluster {{`{{ .Labels.name }}`}} reporting UE connect failure for at least 10 minutes.
13 |   expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1
14 |   for: 10m
15 |   labels:
16 |     severity: critical


--------------------------------------------------------------------------------
/code/provider.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.13"
 3 |   required_providers {
 4 |     rancher2 = {
 5 |       source  = "rancher/rancher2"
 6 |       version = "= 1.15.1"
 7 |     }
 8 |     google = {
 9 |       source  = "hashicorp/google"
10 |       version = "~> 3.65.0"
11 |     }
12 |     null = {
13 |       source  = "hashicorp/null"
14 |       version = "~> 2.1.2"
15 |     }
16 |   }
17 | }
18 | 
19 | variable "rancher" {
20 |   description = "Rancher credential"
21 |   type = object({
22 |     url        = string
23 |     access_key = string
24 |     secret_key = string
25 |   })
26 | }
27 | 
28 | variable "gcp_config" {
29 |   description = "GCP project and network configuration"
30 |   type = object({
31 |     region          = string
32 |     compute_project = string
33 |     network_project = string
34 |     network_name    = string
35 |     subnet_name     = string
36 |   })
37 | }
38 | 
39 | provider "rancher2" {
40 |   api_url    = var.rancher.url
41 |   access_key = var.rancher.access_key
42 |   secret_key = var.rancher.secret_key
43 | }
44 | 
45 | provider "google" {
46 |   # Provide GCP credential using GOOGLE_CREDENTIALS environment variable
47 |   project = var.gcp_config.compute_project
48 |   region  = var.gcp_config.region
49 | }
50 | 


--------------------------------------------------------------------------------
/code/roc-api-tests.groovy:
--------------------------------------------------------------------------------
 1 | pipeline {
 2 | ...
 3 |     stages {
 4 |         stage("Cleanup"){
 5 | 	    ...
 6 |         }
 7 |         stage("Install Kind"){
 8 | 	    ...
 9 |         }
10 |         stage("Clone Test Repo"){
11 | 	    ...
12 |         }
13 |         stage("Setup Virtual Environment"){
14 | 	    ...
15 |         }
16 |         stage("Generate API Test Framework and API Tests"){
17 | 	    ...
18 |         }
19 |         stage("Run API Tests"){
20 |             steps {
21 |                 sh """
22 |                     mkdir -p /tmp/robotlogs
23 |                     cd ${WORKSPACE}/api-tests
24 |                     source ast-venv/bin/activate; set -u;
25 |                     robot ${WORKSPACE}/api-tests/ap_list.robot || true
26 |                     robot ${WORKSPACE}/api-tests/application.robot || true
27 |                     robot ${WORKSPACE}/api-tests/connectivity_service.robot || true
28 |                     robot ${WORKSPACE}/api-tests/device_group.robot || true
29 |                     robot ${WORKSPACE}/api-tests/enterprise.robot || true
30 |                     robot ${WORKSPACE}/api-tests/ip_domain.robot || true
31 |                     robot ${WORKSPACE}/api-tests/site.robot || true
32 |                     robot ${WORKSPACE}/api-tests/template.robot || true
33 |                     robot ${WORKSPACE}/api-tests/traffic_class.robot || true
34 |                     robot ${WORKSPACE}/api-tests/upf.robot || true
35 |                     robot ${WORKSPACE}/api-tests/vcs.robot || true
36 |                 """
37 |             }
38 |         }
39 |     }
40 | ...
41 | }


--------------------------------------------------------------------------------
/code/template.yang:
--------------------------------------------------------------------------------
 1 | module onf-template {
 2 |   ...
 3 |   description 
 4 |     "The aether vcs-template holds common parameters used
 5 |      by a virtual connectivity service. Templates are used to
 6 |      populate a VCS.";
 7 |   typedef template-id {
 8 |         type yg:yang-identifier {
 9 |             length 1..32;
10 |         }
11 |   }
12 |   container template {
13 |     description "The top level container";
14 |     list template {
15 |       key "id";
16 |       description
17 |         "List of vcs templates";
18 |       leaf id {
19 |         type template-id;
20 |         description "ID for this vcs template.";
21 |       }
22 |       leaf display-name {
23 |         type string {
24 |             length 1..80;
25 |         }
26 |         description "display name to use in GUI or CLI";
27 |       }
28 |       leaf sst {
29 |         type at:sst;
30 |         description "Slice/Service type";
31 |       }
32 |       leaf sd {
33 |         type at:sd;
34 |         description "Slice differentiator";
35 |       }
36 |       container device {
37 |         description "Per-device QOS Settings";
38 |         container mbr {
39 |           description "Maximum bitrate";
40 |           leaf uplink {
41 |             type at:bitrate;
42 |             units bps;
43 |             description "Per-device mbr uplink data rate in mbps";
44 |           }
45 |           leaf downlink {
46 |             type at:bitrate;
47 |             units bps;
48 |             description "Per-device mbr downlink data rate in mbps";
49 |           }
50 |         }
51 |       }
52 |       container slice {
53 |         description "Per-Slice QOS Settings";
54 |         container mbr {
55 |           description "Maximum bitrate";
56 |           leaf uplink {
57 |             type at:bitrate;
58 |             units bps;
59 |             description "Per-Slice mbr uplink data rate in mbps";
60 |           }
61 |           leaf downlink {
62 |             type at:bitrate;
63 |             units bps;
64 |             description "Per-Slice mbr downlink data rate in mbps";
65 |           }
66 |         }
67 |       }      
68 |       leaf traffic-class {
69 |         type leafref {
70 |           path "/tc:traffic-class/tc:traffic-class/tc:id";
71 |         }          
72 |         description
73 |           "Link to traffic class";
74 |       } 
75 |       leaf description {
76 |         type at:description;
77 |         description "description of this vcs template";
78 |       }
79 |     }
80 |   } 
81 | }


--------------------------------------------------------------------------------
/code/trigger-event.yaml:
--------------------------------------------------------------------------------
 1 | - job-template:
 2 |     id: 'aether-patchset'
 3 |     name: 'aether-verify-{project}{suffix}'
 4 |     project-type: pipeline
 5 |     pipeline-script: 'aether-test.groovy'
 6 |     ...
 7 |     triggers:
 8 |       - gerrit:
 9 |           server-name: '{gerrit-server-name}'
10 |           dependency-jobs: '{dependency-jobs}'
11 |           trigger-on:
12 |             - patchset-created-event:
13 |                 exclude-drafts: true
14 |                 exclude-trivial-rebase: false
15 |                 exclude-no-code-change: true
16 |             - draft-published-event
17 |             - comment-added-contains-event:
18 |                 comment-contains-value: '(?i)^.*recheck$'
19 | ...		


--------------------------------------------------------------------------------
/code/trigger-time.yaml:
--------------------------------------------------------------------------------
 1 | - job-template:
 2 |     id: aether-api-tests
 3 |     name: 'aether-api-{api-version}-tests-{release-version}'
 4 |     project-type: pipeline
 5 |     pipeline-file: 'aether-api-tests.groovy'
 6 |     ...
 7 |     triggers:
 8 |       - timed: |
 9 |           TZ=America/Los_Angeles
10 |           H {time} * * *
11 | ...	  


--------------------------------------------------------------------------------
/code/uptime.yaml:
--------------------------------------------------------------------------------
1 | "expr": "avg(avg_over_time(ace_e2e_ok{endpoint=\"metrics80\",name=\"$edge\"}[$__interval]) * 100)",
2 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | import os
 20 | 
 21 | from subprocess import check_output, CalledProcessError
 22 | 
 23 | def get_version():
 24 | 
 25 |     try:
 26 |         version = check_output(['cat', 'VERSION'],
 27 |                                universal_newlines=True)
 28 |     except CalledProcessError:
 29 |         return 'unknown version'
 30 | 
 31 |     return version.rstrip()
 32 | 
 33 | # "version" is used for html build
 34 | version = get_version()
 35 | # "release" is used for LaTeX build
 36 | release = version
 37 | 
 38 | 
 39 | # -- Project information -----------------------------------------------------
 40 | 
 41 | project = u'Edge Cloud Operations: A Systems Approach'
 42 | copyright = u'2022, Systems Approach LLC (Publisher)'
 43 | author = u'Peterson, Baker, Bavier, Williams, Davie'
 44 | 
 45 | # -- General configuration ---------------------------------------------------
 46 | 
 47 | # If your documentation needs a minimal Sphinx version, state it here.
 48 | #
 49 | # needs_sphinx = '1.0'
 50 | 
 51 | # make all warnings errors
 52 | warning_is_error = False
 53 | 
 54 | # Add any Sphinx extension module names here, as strings. They can be
 55 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 56 | # ones. ***Replace "mathjax" with "imgmath" for epub output.***
 57 | extensions = [
 58 |     'sphinx.ext.autosectionlabel',
 59 |     'sphinx.ext.coverage',
 60 |     'sphinx.ext.ifconfig',
 61 |     'sphinx.ext.mathjax',
 62 |     'sphinx.ext.todo',
 63 |     'sphinxcontrib.spelling',
 64 |     "sphinx_multiversion",
 65 | ]
 66 | 
 67 | # Text files with lists of words that shouldn't fail the spellchecker:
 68 | spelling_word_list_filename=['dict.txt', ]
 69 | 
 70 | # Add any paths that contain templates here, relative to this directory.
 71 | templates_path = ['_templates']
 72 | 
 73 | # The suffix(es) of source filenames.
 74 | # You can specify multiple suffix as a list of string:
 75 | #
 76 | # source_suffix = ['.rst', '.md']
 77 | source_suffix = '.rst'
 78 | 
 79 | # The master toctree document.
 80 | master_doc = 'index'
 81 | 
 82 | # The language for content autogenerated by Sphinx. Refer to documentation
 83 | # for a list of supported languages.
 84 | #
 85 | # This is also used if you do content translation via gettext catalogs.
 86 | # Usually you set "language" from the command line for these cases.
 87 | language = 'en'
 88 | 
 89 | # List of patterns, relative to source directory, that match files and
 90 | # directories to ignore when looking for source files.
 91 | # This pattern also affects html_static_path and html_extra_path.
 92 | exclude_patterns = [u'_build', 'venv-docs', 'requirements.txt', 'Thumbs.db', 'private', '.DS_Store', '*/README.rst']
 93 | 
 94 | # The name of the Pygments (syntax highlighting) style to use.
 95 | pygments_style = None
 96 | 
 97 | # Enable numbered figures
 98 | numfig = True
 99 | numfig_format = {
100 |     'figure': 'Figure %s.',
101 |     'table':  'Table %s.'
102 |     }
103 | 
104 | # Ignore link check for the following websites
105 | linkcheck_ignore = [
106 |      'https://www.amazon.com/','https://amzn.to/'
107 | ]
108 | 
109 | # -- Options for HTML output -------------------------------------------------
110 | 
111 | # The theme to use for HTML and HTML Help pages.  See the documentation for
112 | # a list of builtin themes.
113 | #
114 | html_theme = 'sphinx_rtd_theme'
115 | 
116 | # Theme options are theme-specific and customize the look and feel of a theme
117 | # further.  For a list of options available for each theme, see the
118 | # documentation.
119 | #
120 | html_theme_options = {
121 |   'prev_next_buttons_location': 'both'
122 | }
123 | 
124 | # Add any paths that contain custom static files (such as style sheets) here,
125 | # relative to this directory. They are copied after the builtin static files,
126 | # so a file named "default.css" will overwrite the builtin "default.css".
127 | html_static_path = ['_static']
128 | 
129 | html_css_files = [
130 |     'css/rtd_theme_mods.css',
131 |     ]
132 | 
133 | 
134 | # HTML Favicon
135 | html_favicon = '_static/bridge.ico'
136 | 
137 | # HTML Index
138 | html_use_index = False
139 | 
140 | # Custom sidebar templates, must be a dictionary that maps document names
141 | # to template names.
142 | #
143 | # The default sidebars (for documents that don't match any pattern) are
144 | # defined by theme itself.  Builtin themes are using these templates by
145 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
146 | # 'searchbox.html']``.
147 | #
148 | # html_sidebars = {}
149 | 
150 | #extra HTML files
151 | html_extra_path = ['_extra']
152 | 
153 | # -- Options for HTMLHelp output ---------------------------------------------
154 | 
155 | # Output file base name for HTML help builder.
156 | htmlhelp_basename = 'SystemsApproach'
157 | 
158 | 
159 | # -- Options for LaTeX output ------------------------------------------------
160 | #latex_engine = 'xelatex'
161 | 
162 | latex_elements = {
163 |     # The paper size ('letterpaper' or 'a4paper').
164 |     #
165 |     'papersize': 'letterpaper',
166 | 
167 |     # The font size ('10pt', '11pt' or '12pt').
168 |     #
169 |     'pointsize': '11pt',
170 | 
171 |     # Get unicode to work
172 |     #
173 |     'fontenc': '\\usepackage[LGR,T1]{fontenc}',
174 | 
175 |     # Latex figure (float) alignment
176 |     #
177 |     'figure_align': 'ht',
178 | }
179 | 
180 | # Grouping the document tree into LaTeX files. List of tuples
181 | # (source start file, target name, title,
182 | #  author, documentclass [howto, manual, or own class]).
183 | latex_documents = [
184 |     (master_doc, 'book.tex', u'Edge Cloud Operations: A Systems Approach',
185 |      u'Peterson, Baker, Bavier, Williams and Davie ', 'manual', True),
186 | ]
187 | 
188 | latex_toplevel_sectioning = 'chapter'
189 | 
190 | 
191 | # -- Options for manual page output ------------------------------------------
192 | 
193 | # One entry per manual page. List of tuples
194 | # (source start file, name, description, authors, manual section).
195 | man_pages = [
196 |     (master_doc, 'Systems Approach', u'Systems Approach',
197 |      [author], 1)
198 | ]
199 | 
200 | 
201 | # -- Options for Texinfo output ----------------------------------------------
202 | 
203 | # Grouping the document tree into Texinfo files. List of tuples
204 | # (source start file, target name, title, author,
205 | #  dir menu entry, description, category)
206 | texinfo_documents = [
207 |     (master_doc, 'Edge Cloud Opetaions', u'Edge Cloud Operations',
208 |      author, 'Peterson, Baker, Bavier, Williams, and Davie', 'A Systems Approach',
209 |      'Miscellaneous'),
210 | ]
211 | 
212 | 
213 | # -- Options for Epub output -------------------------------------------------
214 | epub_title = project
215 | epub_description = 'Building a Cloud Management Platform'
216 | epub_cover = ('_static/cover.jpg', '')
217 | epub_show_urls = 'False'
218 | epub_use_index = False
219 | 
220 | # The unique identifier of the text. This can be a ISBN number
221 | # or the project homepage.
222 | #
223 | # epub_identifier = ''
224 | 
225 | # A unique identification for the text.
226 | #
227 | # epub_uid = ''
228 | 
229 | # A list of files that should not be packed into the epub file.
230 | epub_exclude_files = ['search.html','robots.txt']
231 | 
232 | 
233 | # -- Extension configuration -------------------------------------------------
234 | 
235 | # -- options for Intersphinx extension ---------------------------------------
236 | 
237 | intersphinx_mapping = {
238 |     'sphinx': ('https://www.sphinx-doc.org/en/master', None),
239 |     'aether': ('https://docs.aetherproject.org/master', None),
240 |     'sdcore': ('https://docs.sd-core.opennetworking.org/master', None),
241 |     'sdran': ('https://docs.sd-ran.org/master', None),
242 |     'sdran': ('https://docs.sd-fabric.org/master', None),
243 |     'sysapproach5g': ('https://5g.systemsapproach.org/', None),
244 |     'sysapproachnet': ('https://book.systemsapproach.org/', None),
245 |     'sysapproachsdn': ('https://sdn.systemsapproach.org/', None),
246 |     }
247 | 
248 | # -- Options for todo extension ----------------------------------------------
249 | # If true, `todo` and `todoList` produce output, else they produce nothing.
250 | todo_include_todos = True
251 | 
252 | 
253 | # -- Set up Google Analytics
254 | # -- using approach at https://stackoverflow.com/questions/9444342/adding-a-javascript-script-tag-some-place-so-that-it-works-for-every-file-in-sph/41885884#41885884
255 | 
256 | 
257 | GA_INVOKE_JS = """
258 |   window.dataLayer = window.dataLayer || [];
259 |   function gtag(){dataLayer.push(arguments);}
260 |   gtag('js', new Date());
261 | 
262 |   gtag('config', 'G-K101Q1MWLM');
263 | """
264 | 
265 | def setup(app):
266 | 
267 |     app.add_js_file('https://www.googletagmanager.com/gtag/js?id=G-K101Q1MWLM', loading_method="async")
268 |     app.add_js_file(None, body=GA_INVOKE_JS)
269 | 


--------------------------------------------------------------------------------
/dict.txt:
--------------------------------------------------------------------------------
  1 | Acknowledgements
  2 | Adaptor
  3 | Adaptors
  4 | Aether
  5 | Alertmanager
  6 | Ansible
  7 | Anthos
  8 | Atomix
  9 | BMC
 10 | Bavier
 11 | Bazel
 12 | Calcote
 13 | Chiu
 14 | Condon
 15 | Config
 16 | Davie
 17 | DevOps
 18 | Docker
 19 | Dockerfile
 20 | ECS
 21 | ElasticStack
 22 | Elkstack
 23 | Fluentbit
 24 | Fluentd
 25 | GCP
 26 | GPP
 27 | Gerrit
 28 | Gradle
 29 | Grafana
 30 | Hostname
 31 | Hyunsun
 32 | IPMI
 33 | IaaS
 34 | IoT
 35 | Istio
 36 | Jaeger
 37 | Kahn
 38 | Keycloak
 39 | Kibana
 40 | Kobayashi
 41 | Kubernetes
 42 | Lifecycle
 43 | Linkerd
 44 | Logstash
 45 | Makefile
 46 | Multiprotocol
 47 | NetBox
 48 | Netplan
 49 | Nginx
 50 | Nicira
 51 | ONOS
 52 | Oauth
 53 | Observability
 54 | Oguz
 55 | Onos
 56 | POD
 57 | PODs
 58 | PaaS
 59 | Ph
 60 | Plugable
 61 | Pluggable
 62 | Proxmox
 63 | QoS
 64 | RKE
 65 | Redfish
 66 | Renderer
 67 | Repo
 68 | Repos
 69 | Runtime
 70 | SDN
 71 | Sigelman
 72 | Suchitra
 73 | Sunay
 74 | Sys
 75 | Syslog
 76 | Sámi
 77 | Tanzu
 78 | Telco
 79 | Telcos
 80 | Terraform
 81 | Todo
 82 | Tofino
 83 | Uber
 84 | VM
 85 | VMs
 86 | VMware
 87 | Vemuri
 88 | Weaveworks
 89 | absorber
 90 | adaptor
 91 | adaptors
 92 | analytics
 93 | architected
 94 | auth
 95 | backend
 96 | bitrate
 97 | centric
 98 | cloudified
 99 | config
100 | customizable
101 | datacenter
102 | datacenters
103 | de
104 | decrypt
105 | decrypting
106 | deployable
107 | disaggregate
108 | disaggregated
109 | disaggregation
110 | downlink
111 | eBPF
112 | eNB
113 | eNBs
114 | evolvable
115 | exemplifed
116 | facto
117 | filesystem
118 | frontend
119 | gNMI
120 | gNOI
121 | gNodeB
122 | gNodeBs
123 | gRPC
124 | heatmap
125 | hoc
126 | hyperscale
127 | hyperscaler
128 | hyperscalers
129 | iPXE
130 | instantiation
131 | integrators
132 | invariants
133 | jitter
134 | lifecycle
135 | linter
136 | liveness
137 | llp
138 | mbr
139 | microservice
140 | microservices
141 | mindshare
142 | namespaces
143 | natively
144 | observability
145 | onos
146 | operationalization
147 | operationalize
148 | operationalized
149 | operationalizes
150 | operationalizing
151 | orchestrator
152 | pre
153 | precompiled
154 | prem
155 | programmatically
156 | qsfp
157 | reactively
158 | rearchitecting
159 | recode
160 | repo
161 | repos
162 | repurpose
163 | roadmap
164 | rollout
165 | rst
166 | runtime
167 | runtimes
168 | scalability
169 | scalable
170 | signalling
171 | stderr
172 | stdin
173 | stdout
174 | storylines
175 | subcomponents
176 | subnet
177 | systemsapproach
178 | textboxes
179 | todo
180 | todolist
181 | toolchain
182 | toolset
183 | untrusted
184 | unwinnable
185 | uplink
186 | uptime
187 | virtualenv
188 | 


--------------------------------------------------------------------------------
/figures.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures.pptx


--------------------------------------------------------------------------------
/figures/Slide1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide1.png


--------------------------------------------------------------------------------
/figures/Slide10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide10.png


--------------------------------------------------------------------------------
/figures/Slide11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide11.png


--------------------------------------------------------------------------------
/figures/Slide12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide12.png


--------------------------------------------------------------------------------
/figures/Slide13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide13.png


--------------------------------------------------------------------------------
/figures/Slide14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide14.png


--------------------------------------------------------------------------------
/figures/Slide15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide15.png


--------------------------------------------------------------------------------
/figures/Slide16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide16.png


--------------------------------------------------------------------------------
/figures/Slide17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide17.png


--------------------------------------------------------------------------------
/figures/Slide18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide18.png


--------------------------------------------------------------------------------
/figures/Slide19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide19.png


--------------------------------------------------------------------------------
/figures/Slide2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide2.png


--------------------------------------------------------------------------------
/figures/Slide20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide20.png


--------------------------------------------------------------------------------
/figures/Slide21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide21.png


--------------------------------------------------------------------------------
/figures/Slide22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide22.png


--------------------------------------------------------------------------------
/figures/Slide23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide23.png


--------------------------------------------------------------------------------
/figures/Slide24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide24.png


--------------------------------------------------------------------------------
/figures/Slide25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide25.png


--------------------------------------------------------------------------------
/figures/Slide26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide26.png


--------------------------------------------------------------------------------
/figures/Slide27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide27.png


--------------------------------------------------------------------------------
/figures/Slide3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide3.png


--------------------------------------------------------------------------------
/figures/Slide4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide4.png


--------------------------------------------------------------------------------
/figures/Slide5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide5.png


--------------------------------------------------------------------------------
/figures/Slide6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide6.png


--------------------------------------------------------------------------------
/figures/Slide7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide7.png


--------------------------------------------------------------------------------
/figures/Slide8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide8.png


--------------------------------------------------------------------------------
/figures/Slide9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/Slide9.png


--------------------------------------------------------------------------------
/figures/ace_dash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/ace_dash.png


--------------------------------------------------------------------------------
/figures/cable_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/cable_list.png


--------------------------------------------------------------------------------
/figures/es_dash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/es_dash.png


--------------------------------------------------------------------------------
/figures/gui1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/gui1.png


--------------------------------------------------------------------------------
/figures/gui2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/gui2.png


--------------------------------------------------------------------------------
/figures/pronto_logical_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/pronto_logical_diagram.png


--------------------------------------------------------------------------------
/figures/rack_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/rack_diagram.png


--------------------------------------------------------------------------------
/figures/upf_dash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SystemsApproach/ops/8cb459bc95fc74c1b206174a72b0b81cf9f5321f/figures/upf_dash.png


--------------------------------------------------------------------------------
/foreword.rst:
--------------------------------------------------------------------------------
 1 | Foreword
 2 | ==========
 3 | 
 4 | 
 5 | First the applications all moved to the cloud. And now they're being
 6 | torn apart. Let me explain what I mean by that.
 7 | 
 8 | As markets grow, the unit of function around which one can build a
 9 | business shrinks. A classic example of this can be seen in the history
10 | of the automotive industry. The Ford River Rouge Complex was built in
11 | the late 1920s. At the time, mass-produced cars were relatively new,
12 | and the market was relatively small. And so factories like the River Rouge
13 | Complex had to build all the subcomponents too. Roughly, in one side
14 | of the factory went water, rubber, and iron ore, and out the other
15 | side came full automobiles. Of course, as the market for cars grew, so
16 | did a massive ecosystem of suppliers of car components: wheels,
17 | seats, floor mats, and the like. Today the large car companies are
18 | more akin to integrators than auto parts makers.
19 | 
20 | The same dynamic is happening with the application. In the 1970s the
21 | same manufacturer would build the chips, the circuit boards, the
22 | system form factor, the operating system, and each of the
23 | applications. Over time as the market has grown, the system has
24 | disaggregated. The hardware and software separated and spawned multiple
25 | independent companies. And then companies started to be built around
26 | independent applications.
27 | 
28 | The market hasn't stopped growing and over the last few years we've
29 | seen the application itself disaggregate. Commonly used subcomponents
30 | of applications are being pulled out, and entire companies and
31 | projects are being built around them. Today, if you're building an
32 | application, there are third-party APIs available for authenticating
33 | users, sending texts or email, streaming videos, authorizing access to
34 | resources, and many other useful functions.
35 | 
36 | So what does this have to do with the book you're about to read? While
37 | the last decade was a consolidation of applications into the
38 | cloud, the next decade is largely going to be about the explosion of
39 | applications and application components away from it. Now that
40 | subcomponents of workloads have been largely decoupled from having to
41 | sit with the application, they can be run anywhere. And in particular
42 | they can be run on infrastructure that's purposely built and optimized
43 | for them! In fact, we are starting to see what can only be described
44 | as an anti-cloud trend where large companies are choosing to pull some
45 | workloads back from large clouds to their own optimized
46 | infrastructure. And we're even seeing startups choosing to build their
47 | own infrastructure from the get-go because they understand the cost
48 | and performance advantages of doing so.
49 | 
50 | In "Edge Cloud Operations: A Systems Approach" the authors provide a
51 | detailed overview of not just cloud operations (which are so last
52 | decade) but operations in this new era of distributed clouds. In many
53 | ways, the cloud era was a low point of systems, because so much below
54 | the application layer was buried deep within the engineering organizations of
55 | the three large cloud providers. But that's changing, and to change
56 | with it, you need to understand how it all works. And that's exactly
57 | why you need to read this book.
58 | 
59 | | Martin Casado
60 | | General Partner, a16z
61 | 


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | .. image:: _static/SystemsApproachLogoURL.png
 2 |    :width: 300px
 3 |    :align: center
 4 |    :target: https://systemsapproach.org
 5 | 
 6 | |
 7 | 
 8 | Edge Cloud Operations: A Systems Approach
 9 | =========================================
10 | 
11 | Peterson, Baker, Bavier, Williams and Davie
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | |
15 | 
16 | .. toctree::
17 |    :maxdepth: 2
18 |    :caption: Table of Contents
19 | 
20 |    foreword.rst
21 |    preface.rst
22 |    intro.rst
23 |    arch.rst
24 |    provision.rst
25 |    lifecycle.rst
26 |    control.rst
27 |    monitor.rst
28 |    README.rst
29 |    authors.rst
30 |    latest.rst
31 |    print.rst
32 | 
33 | 


--------------------------------------------------------------------------------
/intro.rst:
--------------------------------------------------------------------------------
  1 | Chapter 1:  Introduction
  2 | ========================
  3 | 
  4 | Clouds provide a set of tools for bringing up and operating scalable
  5 | services, but how do you operationalize a cloud in the first place?
  6 | The two problems are not mutually exclusive—after all, a cloud is
  7 | realized as a set of services—but asking the question this way
  8 | eliminates the temptation to give the answer “the cloud takes care of
  9 | that for you.” This book describes how to operationalize a cloud,
 10 | starting with bare-metal hardware, all the way to offering one or more
 11 | managed services to users.
 12 | 
 13 | Few of us are likely to have reason to instantiate a hyperscale
 14 | datacenter, but deploying private edge clouds in an enterprise—and
 15 | optionally connecting that edge to a datacenter to form a hybrid
 16 | cloud—is becoming increasingly common. We use the term "edge cloud" to
 17 | distinguish our focus from the "core", which is the traditional domain
 18 | of the hyperscale operators. The edge is more likely to be in a
 19 | enterprise or an "Internet of Things" setting such as a factory. The
 20 | edge is the place where the cloud services connect to the real world,
 21 | e.g., via sensors and actuators, and where latency-sensitive services
 22 | are deployed to be close to the consumers of those services.\ [#]_
 23 | 
 24 | .. [#] Server clusters hosted in co-location facilities can also be
 25 |        considered edge clouds, and benefit from the technologies and
 26 |        practices described in this book, but we use enterprises as our
 27 |        exemplar deployment because they expose a broader set of
 28 |        requirements.
 29 | 
 30 | The hyperscalers are indeed willing to manage your edge cloud for you,
 31 | as an extension of their core datacenters. And correspondingly, there
 32 | is significant activity to provide such products, with Google’s
 33 | Anthos, Microsoft’s Azure Arc, and Amazon’s ECS-Anywhere as prime
 34 | examples. But the barrier to operationalizing a cloud is not so high
 35 | that only a hyperscaler has the wherewithal to do it. It is possible
 36 | to build a cloud—and all the associated lifecycle management and
 37 | runtime controls that are required to operate it—using readily
 38 | available open source software packages.
 39 | 
 40 | .. sidebar:: Developers Have an Equal Role to Play
 41 | 
 42 |   *This book takes an operator-centric view of cloud operations, but
 43 |   developers have an equal role to play. This role is reflected in
 44 |   practices like DevOps (which we discuss in Section 2.5), but can
 45 |   also been seen in the underlying system design.  The cloud
 46 |   architecture includes a management platform, which specifies a
 47 |   runtime interface through which service developers (who provide
 48 |   functionality) interact with cloud operators (who manage that
 49 |   functionality).  Because there is a shared management platform to
 50 |   leverage, developers do not need to (and should not) reinvent the
 51 |   wheel when it comes to provisioning, configuring, controlling, and
 52 |   monitoring the services they implement.*
 53 | 
 54 |   *Looking at the broader picture, this management platform is an
 55 |   essential part of how app builders and service developers deliver
 56 |   functionality to end users. Today, functionality is most often
 57 |   delivered as a Managed Service (as opposed to an inert pile of
 58 |   software). This means developers not only have to worry about the
 59 |   algorithms and data structures needed to implement their app or
 60 |   service, they also need to interface with the platform that
 61 |   operationalizes (activates) their code. It is common to focus on the
 62 |   former and view the latter as a burden (especially if someone else
 63 |   will be responsible for deploying and operating their code), but
 64 |   coding to the management platform interface is a central part of the
 65 |   contract for delivering a managed service. Understanding and
 66 |   appreciating the "hows" and "whys" of this platform is critical to
 67 |   developers doing their job.*
 68 | 
 69 | This book describes what such a cloud management platform looks
 70 | like. Our approach is to focus on the fundamental problems that must
 71 | be addressed—design issues that are common to all clouds—but then
 72 | couple this conceptual discussion with specific engineering choices
 73 | made while operationalizing a specific enterprise cloud. Our example
 74 | is Aether, an open source edge cloud that supports 5G connectivity as
 75 | a managed service. Aether has the following properties that make it an
 76 | interesting use case to study:
 77 | 
 78 | * Aether starts with bare-metal hardware (servers and switches)
 79 |   deployed in edge sites (e.g., enterprises). This on-prem cloud can
 80 |   range in size from a partial rack to multi-rack cluster, assembled
 81 |   according to the best practices used in datacenters.
 82 | 
 83 | * Aether supports both “edge services” running on these on-prem
 84 |   clusters and “centralized services” running in commodity cloud
 85 |   datacenters. In this sense it is a hybrid cloud.\ [#]_
 86 | 
 87 | * Aether augments this edge cloud with 5G-Connectivity-as-a-Service,
 88 |   giving us a service that must be operationalized (in addition to the
 89 |   underlying cloud). The end result is that Aether provides a managed
 90 |   Platform-as-a-Service (PaaS).
 91 | 
 92 | * Aether is built entirely from open source components. The only thing
 93 |   it adds is the “glue code” and “specification directives” required
 94 |   to make it operational. This means the recipe is fully reproducible
 95 |   by anyone.
 96 | 
 97 | .. [#] Technically, Aether is also a multi-cloud because it is
 98 |        designed to take advantage of services provided by multiple
 99 |        public clouds, but the private/public (edge/central) aspect is
100 |        the most relevant, so we use hybrid terminology throughout this book.
101 | 
102 | There is another important reason Aether makes for an interesting
103 | example. It is a system being deployed at the confluence of three
104 | traditionally distinct management domains: enterprises (where system
105 | admins have long been responsible for installing and maintaining
106 | purpose-built appliances), network operators (where access
107 | technologies have historically been delivered as Telco-based
108 | solutions), and cloud providers (where commodity hardware and cloud
109 | native software is now readily available). This complicates our job,
110 | because each of these three domains brings its own conventions and
111 | terminology to the table. But understanding how these three
112 | stakeholders approach operationalization gives us a broader
113 | perspective on the problem. We return to the confluence of enterprise,
114 | cloud, and access technologies later in this chapter, but we start by
115 | addressing the terminology challenge.
116 | 
117 | .. _reading_aether:
118 | .. admonition:: Further Reading
119 | 
120 |    `Aether: 5G-Connected Edge Cloud
121 |    <https://opennetworking.org/aether/>`__.
122 | 
123 |    `Aether Documentation
124 |    <https://docs.aetherproject.org/master/index.html>`__.
125 | 
126 | 1.1 Terminology
127 | ---------------
128 | 
129 | The terminology used to talk about operating cloud services represents
130 | a mix of “modern” concepts that are native to the cloud, and
131 | “traditional” concepts that are artifacts from earlier systems (many
132 | of which are now being subsumed by the cloud, but retain some of their
133 | original operational language). This is especially true at the
134 | intersection of the cloud and Telcos, who—like the Sámi of Scandinavia
135 | having over 180 words for snow—have an exceedingly rich vocabulary
136 | for *operating* a network.
137 | 
138 | A major source of confusion is that we are in the midst of a
139 | transition from network systems being built from purpose-built
140 | *devices* to software-based *services* running on commodity
141 | hardware. This often results in multiple terms being used for the same
142 | concept, or more problematically, having one domain subtly repurpose a
143 | term from another domain. To avoid talking past each other, it is
144 | important to first define a few concepts and introduce the related
145 | terminology.
146 | 
147 | * **Operations & Maintenance (O&M):** A traditional term used to
148 |   characterize the overall challenge of operationalizing a network,
149 |   where generally speaking, operators use an O&M Interface to manage
150 |   the system.
151 | 
152 |   * **FCAPS:** An acronym (Fault, Configuration, Accounting, Performance,
153 |     Security) historically used in the Telco industry to enumerate the
154 |     requirements for an operational system. The O&M interface must
155 |     provide a means to detect and manage faults, configure the system,
156 |     account for usage, and so on.
157 | 
158 |   * **OSS/BSS:** Another Telco acronym (Operations Support System,
159 |     Business Support System), referring to the subsystem that
160 |     implements both operational logic (OSS) and business logic
161 |     (BSS). It is usually the top-most component in the overall O&M
162 |     hierarchy.
163 | 
164 |   * **EMS:**  Yet another Telco acronym (Element Management System),
165 |     corresponding to an intermediate layer in the overall O&M
166 |     hierarchy. An EMS is to a particular type of device what an
167 |     OSS/BSS is to the network as a whole.
168 | 
169 | * **Orchestration:** A general term similar to O&M, but originating in
170 |   the cloud context. Involves assembling (e.g., allocating,
171 |   configuring, connecting) a collection of physical or logical
172 |   resources on behalf of some workload. If only a single resource or
173 |   device is involved, we would probably use a term like
174 |   “configuration” instead, so orchestration typically implies
175 |   “orchestrating” across multiple components.
176 | 
177 |   Narrowly defined, an orchestrator is responsible for spinning up
178 |   virtual machines (or containers) and logically interconnecting them
179 |   (with virtual networks). More broadly, orchestration encompasses
180 |   aspects of all the management-related functions described in this
181 |   book.
182 | 
183 |   If you are trying to map cloud terminology onto Telco terminology,
184 |   an orchestrator is often equated with a cloudified version of the
185 |   OSS/BSS mechanism. This top-most layer is sometimes called a
186 |   *Service Orchestrator* since it is responsible for assembling a
187 |   collection of *Virtual Network Functions (VNFs)* into an
188 |   end-to-end-service chain.
189 | 
190 |   * **Playbook/Workflow:** A program or script that implements a
191 |     multi-step orchestration process. (The term workflow is also used
192 |     in a UX context to describe a multi-step operation that a user
193 |     performs on a system using a GUI.)
194 | 
195 | * **Provisioning:** Adding capacity (either physical or virtual
196 |   resources) to a system, usually in response to changes in workload,
197 |   including the initial deployment.
198 | 
199 |   * **Zero-Touch Provisioning:** Usually implies adding new hardware
200 |     without requiring a human to configure it (beyond physically
201 |     connecting the device). This implies the new component
202 |     auto-configures itself, which means the term can also be applied
203 |     to virtual resources (e.g., virtual machines, services) to
204 |     indicate that no manual configuration step is needed to
205 |     instantiate the resource.
206 | 
207 |   * **Remote Device Management:** A standard (e.g., IPMI, Redfish) that
208 |     defines a way to remotely manage hardware devices in support of
209 |     zero-touch provisioning. The idea is to send and receive
210 |     out-of-band messages over the LAN in place of having video or serial
211 |     console access to the device. Additionally, these may integrate with
212 |     monitoring and other device health telemetry systems.
213 | 
214 |   * **Inventory Management:** Planning and tracking both the physical
215 |     (racks, servers, switches, cabling) and virtual (IP ranges and
216 |     addresses, VLANs) resources is a sub-step of the provisioning
217 |     process. This process frequently starts using simple spreadsheets
218 |     and text files, but as complexity grows, a dedicated database for
219 |     inventory facilitates greater automation.
220 | 
221 | * **Lifecycle Management:** Upgrading and replacing functionality (e.g.,
222 |   new services, new features to existing services) over time.
223 | 
224 |   * **Continuous Integration / Continuous Deployment (CI/CD):** An
225 |     approach to Lifecycle Management in which the path from
226 |     development (producing new functionality) to testing, integration,
227 |     and ultimately deployment is an automated pipeline. CI/CD
228 |     typically implies continuously making small incremental changes
229 |     rather than performing large disruptive upgrades.
230 | 
231 |   * **DevOps:** An engineering discipline that fuses the Development
232 |     process and Operational requirements silos, balancing feature
233 |     velocity against system reliability. As a practice, it leverages
234 |     CI/CD methods and is typically associated with container-based
235 |     (also known as *cloud native*) systems. There is some overlap
236 |     between DevOps and *Site
237 |     Reliability Engineering (SRE)* as practiced by cloud providers such as
238 |     Google.
239 | 
240 |   * **In-Service Software Upgrade (ISSU):** A requirement that a
241 |     component continue running during the deployment of an upgrade,
242 |     with minimal disruption to the service delivered to
243 |     end-users. ISSU generally implies the ability to incrementally
244 |     roll-out (and roll-back) an upgrade, but is specifically a
245 |     requirement on individual components (as opposed to the
246 |     platform used to manage a set of components).
247 | 
248 | * **Monitoring & Telemetry:** Collecting data from system components
249 |   to aid in management decisions. This includes diagnosing faults,
250 |   tuning performance, doing root cause analysis, performing security
251 |   audits, and provisioning additional capacity.
252 | 
253 |   * **Analytics:** A program (often using statistical models) that
254 |     produces additional insights (value) from raw data. It can be used
255 |     to close a control loop (i.e., auto-reconfigure a system based on
256 |     these insights), but could also be targeted at a human operator
257 |     who subsequently takes some action.
258 | 
259 | Another way to talk about operations is in terms of stages, leading to
260 | a characterization that is common for traditional network devices:
261 | 
262 | * **Day (-1):** Hardware configuration that is applied to a device (e.g.,
263 |   via a console) when it is first powered on. These configurations correspond
264 |   to firmware (BIOS or similar) settings, and often need knowledge of how the
265 |   device is physically connected to the network (e.g., the port being used).
266 | 
267 | * **Day 0:** Connectivity configuration required to establish
268 |   communication between the device and the available network services
269 |   (e.g., setting a device’s IP address and default router). While such
270 |   information may be provided manually, this is an opportunity to
271 |   auto-configure the device, in support of Zero-Touch Provisioning.
272 | 
273 | * **Day 1:** Service-level configuration needed by the device, including
274 |   parameters that allow the device to take advantage of other services
275 |   (e.g., NTP, Syslog, SMTP, NFS), as well as setting the parameters
276 |   this device needs to perform whatever service it provides. At the
277 |   end of Day-1 operationalization, the device is considered
278 |   up-and-running, and able to support user traffic. This is also an
279 |   opportunity for zero-touch provisioning, in the sense that
280 |   pre-programmed playbooks (workflows) should be able to
281 |   auto-configure the device rather than depending on human
282 |   intervention.
283 | 
284 | * **Day 2..N:** On-going management in support of day-to-day operations,
285 |   coupled with monitoring the network to detect failures and service
286 |   degradation, with the goal of sustaining the service. This may
287 |   involve some closed-loop control, but is often human-intense, which
288 |   involves monitoring a dashboard and fielding alerts, and then
289 |   re-configuring the system as necessary. This is often referred to
290 |   simply as "Day 2 Operations".
291 | 
292 | Again, “Day x” is how traditional network vendors characterize the
293 | process of operationalizing the devices they sell, which in turn
294 | dictates how network operators and enterprise system admins bring
295 | those devices online. While the general framework has been extended to
296 | Virtual Network Functions (VNFs), it is still a device-centric view of
297 | operations. But once a system becomes cloud native, two things shift
298 | the balance of concerns. First, all hardware is commodity, and so Days
299 | 0 and 1 configurations become fully automated (and Day -1 is minimized
300 | since all devices are identical).\ [#]_ Second, Day 2 operations
301 | become a much more sophisticated process. This is because
302 | software-based systems are more agile, making functional upgrades more
303 | commonplace. This focus on *feature velocity* is one of the inherent
304 | values of cloud-based systems, but not surprisingly, it brings its own
305 | set of challenges to management.
306 | 
307 | .. [#] Colloquially, this is sometimes referred to as a shift from
308 |        taking care of pets to one of herding cattle.
309 | 
310 | This book addresses those management challenges, which brings us to a
311 | final note about two words we use frequently: *Operating* and
312 | *Operationalizing*.  Being able to operate a cloud is the end goal and
313 | implies an ongoing process, whereas to operationalize a cloud implies
314 | the process of bringing a set of hardware and software components into
315 | a state that makes it easy to sustain their ongoing operation. This
316 | distinction is relevant because operationalizing a cloud is not a
317 | one-time proposition, but rather, an essential aspect of day-to-day
318 | operations. Being rapidly evolvable is one of the cloud's most
319 | important features, making continual operationalization a key
320 | requirement for operating an edge cloud.
321 | 
322 | 1.2 Disaggregation
323 | ------------------
324 | 
325 | To fully understand the challenge of operating a cloud, we have
326 | to start with the underlying building blocks: a collection of
327 | software-based microservices running on commodity hardware. These
328 | building blocks are the consequence of having *disaggregated* the
329 | bundled and purpose-built network appliances that came before.
330 | From the management perspective, it is helpful to identify what
331 | becomes easier and what becomes harder when you make this
332 | transition. This is both the challenge and the opportunity of
333 | disaggregation.
334 | 
335 | Broadly speaking, disaggregation is the process of breaking large
336 | bundled components into a set of smaller constituent parts. SDN is one
337 | example of disaggregation—it decouples the network’s control and data
338 | planes, with the former running as a cloud service and the latter
339 | running in commodity switches. The microservice architecture is
340 | another example of disaggregation—it breaks monolithic cloud
341 | applications into a mesh of single-function components. Disaggregation
342 | is widely viewed as an essential step in accelerating feature velocity.
343 | This is the opportunity side of the story, which is one of the
344 | widely-claimed benefits of cloud native application architectures. A
345 | useful, if opinionated, view on such architectures is the Twelve-Factor
346 | App.
347 | 
348 | .. _reading_disaggregate:
349 | .. admonition:: Further Reading
350 | 
351 |    Adam Wiggins. `The Twelve-Factor App.
352 |    <https://12factor.net/>`__.
353 | 
354 | The challenge side of the story is that there are many more moving
355 | parts that have to be integrated, coordinated, and managed. Circling
356 | back to terminology, Orchestration and Lifecycle Management become the
357 | dominant issues because (a) many smaller parts have to be assembled,
358 | and (b) these individual parts are expected to change more
359 | frequently. Much of this book focuses on these two issues.
360 | 
361 | The good news is that the industry seems to have converged on
362 | *containers* as the common representation for “component packaging”
363 | and Kubernetes as the first-level *container orchestrator*. (We say
364 | “first-level” because Kubernetes is not sufficient by itself.) This
365 | foundation, in turn, makes many of the other challenges more
366 | manageable:
367 | 
368 | * Monitoring and other telemetry-related mechanisms are themselves
369 |   realized as a set of container-based microservices, deployed within
370 |   the cloud they observe.
371 | 
372 | * ISSU becomes more tractable because the microservice architecture
373 |   encourages stateless components, with persistent state isolated in a
374 |   single function-agnostic storage service, such as a key-value store.
375 | 
376 | * Zero-Touch Provisioning is more tractable because the hardware is
377 |   commodity, and hence, (nearly) identical. This also means the vast
378 |   majority of configuration involves initializing software parameters,
379 |   which is more readily automated.
380 | 
381 | * Cloud native implies a set of best practices for addressing many of
382 |   the FCAPS requirements, especially as they relate to availability
383 |   and performance, both of which are achieved through horizontal
384 |   scaling. Secure communication is also typically built into cloud RPC
385 |   mechanisms.
386 | 
387 | Another way to say this is that by rearchitecting bundled appliances
388 | and devices as horizontally scalable microservices running on
389 | commodity hardware, what used to be a set of one-off O&M problems are
390 | now solved by widely applied best practices from distributed systems,
391 | which have in turn been codified in state-of-the-art cloud management
392 | frameworks (like Kubernetes). This leaves us with the problem of (a)
393 | provisioning commodity hardware, (b) orchestrating the container
394 | building blocks, (c) deploying microservices to collect and archive
395 | monitoring data in a uniform way, and (d) continually integrating and
396 | deploying individual microservices as they evolve over time.
397 | 
398 | Finally, because a cloud is infinitely programmable, the system being
399 | managed has the potential to change substantially over time.\ [#]_
400 | This means that the cloud management system must itself be easily
401 | extended to support new features (as well as the refactoring of
402 | existing features). This is accomplished in part by implementing the
403 | cloud management system as a cloud service, which means we will see a
404 | fair amount of recursive dependencies throughout this book. It also
405 | points to taking advantage of declarative specifications of how all
406 | the disaggregated pieces fit together. These specifications can then
407 | be used to generate elements of the management system, rather than
408 | having to manually recode them. This is a subtle issue we will return
409 | to in later chapters, but ultimately, we want to be able to
410 | auto-configure the subsystem responsible for auto-configuring the rest
411 | of the system.
412 | 
413 | .. [#] For example, compare the two services Amazon offered ten years
414 |        ago (EC2 and S3) with the well over 100 services available on
415 |        the AWS console today (not counting the marketplace of
416 |        partner-provided services).
417 | 
418 | 
419 | 1.3 Cloud Technology
420 | --------------------
421 | 
422 | Being able to operationalize a cloud starts with the building blocks
423 | used to construct the cloud in the first place. This section
424 | summarizes the available technology, with the goal of identifying the
425 | baseline capabilities of the underlying system. This baseline is then
426 | assumed by the collection of management-related subsystems described
427 | throughout this book.
428 | 
429 | Before identifying these building blocks, we need to acknowledge that
430 | we are venturing into a gray area, having to do with what you consider
431 | to be “part of the platform being managed” versus “part of the
432 | subsystem that manages the platform.” To further complicate matters,
433 | where you draw the line shifts over time as technology matures and
434 | becomes ubiquitous.
435 | 
436 | For example, if you start with the premise that a cloud hosts a set of
437 | containers, then your management layer would be responsible for
438 | detecting and restarting failed containers. On the other hand, if you
439 | assume containers are resilient (i.e., able to auto-recover), then the
440 | management layer would not need to include that functionality
441 | (although it probably still needs to detect when the auto-recovery
442 | mechanism fails and correct for that). This is not a unique
443 | situation—complex systems often include mechanisms that address
444 | problems at multiple levels. For the purpose of this book, we just
445 | need to decide on a line that separates “technology that is assumed”
446 | from “problems that remain and how we address them.” The following
447 | identifies the technology we assume.
448 | 
449 | 1.3.1 Hardware Platform
450 | ~~~~~~~~~~~~~~~~~~~~~~~
451 | 
452 | The assumed hardware building blocks are straightforward. We start
453 | with bare-metal servers and switches, built using merchant silicon
454 | chips. These might, for example, be ARM or x86 processor chips and
455 | Tomahawk or Tofino switching chips, respectively. The bare-metal boxes
456 | also include a bootstrap mechanism (e.g., BIOS for servers and ONIE
457 | for switches), and a remote device management interface (e.g., IPMI or
458 | Redfish).
459 | 
460 | .. _reading_redfish:
461 | .. admonition:: Further Reading
462 | 
463 |    DMTF. `Redfish
464 |    <https://www.dmtf.org/standards/redfish>`__.
465 | 
466 | A physical cloud cluster is then constructed with the hardware
467 | building blocks arranged as shown in :numref:`Figure %s <fig-hw>`: one
468 | or more racks of servers connected by a leaf-spine switching
469 | fabric. The servers are shown above the switching fabric to emphasize
470 | that software running on the servers controls the switches.
471 | 
472 | .. _fig-hw:
473 | .. figure:: figures/Slide1.png
474 |    :width: 400px
475 |    :align: center
476 | 
477 |    Example building block components used to construct a cloud,
478 |    including commodity servers and switches, interconnected by a
479 |    leaf-spine switching fabric.
480 | 
481 | :numref:`Figure %s <fig-hw>` also includes the assumed low-level
482 | software components, which we describe next. Collectively, all the
483 | hardware and software components shown in the figure form the
484 | *platform*. Where we draw the line between what's *in the platform*
485 | and what runs *on top of the platform*, and why it is important, will
486 | become clear in later chapters. The summary is that one mechanism is
487 | responsible for bringing up the platform and preparing it to host
488 | workloads, and a different mechanism is responsible for managing the
489 | various workloads that are deployed on that platform.
490 | 
491 | 
492 | 1.3.2 Software Building Blocks
493 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
494 | 
495 | We assume four foundational software technologies, all running on the
496 | commodity processors in the cluster:
497 | 
498 | 1. Linux provides isolation for running container workloads.
499 | 
500 | 2. Docker containers package software functionality.
501 | 
502 | 3. Kubernetes instantiates and interconnects containers.
503 | 
504 | 4. Helm charts specify how collections of related containers are
505 |    interconnected to build applications.
506 | 
507 | These are all well known and ubiquitous, and so we only summarize them
508 | here. Links to related information for anyone who is not familiar
509 | with them (including excellent hands-on tutorials for the three
510 | container-related building blocks) are given below.
511 | 
512 | Linux is the OS that runs on the bare metal systems. It provides
513 | low-level APIs that container runtime systems use to implement
514 | isolation, including *namespaces* to isolate filesystem and network
515 | access, and *cgroups* to limit memory and processor usage.
516 | 
517 | Docker is a container runtime that leverages OS isolation APIs to
518 | instantiate and run multiple containers, each of which is an instance
519 | defined by a Docker image. Docker images are most frequently built
520 | using a Dockerfile, which uses a layering approach that allows sharing
521 | and building customized images on top of base images. A final image
522 | for a particular task incorporates all dependencies required by the
523 | software that is to run in the container, resulting in a container
524 | image that is portable across servers, depending only on the kernel
525 | and Docker runtime. We also assume one or more image artifact
526 | repositories of Docker containers that we will want to deploy in our
527 | cloud, of which `<https://hub.docker.com/>`__ is the best known
528 | example.
529 | 
530 | .. _reading_docker:
531 | .. admonition:: Further Reading
532 | 
533 |    `Docker Tutorial
534 |    <https://www.docker.com/101-tutorial>`__.
535 | 
536 | Kubernetes is a container management system. It provides a
537 | programmatic interface for scaling container instances up and down,
538 | allocating server resources to them, setting up virtual networks to
539 | interconnect those instances, and opening service ports that external
540 | clients can use to access those instances. Behind the scenes,
541 | Kubernetes monitors the liveness of those containers, and
542 | automatically restarts any that have failed. In other words, if you
543 | instruct Kubernetes to spin up three instances of microservice X,
544 | Kubernetes will do its best to keep three instances of the container
545 | that implements X running at all times.
546 | 
547 | Kubernetes also provides mechanisms that can be used to configure
548 | microservices when they start up, including *ConfigMaps*, *Secrets*,
549 | and *Operators*. Because of the role they play in cloud management, we
550 | discuss these mechanisms in more detail as they are introduced in
551 | later chapters.
552 | 
553 | .. _reading_k8s:
554 | .. admonition:: Further Reading
555 | 
556 |    `Kubernetes Tutorial
557 |    <https://kubernetes.io/docs/tutorials/kubernetes-basics/>`__.
558 | 
559 | Helm is a configuration set manager that runs on top of Kubernetes. It issues
560 | calls against the Kubernetes API according to an operator-provided
561 | specification, known as a *Helm Chart*. It is now common practice for
562 | cloud applications built from a set of microservices to publish a Helm
563 | chart that defines how the application is to be deployed on a
564 | Kubernetes cluster. See `<https://artifacthub.io/>`__ for a collection of
565 | publicly available Helm Charts.
566 | 
567 | .. _reading_helm:
568 | .. admonition:: Further Reading
569 | 
570 |    `Helm Tutorial
571 |    <https://helm.sh/docs/intro/quickstart/>`__.
572 | 
573 | The cloud management software described in this book is available in
574 | the form of a set of Docker containers, plus the associated Helm
575 | Charts that specify how they are to be deployed in a Kubernetes
576 | cluster. Overall, we make use of over 20 such open source software
577 | packages in the chapters that follow. Our goal is to show how all
578 | these open building blocks can be assembled into a comprehensive cloud
579 | management platform. We describe each tool in enough detail to
580 | appreciate how all the parts fit together—providing end-to-end
581 | coverage by connecting all the dots—plus links to full documentation
582 | for those who want to dig deeper into the details.
583 | 
584 | ..
585 |    List: NexBox, Ansible, Netplan, Terraform, Rancher, Fleet,
586 |    Prometheus, Grafana, AlertManager, Jenkins, Robot, Selenium,
587 |    onos-config, Atomix, OPF, Kibana, Logstash, Elasticsearch,
588 |    Kubernetes, Helm, Docker (21)
589 | 
590 | 1.3.3 Switching Fabric
591 | ~~~~~~~~~~~~~~~~~~~~~~
592 | 
593 | We assume the cloud is constructed using an SDN-based switching
594 | fabric, with a disaggregated control plane running in the same cloud
595 | as the fabric interconnects. For the purpose of this book, we assume
596 | the following SDN software stack:
597 | 
598 | * A Network OS hosts a set of control applications, including a
599 |   control application that manages the leaf-spine switching fabric. We
600 |   use ONOS as an open source exemplar Network OS. ONOS, in turn, hosts
601 |   the SD-Fabric control app.
602 | 
603 | * A Switch OS runs on each switch, providing a northbound gNMI and
604 |   gNOI interface through which the Network OS controls and configures
605 |   each switch. We use Stratum as an open source exemplar Switch OS.
606 | 
607 | Building a cloud using an SDN-based switching fabric is a best
608 | practice adopted by hyperscaler cloud providers. Their solutions
609 | remain proprietary, so we use ONOS and Stratum as open source
610 | examples. It is noteworthy that ONOS and Stratum are both packaged as
611 | Docker containers, and so can be orchestrated (on *both* servers and
612 | switches) by Kubernetes and Helm.\ [#]_
613 | 
614 | .. [#] Switches often include a commodity processor, typically running
615 |        Linux and hosting control software, in addition to any
616 |        switching chip that implements the data plane. Stratum runs on
617 |        this processor, and exports a northbound API that ONOS uses to
618 |        configure and control the switch.
619 | 
620 | 1.3.4 Repositories
621 | ~~~~~~~~~~~~~~~~~~
622 | 
623 | For completeness, we need to mention that nearly every mechanism
624 | described in this book takes advantage of cloud-hosted repositories,
625 | such as GitHub (for code), DockerHub (for Docker images), and
626 | ArtifactHub (for Helm charts). We also assume complementary systems
627 | like Gerrit, which layer a code-review mechanism on top of a Git
628 | repository, but having direct experience with Gerrit is not critical
629 | to understanding the material.
630 | 
631 | .. _reading_github:
632 | .. admonition:: Further Reading
633 | 
634 |    `GitHub Tutorial
635 |    <https://guides.github.com/activities/hello-world/>`__.
636 | 
637 |    `Gerrit Code Review
638 |    <https://www.gerritcodereview.com/>`__.
639 | 
640 | 
641 | 
642 | 1.3.5 Other Options
643 | ~~~~~~~~~~~~~~~~~~~
644 | 
645 | Just as important as what building blocks we take for granted are the
646 | technologies we do not include. We discuss three here.
647 | 
648 | First, you might have expected Service Mesh frameworks like Istio or
649 | Linkerd to be included. While it is true that anyone running
650 | applications on top of Kubernetes might decide to use Istio or Linkerd
651 | to help do that job—and this includes us, since much of the management
652 | system described in this book is implemented as a set of
653 | microservices—we happen to not take that approach. This is primarily
654 | an engineering choice: Service Meshes provide more features than we
655 | need, and correspondingly, we are able to realize the necessary
656 | functionality using more narrowly focused mechanisms. There is also a
657 | pedagogical reason: The fine-grained components we use are more
658 | consistent with our goal of identifying the elemental pieces of
659 | operations and management, rather than having those components bundled
660 | in a comprehensive package. We do, however, return to the role of
661 | service meshes in our discussion of observability in Chapter 6.
662 | 
663 | .. sidebar:: What's the Master Plan?
664 | 
665 |   *There is a general issue of how one makes engineering choices about
666 |   the combination of software packages to use in a cloud-based system
667 |   like the one this book describes. Ignoring the plethora of commercial
668 |   offerings, just the number of open source projects at the Linux
669 |   Foundation and the Apache Foundation available to help you build and
670 |   operate a cloud is (by our count) approaching 100. These projects
671 |   are largely independent, and in many cases, competing for mindshare.
672 |   This results in significant overlap in functionality, with any Venn
673 |   diagram you try to draw constantly shifting over time as projects
674 |   add and deprecate features.*
675 | 
676 |   *This is all to say, there is no master plan for what a cloud
677 |   management stack should look like. If you start with component X as
678 |   the centerpiece of your approach—perhaps because it solves your most
679 |   immediate problem—you will end up adding dozens of other components
680 |   over time to fully complete the system.  Moreover, the end result
681 |   will likely look different from the system someone else constructs
682 |   starting with component Y. There simply is no consensus framework
683 |   for which you get to select a component from column A, a second
684 |   complementary component from column B, and so on.  This is also true
685 |   for the Aether managed service we use as an exemplar.*
686 | 
687 |   *This makes it all the more important that we take a first
688 |   principles approach, which starts by identifying the set of
689 |   requirements and exploring the design space. Only as a final step do
690 |   we select an existing software component.  This approach naturally
691 |   results in an end-to-end solution that assembles many smaller
692 |   components, and tends to avoid bundled/multi-faceted solutions. This
693 |   does not inoculate us from having to evolve the system over time,
694 |   but it does help to approach the topic with visibility into the full
695 |   scope and complexity of the design space. And even if one ends up
696 |   adopting a bundled solution, understanding all the trade-offs being
697 |   made under the covers will help to make a more informed decision.*
698 | 
699 | Second, we assume a container-based cloud platform. An alternative
700 | would have been VM-based. The main reason for this choice is that
701 | containers are rapidly becoming the de facto way to deploy scalable
702 | and highly available functionality, and operationalizing such
703 | functionality in enterprises is our primary use case. Containers are
704 | sometimes deployed inside of VMs (rather than directly on physical
705 | machines), but in that case, the VMs can be viewed as part of the
706 | underlying infrastructure (rather than a service that is offered to
707 | users). Another way of saying this is that this book focuses on how to
708 | operationalize a Platform-as-a-Service (PaaS) rather than an
709 | Infrastructure-as-a-Service (IaaS), although later chapters will
710 | describe how to introduce VMs as an optional way to provision the
711 | underlying infrastructure for that PaaS.
712 | 
713 | Finally, the Aether edge cloud we use as an example is similar to many
714 | other cloud platforms being built to support on-prem deployments.
715 | The dominant use case shifts over time—with Artificial Intelligence
716 | (AI) recently overtaking Internet-of-Things (IoT) as the most
717 | compelling justification for edge clouds—but the operational
718 | challenge remains the same. For example, *Open Edge Platform* recently
719 | open sourced by Intel includes example AI applications and a
720 | collection of AI libraries, but also an *Edge Management Framework*
721 | that mirrors the one describe this book. It starts with a Kubernetes
722 | foundation, and includes tools for provisioning edge servers,
723 | orchestrating edge clusters using those servers, lifecycle managing
724 | edge applications, and enabling observability. Many of the engineering
725 | choices are the same as in Aether (some are different), but the
726 | important takeaway is that Kubernetes-based edge clouds are quickly
727 | becoming commonplace. That's the reason they are such a good case
728 | study.
729 | 
730 | .. admonition:: Further Reading
731 | 
732 |    `Open Edge Platform <https://github.com/open-edge-platform/>`__.
733 | 
734 |    `Edge Management Framework <https://github.com/open-edge-platform/edge-manageability-framework>`__.
735 | 
736 | 1.4 Future of the Sysadmin
737 | --------------------------
738 | 
739 | System administrators have been responsible for operating enterprise
740 | networks since the first file servers, client workstations, and LANs
741 | were deployed over 30 years ago. Throughout that history, a robust
742 | vendor ecosystem has introduced an increasingly diverse set of network
743 | appliances, compounding the challenge of the sysadmin’s job. The
744 | introduction of virtualization technology led to server consolidation,
745 | but did not greatly reduce the management overhead. This is because each
746 | virtual appliance remains in a management silo.
747 | 
748 | Cloud providers, because of the scale of the systems they build,
749 | cannot survive with operational silos, and so they introduced
750 | increasingly sophisticated cloud orchestration
751 | technologies. Kubernetes and Helm are two high-impact examples. These
752 | cloud best practices are now available to enterprises as well, but
753 | they are often bundled as a managed service, with the cloud provider
754 | playing an ever-greater role in operating the enterprise’s services.
755 | Outsourcing portions of the IT responsibility to a cloud provider is an
756 | attractive value proposition for many enterprises, but comes with the
757 | risk of increased dependence on a single provider. This equation is
758 | complicated by the increased likelihood that Mobile Network Operators
759 | (MNOs) also participate in the rollout of private 5G connectivity
760 | within the enterprise, deployed as yet another cloud service.
761 | 
762 | The approach this book takes is to explore a best-of-both-worlds
763 | opportunity. It does this by walking you through the collection of
764 | subsystems, and associated management processes, required to
765 | operationalize an on-premises cloud, and then provide on-going support for
766 | that cloud and the services it hosts (including 5G connectivity). Our
767 | hope is that understanding what’s under the covers of cloud-managed
768 | services will help enterprises better share responsibility for
769 | managing their IT infrastructure with cloud providers, and potentially with
770 | MNOs.
771 | 


--------------------------------------------------------------------------------
/latest.rst:
--------------------------------------------------------------------------------
 1 | .. role:: pop
 2 | 
 3 | :pop:`Read The Latest!`
 4 | ========================
 5 | 
 6 | `Systems Approach Newsletter: <https://systemsapproach.org/newsletter>`__ Stay
 7 | up to date with the latest developments by subscribing to the
 8 | `Systems Approach Newsletter
 9 | <https://systemsapproach.org/newsletter>`__, where the authors
10 | connect the concepts and lessons in this book to what's happening in
11 | the Internet today.
12 | 
13 | `Book Series: <https://systemsapproach.org/books/>`__ Also check out
14 | our companion books that cover emerging topics in more depth.
15 | 
16 | * `Private 5G: A Systems Approach <https://5G.systemsapproach.org>`__
17 | 
18 | * `Software-Defined Networks: A Systems Approach <https://sdn.systemsapproach.org>`__
19 | 
20 | * `TCP Congestion Control: A Systems Approach <https://tcpcc.systemsapproach.org>`__
21 | 
22 | .. * `Edge Cloud Operations: A Systems Approach <https://ops.systemsapproach.org>`__
23 | 


--------------------------------------------------------------------------------
/monitor.rst:
--------------------------------------------------------------------------------
  1 | Chapter 6:  Monitoring and Telemetry
  2 | ====================================
  3 | 
  4 | Collecting telemetry data for a running system is an essential
  5 | function of the management platform. It enables operators to monitor
  6 | system behavior, evaluate performance, make informed provisioning
  7 | decisions, respond to failures, identify attacks, and diagnose
  8 | problems. This chapter focuses on three types of telemetry
  9 | data—*metrics*, *logs*, and *traces*\—along with exemplar open source
 10 | software stacks available to help collect, store, and act upon each of
 11 | them.
 12 | 
 13 | Metrics are quantitative data about a system. These include common
 14 | performance metrics such as link bandwidth, CPU utilization, and memory
 15 | usage, but also binary results corresponding to "up" and "down", as
 16 | well as other state variables that can be encoded numerically.  These
 17 | values are produced and collected periodically (e.g., every few
 18 | seconds), either by reading a counter, or by executing a runtime test
 19 | that returns a value.  These metrics can be associated with physical
 20 | resources such as servers and switches, virtual resources such as VMs and
 21 | containers, or high-level abstractions such as the *Connectivity Service*
 22 | described in Section 5.3. Given these many possible sources of data,
 23 | the job of the metrics monitoring stack is to collect, archive,
 24 | visualize, and optionally analyze this data.
 25 | 
 26 | Logs are the qualitative data that is generated whenever a noteworthy
 27 | event occurs. This information can be used to identify problematic
 28 | operating conditions (i.e., it may trigger an alert), but more
 29 | commonly, it is used to troubleshoot problems after they have been
 30 | detected. Various system components—all the way from the low-level OS
 31 | kernel to high-level cloud services—write messages that adhere to a
 32 | well-defined format to the log. These messages include a timestamp,
 33 | which makes it possible for the logging stack to parse and correlate
 34 | messages from different components.
 35 | 
 36 | Traces are a record of causal relationships (e.g., Service A calls
 37 | Service B) resulting from user-initiated transactions or jobs. They
 38 | are related to logs, but provide more specialized information about
 39 | the context in which different events happen. Tracing is
 40 | well-understood in a single program, where an execution trace is
 41 | commonly recorded as an in-memory call stack, but traces are
 42 | inherently distributed across a graph of network-connected
 43 | microservices in a cloud setting. This makes the problem challenging,
 44 | but also critically important because it is often the case that the
 45 | only way to understand time-dependent phenomena—such as why a
 46 | particular resource is overloaded—is to understand how multiple
 47 | independent workflows interact with each other.
 48 | 
 49 | Taking a step back from the three types of telemetry data, it is
 50 | helpful to have a broad understanding of the design space, and to that
 51 | end, we make four observations.
 52 | 
 53 | First, there are two general use cases for telemetry data, which we
 54 | broadly characterize as "monitoring" and "troubleshooting". We use
 55 | these terms in the most general way to represent (a) proactively
 56 | watching for warning signs of trouble (attacks, bugs, failures,
 57 | overload conditions) in a steady-state system; versus (b) reactively
 58 | taking a closer look to determine the root cause and resolve an issue
 59 | (fix a bug, optimize performance, provision more resources, defend
 60 | against an attack), once alerted to a potential problem. This
 61 | distinction is important because the former (monitoring) needs to
 62 | incur minimal overhead and require minimal human involvement, while
 63 | the latter (troubleshooting) can be more invasive/expensive and
 64 | typically involves some level of human expertise. This is not a
 65 | perfect distinction, with plenty of operator activity happening in a
 66 | gray area, but being aware of the cost/benefit trade-offs of the
 67 | available tools is an important starting point.
 68 | 
 69 | Second, the more aspects of monitoring and troubleshooting that can be
 70 | automated, the better. This starts with alerts that automatically
 71 | detect potential problems; typically includes dashboards that make it
 72 | easy for humans to see patterns and drill down for relevant details
 73 | across all three types of data; increasingly leverages Machine
 74 | Learning and statistical analysis to identify deeper connections
 75 | that are not obvious to human operators; and ultimately supports
 76 | closed-loop control where the automated tool not only detects problems
 77 | but is also able to issue corrective control directives. For the
 78 | purpose of this chapter, we give examples of the first two (alerts and
 79 | dashboards), and declare the latter two (analytics and close-loop
 80 | control) as out of scope (but likely running as applications that
 81 | consume the telemetry data outlined in the sections that follow).
 82 | 
 83 | Third, when viewed from the perspective of lifecycle management,
 84 | monitoring and troubleshooting are just a continuation of testing,
 85 | except under production workloads rather than test workloads. In fact,
 86 | the same set of tools can be used on either side of the
 87 | development-vs-production boundary. For example, as anyone who has
 88 | profiled a program will recognize and appreciate, tracing is an
 89 | extremely valuable tool during development—both to track down bugs and
 90 | to tune performance. Similarly, artificial end-to-end tests can
 91 | provide value in production systems by triggering early warning
 92 | alerts. This can be especially helpful when dealing with problematic
 93 | failure modes.
 94 | 
 95 | Finally, because the metrics, logs, and traces collected by the
 96 | various subsystems are timestamped, it is possible to establish
 97 | correlations among them, which is helpful when debugging a problem or
 98 | deciding whether or not an alert is warranted. We give examples of how
 99 | such telemetry-wide functions are implemented in practice today, and
100 | discuss the future of generating and using telemetry data, in the
101 | final two sections of this chapter.
102 | 
103 | 6.1 Metrics and Alerts
104 | -------------------------------
105 | 
106 | Starting with metrics, a popular open source monitoring stack uses
107 | Prometheus to collect and store platform and service metrics, Grafana
108 | to visualize metrics over time, and Alertmanager to notify the
109 | operations team of events that require attention.  In Aether,
110 | Prometheus and Alertmanager are instantiated on each edge cluster,
111 | with a single instantiation of Grafana running centrally in the
112 | cloud. More information about each tool is available online, so we
113 | focus more narrowly on (1) how individual Aether components "opt into"
114 | this stack, and (2) how the stack can be customized in
115 | service-specific ways.
116 | 
117 | .. _reading_monitor:
118 | .. admonition:: Further Reading
119 | 
120 |    `Prometheus <https://prometheus.io/docs/introduction/overview/>`__.
121 | 
122 |    `Grafana
123 |    <https://grafana.com/docs/grafana/latest/getting-started/>`__.
124 | 
125 |    `Alertmanager <https://prometheus.io/docs/alerting/latest/alertmanager/>`__.
126 | 
127 | 
128 | 6.1.1 Exporting Metrics
129 | ~~~~~~~~~~~~~~~~~~~~~~~
130 | 
131 | Individual components implement a *Prometheus Exporter* to provide the
132 | current value of the component's metrics. A component's Exporter is
133 | queried via HTTP, with the corresponding metrics returned using a
134 | simple text format. Prometheus periodically scrapes the Exporter's
135 | HTTP endpoint and stores the metrics in its Time Series Database
136 | (TSDB) for querying and analysis. Many client libraries are available
137 | for instrumenting code to produce metrics in Prometheus format.  If a
138 | component's metrics are available in some other format, tools are
139 | often available to convert the metrics into Prometheus format and
140 | export them.
141 | 
142 | A YAML configuration file specifies the set of Exporter endpoints that
143 | Prometheus is to pull metrics from, along will the polling frequency
144 | for each endpoint. Alternatively, Kubernetes-based microservices can
145 | be extended with a *Service Monitor* Custom Resource Descriptor (CRD)
146 | that Prometheus then queries to learn about any Exporter endpoints the
147 | microservice has made available.
148 | 
149 | In addition to component-based Exporters, every edge cluster
150 | periodically tests end-to-end connectivity (for various definitions of
151 | end-to-end).  One test determines whether the 5G control plane is
152 | working (i.e., the edge site can reach the SD-Core running in the
153 | central cloud) and a second test determines whether the 5G user plane
154 | is working (i.e., UEs can reach the Internet). This is a common
155 | pattern: individual components can export accumulators and other local
156 | variables, but only a "third-party observer" can actively test
157 | external behavior, and report the results. These examples correspond
158 | to the rightmost "End-to-End Tests" shown in :numref:`Figure %s
159 | <fig-testing>` of Chapter 4.
160 | 
161 | Finally, when a system is running across multiple edge sites, as is
162 | the case with Aether, there is an design question of whether
163 | monitoring data is stored on the edge sites and lazily pulled to the
164 | central location only when needed, or is proactively pushed to the
165 | central location as soon as it's generated. Aether employs both
166 | approaches, depending on the volume and urgency of the data being
167 | collected. By default, metrics collected by the local instantiation of
168 | Prometheus stay on the edge sites, and only query results are returned
169 | to the central location (e.g., to be displayed by Grafana as described
170 | in the next subsection). This is appropriate for metrics that are both
171 | high-volume and seldom viewed. One exception is the end-to-end tests
172 | described in the previous paragraph. These results are immediately
173 | pushed to the central site (bypassing the local Prometheus instance), because
174 | they are low-volume and may require immediate attention.
175 | 
176 | 6.1.2 Creating Dashboards
177 | ~~~~~~~~~~~~~~~~~~~~~~~~~
178 | 
179 | The metrics collected by Prometheus are visualized using Grafana
180 | dashboards.  In Aether, this means the Grafana instance running as
181 | part of AMP in the central cloud sends queries to some combination of
182 | the central Prometheus instance and a subset of the Prometheus instances
183 | running on edge clusters. For example, :numref:`Figure %s
184 | <fig-ace_dash>` shows the summary dashboard for a collection of Aether
185 | edge sites.
186 | 
187 | .. _fig-ace_dash:
188 | .. figure:: figures/ace_dash.png
189 |    :width: 600px
190 |    :align: center
191 | 
192 |    Central dashboard showing status of Aether edge deployments.
193 | 
194 | Grafana comes with a set of predefined dashboards for the most common
195 | set of metrics—in particular, those associated with physical servers
196 | and virtual resources such as containers—but it can also be customized to
197 | include dashboards for service-level metrics and other
198 | deployment-specific information (e.g., per-enterprise in Aether). For
199 | example, :numref:`Figure %s <fig-upf_dash>` shows a custom dashboard
200 | for UPF (User Plane Function), the data plane packet forwarder of the
201 | SD-Core. The example shows latency and jitter metrics over the last
202 | hour at one site, with three additional collapsed panels (PFCP Sessions
203 | and Messages) at the bottom.
204 | 
205 | .. _fig-upf_dash:
206 | .. figure:: figures/upf_dash.png
207 |    :width: 600px
208 |    :align: center
209 | 
210 |    Custom dashboard showing latency and jitter metrics for UPF, the
211 |    packet forwarding data plane of the SD-Core component.
212 | 
213 | Briefly, a dashboard is constructed from a set of *panels*, where each
214 | panel has a well-defined *type* (e.g., graph, table, gauge, heatmap)
215 | bound to a particular Prometheus *query*. New dashboards are created
216 | using the Grafana GUI, and the resulting configuration then saved in a
217 | JSON file. This configuration file is then committed to the Config
218 | Repo, and later loaded into Grafana whenever it is restarted as part
219 | of Lifecycle Management. For example, the following code snippet
220 | shows the Prometheus query corresponding to the ``Uptime`` panel
221 | in :numref:`Figure %s <fig-ace_dash>`.
222 | 
223 | .. literalinclude:: code/uptime.yaml
224 | 
225 | Note that this expression includes variables for the site (``$edge``)
226 | and the interval over which the uptime is computed (``$__interval``).
227 | 
228 | 6.1.3 Defining Alerts
229 | ~~~~~~~~~~~~~~~~~~~~~
230 | 
231 | Alerts can be triggered in Prometheus when a component metric crosses
232 | some threshold.  Alertmanager is a tool that then routes the alert to
233 | one or more receivers, such as an email address or Slack channel.
234 | 
235 | An alert for a particular component is defined by an *alerting rule*,
236 | an expression involving a Prometheus query, such that whenever it
237 | evaluates to true for the indicated time period, it triggers a
238 | corresponding message to be routed to a set of receivers. These rules
239 | are recorded in a YAML file that is checked into the Config Repo and
240 | loaded into Prometheus. (Alternatively, Helm Charts for individual
241 | components can define rules via *Prometheus Rule* custom resources.)
242 | For example, the following code snippet shows the Prometheus Rule for
243 | two alerts, where the ``expr`` lines corresponds to the respective
244 | queries submitted to Prometheus.
245 | 
246 | .. literalinclude:: code/prometheus-rule.yaml
247 | 
248 | In Aether, the Alertmanager is configured to send alerts with
249 | *critical* or *warning* severity to a general set of receivers.  If it
250 | is desirable to route a specific alert to a different receiver (e.g.,
251 | a Slack channel used by the developers for that particular component),
252 | the Alertmanager configuration is changed accordingly.
253 | 
254 | 6.2 Logging
255 | ------------------
256 | 
257 | OS programmers have been writing diagnostic messages to a *syslog*
258 | since the earliest days of Unix. Originally collected in a local file,
259 | the syslog abstraction has been adapted to cloud environments by
260 | adding a suite of scalable services. Today, one typical open source
261 | logging stack uses Fluentd to collect (aggregate, buffer, and route)
262 | log messages written by a set of components, with Fluentbit serving as a
263 | client-side agent running in each component helping developers
264 | normalize their log messages. ElasticSearch is then used to store,
265 | search, and analyze those messages, with Kibana used to display and
266 | visualize the results. The general flow of data is shown in
267 | :numref:`Figure %s <fig-log>`, using the main Aether subsystems as
268 | illustrative sources of log messages.
269 | 
270 | .. _fig-log:
271 | .. figure:: figures/Slide23.png
272 |    :width: 450px
273 |    :align: center
274 | 
275 |    Flow of log messages through the Logging subsystem.
276 | 
277 | .. _reading_logging:
278 | .. admonition:: Further Reading
279 | 
280 |    `Fluentd <https://docs.fluentd.org/>`__.
281 | 
282 |    `ElasticSearch
283 |    <https://www.elastic.co/elasticsearch/>`__.
284 | 
285 |    `Kibana <https://www.elastic.co/kibana/>`__.
286 | 
287 | 6.2.1 Common Schema
288 | ~~~~~~~~~~~~~~~~~~~
289 | 
290 | The key challenge in logging is to adopt a uniform message format
291 | across all components, a requirement that is complicated by the fact
292 | that the various components integrated in a complex system are often
293 | developed independently of each other. Fluentbit plays a role in
294 | normalizing these messages by supporting a set of filters. These
295 | filters parse "raw" log messages written by the component (an ASCII
296 | string), and output "canonical" log messages as structured JSON. There
297 | are other options, but JSON is reasonably readable as text, which
298 | still matters for debugging by humans. It is also well-supported by
299 | tooling.
300 | 
301 | For example, developers for the SD-Fabric component might
302 | write a log message that looks like this:
303 | 
304 | .. literalinclude:: code/log.ascii
305 | 
306 | where a Fluentbit filter transforms into a structure that looks like
307 | this:
308 | 
309 | .. literalinclude:: code/log.json
310 | 
311 | This example is simplified, but it does serve to illustrate the basic
312 | idea. It also highlights the challenge the DevOps team faces in
313 | building the management platform, which is to decide on a meaningful
314 | set of name/value pairs for the system as a whole. In other words,
315 | they must define a common schema for these structured log messages.
316 | The *Elastic Common Schema* is a good place to start that definition,
317 | but among other things, it will be necessary to establish the accepted
318 | set of log levels, and conventions for using each level. In Aether,
319 | for example, the log levels are: FATAL, ERROR, WARNING, INFO, and
320 | DEBUG.
321 | 
322 | .. _reading_ecs:
323 | .. admonition:: Further Reading
324 | 
325 |    `Elastic Common Schema
326 |    <https://www.elastic.co/guide/en/ecs/current/index.html>`__.
327 | 
328 | 
329 | 6.2.2 Best Practices
330 | ~~~~~~~~~~~~~~~~~~~~
331 | 
332 | Establishing a shared logging platform is, of course, of little value
333 | unless all the individual components are properly instrumented to
334 | write log messages. Programming languages typically come with library
335 | support for writing log messages (e.g., Java's log4j), but that's just
336 | a start. Logging is most effective if the components adhere to the
337 | following set of best practices.
338 | 
339 | * **Log shipping is handled by the platform.** Components should
340 |   assume that stdout/stderr is ingested into the logging system by
341 |   Fluentbit (or similar tooling), and avoid making the job more
342 |   complicated by trying to route their own logs.  The exception is for
343 |   external services and hardware devices that are outside the
344 |   management platform's control.  How these systems send their logs to
345 |   a log aggregator must be established as a part of the deployment
346 |   process.
347 | 
348 | * **File logging should be disabled.** Writing log files directly to a
349 |   container's layered file system is proven to be I/O inefficient and
350 |   can become a performance bottleneck. It is also generally
351 |   unnecessary if the logs are also being sent to stdout/stderr.
352 |   Generally, logging to a file is discouraged when a component runs in
353 |   a container environment. Instead, components should stream all logs
354 |   to the collecting system.
355 | 
356 | * **Asynchronous logging is encouraged.** Synchronous logging can
357 |   become a performance bottleneck in a scaled environment.  Components
358 |   should write logs asynchronously.
359 | 
360 | * **Timestamps should be created by the program's logger.** Components
361 |   should use the selected logging library to create timestamps, with
362 |   as precise a timestamp as the logging framework allows. Using the
363 |   shipper or logging handlers may be slower, or create timestamps on
364 |   receipt, which may be delayed. This makes trying to align events
365 |   between multiple services after log aggregation problematic.
366 | 
367 | * **Must be able to change log levels without interrupting service.**
368 |   Components should provide a mechanism to set the log level at
369 |   startup, and an API that allows the log level to be changed at
370 |   runtime. Scoping the log level based on specific subsystems is a
371 |   useful feature, but not required. When a component is implemented by
372 |   a suite of microservices, the logging configuration need only be
373 |   applied to one instance for it to apply to all instances.
374 | 
375 | 6.3 Distributed Tracing
376 | -------------------------
377 | 
378 | Execution traces are the third source of telemetry data. Tracing is
379 | challenging in a cloud setting because it involves following the flow
380 | of control for each user-initiated request across multiple
381 | microservices. The good news is that instrumenting a set of
382 | microservices involves activating tracing support in the underlying
383 | language runtime system—typically in the RPC stubs—rather than asking
384 | app developers to explicitly instrument their programs.
385 | 
386 | The general pattern is similar to what we've already seen with metrics
387 | and logs: the running code is instrumented to produce data that is
388 | then collected, aggregated, stored, and made available for display and
389 | analysis. The main difference is the type of data we're interested in
390 | collecting, which, for tracing, is typically the sequence of API
391 | boundary crossings from one module to another. This data gives us
392 | the information we need to reconstruct the call chain. In principle,
393 | we could leverage the logging system to support tracing—and just be
394 | diligent in logging the necessary interface-crossing
395 | information—but it is a specialized enough use case to warrant its own
396 | vocabulary, abstractions, and mechanisms.
397 | 
398 | At a high level, a *trace* is a description of a transaction as it
399 | moves through the system. It consists of a sequence of *spans* (each
400 | of which represents work done within a service) interleaved with a set
401 | of *span contexts* (each of which represents the state carried across
402 | the network from one service to another). An illustrative example of a
403 | trace is shown in :numref:`Figure %s <fig-trace>`, but abstractly, a
404 | trace is a directed graph with nodes that correspond to spans and
405 | edges that correspond to span contexts. The nodes and edges are then
406 | timestamped and annotated with relevant facts (key/value tags) about
407 | the end-to-end execution path, including when and for how long it
408 | ran. Each span also includes timestamped log messages generated while
409 | the span was executing, simplifying the process of correlating log
410 | messages with traces.
411 | 
412 | .. _fig-trace:
413 | .. figure:: figures/Slide26.png
414 |    :width: 500px
415 |    :align: center
416 | 
417 |    Example trace spanning two network services.
418 | 
419 | Again, as with metrics and log messages, the details are important and
420 | those details are specified by an agreed-upon data model. The
421 | OpenTelemetry project is now defining one such model, building on the
422 | earlier OpenTracing project (which was in turn influenced by the
423 | Dapper distributed tracing mechanism developed by Google).  Beyond the
424 | challenge of defining a model that captures the most relevant semantic
425 | information, there is the pragmatic issue of (1) minimizing the
426 | overhead of tracing so as not to negatively impact application
427 | performance, yet (2) extracting enough information from traces so as
428 | to make collecting it worthwhile.  Sampling is a widely adopted
429 | technique introduced into the data collection pipeline to manage this
430 | trade-off. One consequence of these challenges is that distributed
431 | tracing is the subject of ongoing research, and we can expect the
432 | model definitions and sampling techniques to evolve and mature in the
433 | foreseeable future.
434 | 
435 | .. _reading_tracing:
436 | .. admonition:: Further Reading
437 | 
438 |    B. Sigelman, *et al.* `Dapper, a Large-Scale Distributed Systems
439 |    Tracing Infrastructure
440 |    <https://static.googleusercontent.com/media/research.google.com/en//archive/papers/dapper-2010-1.pdf>`__.
441 |    Google Technical Report. April 2010.
442 | 
443 |    `OpenTelemetry: High-quality, ubiquitous, and portable telemetry to
444 |    enable effective observability <https://opentelemetry.io/>`__.
445 | 
446 |    `Jaeger: End-to-End Distributed Tracing
447 |    <https://www.jaegertracing.io/>`__.
448 | 
449 | With respect to mechanisms, Jaeger is a widely used open source
450 | tracing tool originally developed by Uber. (Jaeger is not included in
451 | Aether, but was utilized in a predecessor edge cloud.)  Jaeger
452 | includes instrumentation of the runtime system for the language(s)
453 | used to implement an application, a collector, storage, and a query
454 | language that can be used to diagnose performance problems and do root
455 | cause analysis.
456 | 
457 | 6.4 Integrated Dashboards
458 | -------------------------
459 | 
460 | The metrics, logs and traces being generated by instrumented
461 | application software make it possible to collect a wealth of data
462 | about the health of a system. But this instrumentation is only useful
463 | if the right data is displayed to the right people (those with the
464 | ability to take action) at the right time (when action needs to be
465 | taken). Creating useful panels and organizing them into intuitive
466 | dashboards is part of the solution, but integrating information across
467 | the subsystems of the management platform is also a requirement.
468 | 
469 | Unifying all this data is the ultimate objective of ongoing efforts
470 | like the OpenTelemetry project mentioned in the previous section, but
471 | there are also opportunities to use the tools described in this
472 | chapter to better integrate data. This section highlights two
473 | general strategies.
474 | 
475 | First, both Kibana and Grafana can be configured to display telemetry
476 | data from multiple sources. For example, it is straightforward to
477 | integrate both logs and traces in Kibana. This is typically done by
478 | first feeding the tracing data into ElasticSearch, which Kibana then
479 | queries. Similarly, it is useful to have a convenient way to see the
480 | log messages associated with a particular component in the context of
481 | metrics that have been collected. This is easy to accomplish because
482 | Grafana can be configured to display data from ElasticSearch just as
483 | easily as from Prometheus. Both are data sources that can be
484 | queried. This makes it to possible to create a Grafana dashboard that
485 | includes a selected set of log messages, similar to the one from
486 | Aether shown in :numref:`Figure %s <fig-es_dash>`.  In this example,
487 | we see INFO-level messages associated with the UPF sub-component of
488 | SD-Core, which augments the UPF performance data shown in
489 | :numref:`Figure %s <fig-upf_dash>`.
490 | 
491 | .. _fig-es_dash:
492 | .. figure:: figures/es_dash.png
493 |    :width: 600px
494 |    :align: center
495 | 
496 |    Log messages associated with the UPF element of SD-Core, displayed
497 |    in a Grafana dashboard.
498 | 
499 | Second, the runtime control interface described in Chapter 5 provides
500 | a means to change various parameters of a running system, but to make
501 | informed decisions about what changes (if any) need to be
502 | made, it is necessary to have access to the right data. To this end, it
503 | is ideal to have access to both the "knobs" and the "dials" on an
504 | integrated dashboard.  This can be accomplished by incorporating
505 | Grafana frames in the Runtime Control GUI, which, in its simplest form,
506 | displays a set of web forms corresponding to the fields in the
507 | underlying data models. (More sophisticated control panels are
508 | certainly possible.)
509 | 
510 | .. _fig-dev_group:
511 | .. figure:: figures/gui1.png
512 |    :width: 600px
513 |    :align: center
514 | 
515 |    Example control dashboard showing the set of Device Groups defined
516 |    for a fictional set of Aether sites.
517 | 
518 | For example, :numref:`Figure %s <fig-dev_group>` shows the set
519 | of device groups for a fictional set of Aether sites, where clicking
520 | on the "Edit" button pops up a web form that lets the enterprise admin
521 | modify the corresponding fields of the `Device-Group` model (not
522 | shown), and clicking on the "Monitor" button pops up a
523 | Grafana-generated frame similar to the one shown in :numref:`Figure %s
524 | <fig-dev_monitor>`. In principle, this frame is tailored to show only
525 | the most relevant information associated with the selected object.
526 | 
527 | .. _fig-dev_monitor:
528 | .. figure:: figures/gui2.png
529 |    :width: 600px
530 |    :align: center
531 | 
532 |    Example monitoring frame associated with a selected Device Group.
533 | 
534 | 6.5 Observability
535 | -----------------
536 | 
537 | Knowing what telemetry data to collect, so you have exactly the right
538 | information when you need it, but doing so without negatively
539 | impacting system performance, is a difficult problem. *Observability*
540 | is a relatively new term being used to describe this general problem
541 | space, and while the term can be dismissed as the latest marketing
542 | buzzword (which it is), it can also be interpreted as another of the
543 | set of *"-ities"* that all good systems aspire to, alongside
544 | scalability, reliability, availability, security, usability, and so
545 | on. Observability is the quality of a system that makes visible the
546 | facts about its internal operation needed to make informed management
547 | and control decisions. It has become a fertile space for innovation,
548 | and so we conclude this chapter with two examples that may become
549 | commonplace in the near future.
550 | 
551 | The first is *Inband Network Telemetry (INT)*, which takes advantage
552 | of programmable switching hardware to allow operators to ask new
553 | questions about how packets are being processed "in-band", as they
554 | flow through the network. This is in contrast to either depending on
555 | the predefined set of counters hardwired into fixed-function network
556 | devices, or being able to inspect just a sampled subset of packets.
557 | Because Aether uses programmable switches as the foundation for its
558 | SDN-based switching fabric, it is able to use INT as a fourth type of
559 | telemetry data, and in doing so provide qualitatively deeper insights
560 | into traffic patterns and the root causes of network failures.
561 | 
562 | For example, INT has been used to measure and record queuing delay
563 | individual packets experience while traversing a sequence of switches
564 | along an end-to-end path, making it possible to detect *microbursts*
565 | (queuing delays measured over millisecond or even sub-millisecond time
566 | scales). It is even possible to correlate this information across
567 | packet flows that followed different routes, so as to determine which
568 | flows shared buffer capacity at each switch. As another example, INT
569 | has been used to record the decision making process that directed how
570 | packets are delivered, that is, which forwarding rules were applied at
571 | each switch along the end-to-end path. This opens the door to using
572 | INT to verify that the data plane is faithfully executing the
573 | forwarding behavior the network operator intends. For more information
574 | about INT, we refer the reader to our companion SDN book.
575 | 
576 | .. _reading_int:
577 | .. admonition:: Further Reading
578 | 
579 |    L. Peterson, *et al.* `Software-Defined Networking: A Systems Approach
580 |    <https://sdn.systemsapproach.org>`__. November 2021.
581 | 
582 | The second is the emergence of *Service Meshes* mentioned in
583 | Chapter 1. A Service Mesh framework such as Istio provides a means to
584 | enforce fine-grained security policies and collect telemetry data in
585 | cloud native applications by injecting "observation/enforcement
586 | points" between microservices. These injection points, called
587 | *sidecars*, are typically implemented by a container that "runs
588 | alongside" the containers that implement each microservice, with all
589 | RPC calls from Service A to Service B passing through their associated
590 | sidecars. As shown in :numref:`Figure %s <fig-mesh>`, these sidecars
591 | then implement whatever policies the operator wants to impose on the
592 | application, sending telemetry data to a global collector and
593 | receiving security directives from a global policy engine.
594 | 
595 | .. _fig-mesh:
596 | .. figure:: figures/Slide27.png
597 |    :width: 300px
598 |    :align: center
599 | 
600 |    Overview of a Service Mesh framework, with sidecars intercepting
601 |    messages flowing between Services A and B. Each sidecar enforces
602 |    security policy received from the central controller and sends
603 |    telemetry data to the central controller.
604 | 
605 | From the perspective of observability, sidecars can be programmed to
606 | record whatever information operators might want to collect, and in
607 | principle, they can even be dynamically updated as conditions warrant.
608 | This provides a general way for operators to define how the system is
609 | observed without having to rely on instrumentation developers might
610 | include in their services. The downside is that sidecars impose a
611 | nontrivial amount of overhead on inter-service communication. For that
612 | reason, alternative approaches to sidecars are gaining traction,
613 | notably Cilium, which uses eBPF (extended Berkeley Packet Filters) to
614 | implement observability, security and networking data plane features
615 | inside the kernel rather than in a sidecar.
616 | 
617 | For more information about the Istio Service Mesh, we recommend
618 | Calcote and Butcher's book. The Cilium project has extensive
619 | documentation and tutorials at its web site.
620 | 
621 | .. _reading_mesh:
622 | .. admonition:: Further Reading
623 | 
624 |    L. Calcote and Z. Butcher `Istio: Up and Running
625 |    <https://www.oreilly.com/library/view/istio-up-and/9781492043775/>`__. October 2019.
626 | 
627 |    `Cilium: eBPF-based Networking, Observability, Security <https://cilium.io/>`__.
628 | 
629 | 
630 | 


--------------------------------------------------------------------------------
/preface.rst:
--------------------------------------------------------------------------------
  1 | Preface
  2 | =======
  3 | 
  4 | The cloud is ubiquitous. Everyone uses the cloud to either access or
  5 | deliver services, but not everyone will build and operate a cloud. So
  6 | why should anyone care about how to turn a pile of servers and
  7 | switches into a 24/7 service delivery platform? That's what Google,
  8 | Microsoft, Amazon and the other cloud providers do for us, and they do
  9 | a perfectly good job of it.
 10 | 
 11 | The answer, we believe, is that the cloud is becoming ubiquitous in
 12 | another way, as distributed applications increasingly run not just in
 13 | large, central datacenters but at the edge. As applications are
 14 | disaggregated, the cloud is expanding from hundreds of datacenters to
 15 | tens of thousands of enterprises. And while it is clear that the
 16 | commodity cloud providers are eager to manage those edge clouds as a
 17 | logical extension of their datacenters, they do not have a monopoly on
 18 | the know-how for making that happen.
 19 | 
 20 | At the same time edge applications are moving to the forefront,
 21 | increasing importance is also being placed on *digital sovereignty*,
 22 | the ability of countries and organizations to control their destiny and
 23 | their data. Cloud technology is important for running today's
 24 | workloads, but access to that technology does not necessarily have to
 25 | be bundled with outsourcing operational control.
 26 | 
 27 | This book lays out a roadmap that a small team of engineers followed
 28 | over the course of a year to stand up and operationalize an edge cloud
 29 | and then operate it 24/7. This edge cloud spans a dozen
 30 | enterprises, and hosts a non-trivial cloud native service—5G
 31 | connectivity in our case, but that’s just an example. The team was
 32 | able to do this by leveraging 20+ open source components, but
 33 | selecting those components is just a start. There were dozens of
 34 | technical decisions to make along the way, and a few thousand lines of
 35 | configuration code to write. We believe this is a repeatable exercise,
 36 | which we report in this book. The code for those configuration
 37 | files is open source, for those who want to pursue the topic in more
 38 | detail.
 39 | 
 40 | What do we mean by an edge cloud? We're drawing a distinction between
 41 | clouds run by the hyperscale cloud providers in their massive data
 42 | centers, which we think of as the core, and those run by enterprises
 43 | (or managed for them) at the edge. The edge is where the real, physical
 44 | world meets the cloud. For example, it is the place where data from
 45 | sensors is likely to be gathered and processed, and where services
 46 | that need to be close to the end user for reasons of latency or
 47 | bandwidth are delivered.
 48 | 
 49 | Our roadmap may not be the right one for all circumstances, but it
 50 | does shine a light on the fundamental challenges and trade-offs
 51 | involved in operationalizing a cloud. As we can attest based on our
 52 | experience, it’s a complicated design space with an overabundance of
 53 | terminology and storylines to untangle.
 54 | 
 55 | Intended Audience
 56 | ------------------
 57 | 
 58 | We hope this book makes valuable reading for anyone who is trying to
 59 | stand up and operationalize their own edge cloud infrastructure, but
 60 | we also aim to provide useful information for at least two other broad
 61 | groups.
 62 | 
 63 | First, there will be a set of readers who need to evaluate the
 64 | options available, particularly to decide between using the cloud
 65 | services offered by one of the hyperscalers or building their own edge
 66 | cloud (or some combination of these). We hope to demystify the
 67 | landscape of edge clouds for this audience to help inform those
 68 | decisions.
 69 | 
 70 | Secondly, there will be a group of application and service
 71 | developers who need to build on top of whatever cloud infrastructure
 72 | their organization has chosen to use. We believe it is important for
 73 | these developers to understand what goes on "under the hood" of the
 74 | cloud at least at a high level, so that they can make their
 75 | applications manageable and reliable. There is increasingly close
 76 | interaction between developers and operators (as evidenced by the
 77 | DevOps movement) and we aim to facilitate that collaboration. Topics
 78 | such as monitoring and observability are particularly important for
 79 | this audience.
 80 | 
 81 | Guided Tour of Open Source
 82 | --------------------------
 83 | 
 84 | The good news is that there is a wealth of open source components that
 85 | can be assembled to help manage cloud platforms and scalable
 86 | applications built on those platforms. That's also the bad news. With
 87 | several dozen cloud-related projects available at open source
 88 | consortia such as the Linux Foundation, Cloud Native Computing
 89 | Foundation, and Apache Foundation, navigating the project space is one
 90 | of the biggest challenges we faced in putting together a cloud
 91 | management platform. This is in large part because these projects are
 92 | competing for mindshare, with both significant overlap in the
 93 | functionality they offer and dependencies on each other.
 94 | 
 95 | One way to read this book is as a guided tour of the open source
 96 | landscape for cloud control and management. And in that spirit, we do
 97 | not replicate the excellent documentation those projects already
 98 | provide, but instead include links to project-specific documentation
 99 | (which often includes tutorials that we encourage you to try). We also
100 | include snippets of code from those projects, but these examples are
101 | chosen to help solidify the main points we're trying to make about the
102 | management platform as a whole; they should not be interpreted as an
103 | attempt to document the inner working of the individual projects. Our
104 | goal is to explain how the various puzzle pieces fit together to build
105 | an end-to-end management system, and in doing so, identify both
106 | various tools that help and the hard problems that no amount of
107 | tooling can eliminate.
108 | 
109 | It should come as no surprise that there are challenging technical
110 | issues to address (despite marketing claims to the contrary). After
111 | all, how to operationalize a computing system is a question that’s as
112 | old as the field of *Operating Systems*. Operationalizing a cloud is
113 | just today’s version of that fundamental problem, which has become all
114 | the more interesting as we move up the stack, from managing *devices*
115 | to managing *services*. This topic is both timely and
116 | foundational.
117 | 
118 | Acknowledgements
119 | ------------------
120 | 
121 | *Aether*, the example edge cloud this book uses to illustrate how to
122 | operationalize a cloud, was built by the Open Networking Foundation
123 | (ONF) engineering team and the open source community that worked with
124 | them. We acknowledge their contributions, with a special thank-you to
125 | Hyunsun Moon, Sean Condon, and HungWei Chiu for their significant
126 | contributions to Aether's control and management platform, and to Oguz
127 | Sunay for his influence on Aether's overall design.  Suchitra Vemuri's
128 | insights into testing and quality assurance were also invaluable.
129 | 
130 | The ONF is no longer active, but Aether continues as an open source
131 | project of the Linux Foundation. Visit https://aetherproject.org to
132 | learn about the ongoing project. We will also happily accept feedback
133 | to this book. Please send us your comments using the `Issues Link
134 | <https://github.com/SystemsApproach/ops/issues>`__, or submit a Pull
135 | Request with suggested changes.
136 | 
137 | | Larry Peterson, Scott Baker, Andy Bavier, Zack Williams, and Bruce Davie
138 | | June 2025
139 | 
140 | 


--------------------------------------------------------------------------------
/print.rst:
--------------------------------------------------------------------------------
 1 | .. role:: pop
 2 | 
 3 | :pop:`Print Copies`
 4 | ===========================
 5 | 
 6 | We make all books in the *Systems Approach* series available as both
 7 | print and e-books. This book is available via Amazon: `Edge Cloud Operations: A Systems Approach <https://amzn.to/3MfvK13>`__
 8 | 
 9 | `Book Series: <https://systemsapproach.org/books/>`__ Also check out
10 | our companion books that cover networking and emerging topics in more depth.
11 | 
12 | * `Computer Networks: A Systems Approach <https://amzn.to/3CtG81U>`__
13 | 
14 | * `Private 5G: A Systems Approach <https://amzn.to/3BBAQA6>`__
15 | 
16 | * `Software-Defined Networks: A Systems Approach <https://amzn.to/3rmLdCP>`__
17 | 
18 | * `TCP Congestion Control: A Systems Approach <https://amzn.to/3UTYi3T>`__
19 | 
20 | .. * `Edge Cloud Operations: A Systems Approach <https://ops.systemsapproach.org>`__
21 | 
22 | As participants in the Amazon Associate program we may earn income from qualifying purchases using the links above.
23 | 


--------------------------------------------------------------------------------
/provision.rst:
--------------------------------------------------------------------------------
  1 | Chapter 3:  Resource Provisioning
  2 | =================================
  3 | 
  4 | Resource Provisioning is the process of bringing virtual and physical
  5 | resources online. It has both a hands-on component (racking and
  6 | connecting devices) and a bootstrap component (configuring how the
  7 | resources boot into a "ready" state). Resource Provisioning happens
  8 | when a cloud deployment is first installed—i.e., an initial set of
  9 | resources are provisioned—but also incrementally over time as new
 10 | resources are added, obsolete resources are removed, and out-of-date
 11 | resources are upgraded.
 12 | 
 13 | The goal of Resource Provisioning is to be zero-touch, which is
 14 | impossible for hardware resources because it includes an intrinsically
 15 | manual step. (We take up the issue of provisioning virtual resources
 16 | in a moment.) Realistically, the goal is to minimize the number and
 17 | complexity of configuration steps required beyond physically
 18 | connecting the device, keeping in mind that we are starting with
 19 | commodity hardware received directly from a vendor, and not a
 20 | plug-and-play appliance that has already been prepped.
 21 | 
 22 | When a cloud is built from virtual resources (e.g., VMs instantiated
 23 | on a commercial cloud) the "rack and connect" step is carried out by a
 24 | sequence of API calls rather than a hands-on technician.  Of course,
 25 | we want to automate the sequence of calls needed to activate virtual
 26 | infrastructure, which has inspired an approach known as
 27 | *Infrastructure-as-Code*, a special case of the
 28 | *Configuration-as-Code* concept introduced in Chapter 2. The general
 29 | idea is to document, in a declarative format that can be "executed",
 30 | exactly what our infrastructure is to look like; how it is to be
 31 | configured. Aether uses Terraform as its approach to
 32 | Infrastructure-as-Code.
 33 | 
 34 | When a cloud is built from a combination of virtual and physical
 35 | resources, as is the case for a hybrid cloud like Aether, we need a
 36 | seamless way to accommodate both. To this end, our approach is to
 37 | first overlay a *logical structure* on top of hardware resources,
 38 | making them roughly equivalent to the virtual resources we get from a
 39 | commercial cloud provider. This results in a hybrid scenario similar
 40 | to the one shown in :numref:`Figure %s <fig-infra>`. NetBox is
 41 | our open source solution for layering this logical structure on top of
 42 | physical hardware. NetBox also helps us address the requirement of
 43 | tracking physical inventory.
 44 | 
 45 | .. _fig-infra:
 46 | .. figure:: figures/Slide19.png
 47 |     :width: 450px
 48 |     :align: center
 49 | 
 50 |     Resource Provisioning in a hybrid cloud that includes both
 51 |     physical and virtual resources.
 52 | 
 53 | Note that the Provisioning API shown on the right in :numref:`Figure
 54 | %s <fig-infra>` is *not* the NetBox API. Terraform does not interact
 55 | directly with NetBox, but instead with artifacts left behind by the
 56 | hardware provisioning process described in Section 3.1. One way to
 57 | think about this is that the task of booting hardware into the "ready"
 58 | state involves installing and configuring several subsystems that
 59 | collectively form the cloud platform. It is this platform that
 60 | Terraform interacts with, using an API we describe at the end of
 61 | Section 3.1.
 62 | 
 63 | This chapter describes both sides of :numref:`Figure %s <fig-infra>`
 64 | starting with provisioning physical infrastructure. Our approach is to
 65 | focus on the challenge of provisioning an entire site the first time.
 66 | We comment on the simpler problem of incrementally provisioning
 67 | individual resources as relevant details emerge.
 68 | 
 69 | 3.1 Physical Infrastructure
 70 | ---------------------------
 71 | 
 72 | The process of stacking and racking hardware is inherently
 73 | human-intensive, and includes considerations such as airflow and cable
 74 | management. These issues are beyond the scope of this book.  We focus
 75 | instead on the "physical/virtual" boundary, which starts with the
 76 | cabling plan that a hands-on technician uses as a blueprint. The
 77 | details of such a plan are highly deployment-specific, but we use the
 78 | example shown in :numref:`Figure %s <fig-cable_plan>` to help
 79 | illustrate all the steps involved. The example is based on Aether
 80 | clusters deployed in enterprises, which serves to highlight the
 81 | required level of specificity. Considerable planning is required to
 82 | specify an appropriate *Bill of Materials (BOM)*, including details
 83 | about individual device models, but this aspect of the problem is also
 84 | outside our scope.
 85 | 
 86 | .. _fig-cable_plan:
 87 | .. figure:: figures/pronto_logical_diagram.png
 88 |     :width: 700px
 89 |     :align: center
 90 | 
 91 |     Example network cable plan for an edge cluster.
 92 | 
 93 | The blueprint shown in :numref:`Figure %s <fig-cable_plan>` actually
 94 | includes two logical clusters sharing a Management Switch and a
 95 | Management Server. The upper cluster corresponds to a production
 96 | deployment, and includes five servers and a 2x2 leaf-spine switching
 97 | fabric. The lower cluster is for development, and includes two servers
 98 | and a single switch. Defining such logical groupings of hardware
 99 | resources is not unique to Aether; we can ask a commercial cloud
100 | provider to provision multiple logical clusters, so being able to do
101 | the same on physical resources is a natural requirement.
102 | 
103 | In addition to following this blueprint, the technician also enters
104 | various facts about the physical infrastructure into a database. This
105 | information, which is used in later provisioning steps, is where we
106 | pick up the story.
107 | 
108 | 3.1.1 Document Infrastructure
109 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
110 | 
111 | Documenting the physical infrastructure's logical structure in a
112 | database is how we cross the physical-to-virtual divide. It involves
113 | both defining a set of models for the information being collected
114 | (this schema effectively represents the logical structure shown in
115 | :numref:`Figure %s <fig-infra>`), and entering the corresponding facts
116 | about the physical devices. This process is familiar to anyone who is
117 | responsible for managing a network of devices, whether it is the first
118 | stage in a larger automated framework (such as the one described in
119 | this book) or simply a place to record what IP address has been
120 | assigned to each network appliance.
121 | 
122 | There are several open source tools available for this task. Our choice
123 | is NetBox. It supports IP address management (IPAM); inventory-related
124 | information about types of devices and where they are installed; how
125 | infrastructure is organized (racked) by group and site; and how
126 | devices are connected to consoles, networks, and power sources. More
127 | information is readily available on the NetBox web site.
128 | 
129 | .. _reading_netbox:
130 | .. admonition:: Further Reading
131 | 
132 |    `NetBox: <https://docs.netbox.dev>`_ Information Resource Modeling
133 |    Application.
134 | 
135 | One of the key features of NetBox is the ability to customize the set
136 | of models used to organize all the information that is collected. For
137 | example, an operator can define physical groupings like *Rack* and
138 | *Site*, but also logical groupings like *Organization* and
139 | *Deployment*.\ [#]_  In the following we use the Aether cable plan shown in
140 | :numref:`Figure %s <fig-cable_plan>` as an illustrative example,
141 | focusing on what happens when provisioning a single Aether site (but
142 | keeping in mind that Aether spans multiple sites, as outlined in
143 | Chapter 2).
144 | 
145 | .. [#] In this section, we denote models and model fields in italics
146 |        (e.g., *Site*, *Address*) and specific values assigned to an
147 |        instance of a model as a constant (e.g., ``10.0.0.0/22``).
148 | 
149 | The first step is to create a record for the site being provisioned,
150 | and document all the relevant metadata for that site. This includes
151 | the *Name* and *Location* of the *Site*, along with the *Organization*
152 | the site belongs to. An *Organization* can have more than one *Site*,
153 | while a *Site* can (a) span one or more *Racks*, and (b) host one or
154 | more *Deployments*. A *Deployment* is a logical cluster,
155 | corresponding, for example, to ``Production``, ``Staging``, and
156 | ``Development``. The cabling plan shown in :numref:`Figure %s
157 | <fig-cable_plan>` includes two such deployments.
158 | 
159 | This is also the time to specify the VLANs and IP Prefixes assigned to
160 | this particular edge deployment. Because it is important to maintain a
161 | clear relationship between VLANs, IP Prefixes, and DNS names (the last
162 | of which are auto-generated), it is helpful to walk through the
163 | following concrete example. We start with the minimal set of VLANs
164 | needed per Site:
165 | 
166 | * ADMIN 1
167 | * UPLINK 10
168 | * MGMT 800
169 | * FABRIC 801
170 | 
171 | These are Aether-specific, but they illustrate the set of VLANs a
172 | cluster might need. Minimally, one would expect to see a "management"
173 | network (MGMT in this example) and a "data" network (FABRIC in this
174 | example) in any cluster. Also specific to Aether (but generally
175 | applicable), if there are multiple Deployments at a Site sharing a
176 | single management server, additional VLANs (incremented by 10 for
177 | MGMT/FABRIC) are added. For example, a second ``Development``
178 | deployment might define:
179 | 
180 | * DEVMGMT 810
181 | * DEVFABRIC 811
182 | 
183 | IP Prefixes are then associated with VLANs, with all edge IP prefixes
184 | fitting into a ``/22`` sized block. This block is then partitioned in
185 | a way that works in concert with how DNS names are managed; i.e.,
186 | names are generated by combining the first ``<devname>`` component of
187 | the *Device* names (see below) with this suffix. Using ``10.0.0.0/22``
188 | as an example, there are four edge prefixes, with the following
189 | purposes:
190 | 
191 | * ADMIN Prefix ``10.0.0.0/25`` (for IPMI)
192 | 
193 |   * Has the Management Server and Management Switch
194 |   * Assign the ADMIN 1 VLAN
195 |   * Set domain to ``admin.<deployment>.<site>.aetherproject.net``
196 | 
197 | * MGMT Prefix ``10.0.0.128/25`` (for infrastructure control plane)
198 | 
199 |   * Has the Server Management plane, Fabric Switch Management
200 |   * Assign MGMT 800 VLAN
201 |   * Set domain to ``mgmt.<deployment>.<site>.aetherproject.net``
202 | 
203 | * FABRIC Prefix ``10.0.1.0/25`` (for infrastructure data plane)
204 | 
205 |   * IP addresses of the ``qsfp0`` port of the Compute Nodes to Fabric
206 |     switches, plus other Fabric-connected devices (e.g., eNB)
207 |   * Assign FABRIC 801 VLAN
208 |   * Set domain to ``fab1.<deployment>.<site>.aetherproject.net``
209 | 
210 | * FABRIC Prefix ``10.0.1.128/25`` (for infrastructure data plane)
211 | 
212 |   * IP addresses of the ``qsfp1`` port of the Compute Nodes to fabric switches
213 |   * Assign FABRIC 801 VLAN
214 |   * Set domain to ``fab2.<deployment>.<site>.aetherproject.net``
215 | 
216 | There are other edge prefixes used by Kubernetes, but they do not need
217 | to be created in NetBox. Note that ``qsfp0`` and ``qsfp1`` in this
218 | example denote transceiver ports connecting the switching fabric,
219 | where *QSFP* stands for Quad (4-channel) Small Form-factor Pluggable.
220 | 
221 | With this site-wide information recorded, the next step is to install
222 | and document each *Device*. This includes entering a ``<devname>``,
223 | which is subsequently used to generate a fully qualified domain name
224 | for the device: ``<devname>.<deployment>.<site>.aetherproject.net``.
225 | The following fields are also filled in when creating a Device:
226 | 
227 | * Site
228 | * Rack & Rack Position
229 | * Manufacturer
230 | * Model
231 | * Serial number
232 | * Device Type
233 | * MAC Addresses
234 | 
235 | Note there is typically both a primary and a management (e.g., BMC/IPMI)
236 | interface. One convenience feature of NetBox is to use the *Device Type* as a
237 | template that sets the default naming of interfaces, power connections, and
238 | other equipment model specific attributes.
239 | 
240 | Finally, the virtual interfaces for the Device must be specified, with its
241 | *Label* field set to the physical network interface that it is assigned. IP
242 | addresses are then assigned to the physical and virtual interfaces we have
243 | defined. The Management Server should always have the first IP address within
244 | each prefix, and by convention they are assigned incrementally as follows:
245 | 
246 | * Management Server
247 | 
248 |   * ``eno1`` - site provided public IP address, or blank if DHCP provided
249 |   * ``eno2`` - 10.0.0.1/25 (first of ADMIN) - set as primary IP
250 |   * ``bmc`` - 10.0.0.2/25 (next of ADMIN)
251 |   * ``mgmt800`` - 10.0.0.129/25 (first of MGMT, on VLAN 800)
252 |   * ``fab801`` - 10.0.1.1/25 (first of FABRIC, on VLAN 801)
253 | 
254 | * Management Switch
255 | 
256 |   * ``gbe1`` - 10.0.0.3/25 (next of ADMIN) - set as primary IP
257 | 
258 | * Fabric Switch
259 | 
260 |   * ``eth0`` - 10.0.0.130/25 (next of MGMT), set as primary IP
261 |   * ``bmc`` - 10.0.0.131/25
262 | 
263 | * Compute Server
264 | 
265 |   * ``eth0`` - 10.0.0.132/25 (next of MGMT), set as primary IP
266 |   * ``bmc`` - 10.0.0.4/25 (next of ADMIN)
267 |   * ``qsfp0`` - 10.0.1.2/25 (next of FABRIC)
268 |   * ``qsfp1`` - 10.0.1.3/25
269 | 
270 | * Other Fabric devices (eNB, etc.)
271 | 
272 |   * ``eth0`` or other primary interface - 10.0.1.4/25 (next of FABRIC)
273 | 
274 | Once this data is entered into NetBox, it can be used to generate a
275 | rack diagram, similar to the one shown in :numref:`Figure %s
276 | <fig-rack_diagram>`, corresponding to the cabling diagram shown in
277 | :numref:`Figure %s <fig-cable_plan>`. Note that the diagram shows two
278 | logical *Deployments* (``Production`` and ``Development``), co-located
279 | in one physical rack.
280 | 
281 | .. _fig-rack_diagram:
282 | .. figure:: figures/rack_diagram.png
283 |     :width: 500px
284 |     :align: center
285 | 
286 |     NetBox rendering of rack configuration.
287 | 
288 | It is also possible to generate other useful specifications for the
289 | deployment, helping the technician confirm the recorded logical specification
290 | matches the actual physical representation. For example,
291 | :numref:`Figure %s <fig-cable_list>` shows the set of cables and how
292 | they connect the set of hardware in our example deployment.
293 | 
294 | .. _fig-cable_list:
295 | .. figure:: figures/cable_list.png
296 |     :width: 700px
297 |     :align: center
298 | 
299 |     NetBox report of cabling.
300 | 
301 | If all of this seems like a tedious amount of detail, then you get the
302 | main point of this section. Everything about automating the control
303 | and management of a cloud hinges on having complete and accurate data
304 | about its resources. Keeping this information in sync with the reality
305 | of the physical infrastructure is often the weakest link in this
306 | process. The only saving grace is that the information is highly
307 | structured, and tools like NetBox help us codify this structure.
308 | 
309 | 3.1.2 Configure and Boot
310 | ~~~~~~~~~~~~~~~~~~~~~~~~
311 | 
312 | After installing the hardware and recording the relevant facts about
313 | the installation, the next step is to configure and boot the hardware
314 | so that it is "ready" for the automated procedures that follow. The
315 | goal is to minimize manual configuration required to onboard physical
316 | infrastructure like that shown in :numref:`Figure %s
317 | <fig-cable_plan>`, but *zero-touch* is a high bar. To illustrate, the
318 | bootstrapping steps needed to complete provisioning for our example
319 | deployment include:
320 | 
321 | * Configure the Management Switch to know the set of VLANs being
322 |   used.
323 | 
324 | * Configure the Management Server so it boots from a provided USB key.
325 | 
326 | * Run Ansible playbooks needed to complete configuration
327 |   onto the Management Server.
328 | 
329 | * Configure the Compute Servers so they boot from the Management
330 |   Server (via iPXE).
331 | 
332 | * Configure the Fabric Switches so they boot from the Management
333 |   Server (via Nginx).
334 | 
335 | * Configure the eNBs (mobile base stations) so they know their IP
336 |   addresses.
337 | 
338 | These are all manual configuration steps, requiring either console
339 | access or entering information into a device web interface, such that
340 | any subsequent configuration steps can be both fully automated and
341 | resilient. Note that while these steps cannot be automated away, they
342 | do not necessarily have to be performed in the field; hardware shipped
343 | to a remote site can first be prepped accordingly. Also note that care
344 | should be taken to *not* overload this step with configuration that
345 | can be done later. For example, various radio parameters can be set on
346 | the eNBs when it is physically installed, but those parameters will
347 | become settable through the Management Platform once the cluster is
348 | brought online.
349 | 
350 | Manual configuration work done at this stage should be minimized, and
351 | most systems should use automated means of configuration. For example,
352 | using DHCP pervasively with MAC reservations for IP address assignment
353 | instead of manual configuration of each interface allows for
354 | management to be zero-touch and simplifies future reconfiguration.
355 | 
356 | The automated aspects of configuration are implemented as a set of
357 | Ansible *roles* and *playbooks*, which in terms of the high-level
358 | overview shown in :numref:`Figure %s <fig-provision>` of Chapter 2,
359 | corresponds to the box representing the *"Zero-Touch Provision
360 | (System)"*. Said another way, there is no off-the-shelf ZTP solution
361 | we can use (i.e., someone has to write the playbooks), but the problem
362 | is greatly simplified by having access to all the configuration
363 | parameters that NetBox maintains.
364 | 
365 | The general idea is as follows. For every network service (e.g., DNS,
366 | DHCP, iPXE, Nginx) and every per-device subsystem (e.g., network
367 | interfaces, Docker) that needs to be configured, there is a
368 | corresponding Ansible role (set of related playbooks). These
369 | configurations are applied to the Management Server during the manual
370 | configuration stage summarized above, once the management network is
371 | online.
372 | 
373 | The Ansible playbooks install and configure the network services on the
374 | Management Server. The role of DNS and DHCP are obvious. As for iPXE and Nginx,
375 | they are used to bootstrap the rest of the infrastructure. The compute servers
376 | are configured by iPXE delivered over DHCP/TFTP, and then load the scripted OS
377 | installation from a Nginx web server. The fabric switches load their
378 | Stratum OS package from Nginx.
379 | 
380 | In many cases, the playbooks use parameters—such as VLANs, IP
381 | addresses, DNS names, and so on—extracted from NetBox. :numref:`Figure
382 | %s <fig-ansible>` illustrates the approach, and fills in a few
383 | details. For example, a home-grown Python program (``edgeconfig.py``)
384 | extracts data from NetBox using the REST API and outputs a corresponding
385 | set of YAML files, crafted to serve as input to Ansible, which creates yet
386 | more configuration on the management and compute systems.  One example of this
387 | is the *Netplan* file, which is used in Ubuntu to manage network interfaces.
388 | More information about Ansible and Netplan can be found on their respective web
389 | sites.
390 | 
391 | .. _reading_ansible:
392 | .. admonition:: Further Reading
393 | 
394 |    `Ansible: <https://www.ansible.com/>`_ Automation Platform.
395 | 
396 |    `Netplan: <https://netplan.io>`_ Network Configuration Abstraction Renderer.
397 | 
398 | .. _fig-ansible:
399 | .. figure:: figures/Slide20.png
400 |     :width: 550px
401 |     :align: center
402 | 
403 |     Configuring network services and OS-level subsystems using NetBox data.
404 | 
405 | While :numref:`Figure %s <fig-ansible>` highlights how Ansible is
406 | paired with Netplan to configure kernel-level details, there is also
407 | an Ansible playbook that installs Docker on each compute server and
408 | fabric switch, and then launches a Docker container running a
409 | "finalize" image. This image makes calls into the next layer of the
410 | provisioning stack, effectively signaling that the cluster is running and
411 | ready for further instructions. We are now ready to describe that next
412 | layer of the stack.
413 | 
414 | 
415 | 3.1.3 Provisioning API
416 | ~~~~~~~~~~~~~~~~~~~~~~~~
417 | 
418 | As a result of the steps described so far, we can assume each server
419 | and switch is up and running, but we still have a little work to do to
420 | prepare our bare-metal clusters for the next layer in the provisioning
421 | stack, essentially establishing parity between the left- and
422 | right-hand sides of the hybrid cloud shown in :numref:`Figure %s
423 | <fig-infra>`. If you ask yourself *"What would Google do?"* this
424 | reduces to the task of setting up a GCP-like API for the bare-metal
425 | edge clouds. This API primarily subsumes the Kubernetes API, but it
426 | goes beyond providing a way to *use* Kubernetes to also include calls
427 | to *manage* Kubernetes.
428 | 
429 | In short, this "manage Kubernetes" task is to turn a set of
430 | interconnected servers and switches into a fully-instantiated
431 | Kubernetes cluster. For starters, the API needs to provide a means to
432 | install and configure Kubernetes on each physical cluster. This
433 | includes specifying which version of Kubernetes to run, selecting the
434 | right combination of Container Network Interface (CNI) plugins
435 | (virtual network adaptors), and connecting Kubernetes to the local
436 | network (and any VPNs it might need). This layer also needs to provide
437 | a means to set up accounts (and associated credentials) for accessing
438 | and using each Kubernetes cluster, and a way to manage
439 | independent projects that are to be deployed on a given cluster (i.e.,
440 | manage namespaces for multiple applications).
441 | 
442 | As an example, Aether uses Rancher to manage Kubernetes on
443 | the bare-metal clusters, with one centralized instance of Rancher
444 | being responsible for managing all the edge sites. This results in the
445 | configuration shown in :numref:`Figure %s <fig-rancher>`, which to
446 | emphasize Rancher's scope, shows multiple edge clusters. Although not
447 | shown in the Figure, the GCP-provided API, just like Rancher, also
448 | spans multiple physical sites (e.g., ``us-west1-a``,
449 | ``europe-north1-b``, ``asia-south2-c``, and so on).
450 | 
451 | .. _fig-rancher:
452 | .. figure:: figures/Slide21.png
453 |     :width: 450px
454 |     :align: center
455 | 
456 |     Provisioning in a hybrid cloud that includes an API layer
457 |     for managing Kubernetes running on multiple bare-metal clusters.
458 | 
459 | We conclude this discussion by noting that while we often treat
460 | Kubernetes as though it is an industry-wide standard, that is not
461 | quite the reality of the situation. Each cloud provider offers its own
462 | customized version:
463 | 
464 | * Microsoft Azure offers the Azure Kubernetes Service (AKS)
465 | * AWS offers the Amazon Elastic Kubernetes Service (EKS)
466 | * Google Cloud offers the Google Kubernetes Engine (GKE)
467 | * Aether edges run the Rancher-certified version of Kubernetes (RKE)
468 | 
469 | Although the *CNCF (Cloud Native Computing Foundation)*—the open
470 | source organization responsible for shepherding the Kubernetes
471 | project—certifies these and other versions of Kubernetes, this only
472 | establishes baseline compliance. Each version if free to enhance their
473 | offering beyond this baseline, and these enhancements often take the
474 | form of additional features for provisioning and controlling a
475 | Kubernetes cluster.  Our job at the cloud management layer is to
476 | provide operators with a means to manage this heterogeneity. And as
477 | we'll see in Section 3.2, this is the primary challenge addressed by
478 | the Infrastructure-as-Code layer.
479 | 
480 | 3.1.4 Provisioning VMs
481 | ~~~~~~~~~~~~~~~~~~~~~~
482 | 
483 | We conclude our discussion of the steps required to provision physical
484 | machines by considering the implications of provisioning virtual
485 | machines, or VMs. That's something that happens "behind the scenes"
486 | when you request a Kubernetes cluster from AKS, EKS, or GKE, but
487 | that's because the hyperscalers have the option of layering their
488 | Kubernetes service on top of their Infrastructure-as-a-Service
489 | (IaaS). Do we need something similar for the edge cloud we're
490 | building?
491 | 
492 | Not necessarily. Because our goal is to support a curated set of edge
493 | services that provide value to our enterprise users, and not to
494 | support Container-as-a-Service so untrusted third-parties can spin up
495 | whatever applications they want, we do not need to manage VMs "as a
496 | service."  But we still may want to use VMs as a way to isolate
497 | Kubernetes workloads on a limited number of physical servers. This can
498 | be done as a provisioning step, akin to connecting and booting a
499 | physical machine, but using virtualization mechanisms like KVM and
500 | Proxmox. There is no need for a full-fledged IaaS mechanism, such as
501 | OpenStack. These VMs would then be recorded as first-class cloud
502 | resource in NetBox and the other tools described in this section, no
503 | different from a physical machine.
504 | 
505 | The unanswered question is why one might decide to do that,
506 | considering that Kubernetes already allows us to deploy multiple
507 | applications on a single cluster.  One reason is to support fine-grained
508 | resource isolation, making it possible to (a) ensure that each
509 | Kubernetes application receives the processor, memory, and storage
510 | resources it needs to do its job, and (b) reduce the risk of
511 | information leaking between the applications. Suppose, for example,
512 | that in addition to SD-Fabric, SD-RAN and SD-Core workloads that run
513 | (by default) on each edge site, we also want to run one or more other
514 | edge apps, such as the OpenVINO platform introduced in Section 2.3. To
515 | ensure that there is no interference between these applications, we
516 | could dedicate a subset of physical servers to each of them. Physical
517 | partitioning is a coarse-grained way to share the physical cluster.
518 | Being able to "split" one or more servers between multiple uses—by
519 | instantiating VMs—gives the operator more flexibility in allocating
520 | resources, which usually translates into requiring fewer overall
521 | resources. Note that there are other ways to specify how cluster
522 | resources are shared between applications (which we will see in
523 | Section 4.4), but the provisioning layer is one place where the issue
524 | can be addressed.
525 | 
526 | 3.2 Infrastructure-as-Code
527 | --------------------------
528 | 
529 | The provisioning interface for each of the Kubernetes variants just
530 | described includes a programmatic API, a Command Line Interface (CLI),
531 | and a Graphical User Interface (GUI). If you try any of the
532 | tutorials we recommended throughout this book, you'll likely use one
533 | of the latter two. For operational deployments, however, having a
534 | human operator interact with a CLI or GUI is problematic. This is not
535 | only because humans are error-prone, but also because it's nearly
536 | impossible to consistently repeat a sequence of configuration steps.
537 | Being able to continuously repeat the process is at the heart of
538 | Lifecycle Management described in the next chapter.
539 | 
540 | The solution is to find a declarative way of saying what your
541 | infrastructure is to look like—what set of Kubernetes clusters (e.g.,
542 | some running at the edges on bare-metal and some instantiated in GCP)
543 | are to be instantiated, and how each is to be configured—and then
544 | automate the task of making calls against the programmatic API to make
545 | it so. This is the essence of Infrastructure-as-Code, and as we've
546 | already said, Terraform is our open source example.
547 | 
548 | Since Terraform specifications are declarative, the best way to
549 | understand them is to walk through a specific example. In doing so,
550 | our goal isn't to document Terraform (online documentation and
551 | step-by-step tutorials are available for those those interested in
552 | more detail), but rather, to build some intuition about the role this
553 | layer plays in managing a cloud.
554 | 
555 | .. _reading_terraform:
556 | .. admonition:: Further Reading
557 | 
558 |    `Terraform Documentation <https://www.terraform.io/docs>`_.
559 | 
560 |    `Terraform Getting Started Tutorials
561 |    <https://learn.hashicorp.com/terraform?utm_source=terraform_io>`__.
562 | 
563 | To make sense of the example, the main thing you need to know about
564 | the Terraform configuration language is that it provides a means to
565 | both (1) specify *templates* for different kinds of resources (these
566 | are ``.tf`` files), and (2) fill in the *variables* for specific
567 | instances of those resource templates (these are ``.tfvars`` files).
568 | Then given a set of ``.tf`` and ``tfvars`` files, Terraform implements
569 | a two-stage process. In the first stage it constructs an execution
570 | plan, based on what has changed since the previous plan it
571 | executed. In the second stage, Terraform carries out the sequence of
572 | tasks required to bring the underlying infrastructure "up to spec"
573 | with the latest definition. Note that our job, for now, is the write
574 | these specification files, and check them into the Config Repo.
575 | Terraform gets invoked as part of the CI/CD pipeline described in
576 | Chapter 4.
577 | 
578 | Now to the specific files. At the top-most level, the operator defines
579 | the set of *providers* they plan to incorporate into their
580 | infrastructure.  We can think of each provider as corresponding to a
581 | cloud backend, including the corresponding provisioning API depicted
582 | in :numref:`Figure %s <fig-rancher>`. In our example, we show only two
583 | providers: the Rancher-managed edge clusters and the GCP-managed
584 | centralized clusters. Note that the example file declares a set of
585 | relevant variables for each provider (e.g., ``url``, ``access-key``),
586 | which are "filled in" in by instance-specific variable files described
587 | next.
588 | 
589 | .. literalinclude:: code/provider.tf
590 | 
591 | The next step is to fill in the details (define values) for the actual
592 | set of clusters we want to provision. Let's look at two examples,
593 | corresponding to the two providers we just specified. The first shows
594 | a GCP-provided cluster (named ``amp-gcp``) that is to host the AMP
595 | workload. (There's a similar ``sdcore-gcp`` that hosts an instance of
596 | the SD-Core.) The labels associated with this particular cluster
597 | (e.g., ``env = "production"``) establish linkage between Terraform
598 | (which assigns the label to each cluster it instantiates) and other
599 | layers of the management stack (which selectively take different
600 | actions based on the associated labels). We'll see an example of these
601 | labels being used in Section 4.4.
602 | 
603 | .. literalinclude:: code/cluster-gcp_val.tfvars
604 | 
605 | The second example shows an edge cluster (named ``ace-X``) to be
606 | instantiated at *Site X*. As shown in the example code, this is a
607 | bare-metal cluster consisting of five servers and four switches (two
608 | leaf switches and two spine switches). The address for each device
609 | must match the one assigned during the hardware-provisioning stage
610 | outlined in Section 3.1. Ideally, the NetBox (and related) tool chain
611 | described in that section would auto-generate these Terraform
612 | variables files, but in practice, manually entering the data is often
613 | still necessary.
614 | 
615 | .. literalinclude:: code/cluster-edge_val.tfvars
616 | 
617 | The final piece of the puzzle is to to fill in the remaining details
618 | about exactly how each Kubernetes cluster is to be instantiated. In
619 | this case, we show just the RKE-specific module used to configure the
620 | edge clusters, where most of the details are straightforward if you
621 | understand Kubernetes. For example, the module specifies that each
622 | edge cluster should load the ``calico`` and ``multus`` CNI plugins. It
623 | also defines how to invoke ``kubectl`` to configure Kubernetes
624 | according to these specifications. Less familiar, all references to
625 | ``SCTPSupport`` indicate whether or not that particular Kubernetes
626 | cluster needs to support SCTP, a Telco-oriented network protocol that
627 | is not included in a vanilla Kubernetes deployment, but is needed by
628 | the SD-Core.
629 | 
630 | .. literalinclude:: code/main-rke.tf
631 | 
632 | There are other loose ends that need to be tied up, such as defining
633 | the VPN to be used to connect edge clusters to their counterparts in
634 | GCP, but the above examples are sufficient to illustrate the role
635 | Infrastructure-as-Code plays in the cloud management stack. The key
636 | takeaway is that everything Terraform handles could have been done by
637 | a human operator making a sequence of CLI calls (or GUI clicks) on the
638 | backend Provisioning APIs, but experience has shown that approach to
639 | be error-prone and difficult to make consistently repeatable.
640 | Starting with declarative language and auto-generating the right
641 | sequence of API calls is a proven way to overcome that problem.
642 | 
643 | 
644 | We conclude by drawing attention to the fact that while we now have a
645 | declarative specification for our cloud infrastructure, which we refer
646 | to as the *Aether Platform*, these specification files are yet another
647 | software artifact that we check into the Config Repo. This is what we
648 | mean by Infrastructure-as-Code: infrastructure specifications are
649 | checked into a repo and version-controlled like any other code.  This
650 | repo, in turn, feeds the lifecycle management pipeline described in
651 | the next chapter. The physical provisioning steps described in Section
652 | 3.1 happen "outside" the pipeline (which is why we don't just fold
653 | resource provisioning into Lifecycle Management), but it is fair to
654 | think of resource provisioning as "Stage 0" of lifecycle management.
655 | 
656 | 3.3 Platform Definition
657 | ------------------------
658 | 
659 | The art of defining a system architecture, in our case a management
660 | framework for a hybrid cloud, is deciding where to draw the line
661 | between what's included inside the platform and what is considered an
662 | application running on top of the platform. For Aether, we have
663 | decided to include SD-Fabric inside the platform (along with
664 | Kubernetes), with SD-Core and SD-RAN treated as applications, even
665 | though all three are implemented as Kubernetes-based microservices.
666 | One consequence of this decision is that SD-Fabric is initialized as
667 | part of the provisioning system described in this chapter (with
668 | NetBox, Ansible, Rancher, and Terraform playing a role), whereas
669 | SD-Core and SD-RAN are deployed using the application-level mechanisms
670 | described in Chapter 4.
671 | 
672 | There may also be other edge applications running as Kubernetes
673 | workloads, which complicates the story because from their perspective,
674 | all of Aether (including the 5G connectivity that SD-Core and SD-RAN
675 | implements) is assumed to be part of the platform. In other words,
676 | Aether draws two lines, one demarcating Aether's base platform
677 | (Kubernetes plus SD-Fabric) and a second demarcating the Aether PaaS
678 | (which includes SD-Core and SD-RAN running on top of the platform,
679 | plus AMP managing the whole system). The distinction between "base
680 | platform" and "PaaS" is subtle, but essentially corresponds to the
681 | difference between a software stack and a managed service,
682 | respectively.
683 | 
684 | In some respects this is just a matter of terminology, which is
685 | certainly important, but the relevance to our discussion is that
686 | because we have multiple overlapping mechanisms at our disposal,
687 | giving us more than one way to solve each engineering problem we
688 | encounter, it is easy to end up with an implementation that
689 | unnecessarily conflates separable concerns. Being explicit and
690 | consistent about what is platform and what is application is a
691 | prerequisite for a sound overall design. It is also important to
692 | recognize the difference between an internal engineering decision
693 | (e.g., what mechanism is used to deploy a given component), and an
694 | externally-visible architectural decision (e.g., what functionality to
695 | expose through a public API).
696 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx~=5.3.0
2 | doc8~=0.10.1
3 | docutils~=0.17.1
4 | reuse~=0.14.0
5 | sphinx-rtd-theme~=1.0.0
6 | sphinxcontrib-spelling~=7.3.2
7 | sphinx-multiversion~=0.2.4
8 | pytz~=2023.3
9 | 


--------------------------------------------------------------------------------