├── .gitignore
├── .gitreview
├── .zuul.yaml
├── LICENSE
├── README.rst
├── doc
├── requirements.txt
└── source
│ ├── common
│ ├── app-support.rst
│ ├── appendix.rst
│ ├── conventions.rst
│ └── glossary.rst
│ ├── compute-node-ha.rst
│ ├── conf.py
│ ├── control-plane-stateful.rst
│ ├── control-plane-stateless.rst
│ ├── control-plane.rst
│ ├── figures
│ ├── Cluster-deployment-collapsed.png
│ └── Cluster-deployment-segregated.png
│ ├── ha-community.rst
│ ├── index.rst
│ ├── intro-ha-common-tech.rst
│ ├── intro-ha-key-concepts.rst
│ ├── intro-ha.rst
│ ├── intro-os-ha-cluster.rst
│ ├── intro-os-ha-memcached.rst
│ ├── intro-os-ha-state.rst
│ ├── intro-os-ha.rst
│ ├── monitoring.rst
│ ├── networking-ha-l3-agent.rst
│ ├── networking-ha-neutron-l3-analysis.rst
│ ├── networking-ha-neutron-server.rst
│ ├── networking-ha.rst
│ ├── overview.rst
│ ├── ref-arch-examples.rst
│ ├── storage-ha-backend.rst
│ ├── storage-ha-block.rst
│ ├── storage-ha-file-systems.rst
│ ├── storage-ha-image.rst
│ ├── storage-ha.rst
│ └── testing.rst
├── setup.cfg
└── tox.ini
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.xpr
3 |
4 | # Packages
5 | .venv
6 | *.egg
7 | *.egg-info
8 |
9 | # Testenvironment
10 | .tox
11 |
12 | # Build directories
13 | doc/build
14 |
15 | # Transifex Client Setting
16 | .tx
17 |
18 | # Editors
19 | *~
20 | .*.swp
21 | .bak
22 | *.pyc
23 |
24 | doc/source/.doctrees
25 |
--------------------------------------------------------------------------------
/.gitreview:
--------------------------------------------------------------------------------
1 | [gerrit]
2 | host=review.opendev.org
3 | port=29418
4 | project=openstack/ha-guide.git
5 |
--------------------------------------------------------------------------------
/.zuul.yaml:
--------------------------------------------------------------------------------
1 | - project:
2 | templates:
3 | - build-openstack-docs-pti
4 | promote:
5 | jobs:
6 | - promote-openstack-tox-docs-direct
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | =================================
2 | OpenStack High Availability Guide
3 | =================================
4 |
5 | This repository contains the source files for the OpenStack High Availability
6 | Guide.
7 |
8 | You can read this guide at `docs.openstack.org/ha-guide
9 | `_.
10 |
11 | Prerequisites
12 | -------------
13 |
14 | At a minimum, you will need git and the git-review tool installed in order to
15 | contribute documentation. You will also need a `Gerrit account
16 | `_ to
17 | submit the change.
18 |
19 | Git is available for Linux, Mac, and Windows environments. Some platforms come
20 | with it preinstalled, but you can review the `installation instructions
21 | `_ if you
22 | do not have it by default.
23 |
24 | Once git is installed, you can follow the instructions for your platform to
25 | `install git-review `_.
26 |
27 | The last step is to configure git with your name and email address used for
28 | your Gerrit account set up so it can link you patch to your user. Run the
29 | following to set these values:
30 |
31 | .. code-block:: console
32 |
33 | git config --global user.name "First Last"
34 | git config --global user.email "your_email@youremail.com"
35 |
36 |
37 | Submitting Updates
38 | ------------------
39 |
40 | Proposing updates to the documentation is fairly straight forward once you've
41 | done it, but there are a few steps that can appear intimidating your first
42 | couple times through. Here is a suggested workflow to help you along the way.
43 |
44 | .. code-block:: console
45 |
46 | git clone https://opendev.org/openstack/ha-guide
47 | cd ha-guide
48 |
49 | # it is useful to make changes on a separate branch in case you need to make
50 | # other changes
51 | git checkout -b my-topic
52 |
53 | # edit your files
54 | git add .
55 | git commit # Add a descriptive commit message
56 |
57 | # submit your changes for review
58 | git review
59 |
60 | The changes will then be run through a few tests to make sure the docs build
61 | and it will be ready for reviews. Once reviewed, if no problems are found with
62 | the changes they will be merged to the repo and the changes will be published
63 | to the docs.openstack.org site.
64 |
65 | Local Testing
66 | -------------
67 |
68 | If you would like to build the docs locally to make sure there are no issues
69 | with the changes, and to view locally generated HTML files, you will need to do
70 | a couple extra steps.
71 |
72 | The jobs are run using a tool called `tox`. You will need to install tox on
73 | your platform first following its `installation guide
74 | `_.
75 |
76 | You can then run the following to perform a local build with some tests:
77 |
78 | .. code-block:: console
79 |
80 | tox -e docs
81 |
82 | If you have any questions, please reach out on the #openstack-operators IRC
83 | channel or through the openstack-ops mailing list.
84 |
--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | # The order of packages is significant, because pip processes them in the order
2 | # of appearance. Changing the order has an impact on the overall integration
3 | # process, which may cause wedges in the gate later.
4 | openstackdocstheme>=3.5.0 # Apache-2.0
5 | sphinx>=8.1.3 # BSD
6 | doc8>=1.1.2 # Apache-2.0
7 |
--------------------------------------------------------------------------------
/doc/source/common/app-support.rst:
--------------------------------------------------------------------------------
1 | .. ## WARNING ##########################################################
2 | .. This file is synced from openstack/openstack-manuals repository to
3 | .. other related repositories. If you need to make changes to this file,
4 | .. make the changes in openstack-manuals. After any change merged to,
5 | .. openstack-manuals, automatically a patch for others will be proposed.
6 | .. #####################################################################
7 |
8 | =================
9 | Community support
10 | =================
11 |
12 | The following resources are available to help you run and use OpenStack.
13 | The OpenStack community constantly improves and adds to the main
14 | features of OpenStack, but if you have any questions, do not hesitate to
15 | ask. Use the following resources to get OpenStack support and
16 | troubleshoot your installations.
17 |
18 | Documentation
19 | ~~~~~~~~~~~~~
20 |
21 | For the available OpenStack documentation, see
22 | `docs.openstack.org `_.
23 |
24 | The following guides explain how to install a Proof-of-Concept OpenStack cloud
25 | and its associated components:
26 |
27 | * `2025.1 Epoxy Installation Guides `_
28 |
29 | The following books explain how to configure and run an OpenStack cloud:
30 |
31 | * `Architecture Design Guide `_
32 |
33 | * `Administrator Guides `_
34 |
35 | * `Configuration Guides `_
36 |
37 | * `Networking Guide `_
38 |
39 | * `High Availability Guide `_
40 |
41 | * `Security Guide `_
42 |
43 | * `Virtual Machine Image Guide `_
44 |
45 | The following book explains how to use the command-line clients:
46 |
47 | * `API Bindings
48 | `_
49 |
50 | The following documentation provides reference and guidance information
51 | for the OpenStack APIs:
52 |
53 | * `API Documentation `_
54 |
55 | The following guide provides information on how to contribute to OpenStack
56 | documentation:
57 |
58 | * `Documentation Contributor Guide `_
59 |
60 | The OpenStack wiki
61 | ~~~~~~~~~~~~~~~~~~
62 |
63 | The `OpenStack wiki `_ contains a broad
64 | range of topics but some of the information can be difficult to find or
65 | is a few pages deep. Fortunately, the wiki search feature enables you to
66 | search by title or content. If you search for specific information, such
67 | as about networking or OpenStack Compute, you can find a large amount
68 | of relevant material. More is being added all the time, so be sure to
69 | check back often. You can find the search box in the upper-right corner
70 | of any OpenStack wiki page.
71 |
72 | The Launchpad bugs area
73 | ~~~~~~~~~~~~~~~~~~~~~~~
74 |
75 | The OpenStack community values your set up and testing efforts and wants
76 | your feedback. To log a bug, you must `sign up for a Launchpad account
77 | `_. You can view existing bugs and report bugs
78 | in the Launchpad Bugs area. Use the search feature to determine whether
79 | the bug has already been reported or already been fixed. If it still
80 | seems like your bug is unreported, fill out a bug report.
81 |
82 | Some tips:
83 |
84 | * Give a clear, concise summary.
85 |
86 | * Provide as much detail as possible in the description. Paste in your
87 | command output or stack traces, links to screen shots, and any other
88 | information which might be useful.
89 |
90 | * Be sure to include the software and package versions that you are
91 | using, especially if you are using a development branch, such as,
92 | ``"Kilo release" vs git commit bc79c3ecc55929bac585d04a03475b72e06a3208``.
93 |
94 | * Any deployment-specific information is helpful, such as whether you
95 | are using CentOS Stream 9 or are performing a multi-node installation.
96 |
97 | The following Launchpad Bugs areas are available:
98 |
99 | * `Bugs: OpenStack Block Storage
100 | (cinder) `_
101 |
102 | * `Bugs: OpenStack Compute (nova) `_
103 |
104 | * `Bugs: OpenStack Dashboard
105 | (horizon) `_
106 |
107 | * `Bugs: OpenStack Identity
108 | (keystone) `_
109 |
110 | * `Bugs: OpenStack Image service
111 | (glance) `_
112 |
113 | * `Bugs: OpenStack Networking
114 | (neutron) `_
115 |
116 | * `Bugs: OpenStack Object Storage
117 | (swift) `_
118 |
119 | * `Bugs: Bare metal service (ironic) `_
120 |
121 | * `Bugs: Clustering service (senlin) `_
122 |
123 | * `Bugs: Container Infrastructure Management service (magnum) `_
124 |
125 | * `Bugs: Database service (trove) `_
126 |
127 | * `Bugs: DNS service (designate) `_
128 |
129 | * `Bugs: Key Manager Service (barbican) `_
130 |
131 | * `Bugs: Monitoring (monasca) `_
132 |
133 | * `Bugs: Orchestration (heat) `_
134 |
135 | * `Bugs: Rating (cloudkitty) `_
136 |
137 | * `Bugs: Shared file systems (manila) `_
138 |
139 | * `Bugs: Telemetry
140 | (ceilometer) `_
141 |
142 | * `Bugs: Workflow service
143 | (mistral) `_
144 |
145 | * `Bugs: Messaging service
146 | (zaqar) `_
147 |
148 | * `Bugs: Container service
149 | (zun) `_
150 |
151 | * `Bugs: OpenStack API Documentation
152 | (developer.openstack.org) `_
153 |
154 | * `Bugs: OpenStack Documentation
155 | (docs.openstack.org) `_
156 |
157 | Documentation feedback
158 | ~~~~~~~~~~~~~~~~~~~~~~
159 |
160 | To provide feedback on documentation, join our IRC channel ``#openstack-doc``
161 | on the OFTC IRC network, or `report a bug in Launchpad
162 | `_ and choose the particular
163 | project that the documentation is a part of.
164 |
165 | The OpenStack IRC channel
166 | ~~~~~~~~~~~~~~~~~~~~~~~~~
167 |
168 | The OpenStack community lives in the #openstack IRC channel on the
169 | OFTC network. You can hang out, ask questions, or get immediate
170 | feedback for urgent and pressing issues. To install an IRC client or use
171 | a browser-based client, go to
172 | `https://webchat.oftc.net/ `_. You can
173 | also use `Colloquy `_ (Mac OS X),
174 | `mIRC `_ (Windows),
175 | or XChat (Linux). When you are in the IRC channel
176 | and want to share code or command output, the generally accepted method
177 | is to use a Paste Bin. The OpenStack project has one at `Paste
178 | `_. Just paste your longer amounts of text or
179 | logs in the web form and you get a URL that you can paste into the
180 | channel. The OpenStack IRC channel is ``#openstack`` on
181 | ``irc.oftc.net``. You can find a list of all OpenStack IRC channels on
182 | the `IRC page on the wiki `_.
183 |
184 | OpenStack mailing lists
185 | ~~~~~~~~~~~~~~~~~~~~~~~
186 |
187 | A great way to get answers and insights is to post your question or
188 | problematic scenario to the OpenStack mailing list. You can learn from
189 | and help others who might have similar issues. To subscribe or view the
190 | archives, go to the `general OpenStack mailing list
191 | `_. If you are
192 | interested in the other mailing lists for specific projects or development,
193 | refer to `Mailing Lists `_.
194 |
195 | OpenStack distribution packages
196 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
197 |
198 | The following Linux distributions provide community-supported packages
199 | for OpenStack:
200 |
201 | * **CentOS Stream and Red Hat Enterprise Linux:**
202 | https://www.rdoproject.org/
203 |
204 | * **Ubuntu:** https://wiki.ubuntu.com/OpenStack/CloudArchive
205 |
--------------------------------------------------------------------------------
/doc/source/common/appendix.rst:
--------------------------------------------------------------------------------
1 | Appendix
2 | ~~~~~~~~
3 |
4 | .. toctree::
5 | :maxdepth: 1
6 |
7 | app-support.rst
8 | glossary.rst
9 |
--------------------------------------------------------------------------------
/doc/source/common/conventions.rst:
--------------------------------------------------------------------------------
1 | .. ## WARNING ##########################################################
2 | .. This file is synced from openstack/openstack-manuals repository to
3 | .. other related repositories. If you need to make changes to this file,
4 | .. make the changes in openstack-manuals. After any change merged to,
5 | .. openstack-manuals, automatically a patch for others will be proposed.
6 | .. #####################################################################
7 |
8 | ===========
9 | Conventions
10 | ===========
11 |
12 | The OpenStack documentation uses several typesetting conventions.
13 |
14 | Notices
15 | ~~~~~~~
16 |
17 | Notices take these forms:
18 |
19 | .. note:: A comment with additional information that explains a part of the
20 | text.
21 |
22 | .. important:: Something you must be aware of before proceeding.
23 |
24 | .. tip:: An extra but helpful piece of practical advice.
25 |
26 | .. caution:: Helpful information that prevents the user from making mistakes.
27 |
28 | .. warning:: Critical information about the risk of data loss or security
29 | issues.
30 |
31 | Command prompts
32 | ~~~~~~~~~~~~~~~
33 |
34 | .. code-block:: console
35 |
36 | $ command
37 |
38 | Any user, including the ``root`` user, can run commands that are
39 | prefixed with the ``$`` prompt.
40 |
41 | .. code-block:: console
42 |
43 | # command
44 |
45 | The ``root`` user must run commands that are prefixed with the ``#``
46 | prompt. You can also prefix these commands with the :command:`sudo`
47 | command, if available, to run them.
48 |
--------------------------------------------------------------------------------
/doc/source/compute-node-ha.rst:
--------------------------------------------------------------------------------
1 | ============================
2 | Configuring the compute node
3 | ============================
4 |
5 | The `Installation Guides
6 | `_
7 | provide instructions for installing multiple compute nodes.
8 | To make the compute nodes highly available, you must configure the
9 | environment to include multiple instances of the API and other services.
10 |
11 | Configuring high availability for instances
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 |
14 | As of September 2016, the OpenStack High Availability community is
15 | designing and developing an official and unified way to provide high
16 | availability for instances. We are developing automatic
17 | recovery from failures of hardware or hypervisor-related software on
18 | the compute node, or other failures that could prevent instances from
19 | functioning correctly, such as, issues with a cinder volume I/O path.
20 |
21 | More details are available in the `user story
22 | `_
23 | co-authored by OpenStack's HA community and `Product Working Group
24 | `_ (PWG), where this feature is
25 | identified as missing functionality in OpenStack, which
26 | should be addressed with high priority.
27 |
28 | Existing solutions
29 | ~~~~~~~~~~~~~~~~~~
30 |
31 | The architectural challenges of instance HA and several currently
32 | existing solutions were presented in `a talk at the Austin summit
33 | `_,
34 | for which `slides are also available `_.
35 |
36 | The code for three of these solutions can be found online at the following
37 | links:
38 |
39 | * `a mistral-based auto-recovery workflow
40 | `_, by Intel
41 | * `masakari `_, by NTT
42 | * `OCF RAs
43 | `_,
44 | as used by Red Hat and SUSE
45 |
46 | Current upstream work
47 | ~~~~~~~~~~~~~~~~~~~~~
48 |
49 | Work is in progress on a unified approach, which combines the best
50 | aspects of existing upstream solutions. More details are available on
51 | `the HA VMs user story wiki
52 | `_.
53 |
54 | To get involved with this work, see the section on the
55 | :doc:`ha-community`.
56 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
10 | # implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | # This file is execfile()d with the current directory set to its
15 | # containing dir.
16 | #
17 | # Note that not all possible configuration values are present in this
18 | # autogenerated file.
19 | #
20 | # All configuration values have a default; values that are commented out
21 | # serve to show the default.
22 |
23 | import os
24 | # import sys
25 |
26 | # If extensions (or modules to document with autodoc) are in another directory,
27 | # add these directories to sys.path here. If the directory is relative to the
28 | # documentation root, use os.path.abspath to make it absolute, like shown here.
29 | # sys.path.insert(0, os.path.abspath('.'))
30 |
31 | # -- General configuration ------------------------------------------------
32 |
33 | # If your documentation needs a minimal Sphinx version, state it here.
34 | # needs_sphinx = '1.0'
35 |
36 | # Add any Sphinx extension module names here, as strings. They can be
37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
38 | # ones.
39 | extensions = ['openstackdocstheme']
40 |
41 | # Add any paths that contain templates here, relative to this directory.
42 | # templates_path = ['_templates']
43 |
44 | # The suffix of source filenames.
45 | source_suffix = '.rst'
46 |
47 | # The encoding of source files.
48 | # source_encoding = 'utf-8-sig'
49 |
50 | # The master toctree document.
51 | master_doc = 'index'
52 |
53 | # General information about the project.
54 | openstackdocs_repo_name = "openstack/ha-guide"
55 | openstackdocs_use_storyboard = True
56 | copyright = '2016-present, OpenStack contributors'
57 |
58 | # The version info for the project you're documenting, acts as replacement for
59 | # |version| and |release|, also used in various other places throughout the
60 | # built documents.
61 | #
62 | # The short X.Y version.
63 | version = ''
64 | # The full version, including alpha/beta/rc tags.
65 | release = ''
66 |
67 | # The language for content autogenerated by Sphinx. Refer to documentation
68 | # for a list of supported languages.
69 | # language = None
70 |
71 | # There are two options for replacing |today|: either, you set today to some
72 | # non-false value, then it is used:
73 | # today = ''
74 | # Else, today_fmt is used as the format for a strftime call.
75 | # today_fmt = '%B %d, %Y'
76 |
77 | # List of patterns, relative to source directory, that match files and
78 | # directories to ignore when looking for source files.
79 | exclude_patterns = ['common/cli*', 'common/nova*',
80 | 'common/get-started*', 'common/dashboard*']
81 |
82 | # The reST default role (used for this markup: `text`) to use for all
83 | # documents.
84 | # default_role = None
85 |
86 | # If true, '()' will be appended to :func: etc. cross-reference text.
87 | # add_function_parentheses = True
88 |
89 | # If true, the current module name will be prepended to all description
90 | # unit titles (such as .. function::).
91 | # add_module_names = True
92 |
93 | # If true, sectionauthor and moduleauthor directives will be shown in the
94 | # output. They are ignored by default.
95 | # show_authors = False
96 |
97 | # The name of the Pygments (syntax highlighting) style to use.
98 | pygments_style = 'sphinx'
99 |
100 | # A list of ignored prefixes for module index sorting.
101 | # modindex_common_prefix = []
102 |
103 | # If true, keep warnings as "system message" paragraphs in the built documents.
104 | # keep_warnings = False
105 |
106 |
107 | # -- Options for HTML output ----------------------------------------------
108 |
109 | # The theme to use for HTML and HTML Help pages. See the documentation for
110 | # a list of builtin themes.
111 | html_theme = 'openstackdocs'
112 |
113 | # Theme options are theme-specific and customize the look and feel of a theme
114 | # further. For a list of options available for each theme, see the
115 | # documentation.
116 | html_theme_options = {
117 | 'display_badge': False
118 | }
119 |
120 | # Add any paths that contain custom themes here, relative to this directory.
121 | # html_theme_path = [openstackdocstheme.get_html_theme_path()]
122 |
123 | # The name for this set of Sphinx documents. If None, it defaults to
124 | # " v documentation".
125 | # html_title = None
126 |
127 | # A shorter title for the navigation bar. Default is the same as html_title.
128 | # html_short_title = None
129 |
130 | # The name of an image file (relative to this directory) to place at the top
131 | # of the sidebar.
132 | # html_logo = None
133 |
134 | # The name of an image file (within the static path) to use as favicon of the
135 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
136 | # pixels large.
137 | # html_favicon = None
138 |
139 | # Add any paths that contain custom static files (such as style sheets) here,
140 | # relative to this directory. They are copied after the builtin static files,
141 | # so a file named "default.css" will overwrite the builtin "default.css".
142 | # html_static_path = []
143 |
144 | # Add any extra paths that contain custom files (such as robots.txt or
145 | # .htaccess) here, relative to this directory. These files are copied
146 | # directly to the root of the documentation.
147 | # html_extra_path = []
148 |
149 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
150 | # using the given strftime format.
151 | # So that we can enable "log-a-bug" links from each output HTML page, this
152 | # variable must be set to a format that includes year, month, day, hours and
153 | # minutes.
154 | # html_last_updated_fmt = '%Y-%m-%d %H:%M'
155 |
156 | # If true, SmartyPants will be used to convert quotes and dashes to
157 | # typographically correct entities.
158 | # html_use_smartypants = True
159 |
160 | # Custom sidebar templates, maps document names to template names.
161 | # html_sidebars = {}
162 |
163 | # Additional templates that should be rendered to pages, maps page names to
164 | # template names.
165 | # html_additional_pages = {}
166 |
167 | # If false, no module index is generated.
168 | # html_domain_indices = True
169 |
170 | # If false, no index is generated.
171 | html_use_index = False
172 |
173 | # If true, the index is split into individual pages for each letter.
174 | # html_split_index = False
175 |
176 | # If true, links to the reST sources are added to the pages.
177 | html_show_sourcelink = False
178 |
179 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
180 | # html_show_sphinx = True
181 |
182 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
183 | # html_show_copyright = True
184 |
185 | # If true, an OpenSearch description file will be output, and all pages will
186 | # contain a tag referring to it. The value of this option must be the
187 | # base URL from which the finished HTML is served.
188 | # html_use_opensearch = ''
189 |
190 | # This is the file name suffix for HTML files (e.g. ".xhtml").
191 | # html_file_suffix = None
192 |
193 | # Output file base name for HTML help builder.
194 | htmlhelp_basename = 'ha-guide'
195 |
196 | # If true, publish source files
197 | html_copy_source = False
198 |
199 | # -- Options for LaTeX output ---------------------------------------------
200 |
201 | # Grouping the document tree into LaTeX files. List of tuples
202 | # (source start file, target name, title,
203 | # author, documentclass [howto, manual, or own class]).
204 | latex_documents = [
205 | ('index', 'HAGuide.tex', 'HA Guide',
206 | 'OpenStack contributors', 'manual'),
207 | ]
208 |
209 | # The name of an image file (relative to this directory) to place at the top of
210 | # the title page.
211 | # latex_logo = None
212 |
213 | # For "manual" documents, if this is true, then toplevel headings are parts,
214 | # not chapters.
215 | # latex_use_parts = False
216 |
217 | # If true, show page references after internal links.
218 | # latex_show_pagerefs = False
219 |
220 | # If true, show URL addresses after external links.
221 | # latex_show_urls = False
222 |
223 | # Documents to append as an appendix to all manuals.
224 | # latex_appendices = []
225 |
226 | # If false, no module index is generated.
227 | # latex_domain_indices = True
228 |
229 |
230 | # -- Options for manual page output ---------------------------------------
231 |
232 | # One entry per manual page. List of tuples
233 | # (source start file, name, description, authors, manual section).
234 | man_pages = [
235 | ('index', 'haguide', 'High Availability Guide',
236 | ['OpenStack contributors'], 1)
237 | ]
238 |
239 | # If true, show URL addresses after external links.
240 | # man_show_urls = False
241 |
242 |
243 | # -- Options for Texinfo output -------------------------------------------
244 |
245 | # Grouping the document tree into Texinfo files. List of tuples
246 | # (source start file, target name, title, author,
247 | # dir menu entry, description, category)
248 | texinfo_documents = [
249 | ('index', 'HAGuide', 'High Availability Guide',
250 | 'OpenStack contributors', 'HAGuide',
251 | 'This guide shows OpenStack operators and deployers how to configure'
252 | 'OpenStack to be robust and fault-tolerant.', 'Miscellaneous'),
253 | ]
254 |
255 | # Documents to append as an appendix to all manuals.
256 | # texinfo_appendices = []
257 |
258 | # If false, no module index is generated.
259 | # texinfo_domain_indices = True
260 |
261 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
262 | # texinfo_show_urls = 'footnote'
263 |
264 | # If true, do not generate a @detailmenu in the "Top" node's menu.
265 | # texinfo_no_detailmenu = False
266 |
267 | # -- Options for Internationalization output ------------------------------
268 | locale_dirs = ['locale/']
269 |
--------------------------------------------------------------------------------
/doc/source/control-plane-stateful.rst:
--------------------------------------------------------------------------------
1 | =================================
2 | Configuring the stateful services
3 | =================================
4 | .. to do: scope how in depth we want these sections to be
5 |
6 | Database for high availability
7 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 |
9 | Galera
10 | ------
11 |
12 | The first step is to install the database that sits at the heart of the
13 | cluster. To implement high availability, run an instance of the database on
14 | each controller node and use Galera Cluster to provide replication between
15 | them. Galera Cluster is a synchronous multi-master database cluster, based
16 | on MySQL and the InnoDB storage engine. It is a high-availability service
17 | that provides high system uptime, no data loss, and scalability for growth.
18 |
19 | You can achieve high availability for the OpenStack database in many
20 | different ways, depending on the type of database that you want to use.
21 | There are three implementations of Galera Cluster available to you:
22 |
23 | - `Galera Cluster for MySQL `_: The MySQL
24 | reference implementation from Codership, Oy.
25 | - `MariaDB Galera Cluster `_: The MariaDB
26 | implementation of Galera Cluster, which is commonly supported in
27 | environments based on Red Hat distributions.
28 | - `Percona XtraDB Cluster `_: The XtraDB
29 | implementation of Galera Cluster from Percona.
30 |
31 | In addition to Galera Cluster, you can also achieve high availability
32 | through other database options, such as PostgreSQL, which has its own
33 | replication system.
34 |
35 | Pacemaker active/passive with HAproxy
36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
37 |
38 | Replicated storage
39 | ------------------
40 |
41 | For example: DRBD
42 |
43 | Shared storage
44 | --------------
45 |
46 | Messaging service for high availability
47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 |
49 | RabbitMQ
50 | --------
51 |
52 | An AMQP (Advanced Message Queuing Protocol) compliant message bus is
53 | required for most OpenStack components in order to coordinate the
54 | execution of jobs entered into the system.
55 |
56 | The most popular AMQP implementation used in OpenStack installations
57 | is RabbitMQ.
58 |
59 | RabbitMQ nodes fail over on the application and the infrastructure layers.
60 |
61 | The application layer is controlled by the ``oslo.messaging``
62 | configuration options for multiple AMQP hosts. If the AMQP node fails,
63 | the application reconnects to the next one configured within the
64 | specified reconnect interval. The specified reconnect interval
65 | constitutes its SLA.
66 |
67 | On the infrastructure layer, the SLA is the time for which RabbitMQ
68 | cluster reassembles. Several cases are possible. The Mnesia keeper
69 | node is the master of the corresponding Pacemaker resource for
70 | RabbitMQ. When it fails, the result is a full AMQP cluster downtime
71 | interval. Normally, its SLA is no more than several minutes. Failure
72 | of another node that is a slave of the corresponding Pacemaker
73 | resource for RabbitMQ results in no AMQP cluster downtime at all.
74 |
75 | .. until we've determined the content depth, I've transferred RabbitMQ
76 | configuration below from the old HA guide (darrenc)
77 |
78 | Making the RabbitMQ service highly available involves the following steps:
79 |
80 | - :ref:`Install RabbitMQ`
81 |
82 | - :ref:`Configure RabbitMQ for HA queues`
83 |
84 | - :ref:`Configure OpenStack services to use RabbitMQ HA queues
85 | `
86 |
87 | .. note::
88 |
89 | Access to RabbitMQ is not normally handled by HAProxy. Instead,
90 | consumers must be supplied with the full list of hosts running
91 | RabbitMQ with ``rabbit_hosts`` and turn on the ``rabbit_ha_queues``
92 | option. For more information, read the `core issue
93 | `_.
94 | For more detail, read the `history and solution
95 | `_.
96 |
97 | .. _rabbitmq-install:
98 |
99 | Install RabbitMQ
100 | ^^^^^^^^^^^^^^^^
101 |
102 | The commands for installing RabbitMQ are specific to the Linux distribution
103 | you are using.
104 |
105 | For Ubuntu or Debian:
106 |
107 | .. code-block: console
108 |
109 | # apt-get install rabbitmq-server
110 |
111 | For RHEL, Fedora, or CentOS:
112 |
113 | .. code-block: console
114 |
115 | # yum install rabbitmq-server
116 |
117 | For openSUSE:
118 |
119 | .. code-block: console
120 |
121 | # zypper install rabbitmq-server
122 |
123 | For SLES 12:
124 |
125 | .. code-block: console
126 |
127 | # zypper addrepo -f obs://Cloud:OpenStack:Kilo/SLE_12 Kilo
128 | [Verify the fingerprint of the imported GPG key. See below.]
129 | # zypper install rabbitmq-server
130 |
131 | .. note::
132 |
133 | For SLES 12, the packages are signed by GPG key 893A90DAD85F9316.
134 | You should verify the fingerprint of the imported GPG key before using it.
135 |
136 | .. code-block:: none
137 |
138 | Key ID: 893A90DAD85F9316
139 | Key Name: Cloud:OpenStack OBS Project
140 | Key Fingerprint: 35B34E18ABC1076D66D5A86B893A90DAD85F9316
141 | Key Created: Tue Oct 8 13:34:21 2013
142 | Key Expires: Thu Dec 17 13:34:21 2015
143 |
144 | For more information, see the official installation manual for the
145 | distribution:
146 |
147 | - `Debian and Ubuntu `_
148 | - `RPM based `_
149 | (RHEL, Fedora, CentOS, openSUSE)
150 |
151 | .. _rabbitmq-configure:
152 |
153 | Configure RabbitMQ for HA queues
154 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
155 |
156 | .. [TODO: This section should begin with a brief mention
157 | .. about what HA queues are and why they are valuable, etc]
158 |
159 | .. [TODO: replace "currently" with specific release names]
160 |
161 | .. [TODO: Does this list need to be updated? Perhaps we need a table
162 | .. that shows each component and the earliest release that allows it
163 | .. to work with HA queues.]
164 |
165 | The following components/services can work with HA queues:
166 |
167 | - OpenStack Compute
168 | - OpenStack Block Storage
169 | - OpenStack Networking
170 | - Telemetry
171 |
172 | Consider that, while exchanges and bindings survive the loss of individual
173 | nodes, queues and their messages do not because a queue and its contents
174 | are located on one node. If we lose this node, we also lose the queue.
175 |
176 | Mirrored queues in RabbitMQ improve the availability of service since
177 | it is resilient to failures.
178 |
179 | Production servers should run (at least) three RabbitMQ servers for testing
180 | and demonstration purposes, however it is possible to run only two servers.
181 | In this section, we configure two nodes, called ``rabbit1`` and ``rabbit2``.
182 | To build a broker, ensure that all nodes have the same Erlang cookie file.
183 |
184 | .. [TODO: Should the example instead use a minimum of three nodes?]
185 |
186 | #. Stop RabbitMQ and copy the cookie from the first node to each of the
187 | other node(s):
188 |
189 | .. code-block:: console
190 |
191 | # scp /var/lib/rabbitmq/.erlang.cookie root@NODE:/var/lib/rabbitmq/.erlang.cookie
192 |
193 | #. On each target node, verify the correct owner,
194 | group, and permissions of the file :file:`erlang.cookie`:
195 |
196 | .. code-block:: console
197 |
198 | # chown rabbitmq:rabbitmq /var/lib/rabbitmq/.erlang.cookie
199 | # chmod 400 /var/lib/rabbitmq/.erlang.cookie
200 |
201 | #. Start the message queue service on all nodes and configure it to start
202 | when the system boots. On Ubuntu, it is configured by default.
203 |
204 | On CentOS, RHEL, openSUSE, and SLES:
205 |
206 | .. code-block:: console
207 |
208 | # systemctl enable rabbitmq-server.service
209 | # systemctl start rabbitmq-server.service
210 |
211 | #. Verify that the nodes are running:
212 |
213 | .. code-block:: console
214 |
215 | # rabbitmqctl cluster_status
216 | Cluster status of node rabbit@NODE...
217 | [{nodes,[{disc,[rabbit@NODE]}]},
218 | {running_nodes,[rabbit@NODE]},
219 | {partitions,[]}]
220 | ...done.
221 |
222 | #. Run the following commands on each node except the first one:
223 |
224 | .. code-block:: console
225 |
226 | # rabbitmqctl stop_app
227 | Stopping node rabbit@NODE...
228 | ...done.
229 | # rabbitmqctl join_cluster --ram rabbit@rabbit1
230 | # rabbitmqctl start_app
231 | Starting node rabbit@NODE ...
232 | ...done.
233 |
234 | .. note::
235 |
236 | The default node type is a disc node. In this guide, nodes
237 | join the cluster as RAM nodes.
238 |
239 | #. Verify the cluster status:
240 |
241 | .. code-block:: console
242 |
243 | # rabbitmqctl cluster_status
244 | Cluster status of node rabbit@NODE...
245 | [{nodes,[{disc,[rabbit@rabbit1]},{ram,[rabbit@NODE]}]}, \
246 | {running_nodes,[rabbit@NODE,rabbit@rabbit1]}]
247 |
248 | If the cluster is working, you can create usernames and passwords
249 | for the queues.
250 |
251 | #. To ensure that all queues except those with auto-generated names
252 | are mirrored across all running nodes,
253 | set the ``ha-mode`` policy key to all
254 | by running the following command on one of the nodes:
255 |
256 | .. code-block:: console
257 |
258 | # rabbitmqctl set_policy ha-all '^(?!amq\.).*' '{"ha-mode": "all"}'
259 |
260 | More information is available in the RabbitMQ documentation:
261 |
262 | - `Highly Available Queues `_
263 | - `Clustering Guide `_
264 |
265 | .. note::
266 |
267 | As another option to make RabbitMQ highly available, RabbitMQ contains the
268 | OCF scripts for the Pacemaker cluster resource agents since version 3.5.7.
269 | It provides the active/active RabbitMQ cluster with mirrored queues.
270 | For more information, see `Auto-configuration of a cluster with
271 | a Pacemaker `_.
272 |
273 | .. _rabbitmq-services:
274 |
275 | Configure OpenStack services to use Rabbit HA queues
276 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
277 |
278 | Configure the OpenStack components to use at least two RabbitMQ nodes.
279 |
280 | Use these steps to configurate all services using RabbitMQ:
281 |
282 | #. RabbitMQ HA cluster ``host:port`` pairs:
283 |
284 | .. code-block:: console
285 |
286 | rabbit_hosts=rabbit1:5672,rabbit2:5672,rabbit3:5672
287 |
288 | #. Retry connecting with RabbitMQ:
289 |
290 | .. code-block:: console
291 |
292 | rabbit_retry_interval=1
293 |
294 | #. How long to back-off for between retries when connecting to RabbitMQ:
295 |
296 | .. code-block:: console
297 |
298 | rabbit_retry_backoff=2
299 |
300 | #. Maximum retries with trying to connect to RabbitMQ (infinite by default):
301 |
302 | .. code-block:: console
303 |
304 | rabbit_max_retries=0
305 |
306 | #. Use durable queues in RabbitMQ:
307 |
308 | .. code-block:: console
309 |
310 | rabbit_durable_queues=true
311 |
312 | #. Use HA queues in RabbitMQ (``x-ha-policy: all``):
313 |
314 | .. code-block:: console
315 |
316 | rabbit_ha_queues=true
317 |
318 | .. note::
319 |
320 | If you change the configuration from an old set-up
321 | that did not use HA queues, restart the service:
322 |
323 | .. code-block:: console
324 |
325 | # rabbitmqctl stop_app
326 | # rabbitmqctl reset
327 | # rabbitmqctl start_app
328 |
329 |
330 |
331 |
332 |
333 | Pacemaker active/passive
334 | ------------------------
335 |
336 |
337 |
338 | Mirrored queues
339 | ---------------
340 |
341 | Qpid
342 | ----
343 |
--------------------------------------------------------------------------------
/doc/source/control-plane-stateless.rst:
--------------------------------------------------------------------------------
1 | ==============================
2 | Configuring stateless services
3 | ==============================
4 |
5 | .. to do: scope what details we want on the following services
6 |
7 | API services
8 | ~~~~~~~~~~~~
9 |
10 | Load-balancer
11 | ~~~~~~~~~~~~~
12 |
13 | HAProxy
14 | -------
15 |
16 | HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
17 | for TCP or HTTP applications. It is particularly suited for web crawling
18 | under very high loads while needing persistence or Layer 7 processing.
19 | It realistically supports tens of thousands of connections with recent
20 | hardware.
21 |
22 | Each instance of HAProxy configures its front end to accept connections only
23 | to the virtual IP (VIP) address. The HAProxy back end (termination
24 | point) is a list of all the IP addresses of instances for load balancing.
25 |
26 | .. note::
27 |
28 | Ensure your HAProxy installation is not a single point of failure,
29 | it is advisable to have multiple HAProxy instances running.
30 |
31 | You can also ensure the availability by other means, using Keepalived
32 | or Pacemaker.
33 |
34 | Alternatively, you can use a commercial load balancer, which is hardware
35 | or software. We recommend a hardware load balancer as it generally has
36 | good performance.
37 |
38 | For detailed instructions about installing HAProxy on your nodes,
39 | see the HAProxy `official documentation `_.
40 |
41 | Configuring HAProxy
42 | ^^^^^^^^^^^^^^^^^^^
43 |
44 | #. Restart the HAProxy service.
45 |
46 | #. Locate your HAProxy instance on each OpenStack controller in your
47 | environment. The following is an example ``/etc/haproxy/haproxy.cfg``
48 | configuration file. Configure your instance using the following
49 | configuration file, you will need a copy of it on each
50 | controller node.
51 |
52 |
53 | .. code-block:: none
54 |
55 | global
56 | chroot /var/lib/haproxy
57 | daemon
58 | group haproxy
59 | maxconn 4000
60 | pidfile /var/run/haproxy.pid
61 | user haproxy
62 |
63 | defaults
64 | log global
65 | maxconn 4000
66 | option redispatch
67 | retries 3
68 | timeout http-request 10s
69 | timeout queue 1m
70 | timeout connect 10s
71 | timeout client 1m
72 | timeout server 1m
73 | timeout check 10s
74 |
75 | listen dashboard_cluster
76 | bind :443
77 | balance source
78 | option tcpka
79 | option httpchk
80 | option tcplog
81 | server controller1 10.0.0.12:443 check inter 2000 rise 2 fall 5
82 | server controller2 10.0.0.13:443 check inter 2000 rise 2 fall 5
83 | server controller3 10.0.0.14:443 check inter 2000 rise 2 fall 5
84 |
85 | listen galera_cluster
86 | bind :3306
87 | balance source
88 | option mysql-check
89 | server controller1 10.0.0.12:3306 check port 9200 inter 2000 rise 2 fall 5
90 | server controller2 10.0.0.13:3306 backup check port 9200 inter 2000 rise 2 fall 5
91 | server controller3 10.0.0.14:3306 backup check port 9200 inter 2000 rise 2 fall 5
92 |
93 | listen glance_api_cluster
94 | bind :9292
95 | balance source
96 | option tcpka
97 | option httpchk
98 | option tcplog
99 | server controller1 10.0.0.12:9292 check inter 2000 rise 2 fall 5
100 | server controller2 10.0.0.13:9292 check inter 2000 rise 2 fall 5
101 | server controller3 10.0.0.14:9292 check inter 2000 rise 2 fall 5
102 |
103 | listen glance_registry_cluster
104 | bind :9191
105 | balance source
106 | option tcpka
107 | option tcplog
108 | server controller1 10.0.0.12:9191 check inter 2000 rise 2 fall 5
109 | server controller2 10.0.0.13:9191 check inter 2000 rise 2 fall 5
110 | server controller3 10.0.0.14:9191 check inter 2000 rise 2 fall 5
111 |
112 | listen keystone_admin_cluster
113 | bind :35357
114 | balance source
115 | option tcpka
116 | option httpchk
117 | option tcplog
118 | server controller1 10.0.0.12:35357 check inter 2000 rise 2 fall 5
119 | server controller2 10.0.0.13:35357 check inter 2000 rise 2 fall 5
120 | server controller3 10.0.0.14:35357 check inter 2000 rise 2 fall 5
121 |
122 | listen keystone_public_internal_cluster
123 | bind :5000
124 | balance source
125 | option tcpka
126 | option httpchk
127 | option tcplog
128 | server controller1 10.0.0.12:5000 check inter 2000 rise 2 fall 5
129 | server controller2 10.0.0.13:5000 check inter 2000 rise 2 fall 5
130 | server controller3 10.0.0.14:5000 check inter 2000 rise 2 fall 5
131 |
132 | listen nova_ec2_api_cluster
133 | bind :8773
134 | balance source
135 | option tcpka
136 | option tcplog
137 | server controller1 10.0.0.12:8773 check inter 2000 rise 2 fall 5
138 | server controller2 10.0.0.13:8773 check inter 2000 rise 2 fall 5
139 | server controller3 10.0.0.14:8773 check inter 2000 rise 2 fall 5
140 |
141 | listen nova_compute_api_cluster
142 | bind :8774
143 | balance source
144 | option tcpka
145 | option httpchk
146 | option tcplog
147 | server controller1 10.0.0.12:8774 check inter 2000 rise 2 fall 5
148 | server controller2 10.0.0.13:8774 check inter 2000 rise 2 fall 5
149 | server controller3 10.0.0.14:8774 check inter 2000 rise 2 fall 5
150 |
151 | listen nova_metadata_api_cluster
152 | bind :8775
153 | balance source
154 | option tcpka
155 | option tcplog
156 | server controller1 10.0.0.12:8775 check inter 2000 rise 2 fall 5
157 | server controller2 10.0.0.13:8775 check inter 2000 rise 2 fall 5
158 | server controller3 10.0.0.14:8775 check inter 2000 rise 2 fall 5
159 |
160 | listen cinder_api_cluster
161 | bind :8776
162 | balance source
163 | option tcpka
164 | option httpchk
165 | option tcplog
166 | server controller1 10.0.0.12:8776 check inter 2000 rise 2 fall 5
167 | server controller2 10.0.0.13:8776 check inter 2000 rise 2 fall 5
168 | server controller3 10.0.0.14:8776 check inter 2000 rise 2 fall 5
169 |
170 | listen ceilometer_api_cluster
171 | bind :8777
172 | balance source
173 | option tcpka
174 | option tcplog
175 | server controller1 10.0.0.12:8777 check inter 2000 rise 2 fall 5
176 | server controller2 10.0.0.13:8777 check inter 2000 rise 2 fall 5
177 | server controller3 10.0.0.14:8777 check inter 2000 rise 2 fall 5
178 |
179 | listen nova_vncproxy_cluster
180 | bind :6080
181 | balance source
182 | option tcpka
183 | option tcplog
184 | server controller1 10.0.0.12:6080 check inter 2000 rise 2 fall 5
185 | server controller2 10.0.0.13:6080 check inter 2000 rise 2 fall 5
186 | server controller3 10.0.0.14:6080 check inter 2000 rise 2 fall 5
187 |
188 | listen neutron_api_cluster
189 | bind :9696
190 | balance source
191 | option tcpka
192 | option httpchk
193 | option tcplog
194 | server controller1 10.0.0.12:9696 check inter 2000 rise 2 fall 5
195 | server controller2 10.0.0.13:9696 check inter 2000 rise 2 fall 5
196 | server controller3 10.0.0.14:9696 check inter 2000 rise 2 fall 5
197 |
198 | listen swift_proxy_cluster
199 | bind :8080
200 | balance source
201 | option tcplog
202 | option tcpka
203 | server controller1 10.0.0.12:8080 check inter 2000 rise 2 fall 5
204 | server controller2 10.0.0.13:8080 check inter 2000 rise 2 fall 5
205 | server controller3 10.0.0.14:8080 check inter 2000 rise 2 fall 5
206 |
207 | .. note::
208 |
209 | The Galera cluster configuration directive ``backup`` indicates
210 | that two of the three controllers are standby nodes.
211 | This ensures that only one node services write requests
212 | because OpenStack support for multi-node writes is not yet production-ready.
213 |
214 | .. note::
215 |
216 | The Telemetry API service configuration does not have the ``option httpchk``
217 | directive as it cannot process this check properly.
218 |
219 | .. TODO: explain why the Telemetry API is so special
220 |
221 | #. Configure the kernel parameter to allow non-local IP binding. This allows
222 | running HAProxy instances to bind to a VIP for failover. Add following line
223 | to ``/etc/sysctl.conf``:
224 |
225 | .. code-block:: none
226 |
227 | net.ipv4.ip_nonlocal_bind = 1
228 |
229 | #. Restart the host or, to make changes work immediately, invoke:
230 |
231 | .. code-block:: console
232 |
233 | $ sysctl -p
234 |
235 | #. Add HAProxy to the cluster and ensure the VIPs can only run on machines
236 | where HAProxy is active:
237 |
238 | ``pcs``
239 |
240 | .. code-block:: console
241 |
242 | $ pcs resource create lb-haproxy systemd:haproxy --clone
243 | $ pcs constraint order start vip then lb-haproxy-clone kind=Optional
244 | $ pcs constraint colocation add lb-haproxy-clone with vip
245 |
246 | ``crmsh``
247 |
248 | .. code-block:: console
249 |
250 | $ crm cib new conf-haproxy
251 | $ crm configure primitive haproxy lsb:haproxy op monitor interval="1s"
252 | $ crm configure clone haproxy-clone haproxy
253 | $ crm configure colocation vip-with-haproxy inf: vip haproxy-clone
254 | $ crm configure order haproxy-after-vip mandatory: vip haproxy-clone
255 |
256 |
257 | Pacemaker versus systemd
258 | ------------------------
259 |
260 | Memcached
261 | ---------
262 |
263 | Memcached is a general-purpose distributed memory caching system. It
264 | is used to speed up dynamic database-driven websites by caching data
265 | and objects in RAM to reduce the number of times an external data
266 | source must be read.
267 |
268 | Memcached is a memory cache demon that can be used by most OpenStack
269 | services to store ephemeral data, such as tokens.
270 |
271 | Access to Memcached is not handled by HAProxy because replicated
272 | access is currently in an experimental state. Instead, OpenStack
273 | services must be supplied with the full list of hosts running
274 | Memcached.
275 |
276 | The Memcached client implements hashing to balance objects among the
277 | instances. Failure of an instance impacts only a percentage of the
278 | objects and the client automatically removes it from the list of
279 | instances. The SLA is several minutes.
280 |
281 |
282 | Highly available API services
283 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
284 |
285 | Identity API
286 | ------------
287 |
288 | Ensure you have read the
289 | `OpenStack Identity service getting started documentation
290 | `_.
291 |
292 | .. to do: reference controller-ha-identity and see if section involving
293 | adding to pacemaker is in scope
294 |
295 |
296 | Add OpenStack Identity resource to Pacemaker
297 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
298 |
299 | The following section(s) detail how to add the Identity service
300 | to Pacemaker on SUSE and Red Hat.
301 |
302 | SUSE
303 | ----
304 |
305 | SUSE Enterprise Linux and SUSE-based distributions, such as openSUSE,
306 | use a set of OCF agents for controlling OpenStack services.
307 |
308 | #. Run the following commands to download the OpenStack Identity resource
309 | to Pacemaker:
310 |
311 | .. code-block:: console
312 |
313 | # cd /usr/lib/ocf/resource.d
314 | # mkdir openstack
315 | # cd openstack
316 | # wget https://opendev.org/x/openstack-resource-agents/raw/branch/master/ocf/keystone
317 | # chmod a+rx *
318 |
319 | #. Add the Pacemaker configuration for the OpenStack Identity resource
320 | by running the following command to connect to the Pacemaker cluster:
321 |
322 | .. code-block:: console
323 |
324 | # crm configure
325 |
326 | #. Add the following cluster resources:
327 |
328 | .. code-block:: console
329 |
330 | clone p_keystone ocf:openstack:keystone \
331 | params config="/etc/keystone/keystone.conf" os_password="secretsecret" os_username="admin" os_tenant_name="admin" os_auth_url="http://10.0.0.11:5000/v2.0/" \
332 | op monitor interval="30s" timeout="30s"
333 |
334 | .. note::
335 |
336 | This configuration creates ``p_keystone``,
337 | a resource for managing the OpenStack Identity service.
338 |
339 | #. Commit your configuration changes from the :command:`crm configure` menu
340 | with the following command:
341 |
342 | .. code-block:: console
343 |
344 | # commit
345 |
346 | The :command:`crm configure` supports batch input. You may have to copy and
347 | paste the above lines into your live Pacemaker configuration, and then make
348 | changes as required.
349 |
350 | For example, you may enter ``edit p_ip_keystone`` from the
351 | :command:`crm configure` menu and edit the resource to match your preferred
352 | virtual IP address.
353 |
354 | Pacemaker now starts the OpenStack Identity service and its dependent
355 | resources on all of your nodes.
356 |
357 | Red Hat
358 | --------
359 |
360 | For Red Hat Enterprise Linux and Red Hat-based Linux distributions,
361 | the following process uses Systemd unit files.
362 |
363 | .. code-block:: console
364 |
365 | # pcs resource create openstack-keystone systemd:openstack-keystone --clone interleave=true
366 |
367 | .. _identity-config-identity:
368 |
369 | Configure OpenStack Identity service
370 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
371 |
372 | #. Edit the :file:`keystone.conf` file
373 | to change the values of the :manpage:`bind(2)` parameters:
374 |
375 | .. code-block:: ini
376 |
377 | bind_host = 10.0.0.12
378 | public_bind_host = 10.0.0.12
379 | admin_bind_host = 10.0.0.12
380 |
381 | The ``admin_bind_host`` parameter
382 | lets you use a private network for admin access.
383 |
384 | #. To be sure that all data is highly available,
385 | ensure that everything is stored in the MySQL database
386 | (which is also highly available):
387 |
388 | .. code-block:: ini
389 |
390 | [catalog]
391 | driver = keystone.catalog.backends.sql.Catalog
392 | # ...
393 | [identity]
394 | driver = keystone.identity.backends.sql.Identity
395 | # ...
396 |
397 | #. If the Identity service will be sending ceilometer notifications
398 | and your message bus is configured for high availability, you will
399 | need to ensure that the Identity service is correctly configured to
400 | use it.
401 |
402 | .. _identity-services-config:
403 |
404 | Configure OpenStack services to use the highly available OpenStack Identity
405 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
406 |
407 | Your OpenStack services now point their OpenStack Identity configuration
408 | to the highly available virtual cluster IP address.
409 |
410 | #. For OpenStack Compute service, (if your OpenStack Identity service
411 | IP address is 10.0.0.11) use the following configuration in the
412 | :file:`api-paste.ini` file:
413 |
414 | .. code-block:: ini
415 |
416 | auth_host = 10.0.0.11
417 |
418 | #. Create the OpenStack Identity Endpoint with this IP address.
419 |
420 | .. note::
421 |
422 | If you are using both private and public IP addresses,
423 | create two virtual IP addresses and define the endpoint. For
424 | example:
425 |
426 | .. code-block:: console
427 |
428 | $ openstack endpoint create --region $KEYSTONE_REGION \
429 | $service-type public http://PUBLIC_VIP:5000/v2.0
430 | $ openstack endpoint create --region $KEYSTONE_REGION \
431 | $service-type admin http://10.0.0.11:35357/v2.0
432 | $ openstack endpoint create --region $KEYSTONE_REGION \
433 | $service-type internal http://10.0.0.11:5000/v2.0
434 |
435 | #. If you are using Dashboard (horizon), edit the :file:`local_settings.py`
436 | file to include the following:
437 |
438 | .. code-block:: ini
439 |
440 | OPENSTACK_HOST = 10.0.0.11
441 |
442 |
443 | Telemetry API
444 | -------------
445 |
446 | The Telemetry polling agent can be configured to partition its polling
447 | workload between multiple agents. This enables high availability (HA).
448 |
449 | Both the central and the compute agent can run in an HA deployment.
450 | This means that multiple instances of these services can run in
451 | parallel with workload partitioning among these running instances.
452 |
453 | The `Tooz `_ library provides
454 | the coordination within the groups of service instances.
455 | It provides an API above several back ends that can be used for building
456 | distributed applications.
457 |
458 | Tooz supports
459 | `various drivers `_
460 | including the following back end solutions:
461 |
462 | * `Zookeeper `_:
463 | Recommended solution by the Tooz project.
464 |
465 | * `Redis `_:
466 | Recommended solution by the Tooz project.
467 |
468 | * `Memcached `_:
469 | Recommended for testing.
470 |
471 | You must configure a supported Tooz driver for the HA deployment of
472 | the Telemetry services.
473 |
474 | For information about the required configuration options
475 | to set in the :file:`ceilometer.conf`, see the `coordination section
476 | `_
477 | in the OpenStack Configuration Reference.
478 |
479 | .. note::
480 |
481 | Only one instance for the central and compute agent service(s) is able
482 | to run and function correctly if the ``backend_url`` option is not set.
483 |
484 | The availability check of the instances is provided by heartbeat messages.
485 | When the connection with an instance is lost, the workload will be
486 | reassigned within the remaining instances in the next polling cycle.
487 |
488 | .. note::
489 |
490 | Memcached uses a timeout value, which should always be set to
491 | a value that is higher than the heartbeat value set for Telemetry.
492 |
493 | For backward compatibility and supporting existing deployments, the central
494 | agent configuration supports using different configuration files. This is for
495 | groups of service instances that are running in parallel.
496 | For enabling this configuration, set a value for the
497 | ``partitioning_group_prefix`` option in the
498 | `polling section `_
499 | in the OpenStack Configuration Reference.
500 |
501 | .. warning::
502 |
503 | For each sub-group of the central agent pool with the same
504 | ``partitioning_group_prefix``, a disjoint subset of meters must be polled
505 | to avoid samples being missing or duplicated. The list of meters to poll
506 | can be set in the :file:`/etc/ceilometer/pipeline.yaml` configuration file.
507 | For more information about pipelines see the `Data processing and pipelines
508 | `_
509 | section.
510 |
511 | To enable the compute agent to run multiple instances simultaneously with
512 | workload partitioning, the ``workload_partitioning`` option must be set to
513 | ``True`` under the `compute section `_
514 | in the :file:`ceilometer.conf` configuration file.
515 |
516 |
517 | .. To Do: Cover any other projects here with API services which require specific
518 | HA details.
519 |
--------------------------------------------------------------------------------
/doc/source/control-plane.rst:
--------------------------------------------------------------------------------
1 | ===========================
2 | Configuring a control plane
3 | ===========================
4 |
5 | .. toctree::
6 | :maxdepth: 2
7 |
8 | control-plane-stateless.rst
9 | control-plane-stateful.rst
10 |
--------------------------------------------------------------------------------
/doc/source/figures/Cluster-deployment-collapsed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openstack/ha-guide/7154c29fb7c810496f4913c9d3ffa738f56d3afe/doc/source/figures/Cluster-deployment-collapsed.png
--------------------------------------------------------------------------------
/doc/source/figures/Cluster-deployment-segregated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openstack/ha-guide/7154c29fb7c810496f4913c9d3ffa738f56d3afe/doc/source/figures/Cluster-deployment-segregated.png
--------------------------------------------------------------------------------
/doc/source/ha-community.rst:
--------------------------------------------------------------------------------
1 | ============
2 | HA community
3 | ============
4 |
5 | The OpenStack HA community holds `weekly IRC meetings
6 | `_ to discuss
7 | a range of topics relating to HA in OpenStack. Everyone interested is
8 | encouraged to attend. The `logs of all previous meetings
9 | `_ are available to read.
10 |
11 | You can contact the HA community directly in `the #openstack-ha
12 | channel on Freenode IRC `_, or by
13 | sending mail to the `openstack-dev
14 | `_
15 | mailing list with the ``[HA]`` prefix in the ``Subject`` header.
16 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | =================================
2 | OpenStack High Availability Guide
3 | =================================
4 |
5 | Abstract
6 | ~~~~~~~~
7 |
8 | This guide describes how to install and configure OpenStack for high
9 | availability. It supplements the Installation Guides
10 | and assumes that you are familiar with the material in those guides.
11 |
12 | .. warning::
13 |
14 | This guide is a work-in-progress and changing rapidly while we
15 | continue to test and enhance the guidance. There are open TODO
16 | items throughout the guide which will be tracked on
17 | `the ha-guide Storyboard site
18 | `_.
19 | There is also a `bug list corresponding to the old version of the
20 | guide
21 | `_
22 | which need to be triaged, as some of those bugs may still be
23 | relevant in which case they need to be ported over to Storyboard.
24 | Please help where you are able.
25 |
26 | .. toctree::
27 | :maxdepth: 1
28 |
29 | common/conventions.rst
30 | overview.rst
31 | intro-ha.rst
32 | intro-os-ha.rst
33 | control-plane.rst
34 | networking-ha.rst
35 | storage-ha.rst
36 | compute-node-ha.rst
37 | monitoring.rst
38 | testing.rst
39 | ref-arch-examples.rst
40 | ha-community.rst
41 | common/appendix.rst
42 |
--------------------------------------------------------------------------------
/doc/source/intro-ha-common-tech.rst:
--------------------------------------------------------------------------------
1 | ========================
2 | Commonly used technology
3 | ========================
4 | High availability can be achieved only on system level, while both hardware and
5 | software components can contribute to the system level availability.
6 | This document lists the most common hardware and software technologies
7 | that can be used to build a highly available system.
8 |
9 | Hardware
10 | ~~~~~~~~
11 | Using different technologies to enable high availability on the hardware
12 | level provides a good basis to build a high available system. The next chapters
13 | discuss the most common technologies used in this field.
14 |
15 | Redundant switches
16 | ------------------
17 | Network switches are single point of failures as networking is critical to
18 | operate all other basic domains of the infrastructure, like compute and
19 | storage. Network switches need to be able to forward the network traffic
20 | and be able to forward the traffic to a working next hop.
21 | For these reasons consider the following two factors when making a network
22 | switch redundant:
23 |
24 | #. The network switch itself should synchronize its internal state to a
25 | redundant switch either in active/active or active/passive way.
26 |
27 | #. The network topology should be designed in a way that the network router can
28 | use at least two paths in every critical direction.
29 |
30 | Bonded interfaces
31 | -----------------
32 | Bonded interfaces are two independent physical network interfaces handled as
33 | one interface in active/passive or in active/active redundancy mode. In
34 | active/passive mode, if an error happens in the active network interface or in
35 | the remote end of the interface, the interfaces are switched over. In
36 | active/active mode, when an error happens in an interface or in the remote end
37 | of an interface, then the interface is marked as unavailable and ceases to be
38 | used.
39 |
40 | Load balancers
41 | --------------
42 | Physical load balancers are special routers which direct the traffic in
43 | different directions based on a set of rules. Load balancers can be in
44 | redundant mode similarly to the physical switches.
45 | Load balancers are also important for distributing the traffic to the different
46 | active/active components of the system.
47 |
48 | Storage
49 | -------
50 | Physical storage high availability can be achieved with different scopes:
51 |
52 | #. High availability within a hardware unit with redundant disks (mostly
53 | organized into different RAID configurations), redundant control components,
54 | redundant I/O interfaces and redundant power supply.
55 |
56 | #. System level high availability with redundant hardware units with data
57 | replication.
58 |
59 | Software
60 | ~~~~~~~~
61 |
62 | HAproxy
63 | -------
64 |
65 | HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
66 | for TCP or HTTP applications. It is particularly suited for web crawling
67 | under very high loads while needing persistence or Layer 7 processing.
68 | It realistically supports tens of thousands of connections with recent
69 | hardware.
70 |
71 | .. note::
72 |
73 | Ensure your HAProxy installation is not a single point of failure,
74 | it is advisable to have multiple HAProxy instances running.
75 |
76 | You can also ensure the availability by other means, using Keepalived
77 | or Pacemaker.
78 |
79 | Alternatively, you can use a commercial load balancer, which is hardware
80 | or software. We recommend a hardware load balancer as it generally has
81 | good performance.
82 |
83 | For detailed instructions about installing HAProxy on your nodes,
84 | see the HAProxy `official documentation `_.
85 |
86 | keepalived
87 | ----------
88 |
89 | `keepalived `_ is a routing software that
90 | provides facilities for load balancing and high-availability to Linux
91 | system and Linux based infrastructures.
92 |
93 | Keepalived implements a set of checkers to dynamically and
94 | adaptively maintain and manage loadbalanced server pool according
95 | their health.
96 |
97 | The keepalived daemon can be used to monitor services or systems and
98 | to automatically failover to a standby if problems occur.
99 |
100 | Pacemaker
101 | ---------
102 |
103 | `Pacemaker `_ cluster stack is a state-of-the-art
104 | high availability and load balancing stack for the Linux platform.
105 | Pacemaker is used to make OpenStack infrastructure highly available.
106 |
107 | Pacemaker relies on the
108 | `Corosync `_ messaging layer
109 | for reliable cluster communications. Corosync implements the Totem single-ring
110 | ordering and membership protocol. It also provides UDP and InfiniBand based
111 | messaging, quorum, and cluster membership to Pacemaker.
112 |
113 | Pacemaker does not inherently understand the applications it manages.
114 | Instead, it relies on resource agents (RAs) that are scripts that encapsulate
115 | the knowledge of how to start, stop, and check the health of each application
116 | managed by the cluster.
117 |
118 | These agents must conform to one of the `OCF `_,
120 | `SysV Init `_, Upstart, or Systemd standards.
122 |
123 | Pacemaker ships with a large set of OCF agents (such as those managing
124 | MySQL databases, virtual IP addresses, and RabbitMQ), but can also use
125 | any agents already installed on your system and can be extended with
126 | your own (see the
127 | `developer guide `_).
128 |
--------------------------------------------------------------------------------
/doc/source/intro-ha-key-concepts.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Key concepts
3 | ============
4 |
5 | Redundancy and failover
6 | ~~~~~~~~~~~~~~~~~~~~~~~
7 |
8 | High availability is implemented with redundant hardware
9 | running redundant instances of each service.
10 | If one piece of hardware running one instance of a service fails,
11 | the system can then failover to use another instance of a service
12 | that is running on hardware that did not fail.
13 |
14 | A crucial aspect of high availability
15 | is the elimination of single points of failure (SPOFs).
16 | A SPOF is an individual piece of equipment or software
17 | that causes system downtime or data loss if it fails.
18 | In order to eliminate SPOFs, check that mechanisms exist for redundancy of:
19 |
20 | - Network components, such as switches and routers
21 |
22 | - Applications and automatic service migration
23 |
24 | - Storage components
25 |
26 | - Facility services such as power, air conditioning, and fire protection
27 |
28 | In the event that a component fails and a back-up system must take on
29 | its load, most high availability systems will replace the failed
30 | component as quickly as possible to maintain necessary redundancy. This
31 | way time spent in a degraded protection state is minimized.
32 |
33 | Most high availability systems fail in the event of multiple
34 | independent (non-consequential) failures. In this case, most
35 | implementations favor protecting data over maintaining availability.
36 |
37 | High availability systems typically achieve an uptime percentage of
38 | 99.99% or more, which roughly equates to less than an hour of
39 | cumulative downtime per year. In order to achieve this, high
40 | availability systems should keep recovery times after a failure to
41 | about one to two minutes, sometimes significantly less.
42 |
43 | OpenStack currently meets such availability requirements for its own
44 | infrastructure services, meaning that an uptime of 99.99% is feasible
45 | for the OpenStack infrastructure proper. However, OpenStack does not
46 | guarantee 99.99% availability for individual guest instances.
47 |
48 | This document discusses some common methods of implementing highly
49 | available systems, with an emphasis on the core OpenStack services and
50 | other open source services that are closely aligned with OpenStack.
51 |
52 | You will need to address high availability concerns for any applications
53 | software that you run on your OpenStack environment. The important thing is
54 | to make sure that your services are redundant and available.
55 | How you achieve that is up to you.
56 |
57 | Active/passive versus active/active
58 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
59 |
60 | Stateful services can be configured as active/passive or active/active,
61 | which are defined as follows:
62 |
63 | :term:`active/passive configuration`
64 | Maintains a redundant instance
65 | that can be brought online when the active service fails.
66 | For example, OpenStack writes to the main database
67 | while maintaining a disaster recovery database that can be brought online
68 | if the main database fails.
69 |
70 | A typical active/passive installation for a stateful service maintains
71 | a replacement resource that can be brought online when required.
72 | Requests are handled using a :term:`virtual IP address (VIP)` that
73 | facilitates returning to service with minimal reconfiguration.
74 | A separate application (such as Pacemaker or Corosync) monitors
75 | these services, bringing the backup online as necessary.
76 |
77 | :term:`active/active configuration`
78 | Each service also has a backup but manages both the main and
79 | redundant systems concurrently.
80 | This way, if there is a failure, the user is unlikely to notice.
81 | The backup system is already online and takes on increased load
82 | while the main system is fixed and brought back online.
83 |
84 | Typically, an active/active installation for a stateless service
85 | maintains a redundant instance, and requests are load balanced using
86 | a virtual IP address and a load balancer such as HAProxy.
87 |
88 | A typical active/active installation for a stateful service includes
89 | redundant services, with all instances having an identical state. In
90 | other words, updates to one instance of a database update all other
91 | instances. This way a request to one instance is the same as a
92 | request to any other. A load balancer manages the traffic to these
93 | systems, ensuring that operational systems always handle the
94 | request.
95 |
96 | Clusters and quorums
97 | ~~~~~~~~~~~~~~~~~~~~
98 |
99 | The quorum specifies the minimal number of nodes
100 | that must be functional in a cluster of redundant nodes
101 | in order for the cluster to remain functional.
102 | When one node fails and failover transfers control to other nodes,
103 | the system must ensure that data and processes remain sane.
104 | To determine this, the contents of the remaining nodes are compared
105 | and, if there are discrepancies, a majority rules algorithm is implemented.
106 |
107 | For this reason, each cluster in a high availability environment should
108 | have an odd number of nodes and the quorum is defined as more than a half
109 | of the nodes.
110 | If multiple nodes fail so that the cluster size falls below the quorum
111 | value, the cluster itself fails.
112 |
113 | For example, in a seven-node cluster, the quorum should be set to
114 | ``floor(7/2) + 1 == 4``. If quorum is four and four nodes fail simultaneously,
115 | the cluster itself would fail, whereas it would continue to function, if
116 | no more than three nodes fail. If split to partitions of three and four nodes
117 | respectively, the quorum of four nodes would continue to operate the majority
118 | partition and stop or fence the minority one (depending on the
119 | no-quorum-policy cluster configuration).
120 |
121 | And the quorum could also have been set to three, just as a configuration
122 | example.
123 |
124 | .. note::
125 |
126 | We do not recommend setting the quorum to a value less than ``floor(n/2) + 1``
127 | as it would likely cause a split-brain in a face of network partitions.
128 |
129 | When four nodes fail simultaneously, the cluster would continue to function as
130 | well. But if split to partitions of three and four nodes respectively, the
131 | quorum of three would have made both sides to attempt to fence the other and
132 | host resources. Without fencing enabled, it would go straight to running
133 | two copies of each resource.
134 |
135 | This is why setting the quorum to a value less than ``floor(n/2) + 1`` is
136 | dangerous. However it may be required for some specific cases, such as a
137 | temporary measure at a point it is known with 100% certainty that the other
138 | nodes are down.
139 |
140 | When configuring an OpenStack environment for study or demonstration purposes,
141 | it is possible to turn off the quorum checking. Production systems should
142 | always run with quorum enabled.
143 |
144 | Load balancing
145 | ~~~~~~~~~~~~~~
146 |
147 | .. to do: definition and description of need within HA
148 |
--------------------------------------------------------------------------------
/doc/source/intro-ha.rst:
--------------------------------------------------------------------------------
1 | =================================
2 | Introduction to high availability
3 | =================================
4 |
5 | High availability systems seek to minimize the following issues:
6 |
7 | #. System downtime: Occurs when a user-facing service is unavailable
8 | beyond a specified maximum amount of time.
9 |
10 | #. Data loss: Accidental deletion or destruction of data.
11 |
12 | Most high availability systems guarantee protection against system downtime
13 | and data loss only in the event of a single failure.
14 | However, they are also expected to protect against cascading failures,
15 | where a single failure deteriorates into a series of consequential failures.
16 | Many service providers guarantee a :term:`Service Level Agreement (SLA)`
17 | including uptime percentage of computing service, which is calculated based
18 | on the available time and system downtime excluding planned outage time.
19 |
20 | .. toctree::
21 | :maxdepth: 2
22 |
23 | intro-ha-key-concepts.rst
24 | intro-ha-common-tech.rst
25 |
--------------------------------------------------------------------------------
/doc/source/intro-os-ha-cluster.rst:
--------------------------------------------------------------------------------
1 | ================
2 | Cluster managers
3 | ================
4 |
5 | At its core, a cluster is a distributed finite state machine capable
6 | of co-ordinating the startup and recovery of inter-related services
7 | across a set of machines.
8 |
9 | Even a distributed or replicated application that is able to survive failures
10 | on one or more machines can benefit from a cluster manager because a cluster
11 | manager has the following capabilities:
12 |
13 | #. Awareness of other applications in the stack
14 |
15 | While SYS-V init replacements like systemd can provide
16 | deterministic recovery of a complex stack of services, the
17 | recovery is limited to one machine and lacks the context of what
18 | is happening on other machines. This context is crucial to
19 | determine the difference between a local failure, and clean startup
20 | and recovery after a total site failure.
21 |
22 | #. Awareness of instances on other machines
23 |
24 | Services like RabbitMQ and Galera have complicated boot-up
25 | sequences that require co-ordination, and often serialization, of
26 | startup operations across all machines in the cluster. This is
27 | especially true after a site-wide failure or shutdown where you must
28 | first determine the last machine to be active.
29 |
30 | #. A shared implementation and calculation of `quorum
31 | `_
32 |
33 | It is very important that all members of the system share the same
34 | view of who their peers are and whether or not they are in the
35 | majority. Failure to do this leads very quickly to an internal
36 | `split-brain `_
37 | state. This is where different parts of the system are pulling in
38 | different and incompatible directions.
39 |
40 | #. Data integrity through fencing (a non-responsive process does not
41 | imply it is not doing anything)
42 |
43 | A single application does not have sufficient context to know the
44 | difference between failure of a machine and failure of the
45 | application on a machine. The usual practice is to assume the
46 | machine is dead and continue working, however this is highly risky. A
47 | rogue process or machine could still be responding to requests and
48 | generally causing havoc. The safer approach is to make use of
49 | remotely accessible power switches and/or network switches and SAN
50 | controllers to fence (isolate) the machine before continuing.
51 |
52 | #. Automated recovery of failed instances
53 |
54 | While the application can still run after the failure of several
55 | instances, it may not have sufficient capacity to serve the
56 | required volume of requests. A cluster can automatically recover
57 | failed instances to prevent additional load induced failures.
58 |
59 | Pacemaker
60 | ~~~~~~~~~
61 | .. to do: description and point to ref arch example using pacemaker
62 |
63 | `Pacemaker `_.
64 |
65 | Systemd
66 | ~~~~~~~
67 | .. to do: description and point to ref arch example using Systemd and link
68 |
--------------------------------------------------------------------------------
/doc/source/intro-os-ha-memcached.rst:
--------------------------------------------------------------------------------
1 | =========
2 | Memcached
3 | =========
4 |
5 | Most OpenStack services can use Memcached to store ephemeral data such as
6 | tokens. Although Memcached does not support typical forms of redundancy such
7 | as clustering, OpenStack services can use almost any number of instances
8 | by configuring multiple hostnames or IP addresses.
9 |
10 | The Memcached client implements hashing to balance objects among the instances.
11 | Failure of an instance only impacts a percentage of the objects,
12 | and the client automatically removes it from the list of instances.
13 |
14 | Installation
15 | ~~~~~~~~~~~~
16 |
17 | To install and configure Memcached, read the
18 | `official documentation `_.
19 |
20 | Memory caching is managed by `oslo.cache
21 | `_.
22 | This ensures consistency across all projects when using multiple Memcached
23 | servers. The following is an example configuration with three hosts:
24 |
25 | .. code-block:: ini
26 |
27 | Memcached_servers = controller1:11211,controller2:11211,controller3:11211
28 |
29 | By default, ``controller1`` handles the caching service. If the host goes down,
30 | ``controller2`` or ``controller3`` will complete the service.
31 |
32 | For more information about Memcached installation, see the
33 | *Environment -> Memcached* section in the
34 | `Installation Guides `_
35 | depending on your distribution.
36 |
--------------------------------------------------------------------------------
/doc/source/intro-os-ha-state.rst:
--------------------------------------------------------------------------------
1 | ==================================
2 | Stateless versus stateful services
3 | ==================================
4 |
5 | OpenStack components can be divided into three categories:
6 |
7 | - OpenStack APIs: APIs that are HTTP(s) stateless services written in python,
8 | easy to duplicate and mostly easy to load balance.
9 |
10 | - The SQL relational database server provides stateful type consumed by other
11 | components. Supported databases are MySQL, MariaDB, and PostgreSQL.
12 | Making the SQL database redundant is complex.
13 |
14 | - :term:`Advanced Message Queuing Protocol (AMQP)` provides OpenStack
15 | internal stateful communication service.
16 |
17 | .. to do: Ensure the difference between stateless and stateful services
18 | .. is clear
19 |
20 | Stateless services
21 | ~~~~~~~~~~~~~~~~~~
22 |
23 | A service that provides a response after your request and then
24 | requires no further attention. To make a stateless service highly
25 | available, you need to provide redundant instances and load balance them.
26 |
27 | Stateless OpenStack services
28 | ----------------------------
29 |
30 | OpenStack services that are stateless include ``nova-api``,
31 | ``nova-conductor``, ``glance-api``, ``keystone-api``, ``neutron-api``,
32 | and ``nova-scheduler``.
33 |
34 | Stateful services
35 | ~~~~~~~~~~~~~~~~~
36 |
37 | A service where subsequent requests to the service
38 | depend on the results of the first request.
39 | Stateful services are more difficult to manage because a single
40 | action typically involves more than one request. Providing
41 | additional instances and load balancing does not solve the problem.
42 | For example, if the horizon user interface reset itself every time
43 | you went to a new page, it would not be very useful.
44 | OpenStack services that are stateful include the OpenStack database
45 | and message queue.
46 | Making stateful services highly available can depend on whether you choose
47 | an active/passive or active/active configuration.
48 |
49 | Stateful OpenStack services
50 | ----------------------------
51 |
52 | .. to do: create list of stateful services
53 |
--------------------------------------------------------------------------------
/doc/source/intro-os-ha.rst:
--------------------------------------------------------------------------------
1 | ================================================
2 | Introduction to high availability with OpenStack
3 | ================================================
4 |
5 | .. to do: description of section & improvement of title (intro to OS HA)
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | intro-os-ha-state.rst
11 | intro-os-ha-cluster.rst
12 | intro-os-ha-memcached.rst
13 |
--------------------------------------------------------------------------------
/doc/source/monitoring.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | Monitoring
3 | ==========
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/doc/source/networking-ha-l3-agent.rst:
--------------------------------------------------------------------------------
1 | ========
2 | L3 Agent
3 | ========
4 | .. TODO: Introduce L3 agent
5 |
6 | HA Routers
7 | ~~~~~~~~~~
8 | .. TODO: content for HA routers
9 |
10 | Networking DHCP agent
11 | ~~~~~~~~~~~~~~~~~~~~~
12 | The OpenStack Networking (neutron) service has a scheduler that lets you run
13 | multiple agents across nodes. The DHCP agent can be natively highly available.
14 |
15 | To configure the number of DHCP agents per network, modify the
16 | ``dhcp_agents_per_network`` parameter in the :file:`/etc/neutron/neutron.conf`
17 | file. By default this is set to 1. To achieve high availability, assign more
18 | than one DHCP agent per network. For more information, see
19 | `High-availability for DHCP
20 | `_.
21 |
--------------------------------------------------------------------------------
/doc/source/networking-ha-neutron-l3-analysis.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | Neutron L3
3 | ==========
4 |
5 | .. TODO: create and import Neutron L3 analysis
6 | Introduce the Networking (neutron) service L3 agent
7 |
--------------------------------------------------------------------------------
/doc/source/networking-ha-neutron-server.rst:
--------------------------------------------------------------------------------
1 | =========================
2 | Neutron Networking server
3 | =========================
4 |
5 | .. TODO: Create content similar to other API sections
6 |
--------------------------------------------------------------------------------
/doc/source/networking-ha.rst:
--------------------------------------------------------------------------------
1 | ===================================
2 | Configuring the networking services
3 | ===================================
4 |
5 | Configure networking on each node. See the basic information about
6 | configuring networking in the Networking service section of the
7 | `Install Guides `_,
8 | depending on your distribution.
9 |
10 | OpenStack network nodes contain:
11 |
12 | - Networking DHCP agent
13 | - Neutron L3 agent
14 | - Networking L2 agent
15 |
16 | .. note::
17 |
18 | The L2 agent cannot be distributed and highly available. Instead, it
19 | must be installed on each data forwarding node to control the virtual
20 | network driver such as Open vSwitch or Linux Bridge. One L2 agent runs
21 | per node and controls its virtual interfaces.
22 |
23 | .. toctree::
24 | :maxdepth: 2
25 |
26 | networking-ha-neutron-server.rst
27 | networking-ha-neutron-l3-analysis.rst
28 | networking-ha-l3-agent.rst
29 |
30 |
--------------------------------------------------------------------------------
/doc/source/overview.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Overview
3 | ========
4 |
5 | This guide can be split into two parts:
6 |
7 | #. High level architecture
8 | #. Reference architecture examples, monitoring, and testing
9 |
10 | .. warning::
11 | We recommend using this guide for assistance when considering your HA cloud.
12 | We do not recommend using this guide for manually building your HA cloud.
13 | We recommend starting with a pre-validated solution and adjusting to your
14 | needs.
15 |
16 | High availability is not for every user. It presents some challenges.
17 | High availability may be too complex for databases or
18 | systems with large amounts of data. Replication can slow large systems
19 | down. Different setups have different prerequisites. Read the guidelines
20 | for each setup.
21 |
22 | .. important::
23 |
24 | High availability is turned off as the default in OpenStack setups.
25 |
--------------------------------------------------------------------------------
/doc/source/ref-arch-examples.rst:
--------------------------------------------------------------------------------
1 | ======================
2 | Reference Architecture
3 | ======================
4 |
--------------------------------------------------------------------------------
/doc/source/storage-ha-backend.rst:
--------------------------------------------------------------------------------
1 |
2 | .. _storage-ha-backend:
3 |
4 | ================
5 | Storage back end
6 | ================
7 |
8 | An OpenStack environment includes multiple data pools for the VMs:
9 |
10 | - Ephemeral storage is allocated for an instance and is deleted when the
11 | instance is deleted. The Compute service manages ephemeral storage and
12 | by default, Compute stores ephemeral drives as files on local disks on the
13 | compute node. As an alternative, you can use Ceph RBD as the storage back
14 | end for ephemeral storage.
15 |
16 | - Persistent storage exists outside all instances. Two types of persistent
17 | storage are provided:
18 |
19 | - The Block Storage service (cinder) that can use LVM or Ceph RBD as the
20 | storage back end.
21 | - The Image service (glance) that can use the Object Storage service (swift)
22 | or Ceph RBD as the storage back end.
23 |
24 | For more information about configuring storage back ends for
25 | the different storage options, see `Manage volumes
26 | `_
27 | in the OpenStack Administrator Guide.
28 |
29 | This section discusses ways to protect against data loss in your OpenStack
30 | environment.
31 |
32 | RAID drives
33 | -----------
34 |
35 | Configuring RAID on the hard drives that implement storage protects your data
36 | against a hard drive failure. If the node itself fails, data may be lost.
37 | In particular, all volumes stored on an LVM node can be lost.
38 |
39 | Ceph
40 | ----
41 |
42 | `Ceph RBD `_ is an innately high availability storage back
43 | end. It creates a storage cluster with multiple nodes that communicate with
44 | each other to replicate and redistribute data dynamically.
45 | A Ceph RBD storage cluster provides a single shared set of storage nodes that
46 | can handle all classes of persistent and ephemeral data (glance, cinder, and
47 | nova) that are required for OpenStack instances.
48 |
49 | Ceph RBD provides object replication capabilities by storing Block Storage
50 | volumes as Ceph RBD objects. Ceph RBD ensures that each replica of an object
51 | is stored on a different node. This means that your volumes are protected
52 | against hard drive and node failures, or even the failure of the data center
53 | itself.
54 |
55 | When Ceph RBD is used for ephemeral volumes as well as block and image storage,
56 | it supports `live migration
57 | `_
58 | of VMs with ephemeral drives. LVM only supports live migration of
59 | volume-backed VMs.
60 |
--------------------------------------------------------------------------------
/doc/source/storage-ha-block.rst:
--------------------------------------------------------------------------------
1 | ==================================
2 | Highly available Block Storage API
3 | ==================================
4 |
5 | Cinder provides Block-Storage-as-a-Service suitable for performance
6 | sensitive scenarios such as databases, expandable file systems, or
7 | providing a server with access to raw block level storage.
8 |
9 | Persistent block storage can survive instance termination and can also
10 | be moved across instances like any external storage device. Cinder
11 | also has volume snapshots capability for backing up the volumes.
12 |
13 | Making the Block Storage API service highly available in
14 | active/passive mode involves:
15 |
16 | - :ref:`ha-blockstorage-pacemaker`
17 | - :ref:`ha-blockstorage-configure`
18 | - :ref:`ha-blockstorage-services`
19 |
20 | In theory, you can run the Block Storage service as active/active.
21 | However, because of sufficient concerns, we recommend running
22 | the volume component as active/passive only.
23 |
24 | You can read more about these concerns on the
25 | `Red Hat Bugzilla `_
26 | and there is a
27 | `psuedo roadmap `_
28 | for addressing them upstream.
29 |
30 | .. _ha-blockstorage-pacemaker:
31 |
32 | Add Block Storage API resource to Pacemaker
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 |
35 | On RHEL-based systems, create resources for cinder's systemd agents and create
36 | constraints to enforce startup/shutdown ordering:
37 |
38 | .. code-block:: console
39 |
40 | pcs resource create openstack-cinder-api systemd:openstack-cinder-api --clone interleave=true
41 | pcs resource create openstack-cinder-scheduler systemd:openstack-cinder-scheduler --clone interleave=true
42 | pcs resource create openstack-cinder-volume systemd:openstack-cinder-volume
43 |
44 | pcs constraint order start openstack-cinder-api-clone then openstack-cinder-scheduler-clone
45 | pcs constraint colocation add openstack-cinder-scheduler-clone with openstack-cinder-api-clone
46 | pcs constraint order start openstack-cinder-scheduler-clone then openstack-cinder-volume
47 | pcs constraint colocation add openstack-cinder-volume with openstack-cinder-scheduler-clone
48 |
49 |
50 | If the Block Storage service runs on the same nodes as the other services,
51 | then it is advisable to also include:
52 |
53 | .. code-block:: console
54 |
55 | pcs constraint order start openstack-keystone-clone then openstack-cinder-api-clone
56 |
57 | Alternatively, instead of using systemd agents, download and
58 | install the OCF resource agent:
59 |
60 | .. code-block:: console
61 |
62 | # cd /usr/lib/ocf/resource.d/openstack
63 | # wget https://opendev.org/x/openstack-resource-agents/raw/branch/master/ocf/cinder-api
64 | # chmod a+rx *
65 |
66 | You can now add the Pacemaker configuration for Block Storage API resource.
67 | Connect to the Pacemaker cluster with the :command:`crm configure` command
68 | and add the following cluster resources:
69 |
70 | .. code-block:: none
71 |
72 | primitive p_cinder-api ocf:openstack:cinder-api \
73 | params config="/etc/cinder/cinder.conf" \
74 | os_password="secretsecret" \
75 | os_username="admin" \
76 | os_tenant_name="admin" \
77 | keystone_get_token_url="http://10.0.0.11:5000/v2.0/tokens" \
78 | op monitor interval="30s" timeout="30s"
79 |
80 | This configuration creates ``p_cinder-api``, a resource for managing the
81 | Block Storage API service.
82 |
83 | The command :command:`crm configure` supports batch input, copy and paste the
84 | lines above into your live Pacemaker configuration and then make changes as
85 | required. For example, you may enter ``edit p_ip_cinder-api`` from the
86 | :command:`crm configure` menu and edit the resource to match your preferred
87 | virtual IP address.
88 |
89 | Once completed, commit your configuration changes by entering :command:`commit`
90 | from the :command:`crm configure` menu. Pacemaker then starts the Block Storage
91 | API service and its dependent resources on one of your nodes.
92 |
93 | .. _ha-blockstorage-configure:
94 |
95 | Configure Block Storage API service
96 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
97 |
98 | Edit the ``/etc/cinder/cinder.conf`` file. For example, on a RHEL-based system:
99 |
100 | .. code-block:: ini
101 | :linenos:
102 |
103 | [DEFAULT]
104 | # This is the name which we should advertise ourselves as and for
105 | # A/P installations it should be the same everywhere
106 | host = cinder-cluster-1
107 |
108 | # Listen on the Block Storage VIP
109 | osapi_volume_listen = 10.0.0.11
110 |
111 | auth_strategy = keystone
112 | control_exchange = cinder
113 |
114 | volume_driver = cinder.volume.drivers.nfs.NfsDriver
115 | nfs_shares_config = /etc/cinder/nfs_exports
116 | nfs_sparsed_volumes = true
117 | nfs_mount_options = v3
118 |
119 | [database]
120 | connection = mysql+pymysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
121 | max_retries = -1
122 |
123 | [keystone_authtoken]
124 | # 10.0.0.11 is the Keystone VIP
125 | identity_uri = http://10.0.0.11:35357/
126 | www_authenticate_uri = http://10.0.0.11:5000/
127 | admin_tenant_name = service
128 | admin_user = cinder
129 | admin_password = CINDER_PASS
130 |
131 | [oslo_messaging_rabbit]
132 | # Explicitly list the rabbit hosts as it doesn't play well with HAProxy
133 | rabbit_hosts = 10.0.0.12,10.0.0.13,10.0.0.14
134 | # As a consequence, we also need HA queues
135 | rabbit_ha_queues = True
136 | heartbeat_timeout_threshold = 60
137 | heartbeat_rate = 2
138 |
139 | Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
140 | database. Replace ``CINDER_PASS`` with the password you chose for the
141 | ``cinder`` user in the Identity service.
142 |
143 | This example assumes that you are using NFS for the physical storage, which
144 | will almost never be true in a production installation.
145 |
146 | If you are using the Block Storage service OCF agent, some settings will
147 | be filled in for you, resulting in a shorter configuration file:
148 |
149 | .. code-block:: ini
150 | :linenos:
151 |
152 | # We have to use MySQL connection to store data:
153 | connection = mysql+pymysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
154 | # Alternatively, you can switch to pymysql,
155 | # a new Python 3 compatible library and use
156 | # sql_connection = mysql+pymysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
157 | # and be ready when everything moves to Python 3.
158 | # Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
159 |
160 | # We bind Block Storage API to the VIP:
161 | osapi_volume_listen = 10.0.0.11
162 |
163 | # We send notifications to High Available RabbitMQ:
164 | notifier_strategy = rabbit
165 | rabbit_host = 10.0.0.11
166 |
167 | Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
168 | database.
169 |
170 | .. _ha-blockstorage-services:
171 |
172 | Configure OpenStack services to use the highly available Block Storage API
173 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
174 |
175 | Your OpenStack services must now point their Block Storage API configuration
176 | to the highly available, virtual cluster IP address rather than a Block Storage
177 | API server’s physical IP address as you would for a non-HA environment.
178 |
179 | Create the Block Storage API endpoint with this IP.
180 |
181 | If you are using both private and public IP addresses, create two virtual IPs
182 | and define your endpoint. For example:
183 |
184 | .. code-block:: console
185 |
186 | $ openstack endpoint create --region $KEYSTONE_REGION \
187 | volumev2 public http://PUBLIC_VIP:8776/v2/%\(project_id\)s
188 | $ openstack endpoint create --region $KEYSTONE_REGION \
189 | volumev2 admin http://10.0.0.11:8776/v2/%\(project_id\)s
190 | $ openstack endpoint create --region $KEYSTONE_REGION \
191 | volumev2 internal http://10.0.0.11:8776/v2/%\(project_id\)s
192 |
193 | Use Cinder volume backup and restore service
194 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
195 |
196 | Cinder provides a feature to backup and restore volumes and snapshots.
197 | The first backup of a volume must be handled as a full backup.
198 | Subsequent backups may be either full or incremental backups from the
199 | last full backup. See also `the Cinder Block Storage Administration
200 | Guide's section on backing up and restoring volumes and snapshots
201 | `_.
202 |
--------------------------------------------------------------------------------
/doc/source/storage-ha-file-systems.rst:
--------------------------------------------------------------------------------
1 | ========================================
2 | Highly available Shared File Systems API
3 | ========================================
4 |
5 | Making the Shared File Systems (manila) API service highly available
6 | in active/passive mode involves:
7 |
8 | - :ref:`ha-sharedfilesystems-configure`
9 | - :ref:`ha-sharedfilesystems-services`
10 | - :ref:`ha-sharedfilesystems-pacemaker`
11 |
12 | .. _ha-sharedfilesystems-configure:
13 |
14 | Configure Shared File Systems API service
15 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 |
17 | Edit the :file:`/etc/manila/manila.conf` file:
18 |
19 | .. code-block:: ini
20 | :linenos:
21 |
22 | # We have to use MySQL connection to store data:
23 | sql_connection = mysql+pymysql://manila:password@10.0.0.11/manila?charset=utf8
24 |
25 | # We bind Shared File Systems API to the VIP:
26 | osapi_volume_listen = 10.0.0.11
27 |
28 | # We send notifications to High Available RabbitMQ:
29 | notifier_strategy = rabbit
30 | rabbit_host = 10.0.0.11
31 |
32 |
33 | .. _ha-sharedfilesystems-services:
34 |
35 | Configure OpenStack services to use Shared File Systems API
36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
37 |
38 | Your OpenStack services must now point their Shared File Systems API
39 | configuration to the highly available, virtual cluster IP address rather than
40 | a Shared File Systems API server’s physical IP address as you would
41 | for a non-HA environment.
42 |
43 | You must create the Shared File Systems API endpoint with this IP.
44 |
45 | If you are using both private and public IP addresses, you should create two
46 | virtual IPs and define your endpoints like this:
47 |
48 | .. code-block:: console
49 |
50 | $ openstack endpoint create --region RegionOne \
51 | sharev2 public 'http://PUBLIC_VIP:8786/v2/%(tenant_id)s'
52 |
53 | $ openstack endpoint create --region RegionOne \
54 | sharev2 internal 'http://10.0.0.11:8786/v2/%(tenant_id)s'
55 |
56 | $ openstack endpoint create --region RegionOne \
57 | sharev2 admin 'http://10.0.0.11:8786/v2/%(tenant_id)s'
58 |
59 | .. _ha-sharedfilesystems-pacemaker:
60 |
61 | Add Shared File Systems API resource to Pacemaker
62 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
63 |
64 | #. Download the resource agent to your system:
65 |
66 | .. code-block:: console
67 |
68 | # cd /usr/lib/ocf/resource.d/openstack
69 | # wget https://opendev.org/x/openstack-resource-agents/raw/branch/master/ocf/manila-api
70 | # chmod a+rx *
71 |
72 | #. Add the Pacemaker configuration for the Shared File Systems
73 | API resource. Connect to the Pacemaker cluster with the following
74 | command:
75 |
76 | .. code-block:: console
77 |
78 | # crm configure
79 |
80 | .. note::
81 |
82 | The :command:`crm configure` supports batch input. Copy and paste
83 | the lines in the next step into your live Pacemaker configuration and then
84 | make changes as required.
85 |
86 | For example, you may enter ``edit p_ip_manila-api`` from the
87 | :command:`crm configure` menu and edit the resource to match your preferred
88 | virtual IP address.
89 |
90 | #. Add the following cluster resources:
91 |
92 | .. code-block:: none
93 |
94 | primitive p_manila-api ocf:openstack:manila-api \
95 | params config="/etc/manila/manila.conf" \
96 | os_password="secretsecret" \
97 | os_username="admin" \
98 | os_tenant_name="admin" \
99 | keystone_get_token_url="http://10.0.0.11:5000/v2.0/tokens" \
100 | op monitor interval="30s" timeout="30s"
101 |
102 | This configuration creates ``p_manila-api``, a resource for managing the
103 | Shared File Systems API service.
104 |
105 | #. Commit your configuration changes by entering the following command
106 | from the :command:`crm configure` menu:
107 |
108 | .. code-block:: console
109 |
110 | # commit
111 |
112 | Pacemaker now starts the Shared File Systems API service and its
113 | dependent resources on one of your nodes.
114 |
115 |
--------------------------------------------------------------------------------
/doc/source/storage-ha-image.rst:
--------------------------------------------------------------------------------
1 | ==========================
2 | Highly available Image API
3 | ==========================
4 |
5 | The OpenStack Image service offers a service for discovering, registering, and
6 | retrieving virtual machine images. To make the OpenStack Image API service
7 | highly available in active/passive mode, you must:
8 |
9 | - :ref:`glance-api-pacemaker`
10 | - :ref:`glance-api-configure`
11 | - :ref:`glance-services`
12 |
13 | Prerequisites
14 | ~~~~~~~~~~~~~
15 |
16 | Before beginning, ensure that you are familiar with the
17 | documentation for installing the OpenStack Image API service.
18 | See the *Image service* section in the
19 | `Installation Guides `_,
20 | depending on your distribution.
21 |
22 | .. _glance-api-pacemaker:
23 |
24 | Add OpenStack Image API resource to Pacemaker
25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 |
27 | #. Download the resource agent to your system:
28 |
29 | .. code-block:: console
30 |
31 | # cd /usr/lib/ocf/resource.d/openstack
32 | # wget https://opendev.org/x/openstack-resource-agents/raw/branch/master/ocf/glance-api
33 | # chmod a+rx *
34 |
35 | #. Add the Pacemaker configuration for the OpenStack Image API resource.
36 | Use the following command to connect to the Pacemaker cluster:
37 |
38 | .. code-block:: console
39 |
40 | crm configure
41 |
42 | .. note::
43 |
44 | The :command:`crm configure` command supports batch input. Copy and paste
45 | the lines in the next step into your live Pacemaker configuration and
46 | then make changes as required.
47 |
48 | For example, you may enter ``edit p_ip_glance-api`` from the
49 | :command:`crm configure` menu and edit the resource to match your
50 | preferred virtual IP address.
51 |
52 | #. Add the following cluster resources:
53 |
54 | .. code-block:: console
55 |
56 | primitive p_glance-api ocf:openstack:glance-api \
57 | params config="/etc/glance/glance-api.conf" \
58 | os_password="secretsecret" \
59 | os_username="admin" os_tenant_name="admin" \
60 | os_auth_url="http://10.0.0.11:5000/v2.0/" \
61 | op monitor interval="30s" timeout="30s"
62 |
63 | This configuration creates ``p_glance-api``, a resource for managing the
64 | OpenStack Image API service.
65 |
66 | #. Commit your configuration changes by entering the following command from
67 | the :command:`crm configure` menu:
68 |
69 | .. code-block:: console
70 |
71 | commit
72 |
73 | Pacemaker then starts the OpenStack Image API service and its dependent
74 | resources on one of your nodes.
75 |
76 | .. _glance-api-configure:
77 |
78 | Configure OpenStack Image service API
79 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
80 |
81 | Edit the :file:`/etc/glance/glance-api.conf` file
82 | to configure the OpenStack Image service:
83 |
84 | .. code-block:: ini
85 |
86 | # We have to use MySQL connection to store data:
87 | sql_connection=mysql://glance:password@10.0.0.11/glance
88 | # Alternatively, you can switch to pymysql,
89 | # a new Python 3 compatible library and use
90 | # sql_connection=mysql+pymysql://glance:password@10.0.0.11/glance
91 | # and be ready when everything moves to Python 3.
92 | # Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
93 |
94 | # We bind OpenStack Image API to the VIP:
95 | bind_host = 10.0.0.11
96 |
97 | # Connect to OpenStack Image registry service:
98 | registry_host = 10.0.0.11
99 |
100 | # We send notifications to High Available RabbitMQ:
101 | notifier_strategy = rabbit
102 | rabbit_host = 10.0.0.11
103 |
104 | [TODO: need more discussion of these parameters]
105 |
106 | .. _glance-services:
107 |
108 | Configure OpenStack services to use the highly available OpenStack Image API
109 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
110 |
111 | Your OpenStack services must now point their OpenStack Image API configuration
112 | to the highly available, virtual cluster IP address instead of pointing to the
113 | physical IP address of an OpenStack Image API server as you would in a non-HA
114 | cluster.
115 |
116 | For example, if your OpenStack Image API service IP address is 10.0.0.11
117 | (as in the configuration explained here), you would use the following
118 | configuration in your :file:`nova.conf` file:
119 |
120 | .. code-block:: ini
121 |
122 | [glance]
123 | # ...
124 | api_servers = 10.0.0.11
125 | # ...
126 |
127 |
128 | You must also create the OpenStack Image API endpoint with this IP address.
129 | If you are using both private and public IP addresses, create two virtual IP
130 | addresses and define your endpoint. For example:
131 |
132 | .. code-block:: console
133 |
134 | $ openstack endpoint create --region $KEYSTONE_REGION \
135 | image public http://PUBLIC_VIP:9292
136 |
137 | $ openstack endpoint create --region $KEYSTONE_REGION \
138 | image admin http://10.0.0.11:9292
139 |
140 | $ openstack endpoint create --region $KEYSTONE_REGION \
141 | image internal http://10.0.0.11:9292
142 |
--------------------------------------------------------------------------------
/doc/source/storage-ha.rst:
--------------------------------------------------------------------------------
1 | ===================
2 | Configuring storage
3 | ===================
4 |
5 | .. toctree::
6 | :maxdepth: 2
7 |
8 | storage-ha-image.rst
9 | storage-ha-block.rst
10 | storage-ha-file-systems.rst
11 | storage-ha-backend.rst
12 |
13 | Making the Block Storage (cinder) API service highly available in
14 | active/active mode involves:
15 |
16 | * Configuring Block Storage to listen on the VIP address
17 |
18 | * Managing the Block Storage API daemon with the Pacemaker cluster manager
19 |
20 | * Configuring OpenStack services to use this IP address
21 |
22 | .. To Do: HA without Pacemaker
23 |
--------------------------------------------------------------------------------
/doc/source/testing.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Testing
3 | =======
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = openstackhaguide
3 | summary = OpenStack High Availability Guide
4 | author = OpenStack
5 | author_email = openstack-discuss@lists.openstack.org
6 | home_page = https://docs.openstack.org/ha-guide/
7 | classifier =
8 | Environment :: OpenStack
9 | Intended Audience :: Information Technology
10 | Intended Audience :: System Administrators
11 | License :: OSI Approved :: Apache Software License
12 | Operating System :: POSIX :: Linux
13 | Topic :: Documentation
14 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | minversion = 4.4
3 | skipsdist = True
4 | envlist = docs
5 |
6 | [testenv:docs]
7 | deps =
8 | -r{toxinidir}/doc/requirements.txt
9 | commands =
10 | doc8 doc/source -e txt -e rst
11 | sphinx-build -E -W -b html doc/source doc/build/html
12 |
13 | [testenv:pdf-docs]
14 | deps = {[testenv:docs]deps}
15 | allowlist_externals =
16 | make
17 | commands =
18 | sphinx-build -j auto -W -b latex doc/source doc/build/pdf
19 | make -C doc/build/pdf
20 |
21 | [doc8]
22 | # Settings for doc8:
23 | # Ignore target directories and autogenerated files
24 | ignore-path = doc/*/target,doc/*/build*
25 | # File extensions to use
26 | extensions = .rst,.txt
27 | # Maximal line length should be 79 but we have some overlong lines.
28 | # Let's not get far more in.
29 | max-line-length = 79
30 | # Disable some doc8 checks:
31 | # D000: Check RST validity (cannot handle the "linenos" directive)
32 | ignore = D000
33 |
--------------------------------------------------------------------------------