├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── ansible.cfg ├── config.yml ├── doc ├── README.rst └── generate_autodoc_index.sh ├── drill.sh ├── files └── get-random-osd.py ├── gremlin.png ├── inventory ├── group_vars │ └── all ├── hosts └── structure ├── playbooks ├── case.yml ├── cases │ ├── compute │ │ └── 001.yml │ ├── control │ │ └── 001.yml │ ├── network │ │ └── 001.yml │ ├── storage │ │ └── 001.yml │ └── system │ │ └── 001.yml ├── common │ ├── ask.yml │ ├── create_auth.yml │ ├── next.yml │ ├── port.yml │ ├── remove_auth.yml │ └── service.yml ├── compute │ ├── service.yml │ └── system.yml ├── control │ ├── cinder │ │ └── service.yml │ ├── drill_api.yml │ ├── drill_db.yml │ ├── drill_hy.yml │ ├── drill_lb.yml │ ├── drill_mc.yml │ ├── drill_mq.yml │ ├── glance │ │ └── service.yml │ ├── haproxy │ │ └── service.yml │ ├── keystone │ │ └── service.yml │ ├── memcache │ │ └── service.yml │ ├── mysql │ │ ├── service.yml │ │ ├── stress.yml │ │ └── system.yml │ ├── neutron │ │ └── service.yml │ ├── nova │ │ └── service.yml │ └── rabbitmq │ │ ├── service.yml │ │ ├── stress.yml │ │ └── system.yml ├── drill.yml ├── drill_compute.yml ├── drill_control.yml ├── drill_network.yml ├── drill_storage.yml ├── network │ ├── service.yml │ └── system.yml ├── storage │ ├── drill_mon.yml │ ├── drill_osd.yml │ ├── drill_rgw.yml │ ├── mon │ │ ├── damage_mon.yml │ │ └── kill_mon.yml │ ├── osd │ │ ├── del_osd_partition.yml │ │ └── kill_osd.yml │ └── rgw │ │ └── kill_rgw.yml └── system │ ├── base.yml │ ├── cpu_load.yml │ ├── disk_load.yml │ ├── mem_load.yml │ ├── nic.yml │ ├── nic_delay.yml │ ├── nic_down.yml │ └── nic_loss.yml └── roles ├── common ├── defaults │ └── main.yml └── tasks │ ├── port_add.yml │ ├── port_del.yml │ ├── start_service.yml │ └── stop_service.yml ├── compute ├── README.md ├── defaults │ └── main.yml ├── meta │ └── main.yml └── service │ └── main.yml ├── control ├── README.md ├── defaults │ └── main.yml ├── files │ └── stress_mq.py ├── meta │ └── main.yml └── tasks │ ├── purge_queue.yml │ ├── stress_db.yml │ └── stress_mq.yml ├── network ├── README.md ├── defaults │ └── main.yml ├── meta │ └── main.yml └── service │ └── tasks │ └── main.yml ├── provision ├── README.md ├── defaults │ └── main.yml ├── local │ └── tasks │ │ └── main.yml ├── meta │ └── main.yml ├── os_auth │ ├── defaults │ │ └── main.yml │ └── tasks │ │ ├── create_auth.yml │ │ └── remove_auth.yml ├── os_stack │ ├── defaults │ │ └── main.yml │ └── tasks │ │ ├── create_stack.yml │ │ └── remove_stack.yml ├── teardown │ ├── meta │ │ └── main.yml │ └── tasks │ │ └── main.yml └── user │ ├── meta │ └── main.yml │ └── tasks │ └── main.yml ├── storage ├── README.md ├── defaults │ └── main.yml ├── meta │ └── main.yml └── tasks │ ├── damage_mon.yml │ ├── del_osd_partition.yml │ ├── kill_mon.yml │ ├── kill_osd.yml │ ├── kill_rgw.yml │ ├── recover_damage_mon.yml │ ├── recover_osd_partition.yml │ ├── start_mon.yml │ ├── start_osd.yml │ ├── start_rgw.yml │ └── stop_mon.yml └── system ├── defaults └── main.yml ├── meta └── main.yml └── tasks ├── clear_tc.yml ├── cpu_load.yml ├── disk_load.yml ├── mem_load.yml ├── nic_delay.yml ├── nic_down.yml ├── nic_down_async.yml └── nic_loss.yml /.gitignore: -------------------------------------------------------------------------------- 1 | .gremlin 2 | inventory/hosts 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | language: python 3 | python: "2.7" 4 | 5 | install: 6 | - sudo pip install ansible 7 | 8 | script: 9 | - ./drill.sh -p playbooks/drill.yml -i inventory/structure -t all --syntax-check 10 | 11 | notifications: 12 | email: false 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gremlin 2 | 3 | [![Build Status](https://travis-ci.org/unitedstack/gremlin.svg?branch=master)](https://travis-ci.org/unitedstack/gremlin) 4 | 5 | OpenStack reliability verification and fault drill system. 6 | 7 | ![](./gremlin.png) 8 | 9 | ## Background 10 | 11 | IaaS is the cornerstone of building IT systems, the stability and reliability of 12 | the IaaS system is critical for customers, but how to evaluate its stability and 13 | reliability after an IaaS system has been deployed, this needs to be actually 14 | VERIFIED, and how to quickly locate the fault when the system is in failure, 15 | this needs to do actually FAULT DRILL. When our customers know when, how and why 16 | the system will be in failure, and know how to handle this situation, it will be 17 | very helpful to grow their confidence to their system. 18 | 19 | So, we designed the OpenStack Reliability Verification and Fault Drill program, 20 | it will do reliability verification from multiple dimensions of cloud platform, 21 | and will introduce man-made failures by using some operation tools, thus we can 22 | carry on fault drill along with monitoring system and logging system. 23 | 24 | ## Principle 25 | 26 | The program should follow the principles below: 27 | 28 | 1. All faults introduced should be alerted by monitoring system. 29 | 2. All faults introduced can do fallback. 30 | 3. All faults introduced should do cleanup when a fault drill is done. 31 | 32 | 33 | ## Design 34 | 35 | To cover more fault drill cases, the design be will formed from two aspects: 36 | 37 | 1. Horizontally, from node role, such as controller, network, compute, storage 38 | 2. Vertically, from system level, service level, and physical level 39 | 40 | This can design a broad set of fault drill cases if combined with these two dimensions. 41 | The test cases of system level and service level can be automated, but part of physical 42 | level test cases should be operated by human. 43 | 44 | 45 | ## Usage 46 | 47 | There are two modes when running gremlin: 48 | 49 | * auto: All test cases will run automatically. It will introduce fault and recover 50 | this fault automatically. The default mode is auto. 51 | * manual: Will run in interactive mode, when every test case is done, will prompt 52 | to ask if to execute the next one. And after introduced a fault, it will 53 | ask if to recover this fault automatically. 54 | 55 | Before running gremlin, ensure the host running gremlin can ssh to the target hosts 56 | without password. 57 | 58 | Now, following the steps to get started: 59 | 60 | 1. Get the code 61 | 62 | ``` 63 | git clone https://github.com/unitedstack/gremlin.git 64 | ``` 65 | 66 | 2. Install dependencies 67 | 68 | ``` 69 | ./drill.sh --install-deps 70 | ``` 71 | 72 | 3. Define your inventory 73 | 74 | You should define your inventory according your environments. Modify the 75 | inventory/hosts file. 76 | 77 | 4. Define your configuration 78 | 79 | Edit the config.yml to fit your environments. 80 | 81 | 5. Run your test cases 82 | 83 | 5.1 Run all test cases automatically: 84 | 85 | ``` 86 | ./drill.sh -t all 87 | ``` 88 | 89 | 5.2 Run all test cases manually: 90 | 91 | ``` 92 | ./drill.sh -t all --mode manual 93 | ``` 94 | 95 | 5.3 Run some specified test cases manually 96 | 97 | ``` 98 | ./drill.sh -t mon-pre,mon-down --mode manual 99 | ``` 100 | 101 | 6. To get more help info 102 | 103 | ``` 104 | ./drill.sh -h 105 | ``` 106 | 107 | ## More 108 | 109 | * Documentation: https://docs.openstack.org/gremlin 110 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | # config file for ansible -- https://ansible.com/ 2 | # =============================================== 3 | 4 | # nearly all parameters can be overridden in ansible-playbook 5 | # or with command line flags. ansible will read ANSIBLE_CONFIG, 6 | # ansible.cfg in the current working directory, .ansible.cfg in 7 | # the home directory or /etc/ansible/ansible.cfg, whichever it 8 | # finds first 9 | 10 | [defaults] 11 | 12 | # some basic default values... 13 | 14 | inventory = inventory/ 15 | #library = /usr/share/my_modules/ 16 | #module_utils = /usr/share/my_module_utils/ 17 | #remote_tmp = ~/.ansible/tmp 18 | #local_tmp = ~/.ansible/tmp 19 | #forks = 5 20 | #poll_interval = 15 21 | #sudo_user = root 22 | #ask_sudo_pass = True 23 | #ask_pass = True 24 | #transport = smart 25 | #remote_port = 22 26 | #module_lang = C 27 | #module_set_locale = False 28 | 29 | # plays will gather facts by default, which contain information about 30 | # the remote system. 31 | # 32 | # smart - gather by default, but don't regather if already gathered 33 | # implicit - gather by default, turn off with gather_facts: False 34 | # explicit - do not gather by default, must say gather_facts: True 35 | gathering = smart 36 | 37 | # This only affects the gathering done by a play's gather_facts directive, 38 | # by default gathering retrieves all facts subsets 39 | # all - gather all subsets 40 | # network - gather min and network facts 41 | # hardware - gather hardware facts (longest facts to retrieve) 42 | # virtual - gather min and virtual facts 43 | # facter - import facts from facter 44 | # ohai - import facts from ohai 45 | # You can combine them using comma (ex: network,virtual) 46 | # You can negate them using ! (ex: !hardware,!facter,!ohai) 47 | # A minimal set of facts is always gathered. 48 | #gather_subset = all 49 | 50 | # some hardware related facts are collected 51 | # with a maximum timeout of 10 seconds. This 52 | # option lets you increase or decrease that 53 | # timeout to something more suitable for the 54 | # environment. 55 | # gather_timeout = 10 56 | 57 | # additional paths to search for roles in, colon separated 58 | roles_path = roles 59 | 60 | # uncomment this to disable SSH key host checking 61 | host_key_checking = False 62 | 63 | # change the default callback, you can only have one 'stdout' type enabled at a time. 64 | #stdout_callback = skippy 65 | 66 | 67 | ## Ansible ships with some plugins that require whitelisting, 68 | ## this is done to avoid running all of a type by default. 69 | ## These setting lists those that you want enabled for your system. 70 | ## Custom plugins should not need this unless plugin author specifies it. 71 | 72 | # enable callback plugins, they can output to stdout but cannot be 'stdout' type. 73 | #callback_whitelist = timer, mail 74 | 75 | # enable inventory plugins, default: 'host_list', 'script', 'yaml', 'ini' 76 | #inventory_enabled = host_list, aws, openstack, docker 77 | 78 | # Determine whether includes in tasks and handlers are "static" by 79 | # default. As of 2.0, includes are dynamic by default. Setting these 80 | # values to True will make includes behave more like they did in the 81 | # 1.x versions. 82 | #task_includes_static = True 83 | #handler_includes_static = True 84 | 85 | # Controls if a missing handler for a notification event is an error or a warning 86 | #error_on_missing_handler = True 87 | 88 | # change this for alternative sudo implementations 89 | #sudo_exe = sudo 90 | 91 | # What flags to pass to sudo 92 | # WARNING: leaving out the defaults might create unexpected behaviours 93 | #sudo_flags = -H -S -n 94 | 95 | # SSH timeout 96 | #timeout = 10 97 | 98 | # default user to use for playbooks if user is not specified 99 | # (/usr/bin/ansible will use current user as default) 100 | #remote_user = root 101 | 102 | # logging is off by default unless this path is defined 103 | # if so defined, consider logrotate 104 | # log_path = drill.log 105 | 106 | # default module name for /usr/bin/ansible 107 | #module_name = command 108 | 109 | # use this shell for commands executed under sudo 110 | # you may need to change this to bin/bash in rare instances 111 | # if sudo is constrained 112 | #executable = /bin/sh 113 | 114 | # if inventory variables overlap, does the higher precedence one win 115 | # or are hash values merged together? The default is 'replace' but 116 | # this can also be set to 'merge'. 117 | #hash_behaviour = replace 118 | 119 | # by default, variables from roles will be visible in the global variable 120 | # scope. To prevent this, the following option can be enabled, and only 121 | # tasks and handlers within the role will see the variables there 122 | #private_role_vars = yes 123 | 124 | # list any Jinja2 extensions to enable here: 125 | #jinja2_extensions = jinja2.ext.do,jinja2.ext.i18n 126 | 127 | # if set, always use this private key file for authentication, same as 128 | # if passing --private-key to ansible or ansible-playbook 129 | #private_key_file = /path/to/file 130 | 131 | # If set, configures the path to the Vault password file as an alternative to 132 | # specifying --vault-password-file on the command line. 133 | #vault_password_file = /path/to/vault_password_file 134 | 135 | # format of string {{ ansible_managed }} available within Jinja2 136 | # templates indicates to users editing templates files will be replaced. 137 | # replacing {file}, {host} and {uid} and strftime codes with proper values. 138 | #ansible_managed = Ansible managed: {file} modified on %Y-%m-%d %H:%M:%S by {uid} on {host} 139 | # {file}, {host}, {uid}, and the timestamp can all interfere with idempotence 140 | # in some situations so the default is a static string: 141 | #ansible_managed = Ansible managed 142 | 143 | # by default, ansible-playbook will display "Skipping [host]" if it determines a task 144 | # should not be run on a host. Set this to "False" if you don't want to see these "Skipping" 145 | # messages. NOTE: the task header will still be shown regardless of whether or not the 146 | # task is skipped. 147 | display_skipped_hosts = False 148 | 149 | # by default, if a task in a playbook does not include a name: field then 150 | # ansible-playbook will construct a header that includes the task's action but 151 | # not the task's args. This is a security feature because ansible cannot know 152 | # if the *module* considers an argument to be no_log at the time that the 153 | # header is printed. If your environment doesn't have a problem securing 154 | # stdout from ansible-playbook (or you have manually specified no_log in your 155 | # playbook on all of the tasks where you have secret information) then you can 156 | # safely set this to True to get more informative messages. 157 | display_args_to_stdout = False 158 | 159 | # by default (as of 1.3), Ansible will raise errors when attempting to dereference 160 | # Jinja2 variables that are not set in templates or action lines. Uncomment this line 161 | # to revert the behavior to pre-1.3. 162 | #error_on_undefined_vars = False 163 | 164 | # by default (as of 1.6), Ansible may display warnings based on the configuration of the 165 | # system running ansible itself. This may include warnings about 3rd party packages or 166 | # other conditions that should be resolved if possible. 167 | # to disable these warnings, set the following value to False: 168 | #system_warnings = True 169 | 170 | # by default (as of 1.4), Ansible may display deprecation warnings for language 171 | # features that should no longer be used and will be removed in future versions. 172 | # to disable these warnings, set the following value to False: 173 | #deprecation_warnings = True 174 | 175 | # (as of 1.8), Ansible can optionally warn when usage of the shell and 176 | # command module appear to be simplified by using a default Ansible module 177 | # instead. These warnings can be silenced by adjusting the following 178 | # setting or adding warn=yes or warn=no to the end of the command line 179 | # parameter string. This will for example suggest using the git module 180 | # instead of shelling out to the git command. 181 | command_warnings = False 182 | 183 | 184 | # set plugin path directories here, separate with colons 185 | #action_plugins = /usr/share/ansible/plugins/action 186 | #cache_plugins = /usr/share/ansible/plugins/cache 187 | callback_plugins = callback_plugins 188 | #connection_plugins = /usr/share/ansible/plugins/connection 189 | #lookup_plugins = /usr/share/ansible/plugins/lookup 190 | #inventory_plugins = /usr/share/ansible/plugins/inventory 191 | #vars_plugins = /usr/share/ansible/plugins/vars 192 | #filter_plugins = /usr/share/ansible/plugins/filter 193 | #test_plugins = /usr/share/ansible/plugins/test 194 | #terminal_plugins = /usr/share/ansible/plugins/terminal 195 | #strategy_plugins = /usr/share/ansible/plugins/strategy 196 | 197 | 198 | # by default, ansible will use the 'linear' strategy but you may want to try 199 | # another one 200 | #strategy = free 201 | 202 | # by default callbacks are not loaded for /bin/ansible, enable this if you 203 | # want, for example, a notification or logging callback to also apply to 204 | # /bin/ansible runs 205 | #bin_ansible_callbacks = False 206 | 207 | 208 | # don't like cows? that's unfortunate. 209 | # set to 1 if you don't want cowsay support or export ANSIBLE_NOCOWS=1 210 | #nocows = 1 211 | 212 | # set which cowsay stencil you'd like to use by default. When set to 'random', 213 | # a random stencil will be selected for each task. The selection will be filtered 214 | # against the `cow_whitelist` option below. 215 | #cow_selection = default 216 | #cow_selection = random 217 | 218 | # when using the 'random' option for cowsay, stencils will be restricted to this list. 219 | # it should be formatted as a comma-separated list with no spaces between names. 220 | # NOTE: line continuations here are for formatting purposes only, as the INI parser 221 | # in python does not support them. 222 | #cow_whitelist=bud-frogs,bunny,cheese,daemon,default,dragon,elephant-in-snake,elephant,eyes,\ 223 | # hellokitty,kitty,luke-koala,meow,milk,moofasa,moose,ren,sheep,small,stegosaurus,\ 224 | # stimpy,supermilker,three-eyes,turkey,turtle,tux,udder,vader-koala,vader,www 225 | 226 | # don't like colors either? 227 | # set to 1 if you don't want colors, or export ANSIBLE_NOCOLOR=1 228 | #nocolor = 1 229 | 230 | # if set to a persistent type (not 'memory', for example 'redis') fact values 231 | # from previous runs in Ansible will be stored. This may be useful when 232 | # wanting to use, for example, IP information from one group of servers 233 | # without having to talk to them in the same playbook run to get their 234 | # current IP information. 235 | #fact_caching = memory 236 | 237 | 238 | # retry files 239 | # When a playbook fails by default a .retry file will be created in ~/ 240 | # You can disable this feature by setting retry_files_enabled to False 241 | # and you can change the location of the files by setting retry_files_save_path 242 | 243 | retry_files_enabled = False 244 | #retry_files_save_path = ~/.ansible-retry 245 | 246 | # squash actions 247 | # Ansible can optimise actions that call modules with list parameters 248 | # when looping. Instead of calling the module once per with_ item, the 249 | # module is called once with all items at once. Currently this only works 250 | # under limited circumstances, and only with parameters named 'name'. 251 | #squash_actions = apk,apt,dnf,homebrew,pacman,pkgng,yum,zypper 252 | 253 | # prevents logging of task data, off by default 254 | #no_log = False 255 | 256 | # prevents logging of tasks, but only on the targets, data is still logged on the master/controller 257 | #no_target_syslog = False 258 | 259 | # controls whether Ansible will raise an error or warning if a task has no 260 | # choice but to create world readable temporary files to execute a module on 261 | # the remote machine. This option is False by default for security. Users may 262 | # turn this on to have behaviour more like Ansible prior to 2.1.x. See 263 | # https://docs.ansible.com/ansible/become.html#becoming-an-unprivileged-user 264 | # for more secure ways to fix this than enabling this option. 265 | #allow_world_readable_tmpfiles = False 266 | 267 | # controls the compression level of variables sent to 268 | # worker processes. At the default of 0, no compression 269 | # is used. This value must be an integer from 0 to 9. 270 | #var_compression_level = 9 271 | 272 | # controls what compression method is used for new-style ansible modules when 273 | # they are sent to the remote system. The compression types depend on having 274 | # support compiled into both the controller's python and the client's python. 275 | # The names should match with the python Zipfile compression types: 276 | # * ZIP_STORED (no compression. available everywhere) 277 | # * ZIP_DEFLATED (uses zlib, the default) 278 | # These values may be set per host via the ansible_module_compression inventory 279 | # variable 280 | #module_compression = 'ZIP_DEFLATED' 281 | 282 | # This controls the cutoff point (in bytes) on --diff for files 283 | # set to 0 for unlimited (RAM may suffer!). 284 | #max_diff_size = 1048576 285 | 286 | # This controls how ansible handles multiple --tags and --skip-tags arguments 287 | # on the CLI. If this is True then multiple arguments are merged together. If 288 | # it is False, then the last specified argument is used and the others are ignored. 289 | # This option will be removed in 2.8. 290 | #merge_multiple_cli_flags = True 291 | 292 | # Controls showing custom stats at the end, off by default 293 | #show_custom_stats = True 294 | 295 | # Controls which files to ignore when using a directory as inventory with 296 | # possibly multiple sources (both static and dynamic) 297 | #inventory_ignore_extensions = ~, .orig, .bak, .ini, .cfg, .retry, .pyc, .pyo 298 | 299 | # This family of modules use an alternative execution path optimized for network appliances 300 | # only update this setting if you know how this works, otherwise it can break module execution 301 | #network_group_modules=['eos', 'nxos', 'ios', 'iosxr', 'junos', 'vyos'] 302 | 303 | # This keeps facts from polluting the main namespace as variables. 304 | # Setting to True keeps them under the ansible_facts namespace, the default is False 305 | #restrict_facts_namespace: True 306 | 307 | # When enabled, this option allows lookups (via variables like {{lookup('foo')}} or when used as 308 | # a loop with `with_foo`) to return data that is not marked "unsafe". This means the data may contain 309 | # jinja2 templating language which will be run through the templating engine. 310 | # ENABLING THIS COULD BE A SECURITY RISK 311 | #allow_unsafe_lookups = False 312 | 313 | # set default errors for all plays 314 | #any_errors_fatal = False 315 | 316 | [privilege_escalation] 317 | #become=True 318 | #become_method=sudo 319 | #become_user=root 320 | #become_ask_pass=False 321 | 322 | [paramiko_connection] 323 | 324 | # uncomment this line to cause the paramiko connection plugin to not record new host 325 | # keys encountered. Increases performance on new host additions. Setting works independently of the 326 | # host key checking setting above. 327 | #record_host_keys=False 328 | 329 | # by default, Ansible requests a pseudo-terminal for commands executed under sudo. Uncomment this 330 | # line to disable this behaviour. 331 | #pty=False 332 | 333 | # paramiko will default to looking for SSH keys initially when trying to 334 | # authenticate to remote devices. This is a problem for some network devices 335 | # that close the connection after a key failure. Uncomment this line to 336 | # disable the Paramiko look for keys function 337 | #look_for_keys = False 338 | 339 | # When using persistent connections with Paramiko, the connection runs in a 340 | # background process. If the host doesn't already have a valid SSH key, by 341 | # default Ansible will prompt to add the host key. This will cause connections 342 | # running in background processes to fail. Uncomment this line to have 343 | # Paramiko automatically add host keys. 344 | #host_key_auto_add = True 345 | 346 | [ssh_connection] 347 | 348 | # ssh arguments to use 349 | # Leaving off ControlPersist will result in poor performance, so use 350 | # paramiko on older platforms rather than removing it, -C controls compression use 351 | #ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s 352 | 353 | # The base directory for the ControlPath sockets. 354 | # This is the "%(directory)s" in the control_path option 355 | # 356 | # Example: 357 | # control_path_dir = /tmp/.ansible/cp 358 | #control_path_dir = ~/.ansible/cp 359 | 360 | # The path to use for the ControlPath sockets. This defaults to a hashed string of the hostname, 361 | # port and username (empty string in the config). The hash mitigates a common problem users 362 | # found with long hostames and the conventional %(directory)s/ansible-ssh-%%h-%%p-%%r format. 363 | # In those cases, a "too long for Unix domain socket" ssh error would occur. 364 | # 365 | # Example: 366 | # control_path = %(directory)s/%%h-%%r 367 | #control_path = 368 | 369 | # Enabling pipelining reduces the number of SSH operations required to 370 | # execute a module on the remote server. This can result in a significant 371 | # performance improvement when enabled, however when using "sudo:" you must 372 | # first disable 'requiretty' in /etc/sudoers 373 | # 374 | # By default, this option is disabled to preserve compatibility with 375 | # sudoers configurations that have requiretty (the default on many distros). 376 | # 377 | #pipelining = False 378 | 379 | # Control the mechanism for transferring files (old) 380 | # * smart = try sftp and then try scp [default] 381 | # * True = use scp only 382 | # * False = use sftp only 383 | #scp_if_ssh = smart 384 | 385 | # Control the mechanism for transferring files (new) 386 | # If set, this will override the scp_if_ssh option 387 | # * sftp = use sftp to transfer files 388 | # * scp = use scp to transfer files 389 | # * piped = use 'dd' over SSH to transfer files 390 | # * smart = try sftp, scp, and piped, in that order [default] 391 | #transfer_method = smart 392 | 393 | # if False, sftp will not use batch mode to transfer files. This may cause some 394 | # types of file transfer failures impossible to catch however, and should 395 | # only be disabled if your sftp version has problems with batch mode 396 | #sftp_batch_mode = False 397 | 398 | [persistent_connection] 399 | 400 | # Configures the persistent connection timeout value in seconds. This value is 401 | # how long the persistent connection will remain idle before it is destroyed. 402 | # If the connection doesn't receive a request before the timeout value 403 | # expires, the connection is shutdown. The default value is 30 seconds. 404 | #connect_timeout = 30 405 | 406 | # Configures the persistent connection retry timeout. This value configures the 407 | # the retry timeout that ansible-connection will wait to connect 408 | # to the local domain socket. This value must be larger than the 409 | # ssh timeout (timeout) and less than persistent connection idle timeout (connect_timeout). 410 | # The default value is 15 seconds. 411 | #connect_retry_timeout = 15 412 | 413 | # The command timeout value defines the amount of time to wait for a command 414 | # or RPC call before timing out. The value for the command timeout must 415 | # be less than the value of the persistent connection idle timeout (connect_timeout) 416 | # The default value is 10 second. 417 | #command_timeout = 10 418 | 419 | [accelerate] 420 | #accelerate_port = 5099 421 | #accelerate_timeout = 30 422 | #accelerate_connect_timeout = 5.0 423 | 424 | # The daemon timeout is measured in minutes. This time is measured 425 | # from the last activity to the accelerate daemon. 426 | #accelerate_daemon_timeout = 30 427 | 428 | # If set to yes, accelerate_multi_key will allow multiple 429 | # private keys to be uploaded to it, though each user must 430 | # have access to the system via SSH to add a new key. The default 431 | # is "no". 432 | #accelerate_multi_key = yes 433 | 434 | [selinux] 435 | # file systems that require special treatment when dealing with security context 436 | # the default behaviour that copies the existing context or uses the user default 437 | # needs to be changed to use the file system dependent context. 438 | #special_context_filesystems=nfs,vboxsf,fuse,ramfs,9p 439 | 440 | # Set this to yes to allow libvirt_lxc connections to work without SELinux. 441 | #libvirt_lxc_noseclabel = yes 442 | 443 | [colors] 444 | #highlight = white 445 | #verbose = blue 446 | #warn = bright purple 447 | #error = red 448 | #debug = dark gray 449 | #deprecate = purple 450 | #skip = cyan 451 | #unreachable = red 452 | #ok = green 453 | #changed = yellow 454 | #diff_add = green 455 | #diff_remove = red 456 | #diff_lines = cyan 457 | 458 | 459 | [diff] 460 | # Always print diff when running ( same as always running with -D/--diff ) 461 | # always = no 462 | 463 | # Set how many context lines to show in diff 464 | # context = 3 465 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | manage_packages: true 2 | mgmt_nic_name: "eth0" 3 | tenant_nic_name: "eth0" 4 | provider_nic_name: "eth0" 5 | external_nic_name: "eth0" 6 | storage_nic_name: "eth0" 7 | storage_mgmt_nic_name: "eth0" 8 | 9 | stress_mysql_time: 300 10 | stress_mysql_host: localhost 11 | sysbench_threads: 1000 12 | sysbench_user: sysbench 13 | sysbench_password: sysbench 14 | sysbench_database: sysbench 15 | 16 | physical_network_bridge: ovsbr3 17 | physical_network_bridge_port: eth0 18 | 19 | external_network_bridge: br-ex 20 | external_network_bridge_port: eth0 21 | 22 | rabbit_host: localhost 23 | rabbit_username: guest 24 | rabbit_password: guest 25 | 26 | # Auth for OpenStack 27 | os_auth_url: 'http://127.0.0.1:5000/v3' 28 | os_project_domain_name: Default 29 | os_user_domain_name: Default 30 | os_admin_project: admin 31 | os_admin_username: admin 32 | os_admin_password: admin 33 | os_gremlin_role: member 34 | -------------------------------------------------------------------------------- /doc/README.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Building the docs 3 | ================= 4 | 5 | Dependencies 6 | ============ 7 | 8 | Sphinx_ 9 | You'll need sphinx (the python one) and if you are 10 | using the virtualenv you'll need to install it in the virtualenv 11 | specifically so that it can load the cinder modules. 12 | 13 | :: 14 | 15 | pip install Sphinx 16 | 17 | Graphviz_ 18 | Some of the diagrams are generated using the ``dot`` language 19 | from Graphviz. 20 | 21 | :: 22 | 23 | sudo apt-get install graphviz 24 | 25 | .. _Sphinx: http://sphinx.pocoo.org 26 | 27 | .. _Graphviz: http://www.graphviz.org/ 28 | 29 | 30 | Use `make` 31 | ========== 32 | 33 | Just type make:: 34 | 35 | % make 36 | 37 | Look in the Makefile for more targets. 38 | 39 | 40 | Manually 41 | ======== 42 | 43 | 1. Generate the code.rst file so that Sphinx will pull in our docstrings:: 44 | 45 | % ./generate_autodoc_index.sh > source/code.rst 46 | 47 | 2. Run `sphinx_build`:: 48 | 49 | % sphinx-build -b html source build/html 50 | 51 | 52 | The docs have been built 53 | ======================== 54 | 55 | Check out the `build` directory to find them. Yay! 56 | -------------------------------------------------------------------------------- /doc/generate_autodoc_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SOURCEDIR=doc/source/api 4 | 5 | if [ ! -d ${SOURCEDIR} ] ; then 6 | mkdir -p ${SOURCEDIR} 7 | fi 8 | 9 | for x in `./doc/find_autodoc_modules.sh`; 10 | do 11 | echo "Generating ${SOURCEDIR}/${x}.rst" 12 | echo "${SOURCEDIR}/${x}.rst" >> .autogenerated 13 | heading="The :mod:\`${x}\` Module" 14 | # Figure out how long the heading is 15 | # and make sure to emit that many '=' under 16 | # it to avoid heading format errors 17 | # in Sphinx. 18 | heading_len=$(echo "$heading" | wc -c) 19 | underline=$(head -c $heading_len < /dev/zero | tr '\0' '=') 20 | ( cat < ${SOURCEDIR}/${x}.rst 30 | 31 | done 32 | 33 | if [ ! -f ${SOURCEDIR}/autoindex.rst ] ; then 34 | 35 | cat > ${SOURCEDIR}/autoindex.rst <> ${SOURCEDIR}/autoindex.rst 43 | done 44 | 45 | echo ${SOURCEDIR}/autoindex.rst >> .autogenerated 46 | fi 47 | -------------------------------------------------------------------------------- /drill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # With LANG set to everything else than C completely undercipherable errors 4 | # like "file not found" and decoding errors will start to appear during scripts 5 | # or even ansible modules 6 | LANG=C 7 | 8 | GREM_DIR=$(dirname $( readlink -f "${BASH_SOURCE[0]}" )) 9 | DEFAULT_OPT_TAGS="untagged" 10 | 11 | 12 | : ${OPT_TAGS:=$DEFAULT_OPT_TAGS} 13 | : ${OPT_PLAYBOOK:=$GREM_DIR/playbooks/drill.yml} 14 | : ${OPT_WORKDIR:=$GREM_DIR/.gremlin} 15 | : ${OPT_CONFIG:=$GREM_DIR/config.yml} 16 | : ${OPT_MODE:=auto} 17 | 18 | 19 | install_deps () { 20 | sudo yum -y install epel-release 21 | sudo yum clean all 22 | sudo yum makecache 23 | sudo yum -y install ansible git 24 | } 25 | 26 | usage () { 27 | echo "Usage: $0 --install-deps" 28 | echo " install quickstart package dependencies and exit" 29 | echo "" 30 | echo "Usage: $0 [options]" 31 | echo "" 32 | echo "Basic options:" 33 | echo " -p, --playbook " 34 | echo " playbook to run(default=$OPT_PLAYBOOK)" 35 | echo " -i, --inventory " 36 | echo " specify inventory host path" 37 | echo " (default=./inventory/hosts) or comma separated host list" 38 | echo " -c, --config " 39 | echo " specify the config file that contains the node" 40 | echo " configuration, can be used only once" 41 | echo " (default=$OPT_CONFIG)" 42 | echo " -m, --mode " 43 | echo " specify mode to run, there are two modes: manual, auto" 44 | echo " (default=$OPT_MODE)" 45 | echo " -S, --step" 46 | echo " execute playbooks or tasks step by step" 47 | echo " --syntax-check" 48 | echo " perform a syntax check on the playbook, but do not" 49 | echo " execute it" 50 | echo "" 51 | echo "Advanced options:" 52 | echo " -v, --ansible-debug" 53 | echo " invoke ansible-playbook with -vvvv" 54 | echo " -e, --extra-vars =" 55 | echo " additional ansible variables, can be used multiple times" 56 | echo " -t, --tags [,,...]" 57 | echo " only run plays and tasks tagged with these values," 58 | echo " specify 'all' to run everything" 59 | echo " (default=$OPT_TAGS)" 60 | echo " -s, --skip-tags [,,...]" 61 | echo " only run plays and tasks whose tags do" 62 | echo " not match these values" 63 | echo " -w, --working-dir " 64 | echo " directory where the inventory, config files, etc." 65 | echo " are created (default=$OPT_WORKDIR)" 66 | echo " -h, --help print this help and exit" 67 | } 68 | 69 | OPT_VARS=() 70 | 71 | while [ "x$1" != "x" ]; do 72 | case "$1" in 73 | --install-deps) 74 | OPT_INSTALL_DEPS=1 75 | ;; 76 | --inventory|-i) 77 | OPT_INVENTORY=$2 78 | shift 79 | ;; 80 | --playbook|-p) 81 | OPT_PLAYBOOK=$2 82 | shift 83 | ;; 84 | --extra-vars|-e) 85 | OPT_VARS+=("-e") 86 | OPT_VARS+=("$2") 87 | shift 88 | ;; 89 | --config|-c) 90 | OPT_CONFIG=$2 91 | shift 92 | ;; 93 | --mode|-m) 94 | OPT_MODE=$2 95 | shift 96 | ;; 97 | --step|-s) 98 | OPT_STEP=1 99 | ;; 100 | --syntax-check) 101 | OPT_SYNTAX_CHECK=1 102 | ;; 103 | --ansible-debug|-v) 104 | OPT_DEBUG_ANSIBLE=1 105 | ;; 106 | --tags|-t) 107 | OPT_TAGS=$2 108 | shift 109 | ;; 110 | --skip-tags|-S) 111 | OPT_SKIP_TAGS=$2 112 | shift 113 | ;; 114 | --working-dir|-w) 115 | OPT_WORKDIR=$(realpath $2) 116 | shift 117 | ;; 118 | --help|-h) 119 | usage 120 | exit 121 | ;; 122 | --) 123 | shift 124 | break 125 | ;; 126 | *) 127 | break 128 | ;; 129 | esac 130 | shift 131 | done 132 | 133 | 134 | if [ "$OPT_INSTALL_DEPS" = 1 ]; then 135 | echo "NOTICE: installing dependencies" 136 | install_deps 137 | exit $? 138 | fi 139 | 140 | 141 | if [ "$#" -gt 2 ]; then 142 | usage >&2 143 | exit 2 144 | fi 145 | 146 | 147 | set -ex 148 | 149 | export ANSIBLE_CONFIG=$GREM_DIR/ansible.cfg 150 | export ANSIBLE_INVENTORY=$GREM_DIR/inventory/hosts 151 | 152 | if [ "$OPT_DEBUG_ANSIBLE" = 1 ]; then 153 | VERBOSITY=vvvv 154 | else 155 | VERBOSITY=vv 156 | fi 157 | 158 | ansible-playbook -$VERBOSITY $OPT_PLAYBOOK \ 159 | -e @$OPT_CONFIG \ 160 | -e local_working_dir=$OPT_WORKDIR \ 161 | -e mode=$OPT_MODE \ 162 | ${OPT_VARS[@]} \ 163 | ${OPT_INVENTORY:+-i $OPT_INVENTORY} \ 164 | ${OPT_TAGS:+-t $OPT_TAGS} \ 165 | ${OPT_SKIP_TAGS:+--skip-tags $OPT_SKIP_TAGS} \ 166 | ${OPT_STEP:+--step} \ 167 | ${OPT_SYNTAX_CHECK:+--syntax-check} 168 | 169 | set +x 170 | -------------------------------------------------------------------------------- /files/get-random-osd.py: -------------------------------------------------------------------------------- 1 | #!/use/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import subprocess 6 | import json 7 | import socket 8 | import random 9 | import time 10 | import argparse 11 | 12 | bucket_id_map = {} 13 | host_osd_map = {} 14 | osd_host_map = {} 15 | pg_map = {} 16 | 17 | def init_osd_map(): 18 | 19 | CMD = ['ceph', 'osd', 'crush', 'dump', '--format', 'json'] 20 | 21 | ceph_osd_crush_dump_json = subprocess.check_output(CMD) 22 | ceph_osd_crush_dump = json.loads(ceph_osd_crush_dump_json) 23 | 24 | global bucket_id_map 25 | for bucket in ceph_osd_crush_dump['buckets']: 26 | bucket_id_map[bucket['id']] = dict({'type': bucket['type_name'], 27 | 'name': bucket['name'], 28 | 'items': [ item['id'] for item in bucket['items']]}) 29 | 30 | host_id_list = [ item for item in bucket_id_map.keys() 31 | if bucket_id_map[item]['type'] == 'host'] 32 | global host_osd_map 33 | for item in host_id_list: 34 | host_name = bucket_id_map[item]['name'] 35 | host_ip = socket.gethostbyname(host_name) 36 | host_osds = bucket_id_map[item]['items'] 37 | host_osd_map[host_name] = dict({'mgmt': host_ip, 38 | 'id': item, 39 | 'osds': host_osds}) 40 | 41 | global osd_host_map 42 | for host in host_osd_map.keys(): 43 | for osd in host_osd_map[host]['osds']: 44 | osd_name = ceph_osd_crush_dump['devices'][osd]['name'] 45 | osd_mgmt = host_osd_map[host]['mgmt'] 46 | osd_host_map[osd] = dict({'host': host, 47 | 'name': osd_name, 48 | 'mgmt': osd_mgmt}) 49 | 50 | def init_pg_map(): 51 | 52 | CMD = ['ceph', 'pg', 'dump', '--format', 'json'] 53 | 54 | ceph_pg_dump_json = subprocess.check_output(CMD) 55 | ceph_pg_dump = json.loads(ceph_pg_dump_json) 56 | 57 | global pg_map 58 | for item in ceph_pg_dump['pg_stats']: 59 | pg_id = item['pgid'] 60 | pg_map[pg_id] = item['acting'] 61 | 62 | def get_random_osd(map_data, num=1): 63 | 64 | random_osd = {} 65 | 66 | random.seed(time.time()) 67 | try: 68 | random_osd_list = random.sample(map_data.keys(), num) 69 | except ValueError: 70 | print("The total osds is {TOTAL}".format(TOTAL=len(map_data.keys()))) 71 | random_osd_list = map_data.keys() 72 | random_osd['size'] = len(random_osd_list) 73 | random_osd['items'] = {} 74 | 75 | for osd in random_osd_list: 76 | random_osd['items'][osd] = map_data[osd] 77 | 78 | return random_osd 79 | 80 | def get_random_osd_from_pg(pg_map, osd_map): 81 | 82 | random_osd = {} 83 | 84 | random_pg_id = random.choice(pg_map.keys()) 85 | random_osd['pgid'] = random_pg_id 86 | 87 | random_osd_list = pg_map[random_pg_id] 88 | random_osd['size'] = len(random_osd_list) 89 | random_osd['items'] = {} 90 | 91 | for osd in random_osd_list: 92 | random_osd['items'][osd] = osd_map[osd] 93 | 94 | return random_osd 95 | 96 | def reformat_mgmt_osd_map(osd_map): 97 | 98 | mgmt_osd_map = {} 99 | 100 | for osd in osd_map: 101 | mgmt_addr = osd_map[osd]['mgmt'] 102 | if mgmt_addr not in mgmt_osd_map.keys(): 103 | mgmt_osd_map[mgmt_addr] = [] 104 | mgmt_osd_map[mgmt_addr].append(osd) 105 | 106 | return mgmt_osd_map 107 | 108 | def output_data(data, output_format='plain', filename=None): 109 | 110 | if filename: 111 | if output_format == 'plain': 112 | file_handler = open(filename, 'w') 113 | file_handler.write(data) 114 | elif output_format == 'json': 115 | file_handler = open(filename + '.json', 'w') 116 | file_handler.write(json.dumps(data)) 117 | elif output_format == 'json-pretty': 118 | file_handler = open(filename + '.json', 'w') 119 | file_handler.write(json.dumps(data, indent=2)) 120 | else: 121 | print("Unsupport this {FORMAT} format".format(FORMAT=output_format)) 122 | file_handler.close() 123 | else: 124 | if output_format == 'plain': 125 | print(data) 126 | elif output_format == 'json': 127 | print(json.dumps(data)) 128 | elif output_format == 'json-pretty': 129 | print(json.dumps(data, indent=2)) 130 | else: 131 | print("Unsupport this {FORMAT} format!".format(FORMAT=output_format)) 132 | 133 | def init_argument(parser): 134 | 135 | parser.add_argument('-n', '--number', nargs=1, type=int) 136 | parser.add_argument('-p', '--percentage', nargs=1, type=int) 137 | parser.add_argument('--list-host-map', action='store_true') 138 | parser.add_argument('--list-osd-map', action='store_true') 139 | parser.add_argument('--list-pg-map', action='store_true') 140 | parser.add_argument('-F', '--format', nargs=1) 141 | parser.add_argument('-f', '--file', nargs=1) 142 | parser.add_argument('--get-random-osd', action='store_true') 143 | parser.add_argument('--get-pg-osd', action='store_true') 144 | parser.add_argument('--mgmt-osd', action='store_true') 145 | 146 | args = parser.parse_args() 147 | 148 | return args 149 | 150 | def take_action(args): 151 | 152 | if isinstance(args.format, list): 153 | output_format = args.format[0] 154 | else: 155 | output_format = 'plain' 156 | 157 | if isinstance(args.file, list): 158 | output_filename = args.file[0] 159 | else: 160 | output_filename = None 161 | 162 | if isinstance(args.percentage, list): 163 | percentage = args.percentage[0] 164 | osd_number = int(len(osd_host_map.keys()) * (percentage / 100.0)) 165 | elif isinstance(args.number, list): 166 | osd_number = args.number[0] 167 | else: 168 | osd_number = 1 169 | 170 | if args.mgmt_osd: 171 | mgmt_osd_enabled = True 172 | else: 173 | mgmt_osd_enabled = False 174 | 175 | if args.list_host_map: 176 | output_data(host_osd_map, output_format, output_filename) 177 | 178 | if args.list_osd_map: 179 | if mgmt_osd_enabled: 180 | output_map = reformat_mgmt_osd_map(osd_host_map) 181 | else: 182 | output_map = osd_host_map 183 | output_data(output_map, output_format, output_filename) 184 | 185 | if args.list_pg_map: 186 | output_data(pg_map, output_format, output_filename) 187 | 188 | if args.get_random_osd: 189 | random_osd = get_random_osd(osd_host_map, osd_number) 190 | if mgmt_osd_enabled: 191 | output_map_temp = reformat_mgmt_osd_map(random_osd['items']) 192 | output_map = dict({'size': random_osd['size'], 193 | 'items': output_map_temp}) 194 | else: 195 | output_map = random_osd 196 | output_data(output_map, output_format, output_filename) 197 | 198 | if args.get_pg_osd: 199 | random_osd = get_random_osd_from_pg(pg_map, osd_host_map) 200 | if mgmt_osd_enabled: 201 | osd_list = random_osd['items'].keys() 202 | output_list = [] 203 | for num in range(1,random_osd['size'] + 1): 204 | random_osd_temp = {} 205 | for osd in osd_list[:num]: 206 | random_osd_temp[osd] = random_osd['items'][osd] 207 | output_list.append(reformat_mgmt_osd_map(random_osd_temp)) 208 | output_map = dict({'size': random_osd['size'], 209 | 'items': output_list}) 210 | else: 211 | output_map = random_osd 212 | output_data(output_map, output_format, output_filename) 213 | 214 | if __name__ == '__main__': 215 | 216 | init_osd_map() 217 | init_pg_map() 218 | 219 | parser = argparse.ArgumentParser() 220 | args = init_argument(parser) 221 | 222 | take_action(args) 223 | -------------------------------------------------------------------------------- /gremlin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/gremlin.png -------------------------------------------------------------------------------- /inventory/group_vars/all: -------------------------------------------------------------------------------- 1 | manage_packages: true 2 | mgmt_nic_name: "eth0" 3 | tenant_nic_name: "eth0" 4 | provider_nic_name: "eth0" 5 | external_nic_name: "eth0" 6 | storage_nic_name: "eth0" 7 | storage_mgmt_nic_name: "eth0" 8 | 9 | stress_mysql_time: 300 10 | stress_mysql_host: localhost 11 | sysbench_threads: 1000 12 | sysbench_user: sysbench 13 | sysbench_password: sysbench 14 | sysbench_database: sysbench 15 | 16 | physical_network_bridge: ovsbr3 17 | physical_network_bridge_port: eth0 18 | 19 | external_network_bridge: br-ex 20 | external_network_bridge_port: eth0 21 | 22 | case_prefix_map: 23 | ctl: control 24 | com: compute 25 | net: network 26 | sto: storage 27 | sys: system 28 | -------------------------------------------------------------------------------- /inventory/hosts: -------------------------------------------------------------------------------- 1 | # ------------------------------------------ 2 | # # High-level hostgroups 3 | # # 4 | # # Add hosts to these groups (ideally by creating an additional inventory 5 | # # file in the inventory directory, rather than editing this file) to 6 | # # set up typical groups of services. 7 | 8 | myip ansible_connection=local ansible_become=true 9 | 10 | # Node role 11 | [api] 12 | myip 13 | 14 | [db] 15 | myip 16 | 17 | [mq] 18 | myip 19 | 20 | [mc] 21 | myip 22 | 23 | [lb] 24 | myip 25 | 26 | [mon] 27 | myip 28 | 29 | [osd] 30 | myip 31 | 32 | [rgw] 33 | myip 34 | 35 | 36 | # Node role group 37 | [control:children] 38 | api 39 | db 40 | mq 41 | mc 42 | lb 43 | 44 | [network] 45 | myip 46 | 47 | [compute] 48 | myip 49 | 50 | [storage:children] 51 | mon 52 | osd 53 | rgw 54 | -------------------------------------------------------------------------------- /inventory/structure: -------------------------------------------------------------------------------- 1 | # ------------------------------------------ 2 | # # High-level hostgroups 3 | # # 4 | # # Add hosts to these groups (ideally by creating an additional inventory 5 | # # file in the inventory directory, rather than editing this file) to 6 | # # set up typical groups of services. 7 | 8 | myip ansible_connection=local ansible_become=true 9 | 10 | # Node role 11 | [api] 12 | myip 13 | 14 | [db] 15 | myip 16 | 17 | [mq] 18 | myip 19 | 20 | [mc] 21 | myip 22 | 23 | [lb] 24 | myip 25 | 26 | [mon] 27 | myip 28 | 29 | [osd] 30 | myip 31 | 32 | [rgw] 33 | myip 34 | 35 | 36 | # Node role group 37 | [control:children] 38 | api 39 | db 40 | mq 41 | mc 42 | lb 43 | 44 | [network] 45 | myip 46 | 47 | [compute] 48 | myip 49 | 50 | [storage:children] 51 | mon 52 | osd 53 | rgw 54 | -------------------------------------------------------------------------------- /playbooks/case.yml: -------------------------------------------------------------------------------- 1 | # This is the entry point of case, you can specify the 2 | # case number to be executed. 3 | 4 | # The case number will be prefixed with the following short name: 5 | 6 | # ctl: stands for control services releated case 7 | # com: stands for compute services releated case 8 | # net: stands for network services releated case 9 | # sto: stands for storage services releated case 10 | # sys: stands for system services releated case 11 | 12 | - name: Parse the case number to playbook path 13 | hosts: localhost 14 | connection: local 15 | gather_facts: false 16 | tasks: 17 | - name: Parse cid to get case and id 18 | set_fact: 19 | case: "{{ case_prefix_map[cid.split('-')[0]] }}" 20 | id: "{{ cid.split('-')[1] }}" 21 | when: 22 | - cid.split('-') | length == 2 23 | 24 | - name: Validate CID 25 | fail: 26 | msg: "Wrong cid: {{ cid }}, the cid format should be like: net-001" 27 | when: 28 | - case is not defined or id is not defined 29 | 30 | - name: Generate case path 31 | set_fact: 32 | case_path: "cases/{{ case }}/{{ id }}.yml" 33 | 34 | - name: Check if case palybook is existed 35 | stat: 36 | path: "{{ case_path }}" 37 | register: state_result 38 | 39 | - name: Exit if cast path does not exist 40 | fail: 41 | msg: "The case {{ case_path }} does not exist, please check it" 42 | when: 43 | - state_result.stat.exists == false 44 | 45 | - name: Print the case path 46 | debug: 47 | msg: The {{ case_path }} will be executed 48 | -------------------------------------------------------------------------------- /playbooks/cases/compute/001.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/playbooks/cases/compute/001.yml -------------------------------------------------------------------------------- /playbooks/cases/control/001.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/playbooks/cases/control/001.yml -------------------------------------------------------------------------------- /playbooks/cases/network/001.yml: -------------------------------------------------------------------------------- 1 | - name: "Network Case 001: Router unavailable when host in neutron changed" 2 | hosts: networker 3 | gather_facts: false 4 | tasks: 5 | - name: Test 6 | debug: 7 | msg: "The case 001 is executing" 8 | -------------------------------------------------------------------------------- /playbooks/cases/storage/001.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/playbooks/cases/storage/001.yml -------------------------------------------------------------------------------- /playbooks/cases/system/001.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/playbooks/cases/system/001.yml -------------------------------------------------------------------------------- /playbooks/common/ask.yml: -------------------------------------------------------------------------------- 1 | - name: Ask if recover this fault when in manual mode 2 | hosts: localhost 3 | gather_facts: false 4 | vars: 5 | execute: "{{ hostvars['localhost']['execute'] }}" 6 | tasks: 7 | - set_fact: 8 | recovery: true 9 | - block: 10 | - pause: 11 | prompt: "Will recover this fault? (y|yes, n|no)" 12 | register: _recovery 13 | - set_fact: 14 | recovery: "{{ _recovery.user_input }}" 15 | when: 16 | - mode == "manual" 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /playbooks/common/create_auth.yml: -------------------------------------------------------------------------------- 1 | - name: Create Auth 2 | hosts: "{{ api_host }}" 3 | gather_facts: false 4 | tasks: 5 | - include_role: 6 | name: provision/os_auth 7 | tasks_from: create_auth 8 | -------------------------------------------------------------------------------- /playbooks/common/next.yml: -------------------------------------------------------------------------------- 1 | - name: "{{ case_name }}" 2 | hosts: localhost 3 | gather_facts: false 4 | tasks: 5 | - set_fact: 6 | execute: true 7 | - block: 8 | - pause: 9 | prompt: "Will execute this case? (y|yes, n|no)" 10 | register: _execute 11 | - set_fact: 12 | execute: "{{ _execute.user_input }}" 13 | when: 14 | - mode == "manual" 15 | -------------------------------------------------------------------------------- /playbooks/common/port.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do specified port drill test cases, and if 2 | # in manual mode, it will prompt to ask if you want to recover it. 3 | 4 | - include: next.yml 5 | vars: 6 | case_name: "CASE: Delete {{ ovs_port }} port from {{ ovs_bridge }} on {{ random_hosts }}" 7 | 8 | - name: "Drill the case" 9 | hosts: "{{ random_hosts }}" 10 | gather_facts: false 11 | vars: 12 | execute: "{{ hostvars['localhost']['execute'] }}" 13 | tasks: 14 | - include_role: 15 | name: common 16 | tasks_from: port_del 17 | when: 18 | - execute == "y" or execute == true 19 | 20 | - include: ask.yml 21 | 22 | - name: "Recover the fault" 23 | hosts: "{{ random_hosts }}" 24 | gather_facts: false 25 | vars: 26 | execute: "{{ hostvars['localhost']['execute'] }}" 27 | recovery: "{{ hostvars['localhost']['recovery'] }}" 28 | tasks: 29 | - include_role: 30 | name: common 31 | tasks_from: port_add 32 | when: 33 | - mode == "auto" or recovery == "y" or recovery == true 34 | - execute == "y" or execute == true 35 | -------------------------------------------------------------------------------- /playbooks/common/remove_auth.yml: -------------------------------------------------------------------------------- 1 | - name: Remove Auth 2 | hosts: "{{ api_host }}" 3 | gather_facts: false 4 | tasks: 5 | - include_role: 6 | name: provision/os_auth 7 | tasks_from: remove_auth 8 | -------------------------------------------------------------------------------- /playbooks/common/service.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do specified service drill test cases, and if 2 | # in manual mode, it will prompt to ask if you want to recover it. 3 | 4 | - include: next.yml 5 | vars: 6 | case_name: "CASE: Stop {{ service_name }} service on {{ random_hosts }}" 7 | 8 | - name: "Drill the case" 9 | hosts: "{{ random_hosts }}" 10 | gather_facts: false 11 | vars: 12 | execute: "{{ hostvars['localhost']['execute'] }}" 13 | tasks: 14 | - include_role: 15 | name: common 16 | tasks_from: stop_service 17 | when: 18 | - execute == "y" or execute == true 19 | 20 | - include: ask.yml 21 | 22 | - name: "Recover the fault" 23 | hosts: "{{ random_hosts }}" 24 | gather_facts: false 25 | vars: 26 | execute: "{{ hostvars['localhost']['execute'] }}" 27 | recovery: "{{ hostvars['localhost']['recovery'] }}" 28 | tasks: 29 | - include_role: 30 | name: common 31 | tasks_from: start_service 32 | when: 33 | - mode == "auto" or recovery == "y" or recovery == true 34 | - execute == "y" or execute == true 35 | -------------------------------------------------------------------------------- /playbooks/compute/service.yml: -------------------------------------------------------------------------------- 1 | ## Service Level 2 | 3 | # Stop and start nova releated services on compute node 4 | - include: ../common/service.yml 5 | vars: 6 | random_hosts: random_one_compute_host 7 | service_name: openstack-nova-compute 8 | tags: 9 | - nova 10 | - nova-compute-down 11 | 12 | 13 | # Stop and start libvirt releated services on compute node 14 | - include: ../common/service.yml 15 | vars: 16 | random_hosts: random_one_compute_host 17 | service_name: libvirtd 18 | tags: 19 | - libvirtd 20 | - libvirtd-down 21 | 22 | 23 | # Stop and start Neutron Open vSwitch agent services on compute node 24 | - include: ../common/service.yml 25 | vars: 26 | random_hosts: random_one_compute_host 27 | service_name: neutron-openvswitch-agent 28 | tags: 29 | - ovs-agent 30 | - compute-ovs-agent-down 31 | 32 | 33 | # Stop and start Open vSwitch services on compute node 34 | - include: ../common/service.yml 35 | vars: 36 | random_hosts: random_one_compute_host 37 | service_name: openvswitch-nonetwork 38 | tags: 39 | - ovs 40 | - compute-ovs-down 41 | -------------------------------------------------------------------------------- /playbooks/compute/system.yml: -------------------------------------------------------------------------------- 1 | # System Level 2 | - include: ../system/base.yml 3 | vars: 4 | random_hosts: random_one_compute_host 5 | node_group: compute 6 | 7 | - include: ../system/nic.yml 8 | vars: 9 | random_hosts: random_one_compute_host 10 | node_group: compute 11 | nic_type: tenant 12 | nic: "{{ tenant_nic_name }}" 13 | tags: 14 | - compute-tenant-nic 15 | 16 | - include: ../system/nic.yml 17 | vars: 18 | random_hosts: random_one_compute_host 19 | node_group: compute 20 | nic_type: provider 21 | nic: "{{ provider_nic_name }}" 22 | tags: 23 | - compute-provider-nic 24 | 25 | - include: ../system/nic.yml 26 | vars: 27 | random_hosts: random_one_compute_host 28 | node_group: compute 29 | nic_type: storage 30 | nic: "{{ storage_nic_name }}" 31 | tags: 32 | - compute-storage-nic 33 | -------------------------------------------------------------------------------- /playbooks/control/cinder/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start cinder releated services on control node 2 | 3 | # cinder-api 4 | - include: ../../common/service.yml 5 | vars: 6 | random_hosts: random_one_api_host 7 | service_name: openstack-cinder-api 8 | 9 | - include: ../../common/service.yml 10 | vars: 11 | random_hosts: random_two_api_hosts 12 | service_name: openstack-cinder-api 13 | 14 | - include: ../../common/service.yml 15 | vars: 16 | random_hosts: random_three_api_hosts 17 | service_name: openstack-cinder-api 18 | 19 | # cinder-volume 20 | - include: ../../common/service.yml 21 | vars: 22 | random_hosts: random_one_api_host 23 | service_name: openstack-cinder-volume 24 | 25 | - include: ../../common/service.yml 26 | vars: 27 | random_hosts: random_two_api_hosts 28 | service_name: openstack-cinder-volume 29 | 30 | - include: ../../common/service.yml 31 | vars: 32 | random_hosts: random_three_api_hosts 33 | service_name: openstack-cinder-volume 34 | 35 | # cinder-scheduler 36 | - include: ../../common/service.yml 37 | vars: 38 | random_hosts: random_one_api_host 39 | service_name: openstack-cinder-scheduler 40 | 41 | - include: ../../common/service.yml 42 | vars: 43 | random_hosts: random_two_api_hosts 44 | service_name: openstack-cinder-scheduler 45 | 46 | - include: ../../common/service.yml 47 | vars: 48 | random_hosts: random_three_api_hosts 49 | service_name: openstack-cinder-scheduler 50 | -------------------------------------------------------------------------------- /playbooks/control/drill_api.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for api node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_api_hosts: "{{ groups['api'] | shuffle }}" 7 | api_group_num: "{{ groups['api'] | length }}" 8 | tasks: 9 | - name: Random select one api host 10 | add_host: 11 | groups: random_one_api_host 12 | name: "{{ groups['api'] | random }}" 13 | when: 14 | - api_group_num | int >= 1 15 | 16 | - name: Random select two api hosts 17 | add_host: 18 | groups: random_two_api_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_api_hosts[:2] }}" 22 | when: 23 | - api_group_num | int >= 2 24 | 25 | - name: Random select three api hosts 26 | add_host: 27 | groups: random_three_api_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_api_hosts[:3] }}" 31 | when: 32 | - api_group_num | int >= 3 33 | tags: 34 | - api-pre 35 | 36 | ## Service Level 37 | 38 | # Nova 39 | - include: nova/service.yml 40 | tags: 41 | - service 42 | - api 43 | - api-service 44 | - control 45 | - nova 46 | 47 | # Cinder 48 | - include: cinder/service.yml 49 | tags: 50 | - service 51 | - api 52 | - api-service 53 | - control 54 | - cinder 55 | 56 | # Glance 57 | - include: glance/service.yml 58 | tags: 59 | - service 60 | - api 61 | - api-service 62 | - control 63 | - glance 64 | 65 | # Neutron 66 | - include: neutron/service.yml 67 | tags: 68 | - service 69 | - api 70 | - api-service 71 | - control 72 | - neutron 73 | 74 | # Keystone 75 | - include: keystone/service.yml 76 | tags: 77 | - service 78 | - api 79 | - api-service 80 | - control 81 | - keystone 82 | -------------------------------------------------------------------------------- /playbooks/control/drill_db.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for db node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_db_hosts: "{{ groups['db'] | shuffle }}" 7 | db_group_num: "{{ groups['db'] | length }}" 8 | tasks: 9 | - name: Random select one db host 10 | add_host: 11 | groups: random_one_db_host 12 | name: "{{ groups['db'] | random }}" 13 | when: 14 | - db_group_num | int >= 1 15 | 16 | - name: Random select two db hosts 17 | add_host: 18 | groups: random_two_db_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_db_hosts[:2] }}" 22 | when: 23 | - db_group_num | int >= 2 24 | 25 | - name: Random select three db hosts 26 | add_host: 27 | groups: random_three_db_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_db_hosts[:3] }}" 31 | when: 32 | - db_group_num | int >= 3 33 | tags: 34 | - db-pre 35 | 36 | 37 | # Service down 38 | - include: mysql/service.yml 39 | tags: 40 | - service 41 | - db 42 | - db-service 43 | - control 44 | 45 | 46 | # We mainly test the impact to the MySQL cluster when 47 | # there are network problems 48 | - include: mysql/system.yml 49 | tags: 50 | - system 51 | - db 52 | - db-system 53 | - control 54 | 55 | # Stress MySQL Cluster 56 | - include: mysql/stress.yml 57 | tags: 58 | - service 59 | - db 60 | - db-service 61 | - control 62 | - db-stress 63 | -------------------------------------------------------------------------------- /playbooks/control/drill_hy.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for control hyper node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_control_hosts: "{{ groups['control'] | shuffle }}" 7 | control_group_num: "{{ groups['control'] | length }}" 8 | tasks: 9 | - name: Random select one control host 10 | add_host: 11 | groups: random_one_control_host 12 | name: "{{ groups['control'] | random }}" 13 | when: 14 | - control_group_num | int >= 1 15 | tags: 16 | - control-pre 17 | 18 | 19 | ## System Level 20 | - include: ../system/base.yml 21 | vars: 22 | random_hosts: random_one_control_host 23 | node_group: control 24 | tags: 25 | - system 26 | - control 27 | - control-system 28 | -------------------------------------------------------------------------------- /playbooks/control/drill_lb.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for lb node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_lb_hosts: "{{ groups['lb'] | shuffle }}" 7 | lb_group_num: "{{ groups['lb'] | length }}" 8 | tasks: 9 | - name: Random select one lb host 10 | add_host: 11 | groups: random_one_lb_host 12 | name: "{{ groups['lb'] | random }}" 13 | when: 14 | - lb_group_num | int >= 1 15 | 16 | - name: Random select two lb hosts 17 | add_host: 18 | groups: random_two_lb_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_lb_hosts[:2] }}" 22 | when: 23 | - lb_group_num | int >= 2 24 | 25 | - name: Random select three lb hosts 26 | add_host: 27 | groups: random_three_lb_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_lb_hosts[:3] }}" 31 | when: 32 | - lb_group_num | int >= 3 33 | tags: 34 | - lb-pre 35 | 36 | - include: haproxy/service.yml 37 | tags: 38 | - service 39 | - lb 40 | - lb-service 41 | - control 42 | -------------------------------------------------------------------------------- /playbooks/control/drill_mc.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for mc node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_mc_hosts: "{{ groups['mc'] | shuffle }}" 7 | mc_group_num: "{{ groups['mc'] | length }}" 8 | tasks: 9 | - name: Random select one mc host 10 | add_host: 11 | groups: random_one_mc_host 12 | name: "{{ groups['mc'] | random }}" 13 | when: 14 | - mc_group_num | int >= 1 15 | 16 | - name: Random select two mc hosts 17 | add_host: 18 | groups: random_two_mc_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_mc_hosts[:2] }}" 22 | when: 23 | - mc_group_num | int >= 2 24 | 25 | - name: Random select three mc hosts 26 | add_host: 27 | groups: random_three_mc_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_mc_hosts[:3] }}" 31 | when: 32 | - mc_group_num | int >= 3 33 | tags: 34 | - mc-pre 35 | 36 | - include: memcache/service.yml 37 | tags: 38 | - service 39 | - mc 40 | - mc-service 41 | - control 42 | -------------------------------------------------------------------------------- /playbooks/control/drill_mq.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for mq node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_mq_hosts: "{{ groups['mq'] | shuffle }}" 7 | mq_group_num: "{{ groups['mq'] | length }}" 8 | tasks: 9 | - name: Random select one mq host 10 | add_host: 11 | groups: random_one_mq_host 12 | name: "{{ groups['mq'] | random }}" 13 | when: 14 | - mq_group_num | int >= 1 15 | 16 | - name: Random select two mq hosts 17 | add_host: 18 | groups: random_two_mq_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_mq_hosts[:2] }}" 22 | when: 23 | - mq_group_num | int >= 2 24 | 25 | - name: Random select three mq hosts 26 | add_host: 27 | groups: random_three_mq_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_mq_hosts[:3] }}" 31 | when: 32 | - mq_group_num | int >= 3 33 | tags: 34 | - mq-pre 35 | 36 | - include: rabbitmq/service.yml 37 | tags: 38 | - service 39 | - mq 40 | - mq-service 41 | - control 42 | 43 | # We mainly test the impact to the RabbitMQ cluster when 44 | # there are network problems 45 | - include: rabbitmq/system.yml 46 | tags: 47 | - system 48 | - mq 49 | - mq-system 50 | - control 51 | 52 | # Stress RabbitMQ Cluster 53 | - include: rabbitmq/stress.yml 54 | tags: 55 | - service 56 | - mq 57 | - mq-service 58 | - control 59 | - mq-stress 60 | -------------------------------------------------------------------------------- /playbooks/control/glance/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start glance releated services on control node 2 | 3 | # glance-api 4 | - include: ../../common/service.yml 5 | vars: 6 | random_hosts: random_one_api_host 7 | service_name: openstack-glance-api 8 | 9 | - include: ../../common/service.yml 10 | vars: 11 | random_hosts: random_two_api_hosts 12 | service_name: openstack-glance-api 13 | 14 | - include: ../../common/service.yml 15 | vars: 16 | random_hosts: random_three_api_hosts 17 | service_name: openstack-glance-api 18 | 19 | # glance-registry 20 | - include: ../../common/service.yml 21 | vars: 22 | random_hosts: random_one_api_host 23 | service_name: openstack-glance-registry 24 | 25 | - include: ../../common/service.yml 26 | vars: 27 | random_hosts: random_two_api_hosts 28 | service_name: openstack-glance-registry 29 | 30 | - include: ../../common/service.yml 31 | vars: 32 | random_hosts: random_three_api_hosts 33 | service_name: openstack-glance-registry 34 | -------------------------------------------------------------------------------- /playbooks/control/haproxy/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start haproxy releated services on control node 2 | 3 | - include: ../../common/service.yml 4 | vars: 5 | random_hosts: random_one_lb_host 6 | service_name: haproxy 7 | 8 | - include: ../../common/service.yml 9 | vars: 10 | random_hosts: random_two_lb_hosts 11 | service_name: haproxy 12 | 13 | - include: ../../common/service.yml 14 | vars: 15 | random_hosts: random_three_lb_hosts 16 | service_name: haproxy 17 | -------------------------------------------------------------------------------- /playbooks/control/keystone/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start keystone releated services on control node 2 | 3 | - include: ../../common/service.yml 4 | vars: 5 | random_hosts: random_one_api_host 6 | service_name: httpd 7 | 8 | - include: ../../common/service.yml 9 | vars: 10 | random_hosts: random_two_api_hosts 11 | service_name: httpd 12 | 13 | - include: ../../common/service.yml 14 | vars: 15 | random_hosts: random_three_api_hosts 16 | service_name: httpd 17 | -------------------------------------------------------------------------------- /playbooks/control/memcache/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start memcache releated services on control node 2 | 3 | - include: ../../common/service.yml 4 | vars: 5 | random_hosts: random_one_mc_host 6 | service_name: memcached 7 | 8 | - include: ../../common/service.yml 9 | vars: 10 | random_hosts: random_two_mc_hosts 11 | service_name: memcached 12 | 13 | - include: ../../common/service.yml 14 | vars: 15 | random_hosts: random_three_mc_hosts 16 | service_name: memcached 17 | -------------------------------------------------------------------------------- /playbooks/control/mysql/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start mariadb releated services on control node 2 | 3 | - include: ../../common/service.yml 4 | vars: 5 | random_hosts: random_one_db_host 6 | service_name: mysql 7 | 8 | - include: ../../common/service.yml 9 | vars: 10 | random_hosts: random_two_db_hosts 11 | service_name: mysql 12 | 13 | - include: ../../common/service.yml 14 | vars: 15 | random_hosts: random_three_db_hosts 16 | service_name: mysql 17 | -------------------------------------------------------------------------------- /playbooks/control/mysql/stress.yml: -------------------------------------------------------------------------------- 1 | - include: ../../common/next.yml 2 | vars: 3 | case_name: "Stress MySQL Cluster using sysbench" 4 | 5 | - name: "Drill the case" 6 | hosts: random_one_db_host 7 | gather_facts: false 8 | vars: 9 | execute: "{{ hostvars['localhost']['execute'] }}" 10 | tasks: 11 | - include_role: 12 | name: control 13 | tasks_from: stress_db 14 | when: 15 | - execute == "y" or execute == true 16 | -------------------------------------------------------------------------------- /playbooks/control/mysql/system.yml: -------------------------------------------------------------------------------- 1 | # For MySQL Cluster, we mainly test network partition 2 | - include: ../../system/nic.yml 3 | vars: 4 | random_hosts: random_one_db_host 5 | node_group: db 6 | nic_type: mgmt 7 | nic: "{{ mgmt_nic_name }}" 8 | tags: 9 | - db-mgmt-nic 10 | -------------------------------------------------------------------------------- /playbooks/control/neutron/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start neutron releated services on control node 2 | 3 | - include: ../../common/service.yml 4 | vars: 5 | random_hosts: random_one_api_host 6 | service_name: neutron-server 7 | 8 | - include: ../../common/service.yml 9 | vars: 10 | random_hosts: random_two_api_hosts 11 | service_name: neutron-server 12 | 13 | - include: ../../common/service.yml 14 | vars: 15 | random_hosts: random_three_api_hosts 16 | service_name: neutron-server 17 | -------------------------------------------------------------------------------- /playbooks/control/nova/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start nova releated services on control node 2 | 3 | # nova-api 4 | - include: ../../common/service.yml 5 | vars: 6 | random_hosts: random_one_api_host 7 | service_name: openstack-nova-api 8 | 9 | - include: ../../common/service.yml 10 | vars: 11 | random_hosts: random_two_api_hosts 12 | service_name: openstack-nova-api 13 | 14 | - include: ../../common/service.yml 15 | vars: 16 | random_hosts: random_three_api_hosts 17 | service_name: openstack-nova-api 18 | 19 | # nova-scheduler 20 | - include: ../../common/service.yml 21 | vars: 22 | random_hosts: random_one_api_host 23 | service_name: openstack-nova-scheduler 24 | 25 | - include: ../../common/service.yml 26 | vars: 27 | random_hosts: random_two_api_hosts 28 | service_name: openstack-nova-scheduler 29 | 30 | - include: ../../common/service.yml 31 | vars: 32 | random_hosts: random_three_api_hosts 33 | service_name: openstack-nova-scheduler 34 | 35 | # nova-conductor 36 | - include: ../../common/service.yml 37 | vars: 38 | random_hosts: random_one_api_host 39 | service_name: openstack-nova-conductor 40 | 41 | - include: ../../common/service.yml 42 | vars: 43 | random_hosts: random_two_api_hosts 44 | service_name: openstack-nova-conductor 45 | 46 | - include: ../../common/service.yml 47 | vars: 48 | random_hosts: random_three_api_hosts 49 | service_name: openstack-nova-conductor 50 | -------------------------------------------------------------------------------- /playbooks/control/rabbitmq/service.yml: -------------------------------------------------------------------------------- 1 | # Stop and start rabbitmq releated services on control node 2 | 3 | - include: ../../common/service.yml 4 | vars: 5 | random_hosts: random_one_mq_host 6 | service_name: rabbitmq-server 7 | 8 | - include: ../../common/service.yml 9 | vars: 10 | random_hosts: random_two_mq_hosts 11 | service_name: rabbitmq-server 12 | 13 | - include: ../../common/service.yml 14 | vars: 15 | random_hosts: random_three_mq_hosts 16 | service_name: rabbitmq-server 17 | -------------------------------------------------------------------------------- /playbooks/control/rabbitmq/stress.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do specified service drill test cases, and if 2 | # in manual mode, it will prompt to ask if you want to recover it. 3 | 4 | - include: ../../common/next.yml 5 | vars: 6 | case_name: "CASE: Start to stress RabbitMQ" 7 | 8 | - name: "Drill the case" 9 | hosts: localhost 10 | gather_facts: true 11 | vars: 12 | execute: "{{ hostvars['localhost']['execute'] }}" 13 | tasks: 14 | - include_role: 15 | name: control 16 | tasks_from: stress_mq 17 | when: 18 | - execute == "y" or execute == true 19 | 20 | - include: ../../common/ask.yml 21 | 22 | - name: "Recover the fault" 23 | hosts: random_one_mq_host 24 | gather_facts: false 25 | vars: 26 | execute: "{{ hostvars['localhost']['execute'] }}" 27 | recovery: "{{ hostvars['localhost']['recovery'] }}" 28 | tasks: 29 | - include_role: 30 | name: control 31 | tasks_from: purge_queue 32 | when: 33 | - mode == "auto" or recovery == "y" or recovery == true 34 | - execute == "y" or execute == true 35 | -------------------------------------------------------------------------------- /playbooks/control/rabbitmq/system.yml: -------------------------------------------------------------------------------- 1 | # For RabbitMQ Cluster, we mainly test network partition 2 | - include: ../../system/nic.yml 3 | vars: 4 | random_hosts: random_one_mq_host 5 | node_group: mq 6 | nic_type: mgmt 7 | nic: "{{ mgmt_nic_name }}" 8 | tags: 9 | - mq-mgmt-nic 10 | -------------------------------------------------------------------------------- /playbooks/drill.yml: -------------------------------------------------------------------------------- 1 | - include: drill_storage.yml 2 | - include: drill_control.yml 3 | - include: drill_compute.yml 4 | - include: drill_network.yml 5 | -------------------------------------------------------------------------------- /playbooks/drill_compute.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for compute node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | compute_group_num: "{{ groups['compute'] | length }}" 7 | shuffle_compute_hosts: "{{ groups['compute'] | shuffle }}" 8 | tasks: 9 | - name: Random select one compute host 10 | add_host: 11 | groups: random_one_compute_host 12 | name: "{{ groups['compute'] | random }}" 13 | when: 14 | - compute_group_num | int >= 1 15 | 16 | - name: Random select two compute hosts 17 | add_host: 18 | groups: random_two_compute_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_compute_hosts[:2] }}" 22 | when: 23 | - compute_group_num | int >= 2 24 | 25 | - name: Random select three compute hosts 26 | add_host: 27 | groups: random_three_compute_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_compute_hosts[:3] }}" 31 | when: 32 | - compute_group_num | int >= 3 33 | tags: 34 | - compute-pre 35 | 36 | - include: compute/system.yml 37 | tags: 38 | - system 39 | - compute 40 | - compute-system 41 | 42 | - include: compute/service.yml 43 | tags: 44 | - service 45 | - compute 46 | - compute-service 47 | -------------------------------------------------------------------------------- /playbooks/drill_control.yml: -------------------------------------------------------------------------------- 1 | - include: control/drill_hy.yml 2 | - include: control/drill_lb.yml 3 | - include: control/drill_api.yml 4 | - include: control/drill_mc.yml 5 | - include: control/drill_mq.yml 6 | - include: control/drill_db.yml 7 | -------------------------------------------------------------------------------- /playbooks/drill_network.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for network node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_network_hosts: "{{ groups['network'] | shuffle }}" 7 | network_group_num: "{{ groups['network'] | length }}" 8 | tasks: 9 | - name: Random select one network host 10 | add_host: 11 | groups: random_one_network_host 12 | name: "{{ groups['network'] | random }}" 13 | when: 14 | - network_group_num | int >= 1 15 | 16 | - name: Random select two network hosts 17 | add_host: 18 | groups: random_two_network_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_network_hosts[:2] }}" 22 | when: 23 | - network_group_num | int >= 2 24 | 25 | - name: Random select three network hosts 26 | add_host: 27 | groups: random_three_network_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_network_hosts[:3] }}" 31 | when: 32 | - network_group_num | int >= 3 33 | tags: 34 | - network-pre 35 | 36 | - include: network/system.yml 37 | tags: 38 | - system 39 | - network 40 | - network-system 41 | 42 | - include: network/service.yml 43 | tags: 44 | - service 45 | - network 46 | - network-system 47 | -------------------------------------------------------------------------------- /playbooks/drill_storage.yml: -------------------------------------------------------------------------------- 1 | - include: storage/drill_mon.yml 2 | - include: storage/drill_rgw.yml 3 | - include: storage/drill_osd.yml 4 | -------------------------------------------------------------------------------- /playbooks/network/service.yml: -------------------------------------------------------------------------------- 1 | ## Service Level 2 | # Stop and start neutron releated services on network node 3 | 4 | # neutron-dhcp-agent 5 | - include: ../common/service.yml 6 | vars: 7 | random_hosts: random_one_network_host 8 | service_name: neutron-dhcp-agent 9 | 10 | - include: ../common/service.yml 11 | vars: 12 | random_hosts: random_two_network_hosts 13 | service_name: neutron-dhcp-agent 14 | 15 | - include: ../common/service.yml 16 | vars: 17 | random_hosts: random_three_network_hosts 18 | service_name: neutron-dhcp-agent 19 | 20 | # neutron-lbaasv2-agent 21 | - include: ../common/service.yml 22 | vars: 23 | random_hosts: random_one_network_host 24 | service_name: neutron-lbaasv2-agent 25 | 26 | - include: ../common/service.yml 27 | vars: 28 | random_hosts: random_two_network_hosts 29 | service_name: neutron-lbaasv2-agent 30 | 31 | - include: ../common/service.yml 32 | vars: 33 | random_hosts: random_three_network_hosts 34 | service_name: neutron-lbaasv2-agent 35 | 36 | # neutron-metadata-agent 37 | - include: ../common/service.yml 38 | vars: 39 | random_hosts: random_one_network_host 40 | service_name: neutron-metadata-agent 41 | 42 | - include: ../common/service.yml 43 | vars: 44 | random_hosts: random_two_network_hosts 45 | service_name: neutron-metadata-agent 46 | 47 | - include: ../common/service.yml 48 | vars: 49 | random_hosts: random_three_network_hosts 50 | service_name: neutron-metadata-agent 51 | 52 | # neutron-openvswitch-agent 53 | - include: ../common/service.yml 54 | vars: 55 | random_hosts: random_one_network_host 56 | service_name: neutron-openvswitch-agent 57 | 58 | - include: ../common/service.yml 59 | vars: 60 | random_hosts: random_two_network_hosts 61 | service_name: neutron-openvswitch-agent 62 | 63 | - include: ../common/service.yml 64 | vars: 65 | random_hosts: random_three_network_hosts 66 | service_name: neutron-openvswitch-agent 67 | 68 | # neutron-vpn-agent 69 | - include: ../common/service.yml 70 | vars: 71 | random_hosts: random_one_network_host 72 | service_name: neutron-vpn-agent 73 | 74 | - include: ../common/service.yml 75 | vars: 76 | random_hosts: random_two_network_hosts 77 | service_name: neutron-vpn-agent 78 | 79 | - include: ../common/service.yml 80 | vars: 81 | random_hosts: random_three_network_hosts 82 | service_name: neutron-vpn-agent 83 | 84 | # openvswitch-nonetwork 85 | - include: ../common/service.yml 86 | vars: 87 | random_hosts: random_one_network_host 88 | service_name: openvswitch-nonetwork # ovs-xxx 89 | -------------------------------------------------------------------------------- /playbooks/network/system.yml: -------------------------------------------------------------------------------- 1 | ## System Level 2 | - include: ../system/base.yml 3 | vars: 4 | random_hosts: random_one_network_host 5 | node_group: network 6 | 7 | - include: ../system/nic.yml 8 | vars: 9 | random_hosts: random_one_network_host 10 | node_group: network 11 | nic_type: tenant 12 | nic: "{{ tenant_nic_name }}" 13 | tags: 14 | - network-tenant-nic 15 | 16 | - include: ../system/nic.yml 17 | vars: 18 | random_hosts: random_one_network_host 19 | node_group: network 20 | nic_type: provider 21 | nic: "{{ provider_nic_name }}" 22 | tags: 23 | - network-provider-nic 24 | 25 | - include: ../system/nic.yml 26 | vars: 27 | random_hosts: random_one_network_host 28 | node_group: network 29 | nic_type: external 30 | nic: "{{ external_nic_name }}" 31 | tags: 32 | - network-external-nic 33 | 34 | - include: ../common/port.yml 35 | vars: 36 | random_hosts: random_one_network_host 37 | ovs_port: "{{ physical_network_bridge_port }}" 38 | ovs_bridge: "{{ physical_network_bridge }}" 39 | tags: 40 | - network-physical-network-port 41 | 42 | - include: ../common/port.yml 43 | vars: 44 | random_hosts: random_one_network_host 45 | ovs_port: "{{ external_network_bridge_port }}" 46 | ovs_bridge: "{{ external_network_bridge }}" 47 | tags: 48 | - network-external-network-port 49 | -------------------------------------------------------------------------------- /playbooks/storage/drill_mon.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for mon node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | shuffle_mon_hosts: "{{ groups['mon'] | shuffle }}" 7 | mon_group_num: "{{ groups['mon'] | length }}" 8 | tasks: 9 | - name: Random select one mon host 10 | add_host: 11 | groups: random_one_mon_host 12 | name: "{{ groups['mon'] | random }}" 13 | when: 14 | - mon_group_num | int >= 1 15 | 16 | - name: Random select two mon hosts 17 | add_host: 18 | groups: random_two_mon_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_mon_hosts[:2] }}" 22 | when: 23 | - mon_group_num | int >= 2 24 | 25 | - name: Random select three mon hosts 26 | add_host: 27 | groups: random_three_mon_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_mon_hosts[:3] }}" 31 | when: 32 | - mon_group_num | int >= 3 33 | tags: 34 | - mon-pre 35 | 36 | 37 | ## System Level 38 | - include: ../system/base.yml 39 | vars: 40 | random_hosts: random_one_mon_host 41 | node_group: mon 42 | tags: 43 | - system 44 | - mon 45 | - mon-system 46 | 47 | ## Service Level 48 | - include: mon/kill_mon.yml 49 | vars: 50 | random_hosts: random_one_mon_host 51 | tags: 52 | - service 53 | - mon 54 | - mon-service 55 | - mon-down 56 | 57 | - include: mon/damage_mon.yml 58 | vars: 59 | random_hosts: random_one_mon_host 60 | tags: 61 | - service 62 | - mon 63 | - mon-service 64 | - mon-down 65 | 66 | - include: mon/kill_mon.yml 67 | vars: 68 | random_hosts: random_two_mon_hosts 69 | tags: 70 | - service 71 | - mon 72 | - mon-service 73 | - mon-down 74 | 75 | - include: mon/kill_mon.yml 76 | vars: 77 | random_hosts: random_three_mon_hosts 78 | tags: 79 | - service 80 | - mon 81 | - mon-service 82 | - mon-down 83 | -------------------------------------------------------------------------------- /playbooks/storage/drill_osd.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for osd node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | osd_group_num: "{{ groups['osd'] | length }}" 7 | get_osd_script_path: "../../files/get-random-osd.py" 8 | ceph_client_host: "{{ groups['storage'][0] }}" 9 | osd_down_precentage: 30 10 | tasks: 11 | - name: Random select one osd host 12 | add_host: 13 | groups: random_one_osd_host 14 | name: "{{ groups['osd'] | random }}" 15 | when: 16 | - osd_group_num | int >= 1 17 | 18 | - name: Copy get-random-osd.py to storage node 19 | copy: 20 | src: "{{ get_osd_script_path }}" 21 | dest: /tmp/get-random-osd.py 22 | mode: 0644 23 | delegate_to: "{{ ceph_client_host }}" 24 | 25 | - shell: "python get-random-osd.py --get-pg-osd --mgmt-osd 26 | --format json --file /tmp/get-pg-osd" 27 | args: 28 | chdir: /tmp 29 | delegate_to: "{{ ceph_client_host }}" 30 | 31 | - fetch: 32 | src: "/tmp/get-pg-osd.json" 33 | dest: "/tmp/get-pg-osd.json" 34 | flat: yes 35 | fail_on_missing: yes 36 | delegate_to: "{{ ceph_client_host }}" 37 | 38 | - include_vars: 39 | name: _get_pg_osd 40 | file: "/tmp/get-pg-osd.json" 41 | 42 | - file: 43 | path: "/tmp/get-pg-osd.json" 44 | state: absent 45 | 46 | - name: Select one osd in random PG 47 | add_host: 48 | groups: one_osd_pg_hosts 49 | name: "{{ item.key }}" 50 | one_osd_pg_list: "{{ item.value }}" 51 | with_dict: "{{ _get_pg_osd['items'][0] }}" 52 | when: _get_pg_osd['size'] >= 1 53 | 54 | - name: Select two osds in random PG 55 | add_host: 56 | groups: two_osds_pg_hosts 57 | name: "{{ item.key }}" 58 | two_osds_pg_list: "{{ item.value }}" 59 | with_dict: "{{ _get_pg_osd['items'][1] }}" 60 | when: _get_pg_osd['size'] >= 2 61 | 62 | - name: Select three osds in random PG 63 | add_host: 64 | groups: three_osds_pg_hosts 65 | name: "{{ item.key }}" 66 | three_osds_pg_list: "{{ item.value }}" 67 | with_dict: "{{ _get_pg_osd['items'][2] }}" 68 | when: _get_pg_osd['size'] >= 3 69 | 70 | - shell: "python get-random-osd.py --get-random-osd 71 | --percentage {{ osd_down_precentage }} --mgmt-osd 72 | --format json --file /tmp/get-random-osd" 73 | args: 74 | chdir: /tmp 75 | delegate_to: "{{ ceph_client_host }}" 76 | 77 | - fetch: 78 | src: "/tmp/get-random-osd.json" 79 | dest: "/tmp/get-random-osd.json" 80 | flat: yes 81 | fail_on_missing: yes 82 | delegate_to: "{{ ceph_client_host }}" 83 | 84 | - include_vars: 85 | name: _get_random_osd 86 | file: "/tmp/get-random-osd.json" 87 | 88 | - file: 89 | path: "/tmp/get-random-osd.json" 90 | state: absent 91 | 92 | - name: Select some random osds 93 | add_host: 94 | groups: random_osd_hosts 95 | name: "{{ item.key }}" 96 | random_osd_list: "{{ item.value }}" 97 | with_dict: "{{ _get_random_osd['items'] }}" 98 | 99 | - name: Absent get-random-osd.py from storage node 100 | file: 101 | path: "/tmp/get-random-osd.py" 102 | state: absent 103 | delegate_to: "{{ ceph_client_host }}" 104 | tags: 105 | - osd-pre 106 | 107 | 108 | ## System Level 109 | 110 | - include: ../system/base.yml 111 | vars: 112 | random_hosts: random_one_osd_host 113 | node_group: osd 114 | tags: 115 | - system 116 | - osd 117 | - osd-system 118 | 119 | - include: ../system/nic.yml 120 | vars: 121 | random_hosts: random_one_osd_host 122 | node_group: osd 123 | nic_type: storage 124 | nic: "{{ storage_nic_name }}" 125 | tags: 126 | - system 127 | - osd 128 | - osd-system 129 | - osd-storage-nic 130 | 131 | ## Service Level 132 | - include: osd/kill_osd.yml 133 | vars: 134 | random_hosts: one_osd_pg_hosts 135 | osd_down_list: "{{ one_osd_pg_list }}" 136 | tags: 137 | - service 138 | - osd 139 | - osd-service 140 | - osd-down 141 | 142 | - include: osd/kill_osd.yml 143 | vars: 144 | random_hosts: two_osds_pg_hosts 145 | osd_down_list: "{{ two_osds_pg_list }}" 146 | tags: 147 | - service 148 | - osd 149 | - osd-service 150 | - osd-down 151 | 152 | - include: osd/kill_osd.yml 153 | vars: 154 | random_hosts: three_osds_pg_hosts 155 | osd_down_list: "{{ three_osds_pg_list }}" 156 | tags: 157 | - service 158 | - osd 159 | - osd-service 160 | - osd-down 161 | 162 | - include: osd/kill_osd.yml 163 | vars: 164 | random_hosts: random_osd_hosts 165 | osd_down_list: "{{ random_osd_list }}" 166 | tags: 167 | - service 168 | - osd 169 | - osd-service 170 | - osd-down 171 | 172 | - include: osd/del_osd_partition.yml 173 | vars: 174 | random_hosts: one_osd_pg_hosts 175 | osd_down_list: "{{ one_osd_pg_list }}" 176 | tags: 177 | - service 178 | - osd 179 | - osd-service 180 | - osd-partition-del 181 | -------------------------------------------------------------------------------- /playbooks/storage/drill_rgw.yml: -------------------------------------------------------------------------------- 1 | - name: Preparing for rgw node tests 2 | hosts: localhost 3 | connection: local 4 | gather_facts: false 5 | vars: 6 | rgw_group_num: "{{ groups['rgw'] | length }}" 7 | shuffle_rgw_hosts: "{{ groups['rgw'] | shuffle }}" 8 | tasks: 9 | - name: Random select one rgw host 10 | add_host: 11 | groups: random_one_rgw_host 12 | name: "{{ groups['rgw'] | random }}" 13 | when: 14 | - rgw_group_num | int >= 1 15 | 16 | - name: Random select two rgw hosts 17 | add_host: 18 | groups: random_two_rgw_hosts 19 | name: "{{ item }}" 20 | with_items: 21 | - "{{ shuffle_rgw_hosts[:2] }}" 22 | when: 23 | - rgw_group_num | int >= 2 24 | 25 | - name: Random select three rgw hosts 26 | add_host: 27 | groups: random_three_rgw_hosts 28 | name: "{{ item }}" 29 | with_items: 30 | - "{{ shuffle_rgw_hosts[:3] }}" 31 | when: 32 | - rgw_group_num | int >= 3 33 | tags: 34 | - rgw-pre 35 | 36 | 37 | ## System Level 38 | - include: ../system/base.yml 39 | vars: 40 | random_hosts: random_one_rgw_host 41 | node_group: rgw 42 | tags: 43 | - system 44 | - rgw 45 | - rgw-system 46 | 47 | 48 | ## Service Level 49 | - include: rgw/kill_rgw.yml 50 | vars: 51 | random_hosts: random_one_rgw_host 52 | tags: 53 | - service 54 | - rgw 55 | - rgw-service 56 | - rgw-down 57 | 58 | - include: rgw/kill_rgw.yml 59 | vars: 60 | random_hosts: random_two_rgw_hosts 61 | tags: 62 | - service 63 | - rgw 64 | - rgw-service 65 | - rgw-down 66 | 67 | - include: rgw/kill_rgw.yml 68 | vars: 69 | random_hosts: random_three_rgw_hosts 70 | tags: 71 | - service 72 | - rgw 73 | - rgw-service 74 | - rgw-down 75 | -------------------------------------------------------------------------------- /playbooks/storage/mon/damage_mon.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do mon kill drill test case, and if in manual 2 | # mode, it will prompt to ask if you want to recover it. 3 | 4 | - include: ../../common/next.yml 5 | vars: 6 | case_name: "CASE: Ceph monitor down by error opening data directory on {{ random_hosts }}" 7 | 8 | - name: "Drill the case" 9 | hosts: "{{ random_hosts }}" 10 | gather_facts: false 11 | vars: 12 | execute: "{{ hostvars['localhost']['execute'] }}" 13 | tasks: 14 | - include_role: 15 | name: storage 16 | tasks_from: damage_mon 17 | when: 18 | - execute == "y" or execute == true 19 | 20 | - include: ../../common/ask.yml 21 | 22 | - name: "Recover the fault" 23 | hosts: "{{ random_hosts }}" 24 | gather_facts: true 25 | vars: 26 | execute: "{{ hostvars['localhost']['execute'] }}" 27 | recovery: "{{ hostvars['localhost']['recovery'] }}" 28 | tasks: 29 | - include_role: 30 | name: storage 31 | tasks_from: recover_damage_mon 32 | when: 33 | - mode == "auto" or recovery == "y" or recovery == true 34 | - execute == "y" or execute == true 35 | -------------------------------------------------------------------------------- /playbooks/storage/mon/kill_mon.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do mon kill drill test case, and if in manual 2 | # mode, it will prompt to ask if you want to recover it. 3 | 4 | - include: ../../common/next.yml 5 | vars: 6 | case_name: "CASE: Ceph monitor down on {{ random_hosts }}" 7 | 8 | - name: "Drill the case" 9 | hosts: "{{ random_hosts }}" 10 | gather_facts: false 11 | vars: 12 | execute: "{{ hostvars['localhost']['execute'] }}" 13 | tasks: 14 | - include_role: 15 | name: storage 16 | tasks_from: kill_mon 17 | when: 18 | - execute == "y" or execute == true 19 | 20 | - include: ../../common/ask.yml 21 | 22 | - name: "Recover the fault" 23 | hosts: "{{ random_hosts }}" 24 | gather_facts: true 25 | vars: 26 | execute: "{{ hostvars['localhost']['execute'] }}" 27 | recovery: "{{ hostvars['localhost']['recovery'] }}" 28 | tasks: 29 | - include_role: 30 | name: storage 31 | tasks_from: start_mon 32 | when: 33 | - mode == "auto" or recovery == "y" or recovery == true 34 | - execute == "y" or execute == true 35 | -------------------------------------------------------------------------------- /playbooks/storage/osd/del_osd_partition.yml: -------------------------------------------------------------------------------- 1 | - include: ../../common/next.yml 2 | vars: 3 | case_name: "CASE: Storage - Ceph OSD partition-table delete on {{ random_hosts }}" 4 | 5 | - name: "Drill the case" 6 | hosts: "{{ random_hosts }}" 7 | gather_facts: true 8 | vars: 9 | execute: "{{ hostvars['localhost']['execute'] }}" 10 | tasks: 11 | - block: 12 | - include_role: 13 | name: storage 14 | tasks_from: kill_osd 15 | - include_role: 16 | name: storage 17 | tasks_from: del_osd_partition 18 | when: 19 | - execute == "y" or execute == true 20 | 21 | - include: ../../common/ask.yml 22 | 23 | - name: "Recover the fault" 24 | hosts: "{{ random_hosts }}" 25 | gather_facts: true 26 | vars: 27 | execute: "{{ hostvars['localhost']['execute'] }}" 28 | recovery: "{{ hostvars['localhost']['recovery'] }}" 29 | tasks: 30 | - block: 31 | - include_role: 32 | name: storage 33 | tasks_from: recover_osd_partition 34 | - include_role: 35 | name: storage 36 | tasks_from: start_osd 37 | when: 38 | - mode == "auto" or recovery == "y" or recovery == true 39 | - execute == "y" or execute == true 40 | -------------------------------------------------------------------------------- /playbooks/storage/osd/kill_osd.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do osd kill drill test case, and if in manual 2 | # mode, it will prompt to ask if you want to recover it. 3 | 4 | - include: ../../common/next.yml 5 | vars: 6 | case_name: "CASE: Storage - Ceph OSD down on {{ random_hosts }}" 7 | 8 | - name: "Drill the case" 9 | hosts: "{{ random_hosts }}" 10 | gather_facts: true 11 | vars: 12 | execute: "{{ hostvars['localhost']['execute'] }}" 13 | tasks: 14 | - include_role: 15 | name: storage 16 | tasks_from: kill_osd 17 | when: 18 | - execute == "y" or execute == true 19 | 20 | - include: ../../common/ask.yml 21 | 22 | - name: "Recover the fault" 23 | hosts: "{{ random_hosts }}" 24 | gather_facts: true 25 | vars: 26 | execute: "{{ hostvars['localhost']['execute'] }}" 27 | recovery: "{{ hostvars['localhost']['recovery'] }}" 28 | tasks: 29 | - include_role: 30 | name: storage 31 | tasks_from: start_osd 32 | when: 33 | - mode == "auto" or recovery == "y" or recovery == true 34 | - execute == "y" or execute == true 35 | -------------------------------------------------------------------------------- /playbooks/storage/rgw/kill_rgw.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do radosgw kill drill test case, and if 2 | # in manual mode, it will prompt to ask if you want to recover 3 | # it. 4 | 5 | - include: ../../common/next.yml 6 | vars: 7 | case_name: "CASE: Ceph radosgw down on {{ random_hosts }}" 8 | 9 | - name: "Drill the case" 10 | hosts: "{{ random_hosts }}" 11 | gather_facts: false 12 | vars: 13 | execute: "{{ hostvars['localhost']['execute'] }}" 14 | tasks: 15 | - include_role: 16 | name: storage 17 | tasks_from: kill_rgw 18 | when: 19 | - execute == "y" or execute == true 20 | 21 | - include: ../../common/ask.yml 22 | 23 | - name: "Recover the fault" 24 | hosts: "{{ random_hosts }}" 25 | gather_facts: true 26 | vars: 27 | execute: "{{ hostvars['localhost']['execute'] }}" 28 | recovery: "{{ hostvars['localhost']['recovery'] }}" 29 | tasks: 30 | - include_role: 31 | name: storage 32 | tasks_from: start_rgw 33 | when: 34 | - mode == "auto" or recovery == "y" or recovery == true 35 | - execute == "y" or execute == true 36 | -------------------------------------------------------------------------------- /playbooks/system/base.yml: -------------------------------------------------------------------------------- 1 | # CPU 2 | - include: cpu_load.yml 3 | vars: 4 | cpu_stress_load: "80%" 5 | cpu_stress_timeout: "5m" 6 | tags: 7 | - "{{ node_group }}-cpu" 8 | 9 | - include: cpu_load.yml 10 | vars: 11 | cpu_stress_load: "90%" 12 | cpu_stress_timeout: "5m" 13 | tags: 14 | - "{{ node_group }}-cpu" 15 | 16 | - include: cpu_load.yml 17 | vars: 18 | cpu_stress_load: "100%" 19 | cpu_stress_timeout: "5m" 20 | tags: 21 | - "{{ node_group }}-cpu" 22 | 23 | # Memory 24 | - include: mem_load.yml 25 | vars: 26 | mem_stress_load: "80%" 27 | mem_stress_timeout: "5m" 28 | tags: 29 | - "{{ node_group }}-mem" 30 | 31 | - include: mem_load.yml 32 | vars: 33 | mem_stress_load: "90%" 34 | mem_stress_timeout: "5m" 35 | tags: 36 | - "{{ node_group }}-mem" 37 | 38 | - include: mem_load.yml 39 | vars: 40 | mem_stress_load: "100%" 41 | mem_stress_timeout: "5m" 42 | tags: 43 | - "{{ node_group }}-mem" 44 | 45 | # Disk(root disk) 46 | - include: disk_load.yml 47 | vars: 48 | disk_stress_load: "80%" 49 | disk_stress_timeout: "5m" 50 | tags: 51 | - "{{ node_group }}-root-disk" 52 | 53 | - include: disk_load.yml 54 | vars: 55 | disk_stress_load: "90%" 56 | disk_stress_timeout: "5m" 57 | tags: 58 | - "{{ node_group }}-root-disk" 59 | 60 | # Nic delay(Management Network) 61 | - include: nic_delay.yml 62 | vars: 63 | nic_delay_time: "200ms" 64 | nic_delay_timeout: "5m" 65 | nic: "{{ mgmt_nic_name }}" 66 | tags: 67 | - "{{ node_group }}-mgmt-nic-delay" 68 | 69 | - include: nic_delay.yml 70 | vars: 71 | nic_delay_time: "300ms" 72 | nic_delay_timeout: "5m" 73 | nic: "{{ mgmt_nic_name }}" 74 | tags: 75 | - "{{ node_group }}-mgmt-nic-delay" 76 | 77 | # Nic package loss(Management Network) 78 | - include: nic_loss.yml 79 | vars: 80 | nic_loss_percent: "80%" 81 | nic_loss_timeout: "5m" 82 | nic: "{{ mgmt_nic_name }}" 83 | tags: 84 | - "{{ node_group }}-mgmt-nic-loss" 85 | 86 | - include: nic_loss.yml 87 | vars: 88 | nic_loss_percent: "90%" 89 | nic_loss_timeout: "5m" 90 | nic: "{{ mgmt_nic_name }}" 91 | tags: 92 | - "{{ node_group }}-mgmt-nic-loss" 93 | 94 | # Nic down(Management Network) 95 | - include: nic_down.yml 96 | vars: 97 | nic_down_timeout: "5m" 98 | nic: "{{ mgmt_nic_name }}" 99 | tags: 100 | - "{{ node_group }}-mgmt-nic-down" 101 | -------------------------------------------------------------------------------- /playbooks/system/cpu_load.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do cpu stress load on specified node 2 | 3 | - include: ../common/next.yml 4 | vars: 5 | case_name: "CASE: Stress {{ node_group }} host CPU load to {{ cpu_stress_load }} for {{ cpu_stress_timeout }} on {{ random_hosts }}" 6 | 7 | - name: "Drill the case" 8 | hosts: "{{ random_hosts }}" 9 | gather_facts: true 10 | vars: 11 | execute: "{{ hostvars['localhost']['execute'] }}" 12 | tasks: 13 | - include_role: 14 | name: system 15 | tasks_from: cpu_load 16 | when: 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /playbooks/system/disk_load.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do root file system stress load on specified node 2 | 3 | - include: ../common/next.yml 4 | vars: 5 | case_name: "CASE: Stress {{ node_group }} host root file system load to {{ disk_stress_load }} for {{ disk_stress_timeout }} on {{ random_hosts }}" 6 | 7 | - name: "Drill the case" 8 | hosts: "{{ random_hosts }}" 9 | gather_facts: true 10 | vars: 11 | execute: "{{ hostvars['localhost']['execute'] }}" 12 | tasks: 13 | - include_role: 14 | name: system 15 | tasks_from: disk_load 16 | when: 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /playbooks/system/mem_load.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do mem stress load on specified node 2 | 3 | - include: ../common/next.yml 4 | vars: 5 | case_name: "CASE: Stress {{ node_group }} host memory load to {{ mem_stress_load }} for {{ mem_stress_timeout }} on {{ random_hosts }}" 6 | 7 | - name: "Drill the case" 8 | hosts: "{{ random_hosts }}" 9 | gather_facts: true 10 | vars: 11 | execute: "{{ hostvars['localhost']['execute'] }}" 12 | tasks: 13 | - include_role: 14 | name: system 15 | tasks_from: mem_load 16 | when: 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /playbooks/system/nic.yml: -------------------------------------------------------------------------------- 1 | # This playbook is generally to test a nic in data-plance 2 | 3 | # Nic delay(Storage Network) 4 | - include: nic_delay.yml 5 | vars: 6 | nic_delay_time: "300ms" 7 | nic_delay_timeout: "5m" 8 | tags: 9 | - "{{ node_group }}-{{ nic_type }}-nic-delay" 10 | 11 | # Nic package loss(Storage Network) 12 | - include: nic_loss.yml 13 | vars: 14 | nic_loss_percent: "80%" 15 | nic_loss_timeout: "5m" 16 | tags: 17 | - "{{ node_group }}-{{ nic_type }}-nic-loss" 18 | 19 | # Nic down(Storage Network) 20 | - include: nic_down.yml 21 | vars: 22 | nic_down_timeout: "5m" 23 | tags: 24 | - "{{ node_group }}-{{ nic_type }}-nic-down" 25 | -------------------------------------------------------------------------------- /playbooks/system/nic_delay.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do nic delay of specified nic on specified node 2 | 3 | - include: ../common/next.yml 4 | vars: 5 | case_name: "CASE: Set {{ node_group }} host {{ nic }} delay to {{ nic_delay_time }} for {{ nic_delay_timeout }} on {{ random_hosts }}" 6 | 7 | - name: "Drill the case" 8 | hosts: "{{ random_hosts }}" 9 | gather_facts: false 10 | vars: 11 | execute: "{{ hostvars['localhost']['execute'] }}" 12 | tasks: 13 | - include_role: 14 | name: system 15 | tasks_from: nic_delay 16 | when: 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /playbooks/system/nic_down.yml: -------------------------------------------------------------------------------- 1 | # This playbook will down specified nic on specified node 2 | 3 | - include: ../common/next.yml 4 | vars: 5 | case_name: "CASE: Set {{ node_group }} host {{ nic }} down for {{ nic_down_timeout }} on {{ random_hosts }}" 6 | 7 | - name: "Drill the case" 8 | hosts: "{{ random_hosts }}" 9 | gather_facts: false 10 | vars: 11 | execute: "{{ hostvars['localhost']['execute'] }}" 12 | tasks: 13 | - include_role: 14 | name: system 15 | tasks_from: nic_down 16 | when: 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /playbooks/system/nic_loss.yml: -------------------------------------------------------------------------------- 1 | # This playbook will do nic loss of specified nic on specified node 2 | 3 | - include: ../common/next.yml 4 | vars: 5 | case_name: "CASE: Set {{ node_group }} host {{ nic }} loss to {{ nic_loss_percent }} for {{ nic_loss_timeout }} on {{ random_hosts }}" 6 | 7 | - name: "Drill the case" 8 | hosts: "{{ random_hosts }}" 9 | gather_facts: false 10 | vars: 11 | execute: "{{ hostvars['localhost']['execute'] }}" 12 | tasks: 13 | - include_role: 14 | name: system 15 | tasks_from: nic_loss 16 | when: 17 | - execute == "y" or execute == true 18 | -------------------------------------------------------------------------------- /roles/common/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/common/defaults/main.yml -------------------------------------------------------------------------------- /roles/common/tasks/port_add.yml: -------------------------------------------------------------------------------- 1 | - name: Add a port to a ovs bridge 2 | shell: > 3 | ovs-vsctl add-port {{ ovs_bridge }} {{ ovs_port }} 4 | become: true 5 | -------------------------------------------------------------------------------- /roles/common/tasks/port_del.yml: -------------------------------------------------------------------------------- 1 | - name: Delete a port from a ovs bridge 2 | shell: > 3 | ovs-vsctl del-port {{ ovs_bridge }} {{ ovs_port }} 4 | become: true 5 | -------------------------------------------------------------------------------- /roles/common/tasks/start_service.yml: -------------------------------------------------------------------------------- 1 | # Start a service 2 | 3 | - block: 4 | - name: "Try to start service {{ service_name }} using systemctl" 5 | service: 6 | name: "{{ service_name }}" 7 | state: started 8 | become: true 9 | 10 | rescue: 11 | - name: "Try to start service {{ service_name }} using service" 12 | shell: service {{ service_name }} start 13 | become: true 14 | ignore_errors: yes 15 | 16 | always: 17 | - name: Waiting 10 seconds to warm up 18 | pause: 19 | seconds: 10 20 | -------------------------------------------------------------------------------- /roles/common/tasks/stop_service.yml: -------------------------------------------------------------------------------- 1 | # Stop a service 2 | 3 | - block: 4 | - name: "Try to stop service {{ service_name }} using systemctl" 5 | service: 6 | name: "{{ service_name }}" 7 | state: stopped 8 | become: true 9 | 10 | rescue: 11 | - name: "Try to stop service {{ service_name }} using service" 12 | shell: service {{ service_name }} stop 13 | become: true 14 | ignore_errors: yes 15 | 16 | always: 17 | - name: Waiting 10 seconds to warm down 18 | pause: 19 | seconds: 10 20 | -------------------------------------------------------------------------------- /roles/compute/README.md: -------------------------------------------------------------------------------- 1 | # Compute reliability verification and fault drill role 2 | 3 | We design test cases from the following two levels: 4 | 5 | ## System Level 6 | 7 | The compute node is for running instance, which has high demands for cpu, memory, and network. 8 | 9 | There are four key networks used by compute node: 10 | 11 | * management 12 | * storage 13 | * vlan 14 | * tunnel 15 | 16 | ### CPU 17 | 18 | * Stress CPU load to 80% for 5 minutes 19 | * Stress CPU load to 100% for 5 minutes 20 | 21 | ### Memory 22 | 23 | * Stress memory load to 80% for 5 minutes 24 | * Stress memory load to 100% for 5 minutes 25 | 26 | ### Disk 27 | 28 | * Stress root disk util to 80% for 5 minutes 29 | * Stress root disk util to 100% for 5 minutes 30 | 31 | ### Network 32 | 33 | * Management network package loss to 80% 34 | * Management network package loss to 100% 35 | * Management network package delay to 10ms 36 | * Management network package delay to 100ms 37 | * Ifdown management nic 38 | 39 | * VLAN/Tunnel network package loss to 80% 40 | * VLAN/Tunnel network package loss to 100% 41 | * VLAN/Tunnel network package delay to 10ms 42 | * VLAN/Tunnel network package delay to 100ms 43 | * VLAN/Tunnel network package delay to 200ms 44 | * Ifdown vlan/tunnel nic 45 | 46 | * Storage network package loss to 80% 47 | * Storage network package loss to 100% 48 | * Storage network package delay to 10ms 49 | * Storage network package delay to 100ms 50 | * Storage network package delay to 200ms 51 | * Ifdown storage nic 52 | 53 | ## Service Level 54 | 55 | There are following processes running on compute node: 56 | 57 | on control plane: 58 | 59 | * nova-compute 60 | * neutron-openvswitch-agent 61 | * libvirtd 62 | 63 | on data plane: 64 | 65 | * kvm-qemu 66 | * ovsdb-server 67 | * ovs-vswitchd 68 | 69 | So we design the following test cases: 70 | 71 | * kill nova-compute 72 | * systemctl stop nova-compute 73 | * kill neutron-openvswitch-agent 74 | * systemctl stop neutron-openvswitch-agent 75 | * kill libvirtd 76 | * systemctl stop libvirtd 77 | * kill a kvm/qemu process 78 | * kill ovsdb-server 79 | * systemctl stop ovsdb-server 80 | * kill ovs-vswitchd 81 | * systemctl stop ovs-vswitchd 82 | -------------------------------------------------------------------------------- /roles/compute/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/compute/defaults/main.yml -------------------------------------------------------------------------------- /roles/compute/meta/main.yml: -------------------------------------------------------------------------------- 1 | # Include the `common` role as a dependency. This makes sure the 2 | # # variables defined in that role are available here. 3 | dependencies: 4 | - common 5 | -------------------------------------------------------------------------------- /roles/compute/service/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/compute/service/main.yml -------------------------------------------------------------------------------- /roles/control/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/control/README.md -------------------------------------------------------------------------------- /roles/control/defaults/main.yml: -------------------------------------------------------------------------------- 1 | rabbit_host: localhost 2 | rabbit_username: guest 3 | rabbit_password: guest 4 | rabbit_exchange: gremlin 5 | exchange_durable: False 6 | exchange_auto_delete: False 7 | rabbit_queue: notifications.info 8 | queue_durable: True 9 | queue_auto_delete: False 10 | routing_key: notifications.info 11 | threads: 100 12 | msg_per_thread: 10000 13 | -------------------------------------------------------------------------------- /roles/control/files/stress_mq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import pika 4 | import threading 5 | 6 | parser = argparse.ArgumentParser("./stress_mq.py", 7 | description='Publish messages to RabbitMQ') 8 | parser.add_argument('-H', '--rabbit-host', default='localhost', 9 | help="RabbitMQ Host Address") 10 | parser.add_argument('-u', '--rabbit-username', default='guest', 11 | help="RabbitMQ username") 12 | parser.add_argument('-p', '--rabbit-password', default='guest', 13 | help="RabbitMQ password") 14 | parser.add_argument('--rabbit-exchange', default='gremlin', 15 | help="The exchange to stress in RabbitMQ") 16 | parser.add_argument('--exchange-durable', default='False', 17 | help="Set exchange to durable or not") 18 | parser.add_argument('--exchange-auto-delete', default='False', 19 | help="Set exchange to auto_delete or not") 20 | parser.add_argument('--rabbit-queue', default='notifications.info', 21 | help="The queue to stress in RabbitMQ") 22 | parser.add_argument('--queue-durable', default='False', 23 | help="Set queue to durable or not") 24 | parser.add_argument('--queue-auto-delete', default='False', 25 | help="Set queue to auto_delete or not") 26 | parser.add_argument('--routing-key', default='notifications.info', 27 | help="The routing_key the queue will bind with") 28 | parser.add_argument('-t', '--threads', type=int, default=100, 29 | help="The threading number will spawned to do publish messages") 30 | parser.add_argument('-n', '--msg-per-thread', type=int, default=10000, 31 | help="Message number every thread will publish") 32 | args = parser.parse_args() 33 | 34 | credentials = pika.PlainCredentials(args.rabbit_username, args.rabbit_password) 35 | parameters = pika.ConnectionParameters(host=args.rabbit_host, 36 | credentials=credentials) 37 | 38 | def str2bool(v): 39 | return v.lower() in ('true', 'yes', '1') 40 | 41 | def publish(): 42 | connection = pika.BlockingConnection(parameters) 43 | channel = connection.channel() 44 | 45 | channel.exchange_declare(exchange=args.rabbit_exchange, 46 | durable=str2bool(args.exchange_durable), 47 | auto_delete=str2bool(args.exchange_auto_delete), 48 | type="topic") 49 | channel.queue_declare(queue=args.rabbit_queue, 50 | durable=str2bool(args.queue_durable), 51 | auto_delete=str2bool(args.queue_auto_delete)) 52 | channel.queue_bind(args.rabbit_queue, args.rabbit_exchange, 53 | args.routing_key) 54 | 55 | message = 'Gremlin Coming!' 56 | count = 0 57 | while count < args.msg_per_thread: 58 | channel.basic_publish(exchange=args.rabbit_exchange, 59 | routing_key=args.routing_key, 60 | body=message) 61 | count = count + 1 62 | connection.close() 63 | 64 | threads = [threading.Thread(target=publish) for i in range(args.threads)] 65 | 66 | for t in threads: 67 | t.start() 68 | 69 | for t in threads: 70 | t.join() 71 | -------------------------------------------------------------------------------- /roles/control/meta/main.yml: -------------------------------------------------------------------------------- 1 | # Include the `common` role as a dependency. This makes sure the 2 | # # variables defined in that role are available here. 3 | dependencies: 4 | - common 5 | -------------------------------------------------------------------------------- /roles/control/tasks/purge_queue.yml: -------------------------------------------------------------------------------- 1 | - name: Purge RabbitMQ queue 2 | shell: > 3 | /usr/local/bin/rabbitmqadmin purge queue name={{ rabbit_queue }} 4 | --username={{ rabbit_username }} --password={{ rabbit_password }} 5 | become: true 6 | -------------------------------------------------------------------------------- /roles/control/tasks/stress_db.yml: -------------------------------------------------------------------------------- 1 | - name: Install sysbench 2 | package: 3 | name: sysbench 4 | state: present 5 | delegate_to: localhost 6 | become: true 7 | when: manage_packages|default(false) 8 | 9 | - name: Delete sysbench database, user and grant priviledges first 10 | shell: > 11 | mysql -e "revoke all on sysbench.* from {{ sysbench_user }}@'%'"; 12 | mysql -e "drop user {{ sysbench_user }}@'%'"; 13 | mysql -e "drop database sysbench"; 14 | become: true 15 | ignore_errors: true 16 | 17 | - name: Create sysbench database, user and grant priviledges first 18 | shell: > 19 | mysql -e "create database sysbench"; 20 | mysql -e "create user '{{ sysbench_user }}'@'%' identified by '{{ sysbench_password }}'"; 21 | mysql -e "grant all on {{ sysbench_user }}.* to sysbench@'%' identified by '{{ sysbench_password }}';"; 22 | become: true 23 | 24 | - name: Stress MySQL Cluster - Prepare 25 | shell: > 26 | sysbench --time={{ stress_mysql_time }} --threads={{ sysbench_threads }} --mysql-user={{ sysbench_user }} 27 | --mysql-password={{ sysbench_password }} --mysql-db={{ sysbench_database }} --mysql-host={{ stress_mysql_host}} 28 | --db-driver=mysql --table_size=500000 29 | /usr/share/sysbench/oltp_read_only.lua prepare 30 | delegate_to: localhost 31 | ignore_errors: true 32 | 33 | - name: Stress MySQL Cluster - Run 34 | shell: > 35 | sysbench --time={{ stress_mysql_time }} --threads={{ sysbench_threads }} --mysql-user={{ sysbench_user }} 36 | --mysql-password={{ sysbench_password }} --mysql-db={{ sysbench_database }} --mysql-host={{ stress_mysql_host}} 37 | --db-driver=mysql --table_size=500000 38 | /usr/share/sysbench/oltp_read_only.lua run 39 | delegate_to: localhost 40 | ignore_errors: true 41 | 42 | - name: Stress MySQL Cluster - Cleanup 43 | shell: > 44 | sysbench --time={{ stress_mysql_time }} --threads={{ sysbench_threads }} --mysql-user={{ sysbench_user }} 45 | --mysql-password={{ sysbench_password }} --mysql-db={{ sysbench_database }} --mysql-host={{ stress_mysql_host}} 46 | --db-driver=mysql --table_size=500000 47 | /usr/share/sysbench/oltp_read_only.lua cleanup 48 | delegate_to: localhost 49 | ignore_errors: true 50 | 51 | - name: Delete sysbench database, user and grant priviledges 52 | shell: > 53 | mysql -e "revoke all on sysbench.* from {{ sysbench_user }}@'%'"; 54 | mysql -e "drop user {{ sysbench_user }}@'%'"; 55 | mysql -e "drop database sysbench"; 56 | become: true 57 | ignore_errors: true 58 | -------------------------------------------------------------------------------- /roles/control/tasks/stress_mq.yml: -------------------------------------------------------------------------------- 1 | - name: Install pika 2 | package: 3 | name: python-pika 4 | state: present 5 | become: true 6 | when: manage_packages|default(false) 7 | 8 | - name: Stress RabbitMQ 9 | script: "stress_mq.py -H {{ rabbit_host }} -u {{ rabbit_username }} -p {{ rabbit_password }} 10 | --rabbit-exchange {{ rabbit_exchange }} --exchange-durable {{ exchange_durable }} 11 | --exchange-auto-delete {{ exchange_auto_delete}} --rabbit-queue {{ rabbit_queue }} 12 | --queue-durable {{ queue_durable }} --queue-auto-delete {{ queue_auto_delete }} 13 | --routing-key {{ routing_key }} -t {{ threads }} -n {{ msg_per_thread }}" 14 | become: true 15 | -------------------------------------------------------------------------------- /roles/network/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/network/README.md -------------------------------------------------------------------------------- /roles/network/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/network/defaults/main.yml -------------------------------------------------------------------------------- /roles/network/meta/main.yml: -------------------------------------------------------------------------------- 1 | # Include the `common` role as a dependency. This makes sure the 2 | # # variables defined in that role are available here. 3 | dependencies: 4 | - common 5 | -------------------------------------------------------------------------------- /roles/network/service/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/network/service/tasks/main.yml -------------------------------------------------------------------------------- /roles/provision/README.md: -------------------------------------------------------------------------------- 1 | Add non-root user to target host to execute fault drill commands, and 2 | the non-root will be removed when the test case is done. 3 | -------------------------------------------------------------------------------- /roles/provision/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # The path to an ssh key (that we will generate) that can be used to 2 | # log in to the target host. 3 | gremlin_key: "{{ local_working_dir }}/id_rsa_gremlin" 4 | -------------------------------------------------------------------------------- /roles/provision/local/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # Create local working dir 2 | 3 | - name: Get current user group for localhost 4 | command: "id -gn" 5 | register: local_user_group 6 | changed_when: false 7 | 8 | - name: Register fact for current user group 9 | set_fact: 10 | current_group_local: "{{ local_user_group.stdout }}" 11 | tags: 12 | - provision 13 | 14 | - block: 15 | - name: Ensure local working dir exists 16 | file: 17 | path: "{{ local_working_dir }}" 18 | state: directory 19 | owner: "{{ ansible_env.USER }}" 20 | group: "{{ current_group_local }}" 21 | become: true 22 | rescue: 23 | - name: Ensure local working dir exists 24 | file: 25 | path: "{{ local_working_dir }}" 26 | state: directory 27 | owner: "{{ ansible_env.USER }}" 28 | group: "{{ current_group_local }}" 29 | -------------------------------------------------------------------------------- /roles/provision/meta/main.yml: -------------------------------------------------------------------------------- 1 | # Include the `common` role as a dependency. This makes sure the 2 | # # variables defined in that role are available here. 3 | dependencies: 4 | - common 5 | -------------------------------------------------------------------------------- /roles/provision/os_auth/defaults/main.yml: -------------------------------------------------------------------------------- 1 | os_auth_url: 'http://127.0.0.1:5000/v3' 2 | os_project_domain_name: Default 3 | os_user_domain_name: Default 4 | 5 | os_admin_project: admin 6 | os_admin_username: admin 7 | os_admin_password: admin 8 | 9 | os_gremlin_password: drill@gremlin 10 | os_gremlin_role: member 11 | -------------------------------------------------------------------------------- /roles/provision/os_auth/tasks/create_auth.yml: -------------------------------------------------------------------------------- 1 | - shell: "openstack --os-auth-url {{ os_auth_url }} 2 | --os-identity-api-version 3 3 | --os-project-domain-name {{ os_project_domain_name }} 4 | --os-project-name {{ os_admin_project }} 5 | --os-user-domain-name {{ os_user_domain_name }} 6 | --os-username {{ os_admin_username }} 7 | --os-password {{ os_admin_password }} 8 | project show gremlin" 9 | register: os_project_show 10 | ignore_errors: true 11 | 12 | - shell: "openstack --os-auth-url {{ os_auth_url }} 13 | --os-identity-api-version 3 14 | --os-project-domain-name {{ os_project_domain_name }} 15 | --os-project-name {{ os_admin_project }} 16 | --os-user-domain-name {{ os_user_domain_name }} 17 | --os-username {{ os_admin_username }} 18 | --os-password {{ os_admin_password }} 19 | project create --domain {{ os_project_domain_name }} gremlin" 20 | when: os_project_show.rc != 0 21 | 22 | - shell: "openstack --os-auth-url {{ os_auth_url }} 23 | --os-identity-api-version 3 24 | --os-project-domain-name {{ os_project_domain_name }} 25 | --os-project-name {{ os_admin_project }} 26 | --os-user-domain-name {{ os_user_domain_name }} 27 | --os-username {{ os_admin_username }} 28 | --os-password {{ os_admin_password }} 29 | user create --domain {{ os_project_domain_name }} 30 | --password {{ os_gremlin_password }} 31 | gremlin --or-show" 32 | 33 | - shell: "openstack --os-auth-url {{ os_auth_url }} 34 | --os-identity-api-version 3 35 | --os-project-domain-name {{ os_project_domain_name }} 36 | --os-project-name {{ os_admin_project }} 37 | --os-user-domain-name {{ os_user_domain_name }} 38 | --os-username {{ os_admin_username }} 39 | --os-password {{ os_admin_password }} 40 | role add --project gremlin --user gremlin {{ os_gremlin_role }}" 41 | -------------------------------------------------------------------------------- /roles/provision/os_auth/tasks/remove_auth.yml: -------------------------------------------------------------------------------- 1 | - shell: "openstack --os-auth-url {{ os_auth_url }} 2 | --os-identity-api-version 3 3 | --os-project-domain-name {{ os_project_domain_name }} 4 | --os-project-name {{ os_admin_project }} 5 | --os-user-domain-name {{ os_user_domain_name }} 6 | --os-username {{ os_admin_username }} 7 | --os-password {{ os_admin_password }} 8 | user show gremlin" 9 | register: os_user_show 10 | ignore_errors: true 11 | 12 | - shell: "openstack --os-auth-url {{ os_auth_url }} 13 | --os-identity-api-version 3 14 | --os-project-domain-name {{ os_project_domain_name }} 15 | --os-project-name {{ os_admin_project }} 16 | --os-user-domain-name {{ os_user_domain_name }} 17 | --os-username {{ os_admin_username }} 18 | --os-password {{ os_admin_password }} 19 | user delete gremlin" 20 | when: os_user_show.rc == 0 21 | 22 | - shell: "openstack --os-auth-url {{ os_auth_url }} 23 | --os-identity-api-version 3 24 | --os-project-domain-name {{ os_project_domain_name }} 25 | --os-project-name {{ os_admin_project }} 26 | --os-user-domain-name {{ os_user_domain_name }} 27 | --os-username {{ os_admin_username }} 28 | --os-password {{ os_admin_password }} 29 | project show gremlin" 30 | register: os_project_show 31 | ignore_errors: true 32 | 33 | - shell: "openstack --os-auth-url {{ os_auth_url }} 34 | --os-identity-api-version 3 35 | --os-project-domain-name {{ os_project_domain_name }} 36 | --os-project-name {{ os_admin_project }} 37 | --os-user-domain-name {{ os_user_domain_name }} 38 | --os-username {{ os_admin_username }} 39 | --os-password {{ os_admin_password }} 40 | project delete gremlin" 41 | when: os_project_show.rc == 0 42 | -------------------------------------------------------------------------------- /roles/provision/os_stack/defaults/main.yml: -------------------------------------------------------------------------------- 1 | stack_template_path: stack-untitle.yml 2 | stack_name: untitle 3 | os_auth_url: 'http://127.0.0.1:5000/v3' 4 | os_region: RegionOne 5 | os_project_domain_name: Default 6 | os_user_domain_name: Default 7 | os_gremlin_password: drill@gremlin 8 | -------------------------------------------------------------------------------- /roles/provision/os_stack/tasks/create_stack.yml: -------------------------------------------------------------------------------- 1 | - template: 2 | src: "{{ stack_template_path }}" 3 | dest: "/tmp/grem_stack_template.yml" 4 | force: true 5 | mode: 0644 6 | 7 | - shell: "heat --os-auth-url {{ os_auth_url }} 8 | --os-region-name {{ os_region }} 9 | --os-project-domain-name {{ os_project_domain_name }} 10 | --os-project-name gremlin 11 | --os-user-domain-name {{ os_user_domain_name }} 12 | --os-username gremlin 13 | --os-password {{ os_gremlin_password }} 14 | stack-create --template-file /tmp/grem_stack_template.yml 15 | gremlin_{{ stack_name }}" 16 | -------------------------------------------------------------------------------- /roles/provision/os_stack/tasks/remove_stack.yml: -------------------------------------------------------------------------------- 1 | - shell: "heat --os-auth-url {{ os_auth_url }} 2 | --os-region-name {{ os_region }} 3 | --os-project-domain-name {{ os_project_domain_name }} 4 | --os-project-name gremlin 5 | --os-user-domain-name {{ os_user_domain_name }} 6 | --os-username gremlin 7 | --os-password {{ os_gremlin_password }} 8 | stack-delete gremlin_{{ stack_name }}" 9 | 10 | - file: 11 | path: "/tmp/grem_stack_template.yml" 12 | state: absent 13 | 14 | -------------------------------------------------------------------------------- /roles/provision/teardown/meta/main.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - provision 3 | -------------------------------------------------------------------------------- /roles/provision/teardown/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # Check that the non-root user exists. 2 | - name: Get UID of non-root user 3 | command: > 4 | id -u {{ non_root_user }} 5 | register: non_root_uid 6 | ignore_errors: true 7 | changed_when: false 8 | 9 | # If the non-root user exists, perform a variety of cleanup tasks. 10 | - when: non_root_uid|success 11 | block: 12 | # Look for and kill any processes owned by the non-root user. 13 | # This will let us remove the user later on. 14 | - name: Check for processes owned by non-root user 15 | command: > 16 | pgrep -u {{ non_root_user }} 17 | register: proc_exist 18 | ignore_errors: true 19 | become: true 20 | 21 | - name: Kill (SIGTERM) all processes owned by non-root user 22 | command: > 23 | pkill -u {{ non_root_user }} 24 | ignore_errors: true 25 | become: true 26 | when: proc_exist|success 27 | 28 | - name: Kill (SIGKILL) all processes owned by non-root user 29 | command: > 30 | pkill -9 -u {{ non_root_user }} 31 | when: proc_exist|success 32 | ignore_errors: true 33 | become: true 34 | 35 | # Now that we have taken care of any processes owned by this user 36 | # account we can delete it. 37 | - name: Remove non-root user account 38 | user: 39 | name: "{{ non_root_user }}" 40 | state: absent 41 | remove: true 42 | become: true 43 | -------------------------------------------------------------------------------- /roles/provision/user/meta/main.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - provision 3 | -------------------------------------------------------------------------------- /roles/provision/user/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # Create `gremlin_key`, which we will use to log in to the target 2 | # host. Note that this tasks runs on the ansible control host 3 | # (because of the `delegate_to: localhost`), and we will later copy 4 | # the public key to the appropriate location. 5 | 6 | - name: Check if the host key exists 7 | delegate_to: localhost 8 | stat: 9 | path: "{{ gremlin_key }}" 10 | register: gremlin_key_stat 11 | 12 | - name: Create target host access key 13 | delegate_to: localhost 14 | command: > 15 | ssh-keygen -f {{ gremlin_key }} -N '' 16 | -C 'ansible_generated_key' 17 | -t rsa -b 4096 18 | args: 19 | creates: "{{ gremlin_key }}" 20 | when: not gremlin_key_stat.stat.exists 21 | 22 | # Create a non-root user on the target host. This is the user that 23 | # will execute fault drill test cases on the target host. 24 | - name: Create non-root group 25 | group: 26 | name: "{{ non_root_group }}" 27 | state: present 28 | 29 | - name: Create non-root user 30 | user: 31 | name: "{{ non_root_user }}" 32 | group: "{{ non_root_group }}" 33 | state: present 34 | shell: /bin/bash 35 | become: true 36 | 37 | # Install the public component of `gremlin_key` in the 38 | # `.ssh/authorized_keys` file for the non-root user. 39 | - name: Configure non-root user authorized_keys 40 | authorized_key: 41 | user: "{{ non_root_user }}" 42 | key: "{{ item }}" 43 | with_file: 44 | - "{{ gremlin_key }}.pub" 45 | become: true 46 | 47 | # I'm not always root, but when I am it's because of `sudo`. 48 | - name: Grant sudo privileges to non-root user 49 | copy: 50 | content: | 51 | {{ non_root_user }} ALL=(ALL) NOPASSWD:ALL 52 | dest: /etc/sudoers.d/{{ non_root_user }} 53 | owner: root 54 | group: root 55 | mode: 0440 56 | become: true 57 | -------------------------------------------------------------------------------- /roles/storage/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/storage/README.md -------------------------------------------------------------------------------- /roles/storage/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unitedstack/gremlin/017fe09d80040019df7ed387bf1001114944f4c2/roles/storage/defaults/main.yml -------------------------------------------------------------------------------- /roles/storage/meta/main.yml: -------------------------------------------------------------------------------- 1 | # Include the `common` role as a dependency. This makes sure the 2 | # # variables defined in that role are available here. 3 | dependencies: 4 | - common 5 | -------------------------------------------------------------------------------- /roles/storage/tasks/damage_mon.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - include: stop_mon.yml 3 | 4 | - name: Get mon name 5 | shell: ls /var/lib/ceph/mon/ 6 | register: mon_name 7 | 8 | - name: Backup ceph db file 9 | shell: cp /var/lib/ceph/mon/{{ mon_name.stdout }}/store.db/CURRENT /tmp/ 10 | 11 | - name: Change ceph db file 12 | shell: echo 0 > /var/lib/ceph/mon/{{ mon_name.stdout }}/store.db/CURRENT 13 | -------------------------------------------------------------------------------- /roles/storage/tasks/del_osd_partition.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: Get the OSD Disk 3 | shell: "df -h |grep ceph-{{ item }} |awk '{print $1}'|sed 's/[0-9]//g'" 4 | register: ceph_osd_disk 5 | become: true 6 | with_items: 7 | - "{{ osd_down_list }}" 8 | 9 | - name: Get the OSD Disk Partition 10 | shell: "df -h |grep ceph-{{ item }} |awk '{print $1}'" 11 | register: ceph_osd_disk_part 12 | become: true 13 | with_items: 14 | - "{{ osd_down_list }}" 15 | 16 | - name: Backup Ceph OSD Partition 17 | shell: "sgdisk {{ item[1].stdout }} -b /tmp/osd_{{ item[0] }}_partition.bak" 18 | become: true 19 | register: backup_result 20 | ignore_errors: false 21 | failed_when: " 'successfully' not in backup_result.stdout" 22 | with_nested: 23 | - "{{ osd_down_list }}" 24 | - "{{ ceph_osd_disk.results }}" 25 | 26 | - name: Del OSD Partition Table 27 | command: "sgdisk -o {{ item.stdout }}" 28 | become: true 29 | register: del_result 30 | failed_when: " 'successfully' not in del_result.stdout" 31 | with_items: 32 | - "{{ ceph_osd_disk.results }}" 33 | 34 | - name: Umount Ceph OSD Disk 35 | shell: "umount /var/lib/ceph/osd/ceph-{{ item }}" 36 | become: true 37 | with_items: 38 | - "{{ osd_down_list }}" 39 | 40 | - name: Check OSD Umount Result 41 | shell: "df -h |grep ceph-{{ item }}" 42 | register: umount_result 43 | become: true 44 | failed_when: " 'ceph-' in umount_result.stdout" 45 | with_items: 46 | - "{{ osd_down_list }}" 47 | 48 | always: 49 | - name: Waiting 10 seconds to warm down 50 | pause: 51 | seconds: 10 52 | 53 | -------------------------------------------------------------------------------- /roles/storage/tasks/kill_mon.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: Kill ceph monitor daemon 3 | shell: pkill ceph-mon 4 | become: true 5 | ignore_errors: true 6 | 7 | - name: Check if ceph monitor was killed 8 | shell: "ps -ef | grep ceph-mon" 9 | register: ps_result 10 | become: true 11 | 12 | - fail: 13 | msg: "Failed to kill ceph monitor daemon" 14 | when: "'/usr/bin/ceph-mon' in ps_result.stdout" 15 | 16 | always: 17 | - name: Waiting 10 seconds to warm down 18 | pause: 19 | seconds: 10 20 | -------------------------------------------------------------------------------- /roles/storage/tasks/kill_osd.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: Kill Ceph OSD daemon 3 | command: "pkill --pidfile /var/run/ceph/osd.{{ item }}.pid" 4 | args: 5 | removes: "/var/run/ceph/osd.{{ item }}.pid" 6 | become: true 7 | with_items: 8 | - "{{ osd_down_list }}" 9 | 10 | - name: Use Systemd to kill Ceph OSD daemon 11 | command: "systemctl kill ceph-osd@{{ item }}" 12 | args: 13 | creates: "/var/run/ceph/osd.{{ item }}.pid" 14 | become: true 15 | with_items: 16 | - "{{ osd_down_list }}" 17 | 18 | always: 19 | - name: Waiting 10 seconds to warm down 20 | pause: 21 | seconds: 10 22 | -------------------------------------------------------------------------------- /roles/storage/tasks/kill_rgw.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: Kill Ceph radosgw daemon 3 | command: "pkill radosgw" 4 | become: true 5 | ignore_errors: true 6 | 7 | - name: Check if ceph radosgw was killed 8 | shell: "ps -ef | grep radosgw" 9 | register: ps_result 10 | ignore_errors: true 11 | 12 | - fail: 13 | msg: "Failed to kill Ceph radosgw daemon" 14 | when: "'/usr/bin/radosgw' in ps_result.stdout" 15 | 16 | always: 17 | - name: Waiting 10 seconds to warm down 18 | pause: 19 | seconds: 10 20 | -------------------------------------------------------------------------------- /roles/storage/tasks/recover_damage_mon.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: Recover ceph db file 3 | shell: mv /tmp/CURRENT /var/lib/ceph/mon/{{ mon_name.stdout }}/store.db/ -f 4 | 5 | - include: start_mon.yml 6 | -------------------------------------------------------------------------------- /roles/storage/tasks/recover_osd_partition.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: Recover Ceph OSD Partition 3 | command: "sgdisk --load-backup /tmp/osd_{{ item[0] }}_partition.bak {{ item[1].stdout }}" 4 | register: recover_result 5 | become: true 6 | failed_when: " 'successfully' not in recover_result.stdout" 7 | with_nested: 8 | - "{{ osd_down_list }}" 9 | - "{{ ceph_osd_disk.results }}" 10 | when: "ceph_osd_disk is defined" 11 | 12 | - name: Mount Ceph OSD Disk 13 | shell: "mount {{ item[1].stdout }} /var/lib/ceph/osd/ceph-{{ item[0] }} " 14 | become: true 15 | with_nested: 16 | - "{{ osd_down_list }}" 17 | - "{{ ceph_osd_disk_part.results }}" 18 | when: "ceph_osd_disk_part is defined" 19 | 20 | - name: Check OSD Mount Result 21 | shell: "df -h |grep ceph-{{ item }}" 22 | register: check_result 23 | become: true 24 | failed_when: " 'ceph-' not in check_result.stdout" 25 | with_items: 26 | - "{{ osd_down_list }}" 27 | 28 | always: 29 | - name: Waiting 10 seconds to warm up 30 | pause: 31 | seconds: 10 32 | -------------------------------------------------------------------------------- /roles/storage/tasks/start_mon.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: "Reset failed ceph monitor status " 3 | shell: "systemctl reset-failed ceph-mon@{{ ansible_hostname }}" 4 | 5 | - name: "Start Ceph Mon" 6 | service: 7 | name: "ceph-mon@{{ ansible_hostname }}" 8 | state: started 9 | become: true 10 | 11 | rescue: 12 | - shell: "/etc/init.d/ceph start mon" 13 | become: true 14 | 15 | always: 16 | - name: Waiting 10 seconds to warm up 17 | pause: 18 | seconds: 10 19 | -------------------------------------------------------------------------------- /roles/storage/tasks/start_osd.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: "Reset failed ceph osd status " 3 | shell: "systemctl reset-failed ceph-osd@{{ item }}" 4 | args: 5 | creates: "/etc/init.d/ceph" 6 | become: true 7 | with_items: 8 | - "{{ osd_down_list }}" 9 | 10 | - name: "Start Ceph OSD" 11 | shell: "systemctl start ceph-osd@{{ item }}" 12 | args: 13 | creates: "/etc/init.d/ceph" 14 | become: true 15 | with_items: 16 | - "{{ osd_down_list }}" 17 | 18 | - shell: "/etc/init.d/ceph start osd.{{ item }}" 19 | args: 20 | removes: "/etc/init.d/ceph" 21 | become: true 22 | with_items: 23 | - "{{ osd_down_list }}" 24 | 25 | always: 26 | - name: Waiting 10 seconds to warm up 27 | pause: 28 | seconds: 10 29 | -------------------------------------------------------------------------------- /roles/storage/tasks/start_rgw.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: "Reset failed ceph radosgw status " 3 | shell: "systemctl reset-failed ceph-radosgw@rgw.{{ ansible_hostname }}" 4 | become: true 5 | 6 | - name: "Start Ceph RGW" 7 | service: 8 | name: "ceph-radosgw@rgw.{{ ansible_hostname }}" 9 | state: started 10 | become: true 11 | 12 | rescue: 13 | - shell: "/etc/init.d/ceph-radosgw start" 14 | become: true 15 | 16 | always: 17 | - name: Waiting 10 seconds to warm up 18 | pause: 19 | seconds: 10 20 | -------------------------------------------------------------------------------- /roles/storage/tasks/stop_mon.yml: -------------------------------------------------------------------------------- 1 | - block: 2 | - name: "Stop ceph monitor" 3 | shell: "systemctl stop ceph-mon@{{ ansible_hostname }}" 4 | 5 | - name: "Stop Ceph Mon" 6 | service: 7 | name: "ceph-mon@{{ ansible_hostname }}" 8 | state: stoped 9 | become: true 10 | 11 | rescue: 12 | - shell: "/etc/init.d/ceph stop mon" 13 | become: true 14 | 15 | always: 16 | - name: Waiting 10 seconds to warm up 17 | pause: 18 | seconds: 10 19 | -------------------------------------------------------------------------------- /roles/system/defaults/main.yml: -------------------------------------------------------------------------------- 1 | cpu_stress_timeout: "5m" 2 | cpu_stress_workers: 0 3 | cpu_stress_load: "50%" 4 | 5 | mem_stress_timeout: "5m" 6 | mem_stress_load: "50%" 7 | 8 | disk_stress_timeout: "5m" 9 | disk_stress_workers: 0 10 | disk_stress_load: "50%" 11 | -------------------------------------------------------------------------------- /roles/system/meta/main.yml: -------------------------------------------------------------------------------- 1 | # Include the `common` role as a dependency. This makes sure the 2 | # # variables defined in that role are available here. 3 | dependencies: 4 | - common 5 | -------------------------------------------------------------------------------- /roles/system/tasks/clear_tc.yml: -------------------------------------------------------------------------------- 1 | - name: clear tc rule 2 | shell: tc qdisc del dev {{ nic }} root 3 | become: true 4 | -------------------------------------------------------------------------------- /roles/system/tasks/cpu_load.yml: -------------------------------------------------------------------------------- 1 | - name: Install stress-ng 2 | package: 3 | name: stress-ng 4 | state: present 5 | become: true 6 | when: manage_packages|default(false) 7 | 8 | - name: Install htop 9 | package: 10 | name: htop 11 | state: present 12 | become: true 13 | when: manage_packages|default(false) 14 | 15 | - name: Stress CPU load 16 | shell: stress-ng --cpu {{ cpu_stress_workers }} --cpu-load {{ cpu_stress_load }} --timeout {{ cpu_stress_timeout }} 17 | become: true 18 | -------------------------------------------------------------------------------- /roles/system/tasks/disk_load.yml: -------------------------------------------------------------------------------- 1 | - name: Install stress-ng 2 | package: 3 | name: stress-ng 4 | state: present 5 | become: true 6 | when: manage_packages|default(false) 7 | 8 | - name: Install sysstat 9 | package: 10 | name: sysstat 11 | state: present 12 | become: true 13 | when: manage_packages|default(false) 14 | 15 | - name: Stress root file system load(free space on the file system) 16 | shell: stress-ng --iomix {{ disk_stress_workers }} --iomix-bytes {{ disk_stress_load }} --timeout {{ disk_stress_timeout }} 17 | become: true 18 | -------------------------------------------------------------------------------- /roles/system/tasks/mem_load.yml: -------------------------------------------------------------------------------- 1 | - name: Install stress-ng 2 | package: 3 | name: stress-ng 4 | state: present 5 | become: true 6 | when: manage_packages|default(false) 7 | 8 | - name: Install htop 9 | package: 10 | name: htop 11 | state: present 12 | become: true 13 | when: manage_packages|default(false) 14 | 15 | - name: Caculate stress vm_bytes 16 | shell: awk '/MemAvailable/{printf "%d\n", $2 * {{ mem_stress_load[:-1] | int }} / 100;}' < /proc/meminfo 17 | register: vm_bytes 18 | 19 | - name: stress memory 20 | shell: stress-ng --vm 1 --vm-bytes {{ vm_bytes.stdout }}k --vm-keep --timeout {{ mem_stress_timeout }} 21 | become: true 22 | -------------------------------------------------------------------------------- /roles/system/tasks/nic_delay.yml: -------------------------------------------------------------------------------- 1 | - name: Apply nic delay tc rule 2 | shell: > 3 | tc qdisc add dev {{ nic }} root netem delay {{ nic_delay_time }}; 4 | sleep {{ nic_delay_timeout }}; 5 | tc qdisc del dev {{ nic }} root netem delay {{ nic_delay_time }}; 6 | become: true 7 | -------------------------------------------------------------------------------- /roles/system/tasks/nic_down.yml: -------------------------------------------------------------------------------- 1 | - name: ifdown the nic 2 | shell: > 3 | ifdown {{ nic }}; 4 | sleep {{ nic_down_timeout }}; 5 | ifup {{ nic }}; 6 | become: true 7 | -------------------------------------------------------------------------------- /roles/system/tasks/nic_down_async.yml: -------------------------------------------------------------------------------- 1 | - name: ifdown the nic asynchronously 2 | shell: ifdown {{ nic }} 3 | async: 0 4 | poll: 0 5 | become: true 6 | -------------------------------------------------------------------------------- /roles/system/tasks/nic_loss.yml: -------------------------------------------------------------------------------- 1 | - name: Apply nic loss tc rule 2 | shell: > 3 | tc qdisc add dev {{ nic }} root netem loss {{ nic_loss_percent }}; 4 | sleep {{ nic_loss_timeout }}; 5 | tc qdisc del dev {{ nic }} root netem loss {{ nic_loss_percent }}; 6 | become: true 7 | --------------------------------------------------------------------------------