├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── JenkinsFile
├── LICENSE
├── NOTICE
├── README.md
├── cdi_test
    ├── common
    │   ├── cdi-policy.json
    │   ├── cdi-scripts.sh
    │   └── rxtx_cmd.txt
    └── tests
    │   └── run-cdi.sh
├── common.sh
├── efa-check.sh
├── fork_checker.c
├── install-aws-ofi-nccl.sh
├── install-fabtests.sh
├── install-libfabric-1.8.sh
├── install-libfabric.sh
├── install-nccl-tests.sh
├── install-nccl.sh
├── jenkins-ami
    ├── packer-jenkins.json
    └── prepare-ami.sh
├── mpi_common.sh
├── mpi_osu_test.sh
├── mpi_ring_c_test.sh
├── multi-node-efa-minimal.sh
├── multi-node.sh
├── multinode_runfabtests.sh
├── nccl
    ├── common
    │   ├── nccl-common.sh
    │   └── prep_ami.sh
    └── tests
    │   ├── nccl-multi-node.sh
    │   └── nccl-single-node.sh
├── run-nccl-tests.sh
├── single-node.sh
├── test
    └── setup.sh
└── wget_check.sh


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | dist: bionic
 3 | language: bash
 4 | 
 5 | addons:
 6 |   apt:
 7 |     sources:
 8 |     - sourceline: 'deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse'
 9 |     packages:
10 |     - shellcheck
11 | 
12 | before_install:
13 |     - sudo pip install bashate
14 | 
15 | script:
16 | - bashate -i E006 *.sh
17 | - bashate nccl/*/*.sh
18 | - # Commenting out shell-check till we fully resolve existing warnings
19 | - # shellcheck -S warning *.sh
20 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/awslabs/libfabric-ci-scripts/issues), or [recently closed](https://github.com/awslabs/libfabric-ci-scripts/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/libfabric-ci-scripts/labels/help%20wanted) issues is a great place to start. 
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/awslabs/libfabric-ci-scripts/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/JenkinsFile:
--------------------------------------------------------------------------------
 1 | pipeline {
 2 | 	agent {
 3 | 		node {
 4 | 			label 'master'
 5 | 		}
 6 | 	}
 7 | 	stages {
 8 | 		stage('Testing') {
 9 | 			parallel {
10 | 				stage('Single Node') {
11 | 					steps {
12 | 						sh 'echo "==>Beginning single node tests"'
13 | 						build job: 'single-node', parameters: [string(name: 'PULL_REQUEST_REF', value: sha1), string(name: 'PULL_REQUEST_ID', value: ghprbPullId), string(name: 'TARGET_BRANCH', value: ghprbTargetBranch)], propagate: true
14 | 					}
15 | 				}
16 | 				stage('Multi Node') {
17 | 					steps {
18 | 						sh 'echo "==> Beginning multi node tests"'
19 | 						build job: 'multi-node', parameters: [string(name: 'PULL_REQUEST_REF', value: sha1), string(name: 'PULL_REQUEST_ID', value: ghprbPullId), string(name: 'TARGET_BRANCH', value: ghprbTargetBranch)], propagate: true
20 | 					}
21 | 				}
22 | 			}
23 | 		}
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Libfabric CI
2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DEPRECATED - This Project is no longer supported, and should not be used.
 2 | 
 3 | ## Libfabric CI
 4 | 
 5 | A place for all the various scripts utilized in the libfabric ci project such as pipelines and packer files.
 6 | 
 7 | ## License
 8 | 
 9 | This library is licensed under the Apache 2.0 License. 
10 | 


--------------------------------------------------------------------------------
/cdi_test/common/cdi-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": "mediaconnect:*",
 7 |             "Resource": "*"
 8 |         }
 9 |     ]
10 | }
11 | 


--------------------------------------------------------------------------------
/cdi_test/common/cdi-scripts.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -xe
  4 | 
  5 | LIBFABRIC_BRANCH=${LIBFABRIC_BRANCH:-"main"}
  6 | PULL_REQUEST_ID=${PULL_REQUEST_ID:-"None"}
  7 | 
  8 | # cdi_test cmd file arguments
  9 | declare -A CDI_TEST_ARGS=( [LOG_DIR]="${HOME}/cdi_test_logs" \
 10 |                            [METRIC_NAME]="cdi_test_metric" \
 11 |                            [PAYLOAD_SIZE]="24883200" \
 12 |                            [LOCAL_IP]=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4) \
 13 |                            [NUM_LOOPS]="1" )
 14 | 
 15 | usage() {
 16 | cat << EOF
 17 | usage: $(basename "$0") [Options]
 18 | 
 19 | Options:
 20 |  [-c]           Command to run
 21 |                     [configure_aws_iam_user, run_cdi_test_minimal,
 22 |                     run_cdi_test, install_cdi_test]
 23 |  [-t]           Connection type [tx|rx]
 24 |  [-r]           Remote ip
 25 |  [-n]           Number of full cdi-test loops to run (default: 1)
 26 |  [-f]           The command file path for the full cdi-test
 27 |  [-l]           Libfabric branch to install (default: main)
 28 |  [-a]           AWS access key id
 29 |  [-s]           AWS secret access key
 30 |  [-u]           AWS IAM user use to post metrics
 31 |  [-y]           Region to post metrics to
 32 |  [-h]           Shows this help output
 33 | EOF
 34 | }
 35 | 
 36 | while getopts c:t:r:n:f:l:a:s:u:h option; do
 37 | case "${option}" in
 38 |         c)
 39 |             COMMAND=${OPTARG}
 40 |             ;;
 41 |         t)
 42 |             CONNECTION_TYPE=${OPTARG}
 43 |             ;;
 44 |         r)
 45 |             CDI_TEST_ARGS[REMOTE_IP]=${OPTARG}
 46 |             ;;
 47 |         n)
 48 |             NUM_LOOPS=${OPTARG}
 49 |             ;;
 50 |         f)
 51 |             COMMAND_FILE=${OPTARG}
 52 |             ;;
 53 |         l)
 54 |             LIBFABRIC_BRANCH=${OPTARG}
 55 |             ;;
 56 |         a)
 57 |             AWS_ACCESS_KEY_ID=${OPTARG}
 58 |             ;;
 59 |         s)
 60 |             AWS_SECRET_ACCESS_KEY=${OPTARG}
 61 |             ;;
 62 |         u)
 63 |             CDI_TEST_ARGS[CDI_TEST_IAM_USER]=${OPTARG}
 64 |             ;;
 65 |         y)
 66 |             CDI_TEST_ARGS[REGION]=${OPTARG}
 67 |             ;;
 68 |         h)
 69 |             usage
 70 |             exit 0
 71 |             ;;
 72 |         *)
 73 |             usage
 74 |             exit 1
 75 |             ;;
 76 |     esac
 77 | done
 78 | 
 79 | # Cmake
 80 | CMAKE_VERSION="3.15.3"
 81 | CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz"
 82 | 
 83 | # EFA Installer
 84 | EFA_INSTALLER_VERSION="latest"
 85 | EFA_INSTALLER_URL="https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz"
 86 | 
 87 | # AWS cdi sdk
 88 | AWS_CDI_SDK_URL="https://github.com/aws/aws-cdi-sdk"
 89 | 
 90 | # Libfabric
 91 | LIBFABRIC_URL="https://github.com/ofiwg/libfabric.git"
 92 | 
 93 | # AWS sdk cpp
 94 | AWS_SDK_CPP_BRANCH="1.8.46"
 95 | AWS_SDK_CPP_URL="https://github.com/aws/aws-sdk-cpp.git"
 96 | 
 97 | # name of the test directory to place cdi_test, libfabric, and aws sdk cpp
 98 | CDI_TEST_DIR="${HOME}/cdi_test_dir/"
 99 | CDI_TEST_BIN="${CDI_TEST_DIR}aws-cdi-sdk/build/debug/bin/"
100 | CDI_TEST_SRC="${CDI_TEST_BIN}cdi_test"
101 | CDI_TEST_MIN_RX_SRC="${CDI_TEST_BIN}cdi_test_min_rx"
102 | CDI_TEST_MIN_TX_SRC="${CDI_TEST_BIN}cdi_test_min_tx"
103 | 
104 | # Configure aws iam user
105 | # This is needed to store metrics
106 | # Must specify:
107 | #   -c run_cdi_test
108 | #   -a <AWS_ACCESS_KEY_ID>
109 | #   -s <AWS_SECRET_ACCESS_KEY>
110 | configure_aws_iam_user() {
111 |     mkdir -p "${HOME}/.aws"
112 | 
113 |     touch "${HOME}/.aws/credentials"
114 |     echo "[default]" >> ${HOME}/.aws/credentials
115 |     echo "aws_access_key_id=${AWS_ACCESS_KEY_ID}" >> ${HOME}/.aws/credentials
116 |     echo "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" >> ${HOME}/.aws/credentials
117 | 
118 |     touch ${HOME}/.aws/credentials
119 | }
120 | 
121 | # Installs cdi_test dependencies
122 | install_cdi_test_deps() {
123 |     # Install Dependencies for cdi_test
124 |     sudo yum -y install gcc-c++ make libnl3-devel autoconf automake libtool doxygen ncurses-devel git
125 |     # Install Dependencies for sdk-cpp
126 |     sudo yum -y install libcurl-devel openssl-devel libuuid-devel pulseaudio-libs-devel
127 | }
128 | 
129 | # Installs Cmake
130 | install_cmake() {
131 |     wget ${CMAKE_URL}
132 |     tar -zxvf "cmake-${CMAKE_VERSION}.tar.gz"
133 |     pushd "cmake-${CMAKE_VERSION}"
134 |     ./bootstrap --prefix=/usr/local
135 |     make
136 |     sudo make install
137 |     popd
138 | }
139 | 
140 | # Installs minimal EFA drivers
141 | install_efa_minimal() {
142 |     curl -O ${EFA_INSTALLER_URL}
143 |     tar -xf "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz"
144 |     pushd "aws-efa-installer"
145 |     sudo ./efa_installer.sh -y -m
146 |     popd
147 | }
148 | 
149 | # Sets up CDI directory
150 | setup_cdi_test_directory() {
151 |     mkdir ${CDI_TEST_DIR}
152 |     mkdir ${CDI_TEST_ARGS[LOG_DIR]}
153 |     cd ${CDI_TEST_DIR}
154 | 
155 |     git clone ${AWS_CDI_SDK_URL}
156 |     # cdi_test is asserting libfabric version 1.9.x.
157 |     # this prevents us from testing main branch.
158 |     # remove the assert statement:
159 |     sed -i 's/CDI_STATIC_ASSERT(FI_MAJOR_VERSION==[1-9]* && FI_MINOR_VERSION==[1-9]*.*//g' \
160 |         ${CDI_TEST_DIR}aws-cdi-sdk/src/cdi/adapter_efa.c
161 |     #TODO: remove when the libfabric version restriction is lifted
162 |     git clone ${LIBFABRIC_URL}
163 |     pushd libfabric
164 |     if [ ! "$PULL_REQUEST_ID" = "None" ]; then
165 |         git fetch origin +refs/pull/$PULL_REQUEST_ID/*:refs/remotes/origin/pr/$PULL_REQUEST_ID/*
166 |         git checkout $PULL_REQUEST_REF -b PRBranch
167 |     else
168 |         git checkout ${LIBFABRIC_BRANCH}
169 |     fi
170 |     popd
171 |     git clone -b ${AWS_SDK_CPP_BRANCH} ${AWS_SDK_CPP_URL}
172 | 
173 |     # Build CDI libraries
174 |     cd aws-cdi-sdk/
175 |     make docs docs_api
176 |     make DEBUG=y AWS_SDK="${CDI_TEST_DIR}/aws-sdk-cpp/"
177 | }
178 | 
179 | # Installs AWS CDI
180 | # Must specify:
181 | #   -c install_cdi_test
182 | install_cdi_test() {
183 |     install_cdi_test_deps
184 |     install_cmake
185 |     install_efa_minimal
186 |     setup_cdi_test_directory
187 | }
188 | 
189 | # Runs the full cdi-test with the given command file
190 | # Must specify:
191 | #   -c run_cdi_test
192 | #   -t [rx|tx]
193 | #   -f <PATH_TO_COMMAND_FILE>
194 | #   -r <REMOTE_IP>
195 | #   -u <AWS_IAM_USER>
196 | #   -y <REGION>
197 | run_cdi_test() {
198 |     # Check that cmd_file exists
199 |     if [[ ! -f ${COMMAND_FILE} ]]; then
200 |         echo "cmd file does not exist: ${COMMAND_FILE}"
201 |         exit 1
202 |     fi
203 | 
204 |     # Set the cdi_test arguments
205 |     if [[ ${CONNECTION_TYPE} == "rx" ]]; then
206 |         CDI_TEST_ARGS[RX_DEST_PORT]="2000"
207 |         CDI_TEST_ARGS[TX_DEST_PORT]="2100"
208 |     else
209 |         CDI_TEST_ARGS[TX_DEST_PORT]="2000"
210 |         CDI_TEST_ARGS[RX_DEST_PORT]="2100"
211 |     fi
212 | 
213 |     # Replace arguments in cmd file
214 |     for args in "${!CDI_TEST_ARGS[@]}"; do
215 |         sed -i "s,<${args}>,${CDI_TEST_ARGS[$args]},g" ${COMMAND_FILE}
216 |     done
217 | 
218 |     ${CDI_TEST_SRC} "@${COMMAND_FILE}"
219 | }
220 | 
221 | # Run a minimal version of cdi_test for basic tx/rx communication
222 | # Must specify:
223 | #   -c run_cdi_test_minimal
224 | #   -t [rx|tx]
225 | # If connection-type == tx:
226 | #   -r <REMOTE_IP>
227 | run_cdi_test_minimal() {
228 |     if [[ ${CONNECTION_TYPE} == "rx" ]]; then
229 |         ${CDI_TEST_MIN_RX_SRC} --local_ip ${CDI_TEST_ARGS[LOCAL_IP]} \
230 |                                --rx RAW \
231 |                                --dest_port 2000 \
232 |                                --num_transactions 100 \
233 |                                --payload_size 5184000
234 |     else
235 |         ${CDI_TEST_MIN_TX_SRC} --local_ip ${CDI_TEST_ARGS[LOCAL_IP]} \
236 |                                --tx RAW \
237 |                                --remote_ip ${CDI_TEST_ARGS[REMOTE_IP]} \
238 |                                --dest_port 2000 \
239 |                                --rate 60 \
240 |                                --num_transactions 100 \
241 |                                --payload_size 5184000
242 |     fi
243 | }
244 | 
245 | ${COMMAND}
246 | 


--------------------------------------------------------------------------------
/cdi_test/common/rxtx_cmd.txt:
--------------------------------------------------------------------------------
 1 | # change logs directory and prefix to whatever desired
 2 | --logs <LOG_DIR>/rxtx
 3 | --log_component "PROBE ENDPOINT_MANAGER PERFORMANCE_METRICS"
 4 | 
 5 | # change local_ip to be for the efa being tested
 6 | --local_ip <LOCAL_IP>
 7 | --adapter EFA
 8 | --stderr
 9 | --num_loops <NUM_LOOPS>
10 | 
11 | # change cloudwatch settings to whatever desired. The first is the metrics
12 | # name, the second is the AZ being published to, and the last is the user name.
13 | --stats_cloudwatch <METRIC NAME> <REGION> <CDI_TEST_IAM_USER>
14 | 
15 | #---------------------------------------
16 | # rx connection 0
17 | -X
18 | # connection name is user specified
19 | --rx RAW
20 | --connection_name rx_fr_<LOCAL_IP>_<RX_DEST_PORT>
21 | --dest_port <RX_DEST_PORT>
22 | --core 5
23 | --rate 60
24 | --stats_period 10
25 | -S
26 | --pattern SHL
27 | --pat_start 1C014D6DA44CE61A
28 | --payload_size <PAYLOAD_SIZE>
29 | --num_transactions 0
30 | 
31 | #---------------------------------------
32 | # tx connection 0
33 | -X
34 | --tx RAW
35 | --connection_name tx_to_<REMOTE_IP>_<TX_DEST_PORT>
36 | # change remote ip to efa on other instance
37 | --remote_ip <REMOTE_IP>
38 | --dest_port <TX_DEST_PORT>
39 | --tx_timeout 16666
40 | --keep_alive
41 | --core 6
42 | --rate 60
43 | --stats_period 10
44 | -S
45 | --pattern SHL
46 | --pat_start 1C014D6DA44CE61A
47 | --payload_size <PAYLOAD_SIZE>
48 | --num_transactions 0
49 | 


--------------------------------------------------------------------------------
/cdi_test/tests/run-cdi.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -xe
  4 | exit_code=0
  5 | REGION=${AWS_DEFAULT_REGION}
  6 | 
  7 | RUN_MINIMAL=${RUN_MINIMAL:-0}
  8 | RUN_FULL=${RUN_FULL:-0}
  9 | 
 10 | CLIENT_LIBFABRIC_BRANCH=${CLIENT_LIBFABRIC_BRANCH:-"main"}
 11 | SERVER_LIBFABRIC_BRANCH=${SERVER_LIBFABRIC_BRANCH:-"main"}
 12 | 
 13 | PULL_REQUEST_ID=${PULL_REQUEST_ID:-"None"}
 14 | 
 15 | CDI_COMMON="${WORKSPACE}/libfabric-ci-scripts/cdi_test/common"
 16 | CDI_SCRIPT="${CDI_COMMON}/cdi-scripts.sh"
 17 | CDI_CMD_FILE="${CDI_COMMON}/rxtx_cmd.txt"
 18 | CDI_POLICY_DOCUMENT="${CDI_COMMON}/cdi-policy.json"
 19 | 
 20 | ami_arch="x86_64"
 21 | label="alinux"
 22 | SSH_USER="ec2-user"
 23 | NODES=2
 24 | PROVIDER="efa"
 25 | ENABLE_PLACEMENT_GROUP=1
 26 | 
 27 | cdi_test_timeout=30m
 28 | 
 29 | echo "'INFO' ==> Starting perparation for cdi_test"
 30 | source "${WORKSPACE}/libfabric-ci-scripts/common.sh"
 31 | 
 32 | cdi_on_exit() {
 33 |     on_exit
 34 | }
 35 | 
 36 | cdi_execute_cmd() {
 37 |     ip=$1
 38 |     cmd=$2
 39 |     timeout ${cdi_test_timeout} ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
 40 |         -o BatchMode=yes -o TCPKeepAlive=yes \
 41 |         -i ~/${slave_keypair} ${SSH_USER}@${ip} ${cmd} --
 42 | }
 43 | 
 44 | 
 45 | trap 'cdi_on_exit'  EXIT
 46 | 
 47 | # Launch instances
 48 | echo "==> Creating Nodes"
 49 | 
 50 | ami=()
 51 | ami[0]=$(aws --region $AWS_DEFAULT_REGION ssm get-parameters --names "/ec2-imagebuilder/alinux2-x86_64/latest" | jq -r ".Parameters[0].Value")
 52 | ami[1]=${SSH_USER}
 53 | create_instance || { echo "==>Unable to create instance"; exit 65; }
 54 | set -x
 55 | INSTANCE_IDS=($INSTANCE_IDS)
 56 | 
 57 | get_instance_ip
 58 | INSTANCE_IPS=($INSTANCE_IPS)
 59 | 
 60 | pids=""
 61 | # Wait until all instances have passed SSH connection check
 62 | for IP in ${INSTANCE_IPS[@]}; do
 63 |     test_ssh "$IP" &
 64 |     pids="$pids $!"
 65 | done
 66 | for pid in $pids; do
 67 |     wait $pid || { echo "==>Instance ssh check failed"; exit 65; }
 68 | done
 69 | 
 70 | cdi_test_script="/home/${SSH_USER}/cdi-scripts.sh"
 71 | 
 72 | # Put scripts on nodes
 73 | for IP in ${INSTANCE_IPS[@]}; do
 74 |     scp -i ~/${slave_keypair} -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no \
 75 |         ${CDI_SCRIPT} ${SSH_USER}@${IP}:/home/${SSH_USER}/
 76 | 
 77 |     scp -i ~/${slave_keypair} -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no \
 78 |         ${CDI_CMD_FILE} ${SSH_USER}@${IP}:/home/${SSH_USER}/
 79 | 
 80 |     cdi_execute_cmd ${IP} \
 81 |     "${cdi_test_script} -c configure_aws_iam_user -a ${CDI_ACCESS_KEY} -s ${CDI_SECRET_KEY}"
 82 | done
 83 | 
 84 | # Install cdi_test
 85 | echo "==> Installing cdi_test on each node"
 86 | 
 87 | set +e
 88 | 
 89 | cdi_execute_cmd ${INSTANCE_IPS[0]} \
 90 |     "export LIBFABRIC_BRANCH=${SERVER_LIBFABRIC_BRANCH}; \
 91 |     export PULL_REQUEST_ID=${PULL_REQUEST_ID}; \
 92 |     ${cdi_test_script} -c install_cdi_test" \
 93 |     > server_install.out 2>&1 &
 94 | 
 95 | server_pid=$!
 96 | 
 97 | export LIBFABRIC_BRANCH=${CLIENT_LIBFABRIC_BRANCH}
 98 | cdi_execute_cmd ${INSTANCE_IPS[1]} \
 99 |     "export LIBFABRIC_BRANCH=${CLIENT_LIBFABRIC_BRANCH}; \
100 |     export PULL_REQUEST_ID=${PULL_REQUEST_ID}; \
101 |     ${cdi_test_script} -c install_cdi_test" \
102 |     > client_install.out 2>&1 &
103 | 
104 | client_pid=$!
105 | 
106 | wait ${server_pid}
107 | wait ${client_pid}
108 | if [[ $? -ne 0 ]]; then
109 |     echo "cdi_test installation failed."
110 |     exit 1
111 | fi
112 | 
113 | set -e
114 | 
115 | # Run cdi_test
116 | if [[ ${RUN_MINIMAL} -eq 1 ]]; then
117 |     set +e
118 |     echo "==> Running minimal cdi_test"
119 | 
120 |     cdi_execute_cmd ${INSTANCE_IPS[0]} \
121 |         "${cdi_test_script} -c run_cdi_test_minimal -t rx" \
122 |         > server_minimal.out 2>&1 &
123 | 
124 |     server_pid=$!
125 | 
126 |     cdi_execute_cmd ${INSTANCE_IPS[1]} \
127 |         "${cdi_test_script} -c run_cdi_test_minimal -t tx -r ${INSTANCE_IPS[0]}" \
128 |         > client_minimal.out 2>&1 &
129 | 
130 |     client_pid=$!
131 | 
132 |     wait ${server_pid}
133 |     wait ${client_pid}
134 |     error=$?
135 |     if [[ $error -ne 0 ]]; then
136 |         echo "Minimal cdi_test failed."
137 |         exit_code=$error
138 |     fi
139 |     cat server_minimal.out
140 |     set -e
141 | fi
142 | 
143 | if [[ ${RUN_FULL} -eq 1 ]]; then
144 |     set +e
145 |     # run full cdi_test
146 |     echo "==> Running full cdi_test"
147 | 
148 |     cdi_execute_cmd ${INSTANCE_IPS[0]} \
149 |               "${cdi_test_script} -c run_cdi_test -t rx -f /home/${SSH_USER}/rxtx_cmd.txt \
150 |               -r ${INSTANCE_IPS[1]} -u ${USER_NAME} -y ${REGION}" \
151 |               > server_full.out 2>&1 &
152 | 
153 |     server_pid=$!
154 | 
155 |     cdi_execute_cmd ${INSTANCE_IPS[1]}
156 |               "${cdi_test_script} -c run_cdi_test -t tx -f /home/${SSH_USER}/rxtx_cmd.txt \
157 |               -r ${INSTANCE_IPS[0]} -u ${USER_NAME} -y ${REGION}" \
158 |               > client_full.out 2>&1 &
159 | 
160 |     client_pid=$!
161 | 
162 |     wait ${server_pid}
163 |     wait ${client_pid}
164 |     error=$?
165 |     if [[ $error -ne 0 ]]; then
166 |         echo "Full cdi_test failed."
167 |         exit_code=$error
168 |     fi
169 |     cat server_full.out
170 |     set -e
171 | fi
172 | 
173 | if [[ $exit_code -eq 0 ]]; then
174 |     echo "==> cdi_test Tests Passed"
175 |     exit 0
176 | else
177 |     echo "==> cdi_test Tests Failed"
178 |     exit $exit_code
179 | fi
180 | 


--------------------------------------------------------------------------------
/common.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | source $WORKSPACE/libfabric-ci-scripts/wget_check.sh
  4 | execution_seq=1
  5 | BUILD_CODE=0
  6 | output_dir=${output_dir:-$(mktemp -d -p $WORKSPACE)}
  7 | tmp_script=${tmp_script:-$(mktemp -p $WORKSPACE)}
  8 | # set default architecture of ami as x86_64
  9 | ami_arch=${ami_arch:-"x86_64"}
 10 | if [ ! "$ami_arch" = "x86_64" ] && [ ! "$ami_arch" = "aarch64" ]; then
 11 |     echo "Unknown architecture, ami_arch must be x86_64 or aarch64"
 12 |     exit 1
 13 | fi
 14 | RUN_IMPI_TESTS=${RUN_IMPI_TESTS:-1}
 15 | RUN_CDI_TESTS=${RUN_CDI_TESTS:-0}
 16 | ENABLE_PLACEMENT_GROUP=${ENABLE_PLACEMENT_GROUP:-0}
 17 | TEST_SKIP_KMOD=${TEST_SKIP_KMOD:-0}
 18 | BUILD_GDR=${BUILD_GDR:-0}
 19 | 
 20 | create_pg()
 21 | {
 22 |     if [ ${ENABLE_PLACEMENT_GROUP} -eq 0 ]; then
 23 |         return 0
 24 |     fi
 25 |     #Month - Day - Year - Hour - Minute - Second
 26 |     date_time=$(date +'%m-%d-%Y-%H-%M-%S')
 27 |     PLACEMENT_GROUP="compute-pg-${date_time}-${BUILD_NUMBER}-${RANDOM}"
 28 |     AWS_DEFAULT_REGION=us-west-2 aws ec2 create-placement-group \
 29 |         --group-name ${PLACEMENT_GROUP} \
 30 |         --strategy cluster
 31 |     return $?
 32 | }
 33 | 
 34 | delete_pg()
 35 | {
 36 |     if [[ ${ENABLE_PLACEMENT_GROUP} -eq 0 || -z $PLACEMENT_GROUP ]]; then
 37 |         return 0
 38 |     fi
 39 |     local ret=0
 40 |     # The placement group may be in use due to the attached
 41 |     # ec2 instances are not terminated completely. Keep
 42 |     # waiting and retrying within 20*30=600 seconds (10 minutes).
 43 |     local retry=20
 44 |     local sleep_time=30
 45 |     local bash_option=$-
 46 |     local restore_e=0
 47 |     if [[ $bash_option =~ e ]]; then
 48 |         restore_e=1
 49 |         set +e
 50 |     fi
 51 |     echo "Start deleting placement group ${PLACEMENT_GROUP}."
 52 |     while [[ $retry -ge 0 ]]; do
 53 |         delete_pg_response=$(AWS_DEFAULT_REGION=us-west-2 aws ec2 delete-placement-group \
 54 |             --group-name ${PLACEMENT_GROUP} 2>&1)
 55 |         ret=$?
 56 |         if [[ $ret -ne 0 && "$delete_pg_response" == *"InvalidPlacementGroup.InUse"* ]]; then
 57 |             sleep $sleep_time
 58 |         else
 59 |             break
 60 |         fi
 61 |         retry=$((retry-1))
 62 |     done
 63 |     if [[ $ret -eq 0 ]]; then
 64 |         echo "Successfully delete placement group ${PLACEMENT_GROUP}."
 65 |     else
 66 |         echo "Fail to delete placement group ${PLACEMENT_GROUP}."
 67 |     fi
 68 |     if [[ $restore_e -eq 1 ]]; then
 69 |         set -e
 70 |     fi
 71 |     return $ret
 72 | }
 73 | 
 74 | # Launches EC2 instances.
 75 | create_instance()
 76 | {
 77 |     local retry=10
 78 |     local sleep_time=60
 79 |     # TODO: the labels need to be fixed in LibfabricCI and the stack
 80 |     # redeployed for PR testing
 81 |     # The ami-ids are stored in ssm paramater-store with names
 82 |     # "/ec2-imagebuilder/${os}-${arch}/latest".
 83 |     if [[ $PULL_REQUEST_REF == *pr* ]]; then
 84 |         case "${label}" in
 85 |             rhel)
 86 |                 ami[0]=$(aws --region $AWS_DEFAULT_REGION ssm get-parameters --names "/ec2-imagebuilder/rhel7-x86_64/latest" | jq -r ".Parameters[0].Value")
 87 |                 ;;
 88 |             ubuntu)
 89 |                 ami[0]=$(aws --region $AWS_DEFAULT_REGION ssm get-parameters --names "/ec2-imagebuilder/ubuntu1804-x86_64/latest" | jq -r ".Parameters[0].Value")
 90 |                 ;;
 91 |             alinux)
 92 |                 ami[0]=$(aws --region $AWS_DEFAULT_REGION ssm get-parameters --names "/ec2-imagebuilder/alinux2-x86_64/latest" | jq -r ".Parameters[0].Value")
 93 |                 ;;
 94 |             *)
 95 |                 exit 1
 96 |         esac
 97 |     fi
 98 |     # If a specific subnet ID is provided by the caller, use that instead of
 99 |     # querying the VPC for all subnets.
100 |     if [[ -n ${BUILD_SUBNET_ID} ]]; then
101 |         subnet_ids=${BUILD_SUBNET_ID}
102 |     else
103 |         # Get a list of subnets within the VPC relevant to the Slave SG
104 |         vpc_id=$(AWS_DEFAULT_REGION=us-west-2 aws ec2 describe-security-groups \
105 |             --group-ids ${slave_security_group} \
106 |             --query SecurityGroups[0].VpcId --output=text)
107 |         subnet_ids=$(AWS_DEFAULT_REGION=us-west-2 aws ec2 describe-subnets \
108 |             --filters "Name=availability-zone,Values=[us-west-2a,us-west-2b,us-west-2c]" \
109 |                         "Name=vpc-id,Values=$vpc_id" \
110 |                         --query "Subnets[*].SubnetId" --output=text)
111 |     fi
112 | 
113 |     INSTANCE_IDS=''
114 |     SERVER_ERROR=(
115 |     InsufficientInstanceCapacity
116 |     RequestLimitExceeded
117 |     ServiceUnavailable
118 |     Unavailable
119 |     Unsupported
120 |     )
121 |     create_instance_count=0
122 |     error=1
123 |     if [ $ami_arch = "x86_64" ] && [ $BUILD_GDR -eq 0 ]; then
124 |         case "${PROVIDER}" in
125 |             efa)
126 |                 instance_type=c5n.18xlarge
127 |                 network_interface="[{\"DeviceIndex\":0,\"DeleteOnTermination\":true,\"InterfaceType\":\"efa\",\"Groups\":[\"${slave_security_group}\"]"
128 |                 # Opensuse Leap AMI is not supported on m5n.24xlarge instance
129 |                 if [[ ${label} == "suse" ]]; then
130 |                     instance_type=c5n.18xlarge
131 |                 fi
132 |                 ;;
133 |             tcp|udp|shm)
134 |                 instance_type=c5.18xlarge
135 |                 network_interface="[{\"DeviceIndex\":0,\"DeleteOnTermination\":true,\"Groups\":[\"${slave_security_group}\"]"
136 |                 ;;
137 |             *)
138 |                 exit 1
139 |         esac
140 |     elif [ $BUILD_GDR -eq 1 ]; then
141 |         instance_type=p4d.24xlarge
142 |         network_interface="[{\"DeviceIndex\":0,\"DeleteOnTermination\":true,\"InterfaceType\":\"efa\",\"Groups\":[\"${slave_security_group}\"]"
143 |     elif [ $ami_arch = "aarch64" ]; then
144 |         case "${PROVIDER}" in
145 |             efa)
146 |                 instance_type=c6gn.16xlarge
147 |                 network_interface="[{\"DeviceIndex\":0,\"DeleteOnTermination\":true,\"InterfaceType\":\"efa\",\"Groups\":[\"${slave_security_group}\"]"
148 |                 ;;
149 |             tcp)
150 |                 instance_type=${instance_type:-"a1.4xlarge"}
151 |                 network_interface="[{\"DeviceIndex\":0,\"DeleteOnTermination\":true,\"Groups\":[\"${slave_security_group}\"]"
152 |                 ;;
153 |             *)
154 |                 exit 1
155 |         esac
156 |     fi
157 |     addl_args=""
158 |     if [ ${ENABLE_PLACEMENT_GROUP} -eq 1 ]; then
159 |         echo "==> Creating placement group"
160 |         create_pg || return 1
161 |         addl_args+=" --placement GroupName=${PLACEMENT_GROUP}"
162 |     fi
163 |     if [[ -n ${USER_DATA_FILE} && -f ${USER_DATA_FILE} ]]; then
164 |         addl_args+=" --user-data file://${USER_DATA_FILE}"
165 |     fi
166 |     # NVIDIA drivers and CUDA toolkit are large, allocate more EBS space for them.
167 |     if [ "$ami_arch" = "x86_64" ]; then
168 |         dev_name=$(aws ec2 describe-images --image-id ${ami[0]} --query 'Images[*].RootDeviceName' --output text)
169 |         addl_args="${addl_args} --block-device-mapping=[{\"DeviceName\":\"${dev_name}\",\"Ebs\":{\"VolumeSize\":64}}]"
170 |     fi
171 |     # Use capacity reservation if exists
172 |     if [ -n "$CapacityReservationId" ]; then
173 |         addl_args="${addl_args} --capacity-reservation-specification CapacityReservationPreference=open --capacity-reservation-specification CapacityReservationTarget={CapacityReservationId=${CapacityReservationId}}"
174 |     fi
175 | 
176 |     echo "==> Creating instances"
177 |     while [ ${error} -ne 0 ] && [ ${create_instance_count} -lt ${retry} ]; do
178 |         for subnet in ${subnet_ids[@]}; do
179 |             error=1
180 |             set +e
181 |             INSTANCE_IDS=$(AWS_DEFAULT_REGION=us-west-2 aws ec2 run-instances \
182 |                     --tag-specification "ResourceType=instance,Tags=[{Key=Workspace,Value="${WORKSPACE}"},{Key=Name,Value=Slave},{Key=Build_Number,Value="${BUILD_NUMBER}"}]" \
183 |                     --image-id ${ami[0]} \
184 |                     --instance-type ${instance_type} \
185 |                     --enable-api-termination \
186 |                     --key-name ${slave_keypair} \
187 |                     --network-interface ${network_interface}",\"SubnetId\":\"${subnet}\"}]" \
188 |                     --count ${NODES}:${NODES} \
189 |                     --query "Instances[*].InstanceId" \
190 |                     --output=text ${addl_args} 2>&1)
191 |             create_instance_exit_code=$?
192 |             set -e
193 |             echo "${INSTANCE_IDS}"
194 |             # If run-instances is successful break from both the loops, else
195 |             # find out whether the error was due to SERVER_ERROR or some other error
196 |             if [ $create_instance_exit_code -ne 0 ]; then
197 |                 # If the error was due to SERVER_ERROR, set error=1 else for
198 |                 # some other error set error=0
199 |                 for code in ${SERVER_ERROR[@]}; do
200 |                     if [[ "${INSTANCE_IDS}" == *${code}* ]]; then
201 |                         error=1
202 |                         break
203 |                     else
204 |                         error=0
205 |                     fi
206 |                 done
207 |             else
208 |                 break 2
209 |             fi
210 |             # If run-instances wasn't successful, and it was due to some other
211 |             # error, exit and fail the test.
212 |             if [ ${error} -eq 0 ]; then
213 |                 # Mark build as unstable, error code 65 has been used to
214 |                 # identify unstable build
215 |                 exit 65
216 |             fi
217 |         done
218 |         sleep ${sleep_time}
219 |         create_instance_count=$((create_instance_count+1))
220 |     done
221 | }
222 | 
223 | # Get IP address for instances
224 | get_instance_ip()
225 | {
226 |     execution_seq=$((${execution_seq}+1))
227 |     local retry=20
228 |     local sleep_time=10
229 |     local ret=0
230 |     local bash_option=$-
231 |     local restore_e=0
232 |     local get_instance_ip_succeed=0
233 |     local instance_ips_array=()
234 |     if [[ $bash_option =~ e ]]; then
235 |         restore_e=1
236 |         set +e
237 |     fi
238 |     while [ $retry -ge 0 ]; do
239 |         INSTANCE_IPS=$(aws ec2 describe-instances --instance-ids ${INSTANCE_IDS[@]} \
240 |                             --query "Reservations[*].Instances[*].PrivateIpAddress" \
241 |                             --output=text)
242 |         ret=$?
243 |         instance_ips_array=($INSTANCE_IPS)
244 |         if [[ $ret -eq 0 && -n $INSTANCE_IPS && ${#instance_ips_array[@]} -eq ${#INSTANCE_IDS[@]} ]]; then
245 |             get_instance_ip_succeed=1
246 |             break
247 |         else
248 |             sleep $sleep_time
249 |         fi
250 |         retry=$((retry-1))
251 |     done
252 |     if [[ $get_instance_ip_succeed -eq 1 ]]; then
253 |         echo "Successfully get instance ips: ${INSTANCE_IPS}."
254 |     else
255 |         echo "Failed to get instance ips, exiting ..."
256 |         exit 1
257 |     fi
258 |     if [[ $restore_e -eq 1 ]]; then
259 |         set -e
260 |     fi
261 | }
262 | 
263 | #Test SLES15 with allow unsupported modules
264 | sles_allow_module()
265 | {
266 |     cat <<-"EOF" >> ${tmp_script}
267 |     if [[ $(grep -Po '(?<=^NAME=).*' /etc/os-release) =~  .*SLES.* ]]; then
268 |         sudo cp /lib/modprobe.d/10-unsupported-modules.conf /etc/modprobe.d/
269 |         sudo sed -i 's/allow_unsupported_modules .*/allow_unsupported_modules 1/' /etc/modprobe.d/10-unsupported-modules.conf
270 |         line_number=$(grep -n "exit_sles15_efa_unsupported_module" efa_installer.sh | cut -d":" -f1 | tail -n1)
271 |         sed -i "${line_number}s/.*/echo \"Allow unsupported modules for testing\"/" efa_installer.sh
272 |     fi
273 | EOF
274 | }
275 | # Creates a script, the script includes installation commands for
276 | # different AMIs and appends libfabric script
277 | script_builder()
278 | {
279 |     type=$1
280 |     set_var
281 |     efa_software_components
282 | 
283 |     # Do not construct script to install libfabric and fabtests for
284 |     # centos7-arm and rhel7-arm in EFAInstallerProdCanary and LibfabricMainCanary,
285 |     # where provider=tcp and TEST_SKIP_KMOD=1
286 |     if ! [ "$PROVIDER" = "tcp" -a "${TEST_SKIP_KMOD}" -eq 1 ]; then
287 |         # The libfabric shm provider use CMA for communication. By default ubuntu
288 |         # disallows non-child process ptrace by, which disable CMA.
289 |         # Since libfabric 1.10, shm provider has a fallback solution, which will
290 |         # be used when CMA is not available. Therefore, we turn off ptrace protection
291 |         # for v1.9.x and v1.8.x
292 |         if [ ${label} == "ubuntu" ]; then
293 |             if [ ${TARGET_BRANCH} == "v1.9.x" ] || [ ${TARGET_BRANCH} == "v1.8.x" ];then
294 |                 echo "sudo sysctl -w kernel.yama.ptrace_scope=0" >> ${tmp_script}
295 |             fi
296 |         fi
297 | 
298 |         if [ -n "$LIBFABRIC_INSTALL_PATH" ]; then
299 |             echo "LIBFABRIC_INSTALL_PATH=$LIBFABRIC_INSTALL_PATH" >> ${tmp_script}
300 |         elif [ ${TARGET_BRANCH} == "v1.8.x" ]; then
301 |             cat install-libfabric-1.8.sh >> ${tmp_script}
302 |         else
303 |             cat install-libfabric.sh >> ${tmp_script}
304 |         fi
305 | 
306 |         cat install-fabtests.sh >> ${tmp_script}
307 |     fi
308 | 
309 |     if [ $BUILD_GDR -eq 1 ]; then
310 |         cat install-nccl.sh >> ${tmp_script}
311 |         cat install-aws-ofi-nccl.sh >> ${tmp_script}
312 |         cat install-nccl-tests.sh >> ${tmp_script}
313 |     fi
314 | }
315 | 
316 | #Initialize variables
317 | set_var()
318 | {
319 |     cat <<-"EOF" > ${tmp_script}
320 |     #!/bin/bash
321 |     set -xe
322 |     source ~/wget_check.sh
323 |     PULL_REQUEST_ID=$1
324 |     PULL_REQUEST_REF=$2
325 |     PROVIDER=$3
326 |     AMI_ARCH=$4
327 |     LIBFABRIC_JOB_TYPE=$5
328 |     echo "==>Installing OS specific packages"
329 | EOF
330 | }
331 | 
332 | # Poll for the SSH daemon to come up before proceeding.
333 | # The SSH poll retries with exponential backoff.
334 | # The initial backoff is 30s, and doubles for each retry, until 16 minutes.
335 | test_ssh()
336 | {
337 |     slave_ready=1
338 |     ssh_backoff=30
339 |     set +xe
340 |     echo "Testing SSH connection of instance $1"
341 |     while [ $ssh_backoff -le 960 ]; do
342 |         sleep ${ssh_backoff}s
343 |         ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o BatchMode=yes -i ~/${slave_keypair} ${ami[1]}@$1  hostname
344 |         if [ $? -eq 0 ]; then
345 |             slave_ready=0
346 |             echo "SSH connection of instance $1 is ready"
347 |             set -xe
348 |             return 0
349 |         fi
350 |         ssh_backoff=$((ssh_backoff * 2))
351 |         echo "SSH connection of instance $1 NOT ready, sleeping ${ssh_backoff} seconds and retry"
352 |     done
353 |     echo "The instance $1 failed SSH connection test"
354 |     set -xe
355 |     return 65
356 | }
357 | 
358 | efa_software_components()
359 | {
360 |     if [ -z "$EFA_INSTALLER_URL" ]; then
361 |         if [ ${TARGET_BRANCH} == "v1.8.x" ]; then
362 |             EFA_INSTALLER_URL="https://efa-installer.amazonaws.com/aws-efa-installer-1.7.1.tar.gz"
363 |         else
364 |             EFA_INSTALLER_URL="https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz"
365 |         fi
366 |     fi
367 |     echo "EFA_INSTALLER_URL=$EFA_INSTALLER_URL" >> ${tmp_script}
368 |     cat <<-"EOF" >> ${tmp_script}
369 |     wget_check "$EFA_INSTALLER_URL" "efa-installer.tar.gz"
370 |     tar -xf efa-installer.tar.gz
371 |     cd ${HOME}/aws-efa-installer
372 | EOF
373 |     # If we are not skipping the kernel module, then add a check for SLES
374 |     if [ ${TEST_SKIP_KMOD} -eq 0 ]; then
375 |             sles_allow_module
376 |     fi
377 |     if [ $TEST_SKIP_KMOD -eq 1 ]; then
378 |         echo "sudo ./efa_installer.sh -y -k" >> ${tmp_script}
379 |     elif [ $BUILD_GDR -eq 1 ]; then
380 |         echo "sudo ./efa_installer.sh -y -g" >> ${tmp_script}
381 |     else
382 |         echo "sudo ./efa_installer.sh -y" >> ${tmp_script}
383 |     fi
384 |     echo ". /etc/profile.d/efa.sh" >> ${tmp_script}
385 | }
386 | 
387 | # Download the fabtest parser file and modify it locally to show results for
388 | # Excluded files as skipped as well. Currently only Notrun files are displayed
389 | # as skipped
390 | get_rft_yaml_to_junit_xml()
391 | {
392 |     pushd ${output_dir}
393 |     # fabtests junit parser script
394 |     wget_check "https://raw.githubusercontent.com/ofiwg/libfabric/master/fabtests/scripts/rft_yaml_to_junit_xml" "rft_yaml_to_junit_xml"
395 |     # Add Excluded tag
396 |     sed -i "s,<skipped />,<skipped />\n    EOT\n  when 'Excluded'\n    puts <<-EOT\n    <skipped />,g" rft_yaml_to_junit_xml
397 |     sed -i "s,skipped += 1,skipped += 1\n  when 'Excluded'\n    skipped += 1,g" rft_yaml_to_junit_xml
398 |     popd
399 | }
400 | 
401 | # Split out output files into fabtest build and fabtests, this is done to
402 | # separate the output. As long as INSTANCE_IPS[0] is used, this can be
403 | # common for both single node and multinode
404 | split_files()
405 | {
406 |     pushd ${output_dir}
407 |     csplit -k temp_execute_runfabtests.txt '/- name/'
408 |     # If the installation failed, fabtests will not have run. In that case, do
409 |     # not split the file.
410 |     if [ $? -ne 0 ]; then
411 |         execution_seq=$((${execution_seq}+1))
412 |         mv temp_execute_runfabtests.txt ${execution_seq}_${INSTANCE_IPS[0]}_install_libfabric_or_fabtests_parameters.txt
413 |     else
414 |         execution_seq=$((${execution_seq}+1))
415 |         mv xx00 ${execution_seq}_${INSTANCE_IPS[0]}_install_libfabric_or_fabtests_parameters.txt
416 |         execution_seq=$((${execution_seq}+1))
417 |         mv xx01 ${execution_seq}_${INSTANCE_IPS[0]}_fabtests.txt
418 |     fi
419 |     rm temp_execute_runfabtests.txt
420 | 
421 |     execution_seq=$((${execution_seq}+1))
422 |     mv temp_execute_ring_c_ompi.txt ${execution_seq}_${INSTANCE_IPS[0]}_ring_c_ompi.txt
423 |     execution_seq=$((${execution_seq}+1))
424 |     mv temp_execute_osu_ompi.txt ${execution_seq}_${INSTANCE_IPS[0]}_osu_ompi.txt
425 |     if [ ${RUN_IMPI_TESTS} -eq 1 ]; then
426 |         execution_seq=$((${execution_seq}+1))
427 |         mv temp_execute_ring_c_impi.txt ${execution_seq}_${INSTANCE_IPS[0]}_ring_c_impi.txt
428 |         execution_seq=$((${execution_seq}+1))
429 |         mv temp_execute_osu_impi.txt ${execution_seq}_${INSTANCE_IPS[0]}_osu_impi.txt
430 |     fi
431 |     if [ ${BUILD_GDR} -eq 1 ]; then
432 |         execution_seq=$((${execution_seq}+1))
433 |         mv temp_execute_nccl_tests.txt ${execution_seq}_${INSTANCE_IPS[0]}_nccl_tests.txt
434 |     fi
435 |     popd
436 | }
437 | # Parses the output text file to yaml and then runs rft_yaml_to_junit_xml script
438 | # to generate junit xml file. Calls parse_fabtests function for fabtests result.
439 | # For general text file assign commands yaml -name tags, the output of these
440 | # commands will be assigned server_stdout tag
441 | parse_txt_junit_xml()
442 | {
443 |     exit_code=$?
444 |     set +x
445 |     pushd ${output_dir}
446 |     get_rft_yaml_to_junit_xml
447 |     # Read all .txt files
448 |     for file in *.txt; do
449 |         if [[ ${file} == '*.txt' ]]; then
450 |             continue
451 |         fi
452 |         # Get instance id or instance ip from the file name
453 |         instance_ip_or_id=($(echo ${file} | tr "_" "\n"))
454 |         N=${#instance_ip_or_id[@]}
455 |         file_name=${file/.txt/}
456 |         # Line number to arrange commands sequentially
457 |         line_no=1
458 |         # If the first line of the file does not have a + (+ indicates command)
459 |         # then insert ip/id and + only if its not empty, this is only for non
460 |         # fabtests.txt file
461 |         if [[ ${instance_ip_or_id[$(($N-1))]} != 'fabtests.txt' ]]; then
462 |             sed -i "1s/\(${instance_ip_or_id[1]} [+]\+ \)*\(.\+\)/${instance_ip_or_id[1]} + \2/g" ${file}
463 |         else
464 |             parse_fabtests ${file}
465 |             continue
466 |         fi
467 |         while read line; do
468 |             # If the line is a command indicated by + sign then assign name tag
469 |             # to it, command is the testname used in the xml
470 |             if [[ ${line} == *${instance_ip_or_id[1]}' +'* ]]; then
471 |                 # Junit deosn't accept quotes or colons or less than sign in
472 |                 # testname in the xml, convert them to underscores. Parse the
473 |                 # command to yaml, by inserting - name tag before the command
474 |                 echo ${line//[\"<:]/_} | sed "s/\(${instance_ip_or_id[1]} [+]\+\)\(.*\)/- name: $(printf '%08d\n' $line_no)-\2\n  time: 0\n  result:\n  server_stdout: |/g" \
475 |                 >> ${file_name}
476 |                 line_no=$((${line_no}+1))
477 |             else
478 |                 # These are output lines and are put under server_stdout tag
479 |                 echo "    "${line}  >> ${file_name}
480 |             fi
481 |         done < ${file}
482 |         junit_xml ${file_name}
483 |     done
484 |     popd
485 |     set -x
486 | }
487 | 
488 | # Parses the fabtest result to xml. One change has been done to accomodate yaml
489 | # file creation if fabtest fails. All output other than name,time,result will be
490 | # grouped under server_stdout.
491 | parse_fabtests()
492 | {
493 |     pushd ${output_dir}
494 |     while read line; do
495 |         # If the line has - name: it indicates its a fabtests command and is
496 |         # already yaml format, it already has name tag. It is the testname
497 |         # used in the xml
498 |         if [[ ${line} == *${instance_ip_or_id[1]}' - 'name:* ]]; then
499 |             echo ${line//[\"]/_} | sed "s/\(${instance_ip_or_id[1]} [-] name: \)\(.*\)/- name: \2/g" >> ${file_name}
500 |         elif [[ ${line} == *'time: '* ]]; then
501 |             echo ${line} | sed "s/\(${instance_ip_or_id[1]}\)\(.*time:.*\)/ \2\n  server_stdout: |/g" >> ${file_name}
502 |         else
503 |             # Yaml spacing for result tag should be aligned with name,
504 |             # time, server_stdout tags; whereas all other should be under
505 |             # server_stdout tag
506 |             echo ${line} | sed "s/\(${instance_ip_or_id[1]}\)\(.*\(result\):.*\)*\(.*\)/ \2  \4/g" >> ${file_name}
507 |         fi
508 |         line_no=$((${line_no}+1))
509 |     done < $1
510 |     junit_xml ${file_name}
511 |     popd
512 | }
513 | 
514 | # It updates the filename in rft_yaml_to_junit_xml on the fly to the file_name
515 | # which is the function_name. If the file is empty it doesn't call the
516 | # rft_yaml_to_junit_xml instead creates the xml itself
517 | junit_xml()
518 | {
519 |     pushd ${output_dir}
520 |     file_name=$1
521 |     file_name_xml=${file_name//[.-]/_}
522 |     # If the yaml file is not empty then convert it to xml using
523 |     # rft_yaml_to_junit_xml else create an xml for empty yaml
524 |     if [ -s ${file_name} ]; then
525 |         sed -i "s/\(testsuite name=\)\(.*\)\(tests=\)/\1\"${file_name_xml}\" \3/g" rft_yaml_to_junit_xml
526 |         # TODO: change this, we should only use this ruby script for fabtests.
527 |         ruby rft_yaml_to_junit_xml < ${file_name} > ${file_name_xml}.xml || true
528 |         # Check MPI tests for pass/failure and update the xml if a failure
529 |         # occurred.
530 |         if [[ ${file_name} =~ "ompi" ]] || [[ ${file_name} =~ "impi" ]]; then
531 |             if ! grep -q "Test Passed" ${file_name_xml}.xml; then
532 |                 sed -i 's/failures="0"/failures="1"/' ${file_name_xml}.xml
533 |             fi
534 |         fi
535 |     else
536 |         cat<<-EOF > ${file_name_xml}.xml
537 | <testsuite name="${file_name_xml}" tests="${file_name_xml}" skipped="0" time="0.000">
538 |     <testcase name="${file_name_xml}" time="0">
539 |     </testcase>
540 | </testsuite>
541 | EOF
542 |     fi
543 |     popd
544 | }
545 | 
546 | terminate_instances()
547 | {
548 |     # Terminates compute node
549 |     local ret=0
550 |     if [[ ! -z ${INSTANCE_IDS[@]} ]]; then
551 |         echo "Start terminating instances ${INSTANCE_IDS[@]}."
552 |         AWS_DEFAULT_REGION=us-west-2 aws ec2 terminate-instances --instance-ids ${INSTANCE_IDS[@]}
553 |         # aws wait instance-terminated will poll every 15 seconds until a successful state has been reached.
554 |         # It will exit with a return code of 255 after 40 failed checks, i.e. 10 minutes. Retry this API call
555 |         # within $retry times in case some instances are not terminated within 10 minutes.
556 |         local retry=5
557 |         local bash_option=$-
558 |         local restore_e=0
559 |         if [[ $bash_option =~ e ]]; then
560 |             restore_e=1
561 |             set +e
562 |         fi
563 |         while [[ $retry -ge 0 ]]; do
564 |             AWS_DEFAULT_REGION=us-west-2 aws ec2 wait instance-terminated --instance-ids ${INSTANCE_IDS[@]}
565 |             ret=$?
566 |             if [[ $ret -eq 0 ]]; then
567 |                 break
568 |             fi
569 |             retry=$((retry-1))
570 |         done
571 |         if [[ $ret -eq 0 ]]; then
572 |             echo "Successfully terminate instances ${INSTANCE_IDS[@]}."
573 |         else
574 |             echo "Fail to terminate instances ${INSTANCE_IDS[@]}."
575 |         fi
576 |         if [[ $restore_e -eq 1 ]]; then
577 |             set -e
578 |         fi
579 |     fi
580 |     return $ret
581 | }
582 | 
583 | on_exit()
584 | {
585 |     return_code=$?
586 |     set +e
587 |     # Some of the commands run are background procs, wait for them.
588 |     wait
589 |     split_files
590 |     parse_txt_junit_xml
591 |     terminate_instances
592 |     delete_pg
593 |     return $return_code
594 | }
595 | 
596 | exit_status()
597 | {
598 |     if [ $1 -ne 0 ];then
599 |         BUILD_CODE=1
600 |         echo "Build failure on $2"
601 |     else
602 |         BUILD_CODE=0
603 |         echo "Build success on $2"
604 |     fi
605 | }
606 | 


--------------------------------------------------------------------------------
/efa-check.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 | 
  4 | # Script to verify EFA configuration and assist with debugging.
  5 | 
  6 | source ~/wget_check.sh
  7 | VENDOR_ID="0x1d0f"
  8 | ami_arch="$(uname -m)"
  9 | if [ "${ami_arch}" == "x86_64" ]; then
 10 |     DEV_ID="0xefa0"
 11 | elif [ "${ami_arch}" == "aarch64" ]; then
 12 |     DEV_ID="0xefa1"
 13 | else
 14 |     echo "Unknown architecture, exiting."
 15 | fi
 16 | usage() {
 17 | cat << EOF
 18 | usage: $(basename "$0") [options]
 19 | 
 20 | Options:
 21 |  --skip-libfabric   Skip libfabric checks
 22 |  --skip-mpi         Skip mpi checks
 23 | EOF
 24 | }
 25 | 
 26 | libfabric_checks() {
 27 |     # Check for libfabric and print the version.
 28 |     libfabric=$(sudo ldconfig -p | tr -d '\t' | grep '^libfabric.so.1')
 29 |     if [ $? -ne 0 ]; then
 30 |         cat >&2 << EOF
 31 | Error: libfabric shared library not found.
 32 | EOF
 33 |         return 1
 34 |     fi
 35 | 
 36 |     echo "libfabric in ldcache: $(readlink -m "$(echo "$libfabric" | awk '{print $4}')")"
 37 | 
 38 |     if command -v fi_info >/dev/null 2>&1; then
 39 |         echo "libfabric version:"
 40 |         fi_info --version
 41 |         echo "EFA libfabric providers:"
 42 |         fi_info -p efa
 43 |         if [ "$efa_gdr_enabled" -eq 1 ]; then
 44 |             if ! FI_EFA_USE_DEVICE_RDMA=1 fi_info -p efa -c FI_HMEM; then
 45 |                 echo "EFA libfabric provider does not have FI_HMEM capability."
 46 |                 return 1
 47 |             else
 48 |                 echo "EFA libfabric provider has FI_HMEM capability."
 49 |             fi
 50 |         fi
 51 |     fi
 52 | }
 53 | 
 54 | mpi_checks() {
 55 |     # Print location of mpirun and its version
 56 |     mpirun=$(command -v mpirun)
 57 |     if [ $? -ne 0 ]; then
 58 |         cat >&2 << EOF
 59 | Warning: mpirun not found in \$PATH.
 60 | EOF
 61 |         return 1
 62 |     else
 63 |         echo "Current mpirun in \$PATH: $mpirun"
 64 |         $mpirun --version | grep -v "^Report"
 65 |     fi
 66 | }
 67 | 
 68 | ret=0
 69 | skip_libfabric=0
 70 | skip_mpi=0
 71 | 
 72 | for arg in "$@"; do
 73 |     case "$arg" in
 74 |         --skip-libfabric)
 75 |             skip_libfabric=1
 76 |             ;;
 77 |         --skip-mpi)
 78 |             skip_mpi=1
 79 |             ;;
 80 |         -h|--help)
 81 |             usage
 82 |             exit 0
 83 |             ;;
 84 |         *)
 85 |             usage
 86 |             exit 1
 87 |             ;;
 88 |     esac
 89 | done
 90 | 
 91 | echo "======== Instance / Device check ========"
 92 | # Get instance type
 93 | if command -v curl >/dev/null 2>&1; then
 94 |     metadata_url="http://169.254.169.254/latest/meta-data/instance-type"
 95 |     wget_check "$metadata_url" "instance-type"
 96 |     instance=$(cat instance-type)
 97 |     echo "Instance type: ${instance}"
 98 | fi
 99 | 
100 | # Determine if an EFA device is present and print device list.
101 | efa_detected=0
102 | for dev in /sys/class/infiniband/*/device; do
103 |     if [ "$(cat "${dev}"/subsystem_vendor)" = "$VENDOR_ID" ] && \
104 |        [ "$(cat "${dev}"/subsystem_device)" = "$DEV_ID" ]; then
105 |         efa_detected=1
106 |     fi
107 | done
108 | if [ $efa_detected -ne 1 ]; then
109 |     cat >&2 << EOF
110 | An EFA device was not detected. Please verify that EFA has been enabled
111 | for your Elastic Network Interface.
112 | EOF
113 |     exit 1
114 | fi
115 | 
116 | echo "EFA device detected: "
117 | if command -v ibv_devices >/dev/null 2>&1; then
118 |     ibv_devices
119 | fi
120 | 
121 | # check if GPUDirect RDMA is supported by
122 | # reading from sysfs file "/sys/class/infiniband/<device_name>/gdr"
123 | efa_gdr_enabled=0
124 | for dev in /sys/class/infiniband/*/device; do
125 |     if [ "$(cat "${dev}"/gdr)" == "1" ]; then
126 |         echo "EFA GPUDirect RDMA support is enabled"
127 |         efa_gdr_enabled=1
128 |     else
129 |         echo "EFA GPUDirect RDMA support is not enabled"
130 |     fi
131 | done
132 | 
133 | echo ""
134 | echo "======== Configuration check ========"
135 | # Check for memory lock limit and warn if less than 16GiB. 16GiB is enough for
136 | # bounce buffers for 128 cores with some extra for safety.
137 | if [ "$(ulimit -l)" != 'unlimited' ]; then
138 |     if [ "$(ulimit -l)" -lt "$((16*1024*1024))" ]; then
139 |         cat >&2 << EOF
140 | Warning: EFA requires memory locking and the current limit may be too low for
141 | your application.
142 | EOF
143 |         ret=1
144 |     fi
145 | fi
146 | echo "Current memory lock limit: $(ulimit -l)"
147 | 
148 | huge_pages_size=$(grep "^Hugepagesize:" /proc/meminfo  | awk '{print $2}')
149 | huge_pages_file="/sys/kernel/mm/hugepages/hugepages-${huge_pages_size}kB/nr_hugepages"
150 | hugepages=$(cat $huge_pages_file)
151 | efa_ep_huge_pages_memory=$((110 * 1024)) # convert to kB
152 | number_of_cores=$(lscpu | grep "^CPU(s):"  | awk '{print $2}')
153 | efa_total_huge_pages_memory=$(($efa_ep_huge_pages_memory * $number_of_cores))
154 | efa_number_of_huge_pages=$(($efa_total_huge_pages_memory / $huge_pages_size + 1))
155 | # For each end point, the libfabric EFA provider will create two packet pools,
156 | # which is backed by huge page memory. The two packet pools will use 110 MB of
157 | # memory. We need to reserve at least cores * 110 MB worth of memory in huge
158 | # pages.
159 | # RHEL 8 on ARM has a different huge page configuration than other
160 | # OSes and defaulted to 512 MB huge page size. This might
161 | # 1. massively over-reserve huge pages, thus run the machine effectively OOM
162 | # 2. not reserve sufficient pages for a process per core worth of huge pages
163 | # To deal with this issue, EFA-Config package sets a huge page size threshold as 16 MB (16384 KB).
164 | # If the default huge page size is larger than the threshold, the huge page is not used.
165 | huge_page_size_threshold=16384
166 | if [[ $huge_pages_size -le $huge_page_size_threshold && $hugepages -lt $efa_number_of_huge_pages ]]; then
167 |     cat >&2 << EOF
168 | Warning: Configuring huge pages is recommended for the best performance with
169 | EFA.
170 | EOF
171 |     ret=1
172 | fi
173 | echo "Current number of $huge_pages_size kB huge pages: $hugepages"
174 | 
175 | echo ""
176 | echo "======== Software information ========"
177 | echo "Kernel version: $(uname -r)"
178 | # Verify that the EFA kernel driver and its dependencies are loaded.
179 | if [ "$(grep -c -E '^ib_uverbs|^ib_core' /proc/modules)" -ne 2 ]; then
180 |     cat >&2 << EOF
181 | Error: The ib_uverbs and ib_core kernel modules are required for the EFA kernel
182 | module to be loaded.
183 | EOF
184 |     exit 1
185 | fi
186 | echo "ib_uverbs and ib_core kernel modules are loaded"
187 | 
188 | if ! grep -q '^efa' /proc/modules; then
189 |     cat >&2 << EOF
190 | Error: The EFA kernel module is not loaded. Please verify that the EFA kernel
191 | module is provided with the kernel or is installed using DKMS.
192 | EOF
193 |     exit 1
194 | fi
195 | 
196 | if grep -q '^nvidia' /proc/modules; then
197 |     echo "NVIDIA kernel module is loaded, version: $(sudo modinfo -F version nvidia)"
198 | else
199 |     echo "NVIDIA kernel module is not loaded"
200 | fi
201 | echo "EFA kernel module is loaded, version: $(sudo modinfo -F version efa)"
202 | 
203 | # Check for rdma-core and print the version.
204 | libibverbs=$(sudo ldconfig -p | tr -d '\t' | grep '^libibverbs.so.1')
205 | if [ $? -ne 0 ]; then
206 |     cat >&2 << EOF
207 | Error: libibverbs shared library not found and is required for the EFA
208 | libfabric provider.
209 | EOF
210 |     exit 1
211 | fi
212 | 
213 | echo "libibverbs in ldcache: $(readlink -m "$(echo "$libibverbs" | awk '{print $4}')")"
214 | 
215 | libefa=$(sudo ldconfig -p | tr -d '\t' | grep '^libefa.so.1')
216 | if [ $? -ne 0 ]; then
217 |     cat >&2 << EOF
218 | Error: libefa shared library not found and is required for the EFA
219 | libfabric provider.
220 | EOF
221 |     exit 1
222 | fi
223 | 
224 | echo "libefa in ldcache: $(readlink -m "$(echo "$libefa" | awk '{print $4}')")"
225 | 
226 | if [ $skip_libfabric -eq 0 ]; then
227 |     if ! libfabric_checks; then
228 |         ret=1
229 |     fi
230 | fi
231 | 
232 | if [ $skip_mpi -eq 0 ]; then
233 |     if ! mpi_checks; then
234 |         ret=1
235 |     fi
236 | fi
237 | 
238 | echo ""
239 | if [ $ret -ne 0 ]; then
240 |     echo "EFA check complete, please see output for warnings."
241 |     exit $ret
242 | fi
243 | 
244 | echo "EFA check complete."
245 | exit 0
246 | 


--------------------------------------------------------------------------------
/fork_checker.c:
--------------------------------------------------------------------------------
 1 | #include <infiniband/verbs.h>
 2 | #include <stdio.h>
 3 | 
 4 | /*
 5 |  * Check whether fork support is enabled on the instance by querying the rdma-core interface.
 6 |  */
 7 | int main()
 8 | {
 9 | 	if (IBV_FORK_UNNEEDED != ibv_is_fork_initialized()) {
10 | 		fprintf(stderr, "Kernel space fork support is not enabled \n");
11 | 		return -1;
12 | 	}
13 | 
14 | 	fprintf(stderr, "Kernel space fork support is enabled \n");
15 | 	return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/install-aws-ofi-nccl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | AWS_OFI_NCCL_BRANCH="aws"
 4 | cd $HOME
 5 | git clone -b ${AWS_OFI_NCCL_BRANCH} https://github.com/aws/aws-ofi-nccl.git
 6 | pushd aws-ofi-nccl
 7 | echo "== aws-ofi-nccl commit info =="
 8 | git log -1
 9 | ./autogen.sh
10 | ./configure --prefix $HOME/aws-ofi-nccl/install \
11 |             --with-libfabric=$LIBFABRIC_INSTALL_PATH \
12 |             --with-cuda=/usr/local/cuda \
13 |             --with-nccl=$HOME/nccl/build \
14 |             --with-mpi=/opt/amazon/openmpi
15 | make
16 | make install
17 | popd
18 | 
19 | echo "export LD_LIBRARY_PATH=$HOME/aws-ofi-nccl/install/lib/:\$LD_LIBRARY_PATH" >> ~/.bash_profile
20 | echo "export LD_LIBRARY_PATH=$HOME/aws-ofi-nccl/install/lib/:\$LD_LIBRARY_PATH" >> ~/.bashrc
21 | 


--------------------------------------------------------------------------------
/install-fabtests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "==> Building fabtests"
 4 | cd ${HOME}
 5 | fi_info_bin=${LIBFABRIC_INSTALL_PATH}/bin/fi_info
 6 | if [ ! -x ${fi_info_bin} ]; then
 7 |     echo "fi_info not detected, exiting"
 8 |     exit 1
 9 | fi
10 | if [ ! -d libfabric ]; then
11 |     # Checkout libfabric bugfix branch so that fabtests is compatible with the
12 |     # installed version of libfabric.
13 |     git clone https://github.com/ofiwg/libfabric
14 |     ofi_ver=$(${fi_info_bin} --version | grep 'libfabric api' | awk '{print $3}')
15 |     if [ "${ofi_ver}" != "1.16" ]; then
16 |         pushd libfabric
17 |         git checkout "v${ofi_ver}.x"
18 |         popd
19 |     fi
20 | fi
21 | if [ ! -z "${target_fabtest_tag}" ]; then
22 |     cd ${HOME}/libfabric
23 |     git checkout tags/${target_fabtest_tag}
24 | fi
25 | cd ${HOME}/libfabric/fabtests
26 | ./autogen.sh
27 | configure_flags=(
28 |     --with-libfabric=${LIBFABRIC_INSTALL_PATH} \
29 |     --prefix=${HOME}/libfabric/fabtests/install/ \
30 |     --enable-debug
31 |     )
32 | # Build fabtests with cuda on x86_64 platform only.
33 | if [ "$(uname -m)" == "x86_64" ]; then
34 |     configure_flags+=(--with-cuda=/usr/local/cuda)
35 | fi
36 | 
37 | ./configure "${configure_flags[@]}"
38 | make -j 4
39 | make install
40 | 
41 | # Runs all the tests in the fabtests suite between two nodes while only expanding failed cases
42 | EXCLUDE=${HOME}/libfabric/fabtests/install/share/fabtests/test_configs/${PROVIDER}/${PROVIDER}.exclude
43 | if [ "${PROVIDER}" == "efa" ]; then
44 |     if [ ! -f ${EXCLUDE} ]; then
45 |         echo "exclude file for efa does not exist! Exiting ..."
46 |         exit 1
47 |     fi
48 | 
49 |     # fi_dgram_pingpong test assums packet delivery, which dgram end point
50 |     # does not guarantee. As a result, this test only works with out of
51 |     # band synchronization (-b option). However, runfabtests.sh run all
52 |     # the tests with in band synchronization (-E option).
53 |     # Therefore we exclude it from runfabtests.sh and run this test
54 |     # separately with -b option in multinode_runfabtests.sh
55 |     echo "# skip dgram_pingpong test" >> ${EXCLUDE}
56 |     echo "dgram_pingpong" >> ${EXCLUDE}
57 |     echo "" >> ${EXCLUDE}
58 | 
59 |     if [ "${AMI_ARCH}" == "aarch64" ]; then
60 |         # Temporarily exclude fi_rdm test on c6gn to workaround a firmware issue.
61 |         # We cannot simply add rdm into the exclude file because that will exclude
62 |         # all fi_rdm* tests.
63 |         sed -i '/\"fi_rdm\"/d' ${HOME}/libfabric/fabtests/install/bin/runfabtests.sh
64 |         if [ ${LIBFABRIC_JOB_TYPE} == "master" ]; then
65 |             # temporarily exclude fi_rdm_multi_client test
66 |             echo "# skip rdm_multi_client test" >> ${EXCLUDE}
67 |             echo "rdm_multi_client" >> ${EXCLUDE}
68 |             echo "" >> ${EXCLUDE}
69 |         fi
70 |     fi
71 | fi
72 | # .bashrc and .bash_profile are loaded differently depending on distro and
73 | # whether the shell is interactive or not, just do both to be safe.
74 | echo "export LD_LIBRARY_PATH=${LIBFABRIC_INSTALL_PATH}/lib/:\$LD_LIBRARY_PATH" >> ~/.bash_profile
75 | echo "export LD_LIBRARY_PATH=${LIBFABRIC_INSTALL_PATH}/lib/:\$LD_LIBRARY_PATH" >> ~/.bashrc
76 | echo "export PATH=${HOME}/libfabric/fabtests/install/bin:\$PATH" >> ~/.bash_profile
77 | echo "export PATH=${HOME}/libfabric/fabtests/install/bin:\$PATH" >> ~/.bashrc
78 | 


--------------------------------------------------------------------------------
/install-libfabric-1.8.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "==> Building libfabric 1.8.x"
 4 | cd ${HOME}
 5 | git clone https://github.com/ofiwg/libfabric
 6 | cd ${HOME}/libfabric
 7 | git fetch origin +refs/pull/$PULL_REQUEST_ID/*:refs/remotes/origin/pr/$PULL_REQUEST_ID/*
 8 | git checkout $PULL_REQUEST_REF -b PRBranch
 9 | ./autogen.sh
10 | ./configure --prefix=${HOME}/libfabric/install/ \
11 |     --enable-debug  \
12 |     --enable-mrail  \
13 |     --enable-tcp    \
14 |     --enable-rxm    \
15 |     --disable-rxd   \
16 |     --disable-verbs
17 | make -j 4
18 | make install
19 | LIBFABRIC_INSTALL_PATH=${HOME}/libfabric/install
20 | # ld.so.conf.d files are preferred in alphabetical order
21 | # this doesn't seem to be working for non-interactive shells
22 | sudo bash -c "echo ${LIBFABRIC_INSTALL_PATH} > /etc/ld.so.conf.d/aaaa-libfabric-testing.sh"
23 | 


--------------------------------------------------------------------------------
/install-libfabric.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "==> Building libfabric"
 4 | # Pulls the libfabric repository and checks out the pull request commit
 5 | cd ${HOME}
 6 | git clone https://github.com/ofiwg/libfabric
 7 | cd ${HOME}/libfabric
 8 | if [ ! "$PULL_REQUEST_ID" = "None" ]; then
 9 |     git fetch origin +refs/pull/$PULL_REQUEST_ID/*:refs/remotes/origin/pr/$PULL_REQUEST_ID/*
10 |     git checkout $PULL_REQUEST_REF -b PRBranch
11 | fi
12 | ./autogen.sh
13 | configure_flags=(--prefix=${HOME}/libfabric/install/ \
14 |     --enable-debug  \
15 |     --enable-mrail  \
16 |     --enable-tcp    \
17 |     --enable-rxm    \
18 |     --disable-rxd   \
19 |     --disable-verbs \
20 |     --enable-efa )
21 | # Build libfabric with cuda on x86_64 platform only.
22 | if [ "$(uname -m)" == "x86_64" ]; then
23 |     with_cuda_option_available="$(./configure -h 2>&1 | grep '\-\-with\-cuda' || true)"
24 |     enable_cuda_dlopen_option_available="$(./configure -h 2>&1 | grep '\-\-enable\-cuda\-dlopen' || true)"
25 |     if [[ -n "$with_cuda_option_available" && -n "$enable_cuda_dlopen_option_available" ]]; then
26 |         configure_flags+=(--with-cuda=/usr/local/cuda --enable-cuda-dlopen)
27 |     fi
28 | fi
29 | ./configure "${configure_flags[@]}"
30 | make -j 4
31 | make install
32 | LIBFABRIC_INSTALL_PATH=${HOME}/libfabric/install
33 | # ld.so.conf.d files are preferred in alphabetical order
34 | # this doesn't seem to be working for non-interactive shells
35 | sudo bash -c "echo ${LIBFABRIC_INSTALL_PATH} > /etc/ld.so.conf.d/aaaa-libfabric-testing.sh"
36 | 


--------------------------------------------------------------------------------
/install-nccl-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NCCL_TESTS_VERSION="v2.0.0"
 4 | cd $HOME
 5 | git clone -b "$NCCL_TESTS_VERSION" https://github.com/NVIDIA/nccl-tests
 6 | pushd nccl-tests
 7 | # TODO: We need to apply the patch in commit https://github.com/NVIDIA/nccl-tests/commit/0f173234bb2837327d806e9e4de9af3dda9a7043
 8 | # to add the LD_LIBRARY_PATH of openmpi shipped in efa installer (ended as lib64 on fedora distros). This commit is merged
 9 | # in nccl-tests's main branch but not in any stable release. Update the version number when this fix is taken in and remove
10 | # this patch line.
11 | sed -i s/'NVLDFLAGS += -L$(MPI_HOME)\/lib -lmpi'/'NVLDFLAGS += -L$(MPI_HOME)\/lib -L$(MPI_HOME)\/lib64 -lmpi'/ src/Makefile
12 | make MPI=1 MPI_HOME=/opt/amazon/openmpi NCCL_HOME=$HOME/nccl/build NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"
13 | popd
14 | 


--------------------------------------------------------------------------------
/install-nccl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NCCL_VERSION="v2.8.3-1"
 3 | cd $HOME
 4 | git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git
 5 | pushd nccl
 6 | make -j src.build CUDA_HOME=/usr/local/cuda NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"
 7 | popd
 8 | 
 9 | echo "export LD_LIBRARY_PATH=$HOME/nccl/build/lib/:\$LD_LIBRARY_PATH" >> ~/.bash_profile
10 | echo "export LD_LIBRARY_PATH=$HOME/nccl/build/lib/:\$LD_LIBRARY_PATH" >> ~/.bashrc
11 | 


--------------------------------------------------------------------------------
/jenkins-ami/packer-jenkins.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "builders": [{
 3 |     "type": "amazon-ebs",
 4 |     "name" : "AmazonLinux17.09",
 5 |     "region": "us-west-2",
 6 |     "source_ami": "ami-f2d3638a",
 7 |     "instance_type": "t2.micro",
 8 |     "ssh_username": "ec2-user",
 9 |     "ami_name": "Jenkins Amazon Linux 17.09 {{timestamp}}",
10 |     "ena_support": true,
11 |     "associate_public_ip_address" : true
12 |   },{
13 |     "type": "amazon-ebs",
14 |     "name" : "Ubuntu16.04",
15 |     "region": "us-west-2",
16 |     "source_ami": "ami-79873901",
17 |     "instance_type": "t2.micro",
18 |     "ssh_username": "ubuntu",
19 |     "ami_name": "Jenkins Ubuntu 16.04 {{timestamp}}",
20 |     "ena_support": true,
21 |     "associate_public_ip_address" : true
22 |   },{
23 |     "type": "amazon-ebs",
24 |     "name" : "RHEL7.4",
25 |     "region": "us-west-2",
26 |     "source_ami": "ami-223f945a",
27 |     "instance_type": "t2.micro",
28 |     "ssh_username": "ec2-user",
29 |     "ami_name": "Jenkins RHEL 7.4 {{timestamp}}",
30 |     "ena_support": true,
31 |     "associate_public_ip_address" : true
32 |   }],
33 |   "provisioners": [{
34 |     "type": "shell",
35 |     "script" : "prepare-ami.sh"
36 |   }]
37 | }
38 | 


--------------------------------------------------------------------------------
/jenkins-ami/prepare-ami.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | eval "PLATFORM_ID=`sed -n 's/^ID=//p' /etc/os-release`"
 6 | eval "VERSION_ID=`sed -n 's/^VERSION_ID=//p' /etc/os-release`"
 7 | 
 8 | echo "==> Platform: $PLATFORM_ID"
 9 | echo "==> Version:  $VERSION_ID"
10 | 
11 | echo "==> Installing packages"
12 | 
13 | case $PLATFORM_ID in
14 |   rhel)
15 |     sudo yum -y install deltarpm
16 |     sudo yum -y update
17 |     sudo yum -y groupinstall "Development Tools"
18 |     sudo yum -y install libevent-devel java-1.8.0-openjdk-devel java-1.8.0-openjdk gdb
19 |     sudo yum -y install wget
20 |     sudo yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
21 |     sudo yum -y install python-pip
22 |     sudo pip install --upgrade pip
23 |     sudo pip install awscli
24 |     ;;
25 |   amzn)
26 |     sudo yum -y update
27 |     sudo yum -y groupinstall "Development Tools"
28 |     sudo yum -y groupinstall "Java Development"
29 |     sudo yum -y install libevent-devel java-1.8.0-openjdk-devel java-1.8.0-openjdk gdb
30 |     sudo update-alternatives --set java /usr/lib/jvm/jre-1.8.0-openjdk.x86_64/bin/java
31 |     sudo pip install awscli
32 |     ;;
33 |   ubuntu)
34 |     sudo apt-get update
35 |     sudo apt -y install openjdk-8-jre-headless
36 |     sudo apt -y install python
37 |     sudo apt -y install autoconf
38 |     sudo apt -y install libltdl-dev
39 |     sudo apt -y install make
40 |     sudo apt -y install python-pip
41 |     sudo pip install --upgrade pip
42 |     sudo pip install awscli
43 |     ;;
44 |   *)
45 |     echo "ERROR: Unkonwn platform ${PLATFORM_ID}"
46 |     exit 1
47 | esac
48 | 
49 | echo "==> Cleaning instance"
50 | sudo rm -rf /tmp/* /var/tmp/* /var/log/* /etc/ssh/ssh_host*
51 | sudo rm -rf /root/* /root/.ssh /root/.history /root/.bash_history
52 | sudo rm -rf ~/* ~/.history ~/.bash_history ~/.cache
53 | 
54 | echo "==> Generating key"
55 | ssh-keygen -f $HOME/.ssh/id_rsa -N ""
56 | cat $HOME/.ssh/id_rsa.pub > $HOME/.ssh/authorized_keys
57 | 


--------------------------------------------------------------------------------
/mpi_common.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # MPI helper shell functions
 4 | function check_efa_ompi {
 5 |     out=$1
 6 |     if ! grep -q "mtl:ofi:prov: efa" $out; then
 7 |         echo "efa provider not used with Open MPI"
 8 |         exit 1
 9 |     fi
10 | }
11 | 
12 | function check_efa_impi {
13 |     out=$1
14 |     if ! grep -q "libfabric provider: efa" $out; then
15 |         echo "efa provider not used with Intel MPI"
16 |         exit 1
17 |     fi
18 | }
19 | 
20 | function ompi_setup {
21 |     provider=$1
22 |     . /etc/profile.d/efa.sh
23 |     if [ $provider = "efa" ]; then
24 |         # Get the mtl:ofi:prov information in verbose output
25 |         export OMPI_MCA_opal_common_ofi_verbose=1
26 |     else
27 |         # Get btl base verbose output for component
28 |         export OMPI_MCA_btl_base_verbose=10
29 |     fi
30 |     # Pass LD_LIBRARY_PATH arg so that we use the right libfabric. Ubuntu
31 |     # doesn't load .bashrc/.bash_profile for non-interactive shells.
32 |     export MPI_ARGS="-x LD_LIBRARY_PATH"
33 |     if [ $provider = "efa" ]; then
34 |         # Only load the OFI component in MTL so MPI will fail if it cannot
35 |         # be used.
36 |         export MPI_ARGS="$MPI_ARGS --mca pml cm --mca mtl ofi"
37 | 
38 |         # We have to disable the OpenIB BTL to avoid the call to ibv_fork_init
39 |         # EFA installer 1.10.0 (and above) ships open mpi that does not have openib btl
40 |         # enabled, therefore does not need the extra mca parameter
41 |         cur_version=$(head -n 1 /opt/amazon/efa_installed_packages | awk '{print $5}')
42 |         min_version=$(echo -e "$cur_version\n1.10.0" | sort --version-sort | head -n 1)
43 |         if [ $min_version != "1.10.0" ]; then
44 |             MPI_ARGS="$MPI_ARGS --mca btl ^openib"
45 |         fi
46 |     else
47 |         # Only load the TCP component in BTL so MPI will fail if it cannot be used.
48 |         export MPI_ARGS="$MPI_ARGS --mca pml ob1 --mca btl tcp,self"
49 |     fi
50 |     export MPIEXEC_TIMEOUT=1800
51 | }
52 | 
53 | function impi_setup {
54 |     LIBFABRIC_JOB_TYPE=$1
55 |     if [ "$LIBFABRIC_JOB_TYPE" = "master" ]; then
56 |         source /opt/intel/oneapi/mpi/latest/env/vars.sh -i_mpi_ofi_internal=0
57 |         export LD_LIBRARY_PATH=${HOME}/libfabric/install/lib/:$LD_LIBRARY_PATH
58 |     else
59 |         # Use Intel MPI's internal libfabric (-i_mpi_ofi_internal=1 by default)
60 |         source /opt/intel/oneapi/mpi/latest/env/vars.sh
61 |     fi
62 |     export I_MPI_DEBUG=1
63 |     export MPI_ARGS=""
64 |     export MPIEXEC_TIMEOUT=1800
65 | }
66 | 
67 | function host_setup {
68 |     hostfile=$1
69 |     shift
70 |     hosts=$@
71 |     for host in $hosts; do
72 |         ssh-keyscan $host >> ~/.ssh/known_hosts
73 |         echo $host >> $hostfile
74 |     done
75 | 
76 |     export cpus=$(grep -c ^processor /proc/cpuinfo)
77 |     export threads=$(lscpu | grep '^Thread(s) per core:' | awk '{ print $4 }')
78 |     export ranks=$(( $cpus / $threads ))
79 |     # Avoid non-interactive shell PATH issues on Ubuntu with MPI by using full
80 |     # path, so it can find orted.
81 |     export mpirun_cmd="$(which mpirun) $MPI_ARGS"
82 | }
83 | 


--------------------------------------------------------------------------------
/mpi_osu_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | source ~/.bash_profile
 4 | source ~/mpi_common.sh
 5 | source ~/wget_check.sh
 6 | set -x
 7 | set -o pipefail
 8 | mpi=$1
 9 | shift
10 | libfabric_job_type=$1
11 | shift
12 | provider=$1
13 | shift
14 | hosts=$@
15 | hostfile=$(mktemp)
16 | out=$(mktemp)
17 | 
18 | if [ -f /softwares/osu-micro-benchmarks-5.6.tar.gz ]; then
19 |     cp /softwares/osu-micro-benchmarks-5.6.tar.gz .
20 |     osu_dir="osu-micro-benchmarks-5.6"
21 | else
22 |     wget_check "http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz" "osu-micro-benchmarks-5.6.2.tar.gz"
23 |     osu_dir="osu-micro-benchmarks-5.6.2"
24 | fi
25 | 
26 | one_rank_per_node=""
27 | if [ "${mpi}" == "ompi" ]; then
28 |     ompi_setup "${provider}"
29 |     one_rank_per_node="-N 1"
30 | elif [ "${mpi}" == "impi" ]; then
31 |     impi_setup "${libfabric_job_type}"
32 |     one_rank_per_node="-ppn 1"
33 | else
34 |     echo "unknown mpi type"
35 |     exit 1
36 | fi
37 | 
38 | host_setup ${hostfile} ${hosts}
39 | tar -xvf ${osu_dir}.tar.gz
40 | mv ${osu_dir} osu-micro-benchmarks-${mpi}
41 | osu_dir="osu-micro-benchmarks-${mpi}"
42 | pushd ${osu_dir}
43 | ./configure CC=mpicc CXX=mpicxx
44 | make -j
45 | popd
46 | for host in $hosts; do
47 |     scp -r ${osu_dir} $host:/tmp
48 | done
49 | 
50 | echo "$mpirun_cmd --version"
51 | $mpirun_cmd --version
52 | 
53 | # TODO: split this output so that it shows up as three separate tests in the xml output
54 | $mpirun_cmd -n 2 ${one_rank_per_node} -hostfile $hostfile /tmp/${osu_dir}/mpi/pt2pt/osu_latency 2>&1 | tee $out
55 | if [ $? -ne 0 ]; then
56 |     echo "osu_latency failed"
57 |     exit 1
58 | fi
59 | 
60 | if [ "${mpi}" == "ompi" ] && [ "$provider" == "efa" ]; then
61 |     check_efa_ompi $out
62 | elif [ "${mpi}" == "impi" ] && [ "$provider" == "efa" ]; then
63 |     check_efa_impi $out
64 | fi
65 | 
66 | $mpirun_cmd -n 2 ${one_rank_per_node} -hostfile $hostfile /tmp/${osu_dir}/mpi/pt2pt/osu_bw 2>&1 | tee $out
67 | if [ $? -ne 0 ]; then
68 |     echo "osu_bw failed"
69 |     exit 1
70 | fi
71 | 
72 | if [ "${mpi}" == "ompi" ] && [ "$provider" == "efa" ]; then
73 |     check_efa_ompi $out
74 | elif [ "${mpi}" == "impi" ] && [ "$provider" == "efa" ]; then
75 |     check_efa_impi $out
76 | fi
77 | 
78 | # osu_mbw_mr test takes more than 30 min when running with tcp provider.
79 | # Increase its timeout limit to 3600 secs.
80 | MPIEXEC_TIMEOUT_ORIG=${MPIEXEC_TIMEOUT}
81 | if [ "$provider" = "tcp" ]; then
82 |     MPIEXEC_TIMEOUT=3600
83 | fi
84 | $mpirun_cmd -n $(( $ranks * $# )) -hostfile $hostfile /tmp/${osu_dir}/mpi/pt2pt/osu_mbw_mr 2>&1 | tee $out
85 | if [ $? -ne 0 ]; then
86 |     echo "osu_mbw_mr failed"
87 |     exit 1
88 | fi
89 | MPIEXEC_TIMEOUT=${MPIEXEC_TIMEOUT_ORIG}
90 | 
91 | if [ "${mpi}" == "ompi" ] && [ "$provider" == "efa" ]; then
92 |     check_efa_ompi $out
93 | elif [ "${mpi}" == "impi" ] && [ "$provider" == "efa" ]; then
94 |     check_efa_impi $out
95 | fi
96 | 
97 | echo "Test Passed"
98 | 


--------------------------------------------------------------------------------
/mpi_ring_c_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ~/.bash_profile
 4 | source ~/mpi_common.sh
 5 | source ~/wget_check.sh
 6 | 
 7 | set -x
 8 | set -o pipefail
 9 | mpi=$1
10 | shift
11 | libfabric_job_type=$1
12 | shift
13 | provider=$1
14 | shift
15 | hosts=$@
16 | hostfile=$(mktemp)
17 | out1=$(mktemp)
18 | out2=$(mktemp)
19 | 
20 | wget_check "https://raw.githubusercontent.com/open-mpi/ompi/master/examples/ring_c.c" "ring_c.c"
21 | wget_check "https://raw.githubusercontent.com/open-mpi/ompi/master/examples/ring_usempi.f90" "ring_usempi.f90"
22 | 
23 | if [ "${mpi}" == "ompi" ]; then
24 |     ompi_setup "${provider}"
25 | elif [ "${mpi}" == "impi" ]; then
26 |     impi_setup "${libfabric_job_type}"
27 | else
28 |     echo "unknown mpi type"
29 |     exit 1
30 | fi
31 | 
32 | host_setup ${hostfile} ${hosts}
33 | mpicc -o /tmp/ring_c ring_c.c
34 | # Fortran compile test
35 | mpif90 -o /tmp/ring_fortran ring_usempi.f90
36 | for host in $hosts; do
37 |     scp /tmp/ring_c $host:/tmp
38 |     scp /tmp/ring_fortran $host:/tmp
39 | done
40 | 
41 | $mpirun_cmd --version
42 | $mpirun_cmd -n $(( $ranks * $# )) -hostfile $hostfile /tmp/ring_c 2>&1 | tee $out1
43 | if [ $? -ne 0 ] || ! grep -q "Process 0 exiting" $out1; then
44 |     echo "mpirun ring_c failed"
45 |     exit 1
46 | fi
47 | 
48 | $mpirun_cmd -n $(( $ranks * $# )) -hostfile $hostfile /tmp/ring_fortran 2>&1 | tee $out2
49 | if [ $? -ne 0 ] || ! grep -q "Process  0 exiting" $out2; then
50 |     echo "mpirun ring_fortran (f90) failed"
51 |     exit 1
52 | fi
53 | 
54 | if [ "${mpi}" == "ompi" ] && [ "$provider" == "efa" ]; then
55 |     check_efa_ompi $out1
56 |     check_efa_ompi $out2
57 | elif [ "${mpi}" == "impi" ] && [ "$provider" == "efa" ]; then
58 |     check_efa_impi $out1
59 |     check_efa_impi $out2
60 | fi
61 | 
62 | echo "Test Passed"
63 | 


--------------------------------------------------------------------------------
/multi-node-efa-minimal.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This script test the minimal installation mode of the efa installer
  4 | # The minimal installation mode will not install libfabric and openmpi
  5 | # and will install EFA kernel module and rdma-core
  6 | # The goal is to provide a minimal environment for intel MPI U6 and above
  7 | # to work, which comes with a copy of libfabric
  8 | # The libfabric comes with Intel MPI does not have headers, therefore
  9 | # fabtests cannot be compiled against it and we are skipping fabtests
 10 | # here and only test Intel MPI
 11 | 
 12 | set -xe
 13 | source $WORKSPACE/libfabric-ci-scripts/common.sh
 14 | trap 'on_exit'  EXIT
 15 | slave_name=slave_$label
 16 | slave_value=${!slave_name}
 17 | ami=($slave_value)
 18 | NODES=2
 19 | export MINIMAL=1
 20 | export RUN_IMPI_TESTS=1
 21 | # Use Intel MPI's internal libfabric library.
 22 | export LIBFABRIC_INSTALL_PATH=/opt/intel/oneapi/mpi/latest/libfabric/lib/
 23 | 
 24 | efa_software_components_minimal()
 25 | {
 26 |     if [ -z "$EFA_INSTALLER_URL" ]; then
 27 |         EFA_INSTALLER_URL="https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz"
 28 |     fi
 29 |     echo "EFA_INSTALLER_URL=$EFA_INSTALLER_URL" >> ${tmp_script}
 30 |     cat <<-"EOF" >> ${tmp_script}
 31 |     wget_check "$EFA_INSTALLER_URL" "efa-installer.tar.gz"
 32 |     tar -xf efa-installer.tar.gz
 33 |     cd ${HOME}/aws-efa-installer
 34 | EOF
 35 |     # If we are not skipping the kernel module, then add a check for SLES
 36 |     if [ ${TEST_SKIP_KMOD} -eq 0 ]; then
 37 |         sles_allow_module
 38 |     fi
 39 |     if [[ $TEST_SKIP_KMOD -eq 1 ]]; then
 40 |         echo "sudo ./efa_installer.sh -k -m -y" >> ${tmp_script}
 41 |     elif [ ${BUILD_GDR} -eq 1 ]; then
 42 |         echo "sudo ./efa_installer.sh -g -m -y" >> ${tmp_script}
 43 |     else
 44 |         echo "sudo ./efa_installer.sh -m -y" >> ${tmp_script}
 45 |     fi
 46 | }
 47 | 
 48 | multi_node_efa_minimal_script_builder()
 49 | {
 50 |     type=$1
 51 |     set_var
 52 |     efa_software_components_minimal
 53 | 
 54 |     # Ubuntu disallows non-child process ptrace by default, which is
 55 |     # required for the use of CMA in the shared-memory codepath.
 56 |     if [ ${PROVIDER} == "efa" ] && [ ${label} == "ubuntu" ];then
 57 |         echo "sudo sysctl -w kernel.yama.ptrace_scope=0" >> ${tmp_script}
 58 |     fi
 59 | }
 60 | 
 61 | # Test whether the instance is ready for SSH or not. Once the instance is ready,
 62 | # copy SSH keys from Jenkins and install libfabric
 63 | install_libfabric()
 64 | {
 65 |     test_ssh "$1"
 66 |     set +x
 67 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair} ${ami[1]}@$1:~/.ssh/id_rsa
 68 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair}.pub ${ami[1]}@$1:~/.ssh/id_rsa.pub
 69 |     execution_seq=$((${execution_seq}+1))
 70 |     (ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@$1 \
 71 |         "bash -s" -- < ${tmp_script} \
 72 |         "$PULL_REQUEST_ID" "$PULL_REQUEST_REF" "$PROVIDER" 2>&1; \
 73 |         echo "EXIT_CODE=$?" > $WORKSPACE/libfabric-ci-scripts/$1_install_libfabric.sh) \
 74 |         | tr \\r \\n | sed 's/\(.*\)/'$1' \1/' | tee ${output_dir}/${execution_seq}_$1_install_libfabric.txt
 75 |     set -x
 76 | }
 77 | 
 78 | set +x
 79 | create_instance || { echo "==>Unable to create instance"; exit 65; }
 80 | set -x
 81 | INSTANCE_IDS=($INSTANCE_IDS)
 82 | 
 83 | get_instance_ip
 84 | INSTANCE_IPS=($INSTANCE_IPS)
 85 | 
 86 | execution_seq=$((${execution_seq}+1))
 87 | # Wait until all instances have passed SSH connection check
 88 | for IP in ${INSTANCE_IPS[@]}; do
 89 |     test_ssh "$IP" &
 90 |     pids="$pids $!"
 91 | done
 92 | for pid in $pids; do
 93 |     wait $pid || { echo "==>Instance ssh check failed"; exit 65; }
 94 | done
 95 | 
 96 | 
 97 | # Prepare AMI specific libfabric installation script
 98 | multi_node_efa_minimal_script_builder
 99 | 
100 | for IP in ${INSTANCE_IPS[@]}; do
101 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
102 |         $WORKSPACE/libfabric-ci-scripts/wget_check.sh \
103 |         ${ami[1]}@${IP}:~/
104 | done
105 | 
106 | # Generate ssh key for fabtests
107 | set +x
108 | if [ ! -f $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair} ]; then
109 |     ssh-keygen -f $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair} -N ""
110 | fi
111 | cat <<-"EOF" >>${tmp_script}
112 |     set +x
113 |     cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
114 |     chmod 600  ~/.ssh/id_rsa
115 |     set -x
116 | EOF
117 | set -x
118 | 
119 | execution_seq=$((${execution_seq}+1))
120 | # SSH into nodes and install libfabric concurrently on all nodes
121 | for IP in ${INSTANCE_IPS[@]}; do
122 |     install_libfabric "$IP" &
123 | done
124 | wait
125 | 
126 | # Run the efa-check.sh script now that the installer has completed. We need to
127 | # use a login shell so that $PATH is setup correctly for Debian variants.
128 | for IP in ${INSTANCE_IPS[@]}; do
129 |     echo "Running efa-check.sh on ${IP}"
130 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
131 |         $WORKSPACE/libfabric-ci-scripts/efa-check.sh ${ami[1]}@${IP}:
132 |     ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${IP} \
133 |         "bash --login efa-check.sh --skip-libfabric --skip-mpi" 2>&1 | tr \\r \\n | sed 's/\(.*\)/'$IP' \1/'
134 |     if [ ${PIPESTATUS[0]} -ne 0 ]; then
135 |         "EFA check failed on ${IP}"
136 |         exit 1
137 |     fi
138 | done
139 | 
140 | # Run MPI tests only for EFA provider for now.
141 | scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
142 |         $WORKSPACE/libfabric-ci-scripts/mpi_ring_c_test.sh \
143 |         $WORKSPACE/libfabric-ci-scripts/mpi_osu_test.sh \
144 |         $WORKSPACE/libfabric-ci-scripts/mpi_common.sh \
145 |         ${ami[1]}@${INSTANCE_IPS[0]}:
146 | 
147 | test_list="impi"
148 | 
149 | for mpi in $test_list; do
150 |     execution_seq=$((${execution_seq}+1))
151 |     ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[0]} \
152 |         bash mpi_ring_c_test.sh ${mpi} ${libfabric_job_type} ${PROVIDER} ${INSTANCE_IPS[@]} | tee ${output_dir}/temp_execute_ring_c_efa_minimal_${mpi}.txt
153 | 
154 |     set +e
155 |     grep -q "Test Passed" ${output_dir}/temp_execute_ring_c_efa_minimal_${mpi}.txt
156 |     if [ $? -ne 0 ]; then
157 |         BUILD_CODE=1
158 |         echo "${mpi} ring_c test failed."
159 |     fi
160 |     set -e
161 | 
162 |     ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[0]} \
163 |         bash mpi_osu_test.sh ${mpi} ${libfabric_job_type} ${PROVIDER} ${INSTANCE_IPS[@]} | tee ${output_dir}/temp_execute_osu_efa_minimal_${mpi}.txt
164 | 
165 |     set +e
166 |     grep -q "Test Passed" ${output_dir}/temp_execute_osu_efa_minimal_${mpi}.txt
167 |     if [ $? -ne 0 ]; then
168 |         BUILD_CODE=1
169 |         echo "${mpi} osu test failed."
170 |     fi
171 |     set -e
172 | done
173 | 
174 | exit ${BUILD_CODE}
175 | 


--------------------------------------------------------------------------------
/multi-node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -xe
  4 | source $WORKSPACE/libfabric-ci-scripts/common.sh
  5 | trap 'on_exit'  EXIT
  6 | slave_name=slave_$label
  7 | slave_value=${!slave_name}
  8 | ami=($slave_value)
  9 | NODES=2
 10 | libfabric_job_type=${libfabric_job_type:-"master"}
 11 | # Current LibfabricCI IAM permissions do not allow placement group creation,
 12 | # enable this after it is fixed.
 13 | # export ENABLE_PLACEMENT_GROUP=1
 14 | export USER_DATA_FILE=${USER_DATA_FILE:-${JENKINS_HOME}/user_data_script.sh}
 15 | # The tests running on GPUDirect instances will set this variable, other test stages will not.
 16 | export BUILD_GDR=${BUILD_GDR:-0}
 17 | 
 18 | # Test whether the instance is ready for SSH or not. Once the instance is ready,
 19 | # copy SSH keys from Jenkins and install libfabric
 20 | install_libfabric()
 21 | {
 22 |     test_ssh "$1"
 23 |     set +x
 24 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair} ${ami[1]}@$1:~/.ssh/id_rsa
 25 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair}.pub ${ami[1]}@$1:~/.ssh/id_rsa.pub
 26 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} $WORKSPACE/libfabric-ci-scripts/fork_checker.c ${ami[1]}@$1:~/fork_checker.c
 27 |     execution_seq=$((${execution_seq}+1))
 28 |     (ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@$1 \
 29 |         "bash -s" -- < ${tmp_script} \
 30 |         "$PULL_REQUEST_ID" "$PULL_REQUEST_REF" "$PROVIDER" "$ami_arch" "$libfabric_job_type" 2>&1; \
 31 |         echo "EXIT_CODE=$?" > $WORKSPACE/libfabric-ci-scripts/$1_install_libfabric.sh) \
 32 |         | tr \\r \\n | sed 's/\(.*\)/'$1' \1/' | tee ${output_dir}/${execution_seq}_$1_install_libfabric.txt
 33 |     set -x
 34 | }
 35 | 
 36 | # Runs fabtests on client nodes using INSTANCE_IPS[0] as server
 37 | execute_runfabtests()
 38 | {
 39 |     if [ ${PROVIDER} == "efa" ];then
 40 |         gid_c=$(ssh -o StrictHostKeyChecking=no -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[$1]} ibv_devinfo -v | grep GID | awk '{print $3}')
 41 |     else
 42 |         gid_c=""
 43 |     fi
 44 |     set +x
 45 |     execution_seq=$((${execution_seq}+1))
 46 |     (ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[0]} \
 47 |         "bash -s" -- < $WORKSPACE/libfabric-ci-scripts/multinode_runfabtests.sh \
 48 |         "${PROVIDER}" "${INSTANCE_IPS[0]}" "${INSTANCE_IPS[$1]}" "${gid_c}" "${BUILD_GDR}" 2>&1; \
 49 |         echo "EXIT_CODE=$?" > $WORKSPACE/libfabric-ci-scripts/${INSTANCE_IPS[$1]}_execute_runfabtests.sh) | \
 50 |         tr \\r \\n | sed 's/\(.*\)/'${INSTANCE_IPS[0]}' \1/' | tee ${output_dir}/temp_execute_runfabtests.txt
 51 |     set -x
 52 | }
 53 | 
 54 | set +x
 55 | create_instance || { echo "==>Unable to create instance"; exit 65; }
 56 | set -x
 57 | INSTANCE_IDS=($INSTANCE_IDS)
 58 | 
 59 | get_instance_ip
 60 | INSTANCE_IPS=($INSTANCE_IPS)
 61 | 
 62 | execution_seq=$((${execution_seq}+1))
 63 | pids=""
 64 | # Wait until all instances have passed SSH connection check
 65 | for IP in ${INSTANCE_IPS[@]}; do
 66 |     test_ssh "$IP" &
 67 |     pids="$pids $!"
 68 | done
 69 | for pid in $pids; do
 70 |     wait $pid || { echo "==>Instance ssh check failed"; exit 65; }
 71 | done
 72 | 
 73 | 
 74 | # Prepare AMI specific libfabric installation script
 75 | script_builder multi-node
 76 | 
 77 | for IP in ${INSTANCE_IPS[@]}; do
 78 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
 79 |         $WORKSPACE/libfabric-ci-scripts/wget_check.sh \
 80 |         ${ami[1]}@${IP}:~/
 81 | done
 82 | # Generate ssh key for fabtests
 83 | set +x
 84 | if [ ! -f $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair} ]; then
 85 |     ssh-keygen -f $WORKSPACE/libfabric-ci-scripts/fabtests_${slave_keypair} -N ""
 86 | fi
 87 | cat <<-"EOF" >>${tmp_script}
 88 |     set +x
 89 |     cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
 90 |     chmod 600  ~/.ssh/id_rsa
 91 |     set -x
 92 | EOF
 93 | set -x
 94 | 
 95 | execution_seq=$((${execution_seq}+1))
 96 | # SSH into nodes and install libfabric concurrently on all nodes
 97 | for IP in ${INSTANCE_IPS[@]}; do
 98 |     install_libfabric "$IP" &
 99 | done
100 | wait
101 | 
102 | # Run the efa-check.sh script now that the installer has completed. We need to
103 | # use a login shell so that $PATH is setup correctly for Debian variants.
104 | if [ "${PROVIDER}" == "efa" ]; then
105 |     for IP in ${INSTANCE_IPS[@]}; do
106 |         echo "Running efa-check.sh on ${IP}"
107 |         scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
108 |             $WORKSPACE/libfabric-ci-scripts/efa-check.sh ${ami[1]}@${IP}:
109 |         ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${IP} \
110 |             "bash --login efa-check.sh" 2>&1 | tr \\r \\n | sed 's/\(.*\)/'$IP' \1/'
111 |         if [ ${PIPESTATUS[0]} -ne 0 ]; then
112 |             "EFA check failed on ${IP}"
113 |             exit 1
114 |         fi
115 |     done
116 | fi
117 | 
118 | # Only disable running fabtests with tcp provider on ARM instance
119 | if ! [[ "${PROVIDER}" == "tcp" && "$ami_arch" == "aarch64" ]]; then
120 |     execution_seq=$((${execution_seq}+1))
121 |     # SSH into SERVER node and run fabtests
122 |     N=$((${#INSTANCE_IPS[@]}-1))
123 |     for i in $(seq 1 $N); do
124 |         execute_runfabtests "$i"
125 |     done
126 | 
127 |     # Get build status
128 |     for i in $(seq 1 $N); do
129 |         source $WORKSPACE/libfabric-ci-scripts/${INSTANCE_IPS[$i]}_execute_runfabtests.sh
130 |         exit_status "$EXIT_CODE" "${INSTANCE_IPS[$i]}"
131 |     done
132 | fi
133 | 
134 | # Run MPI tests for EFA provider.
135 | # For tests running on instances with ARM AMIs, also run MPI test if it's testing tcp.
136 | if [[ ${PROVIDER} == "efa" ]] || [[ ${ami_arch} == "aarch64" && ${PROVIDER} == "tcp" ]]; then
137 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
138 |         $WORKSPACE/libfabric-ci-scripts/mpi_ring_c_test.sh \
139 |         $WORKSPACE/libfabric-ci-scripts/mpi_osu_test.sh \
140 |         $WORKSPACE/libfabric-ci-scripts/mpi_common.sh \
141 |         ${ami[1]}@${INSTANCE_IPS[0]}:
142 | 
143 |     test_list="ompi"
144 |     if [ ${RUN_IMPI_TESTS} -eq 1 ]; then
145 |         test_list="$test_list impi"
146 |     fi
147 |     for mpi in $test_list; do
148 |         execution_seq=$((${execution_seq}+1))
149 |         ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[0]} \
150 |             bash mpi_ring_c_test.sh ${mpi} ${libfabric_job_type} ${PROVIDER} ${INSTANCE_IPS[@]} | tee ${output_dir}/temp_execute_ring_c_${mpi}.txt
151 | 
152 |         set +e
153 |         grep -q "Test Passed" ${output_dir}/temp_execute_ring_c_${mpi}.txt
154 |         if [ $? -ne 0 ]; then
155 |             BUILD_CODE=1
156 |             echo "${mpi} ring_c test failed."
157 |         fi
158 |         set -e
159 | 
160 |         ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[0]} \
161 |             bash mpi_osu_test.sh ${mpi} ${libfabric_job_type} ${PROVIDER} ${INSTANCE_IPS[@]} | tee ${output_dir}/temp_execute_osu_${mpi}.txt
162 | 
163 |         set +e
164 |         grep -q "Test Passed" ${output_dir}/temp_execute_osu_${mpi}.txt
165 |         if [ $? -ne 0 ]; then
166 |             BUILD_CODE=1
167 |             echo "${mpi} osu test failed."
168 |         fi
169 |         set -e
170 |     done
171 | fi
172 | 
173 | # Run nccl-tests for GDR Test.
174 | if [ ${BUILD_GDR} -eq 1 ]; then
175 |     scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
176 |         $WORKSPACE/libfabric-ci-scripts/run-nccl-tests.sh \
177 |         $WORKSPACE/libfabric-ci-scripts/mpi_common.sh \
178 |         ${ami[1]}@${INSTANCE_IPS[0]}:
179 |     execution_seq=$((${execution_seq}+1))
180 |     ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS[0]} \
181 |         bash run-nccl-tests.sh ${INSTANCE_IPS[@]} | tee ${output_dir}/temp_execute_nccl_tests.txt
182 | 
183 |     set +e
184 |     grep -q "Test Passed" ${output_dir}/temp_execute_nccl_tests.txt
185 |     if [ $? -ne 0 ]; then
186 |         BUILD_CODE=1
187 |         echo "GDR nccl-tests failed."
188 |     fi
189 |     set -e
190 | fi
191 | 
192 | # Run cdi_test
193 | if [ ${RUN_CDI_TESTS} -eq 1 ]; then
194 |     set +e
195 |     export RUN_MINIMAL=1
196 |     bash $WORKSPACE/libfabric-ci-scripts/cdi_test/tests/run-cdi.sh | tee ${output_dir}/temp_execute_cdi_test.txt
197 |     grep -q "Test Passed" ${output_dir}/temp_execute_cdi_test.txt
198 |     if [ $? -ne 0 ]; then
199 |         BUILD_CODE=1
200 |         echo "cdi_test tests failed."
201 |     fi
202 |     set -e
203 | fi
204 | 
205 | exit ${BUILD_CODE}
206 | 


--------------------------------------------------------------------------------
/multinode_runfabtests.sh:
--------------------------------------------------------------------------------
  1 | . ~/.bash_profile
  2 | 
  3 | check_kernel_has_fork_support()
  4 | {
  5 |     gcc -I/usr/include -o ~/fork_checker ~/fork_checker.c -lefa -libverbs
  6 |     ~/fork_checker
  7 | 
  8 |     if [ 0 -eq $? ]; then
  9 |         return 1
 10 |     fi
 11 | 
 12 |     return 0
 13 | }
 14 | 
 15 | run_test_with_expected_ret()
 16 | {
 17 |     SERVER_IP=$1
 18 |     CLIENT_IP=$2
 19 |     SERVER_CMD=$3
 20 |     CLIENT_CMD=$4
 21 |     EXPECT_RESULT=$5
 22 | 
 23 |     ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no ${SERVER_IP} ${SERVER_CMD} >& server.out &
 24 |     server_pid=$!
 25 |     sleep 1
 26 | 
 27 |     ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no ${CLIENT_IP} ${CLIENT_CMD} ${SERVER_IP} >& client.out &
 28 |     client_pid=$!
 29 | 
 30 |     wait $client_pid
 31 |     client_ret=$?
 32 | 
 33 |     if [ $client_ret -ne 0 ]; then
 34 |         kill -9 $server_pid
 35 |     fi
 36 | 
 37 |     wait $server_pid
 38 |     server_ret=$?
 39 | 
 40 |     ret=0
 41 |     if [ ${EXPECT_RESULT} = "FAIL" ]; then
 42 |         if [ $server_ret -ne 0 ] || [ $client_ret -ne 0 ]; then
 43 |             echo "Test ${PROGRAM_TO_RUN} Passed!"
 44 |         else
 45 |             echo "Test ${PROGRAM_TO_RUN} Failed!"
 46 |             ret=1
 47 |         fi
 48 |     else
 49 |         if [ $server_ret -eq 0 ] && [ $client_ret -eq 0 ]; then
 50 |             echo "Test ${PROGRAM_TO_RUN} Passed!"
 51 |         else
 52 |             echo "Test ${PROGRAM_TO_RUN} Failed!"
 53 |             ret=1
 54 |         fi
 55 |     fi
 56 | 
 57 |     echo "server output:"
 58 |     cat server.out
 59 | 
 60 |     echo "client output:"
 61 |     cat client.out
 62 | 
 63 |     return $ret
 64 | }
 65 | 
 66 | set -xe
 67 | PROVIDER=$1
 68 | SERVER_IP=$2
 69 | CLIENT_IP=$3
 70 | BUILD_GDR=$5
 71 | # Runs all the tests in the fabtests suite while only expanding failed cases
 72 | EXCLUDE=${HOME}/libfabric/fabtests/install/share/fabtests/test_configs/${PROVIDER}/${PROVIDER}.exclude
 73 | if [ -f ${EXCLUDE} ]; then
 74 |     EXCLUDE="-R -f ${EXCLUDE}"
 75 | else
 76 |     EXCLUDE=""
 77 | fi
 78 | 
 79 | # Each individual test has a "-b" option and "-E" option. Both will
 80 | # use out-of-band address exchange.
 81 | # The difference is "-b" will use out-of-band synchronization, -E
 82 | # does not.
 83 | #
 84 | # runfabtests.sh's "-b" option actually uses the -E option of each indivdual
 85 | # test (for historical reasons).
 86 | #
 87 | runfabtests_script="${HOME}/libfabric/fabtests/install/bin/runfabtests.sh"
 88 | b_option_available="$($runfabtests_script -h 2>&1 | grep '\-b' || true)"
 89 | # Check if '-P' option (Run provider specific fabtests) is available
 90 | P_option_available="$($runfabtests_script -h 2>&1 | grep '\-P' || true)"
 91 | FABTESTS_OPTS="-E LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\" -vvv ${EXCLUDE}"
 92 | FABTESTS_OPTS+=" -p ${HOME}/libfabric/fabtests/install/bin/"
 93 | if [ ${PROVIDER} == "efa" ]; then
 94 |     if [ -n "$P_option_available" ]; then
 95 |         FABTESTS_OPTS+=" -P"
 96 |     fi
 97 |     if [ -n "$b_option_available" ]; then
 98 |         FABTESTS_OPTS+=" -b -t all"
 99 |     else
100 |         gid_c=$4
101 |         gid_s=$(ibv_devinfo -v | grep GID | awk '{print $3}')
102 |         FABTESTS_OPTS+=" -C \"-P 0\" -s $gid_s -c $gid_c -t all"
103 |     fi
104 | fi
105 | 
106 | bash -c "$runfabtests_script ${FABTESTS_OPTS} ${PROVIDER} ${SERVER_IP} ${CLIENT_IP}"
107 | 
108 | if [ ${PROVIDER} == "efa" ]; then
109 |     # dgram_pingpong test has been excluded during installation
110 |     # (in install-fabtests.sh), because it does not work with "-E" option.
111 |     # So here we run it separately using "-b" option
112 | 
113 |     bash_option=$-
114 |     restore_e=0
115 |     if [[ $bash_option =~ e ]]; then
116 |         restore_e=1
117 |         set +e
118 |     fi
119 | 
120 |     exit_code=0
121 |     ami_arch=$(uname -m)
122 |     # Run fi_dgram_pingpong on x86 only as it currently does not work on c6gn instances.
123 |     # This change will be reverted once the issue is fixed.
124 |     if [[ "$ami_arch" == "x86_64" ]]; then
125 |         echo "Run fi_dgram_pingpong with out-of-band synchronization"
126 |         SERVER_CMD="${HOME}/libfabric/fabtests/install/bin/fi_dgram_pingpong -k -p efa -b"
127 |         CLIENT_CMD="${SERVER_CMD}"
128 |         run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
129 |         if [ "$?" -ne 0 ]; then
130 |             exit_code=1
131 |         fi
132 |     fi
133 | 
134 |     # Run fi_rdm_tagged_bw with fork when different environment variables are set.
135 |     fork_option_available=$(${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -h 2>&1 | grep '\-K' || true)
136 |     if [ -n "$fork_option_available" ]; then
137 |         echo "Run fi_rdm_tagged_bw with fork"
138 |         SERVER_CMD="${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -p efa -K -E"
139 |         CLIENT_CMD="${SERVER_CMD}"
140 |         # If an application used fork, it needs to enable rdma-core's user space fork support to avoid kernel's
141 |         # copy-on-write mechanism being is applied to pinned memory, otherwise there will be data corrutpion.
142 |         # To make sure fork support is enabled properly, libfabric registered a fork handler, which will abort
143 |         # the application if fork support is not enabled.
144 |         #
145 |         # Kernel 5.13 and newer will not apply CoW on pinned memory, hence the user space kernel support
146 |         # is unneeded. Libfabric will detect that support via rdma-core's ibv_is_fork_initialized() API, and
147 |         # will not register that fork handler on kernel 5.13.
148 |         #
149 |         # In all, the "fi_rdm_tagged_bw with fork" test is expected to pass on 5.13 and newer, but fail on
150 |         # older kernels.
151 |         check_kernel_has_fork_support ${SERVER_IP}
152 |         if [ $? -eq 1 ] ; then
153 |                 expected_result="PASS"
154 |         else
155 |                 expected_result="FAIL"
156 |         fi
157 |         run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "$expected_result"
158 |         if [ "$?" -ne 0 ]; then
159 |             exit_code=1
160 |         fi
161 | 
162 |         echo "Run fi_rdm_tagged_bw with fork and RDMAV_FORK_SAFE set"
163 |         SERVER_CMD="RDMAV_FORK_SAFE=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -v -p efa -K -E"
164 |         CLIENT_CMD="${SERVER_CMD}"
165 |         run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
166 |         if [ "$?" -ne 0 ]; then
167 |             exit_code=1
168 |         fi
169 | 
170 |         echo "Run fi_rdm_tagged_bw with fork and FI_EFA_FORK_SAFE set"
171 |         SERVER_CMD="FI_EFA_FORK_SAFE=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -v -p efa -K -E"
172 |         CLIENT_CMD="${SERVER_CMD}"
173 |         run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
174 |         if [ "$?" -ne 0 ]; then
175 |             exit_code=1
176 |         fi
177 |     fi
178 | 
179 |     if [[ ${BUILD_GDR} -eq 1 ]]; then
180 |         echo "Run fi_rdm_tagged_bw with server using device (GPU) memory and client using host memory"
181 |         CLIENT_CMD="FI_EFA_USE_DEVICE_RDMA=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -p efa -E"
182 |         SERVER_CMD="${CLIENT_CMD} -D cuda"
183 |         run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
184 |         if [ "$?" -ne 0 ]; then
185 |             exit_code=1
186 |         fi
187 |     fi
188 | 
189 |     if [ $restore_e -eq 1 ]; then
190 |         set -e
191 |     fi
192 |     exit $exit_code
193 | fi
194 | 


--------------------------------------------------------------------------------
/nccl/common/nccl-common.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #
  4 | # Copyright 2020 Amazon.com, Inc. or its affiliates.  All Rights Reserved.
  5 | #
  6 | 
  7 | set -e
  8 | 
  9 | # Generate universally unique identifier
 10 | get_uniq_num() {
 11 |     echo $(uuidgen)
 12 | }
 13 | 
 14 | # AMIs dict
 15 | declare -A AMIS
 16 | 
 17 | # Placement groups dict
 18 | declare -A PGS
 19 | 
 20 | # List of aws regions where tests can be executed
 21 | aws_regions=('us-west-2' 'us-east-1')
 22 | 
 23 | # Number of GPUs on each test node
 24 | TEST_NODE_GPUS=8
 25 | 
 26 | NVIDIA_DRIVER_VERSION=450.80.02
 27 | NVIDIA_BASE_URL='https://us.download.nvidia.com/tesla'
 28 | NVIDIA_DRIVER_PATH="$NVIDIA_BASE_URL/$NVIDIA_DRIVER_VERSION/NVIDIA-Linux-x86_64-$NVIDIA_DRIVER_VERSION.run"
 29 | 
 30 | # Components installation prefixes
 31 | LIBFABRIC_INSTALL_PREFIX='$HOME/libfabric/install'
 32 | AWS_OFI_NCCL_INSTALL_PREFIX='$HOME/aws-ofi-nccl/install'
 33 | NCCL_INSTALL_PREFIX='$HOME/nccl'
 34 | 
 35 | # LD_LIBRARY_PATH for nccl tests
 36 | # TODO: Find alternative way for LD_LIBRARY_PATH construction
 37 | # custom_ld_library_path should be updated in case
 38 | # of any changes in components installation prefixes
 39 | custom_ld_library_path="$AWS_OFI_NCCL_INSTALL_PREFIX/lib/:`
 40 |                         `$NCCL_INSTALL_PREFIX/build/lib:`
 41 |                         `$LIBFABRIC_INSTALL_PREFIX/lib/:`
 42 |                         `/opt/amazon/openmpi/lib64:`
 43 |                         `/opt/amazon/openmpi/lib:\$LD_LIBRARY_PATH"
 44 | 
 45 | set_jenkins_variables() {
 46 | 
 47 |     tmp_script=${tmp_script:-$(mktemp -p $WORKSPACE)}
 48 |     tmp_out=${tmp_out:-$(mktemp -p $WORKSPACE)}
 49 | }
 50 | 
 51 | find_latest_ami() {
 52 | 
 53 |     ami=$(aws ec2 describe-images --owners amazon --filters \
 54 |         "Name=name,Values=*$1*" \
 55 |         "Name=state,Values=available" "Name=architecture,Values="x86_64"" \
 56 |         --query 'reverse(sort_by(Images, &CreationDate)[].ImageId)' \
 57 |         --output text | awk '{print $1;}')
 58 |     echo ${ami}
 59 | }
 60 | 
 61 | set_aws_defaults() {
 62 | 
 63 |     echo "==> Establishing default parameters for region: $1"
 64 | 
 65 |     export AWS_DEFAULT_REGION=$1
 66 |     #Use default vpc_id for each region
 67 |     export vpc_id_reg=$(aws ec2 describe-vpcs --query "Vpcs[*].VpcId" --filters Name=isDefault,Values=true --output=text)
 68 | 
 69 | 
 70 |     # The latest Deep Learning AMI (Amazon Linux 2) Image
 71 |     ami_amzn=$(find_latest_ami "Deep Learning Base AMI (Amazon Linux 2)")
 72 |     echo "==> Latest Deep Learning Base AMI (Amazon Linux): ${ami_amzn}"
 73 | 
 74 |     # The latest Deep Learning AMI Ubuntu 16.04 Image
 75 |     ami_ubuntu_16_04=$(find_latest_ami "Deep Learning AMI (Ubuntu 16.04)")
 76 |     echo "==> Latest Deep Learning AMI (Ubuntu 16.04): ${ami_ubuntu_16_04}"
 77 | 
 78 |     # The latest Deep Learning AMI Ubuntu 18.04 Image
 79 |     ami_ubuntu_18_04=$(find_latest_ami "Deep Learning Base AMI (Ubuntu 18.04)")
 80 |     echo "==> Latest Deep Learning Base AMI (Ubuntu 18.04): ${ami_ubuntu_18_04}"
 81 | }
 82 | 
 83 | define_parameters() {
 84 | 
 85 |     # Instance type for AMI preparation
 86 |     instance_ami_type='c5n.18xlarge'
 87 | 
 88 |     # Instance type for running NCCL tests
 89 |     # p3dn.24xlarge instance type was previously used
 90 |     # Changed to p4d.24xlarge due to capacity issue
 91 |     instance_test_type='p4d.24xlarge'
 92 |     create_instance_retries=10
 93 |     instance_check_retries=10
 94 |     ami_check_retries=20
 95 |     ssh_check_retries=40
 96 |     #Size in (B) used to filter busbw test result
 97 |     test_b_size='1073741824'
 98 | 
 99 |     if [[ "${label}" == 'alinux' ]]; then
100 |         ssh_user='ec2-user'
101 |         prep_ami=${ami_amzn}
102 |     elif [[ "${label}" == 'ubuntu_16.04' ]]; then
103 |         ssh_user='ubuntu'
104 |         prep_ami=${ami_ubuntu_16_04}
105 |     elif [[ "${label}" == 'ubuntu_18.04' ]]; then
106 |         ssh_user='ubuntu'
107 |         prep_ami=${ami_ubuntu_18_04}
108 |     else
109 |         echo "Unknown label"
110 |         exit 1
111 |     fi
112 | }
113 | 
114 | # Create security group for NCCL testing
115 | create_efa_sg() {
116 | 
117 |     SGId=$(aws ec2 create-security-group --group-name "EFA-enabled-sg-$(get_uniq_num)" \
118 |         --tag-specification "ResourceType=security-group,Tags=[{Key=Workspace,Value="${WORKSPACE}"},{Key=Build_Number,Value="${BUILD_NUMBER}"}]" \
119 |         --description "EFA-enabled security group" --vpc-id ${vpc_id_reg} --query "GroupId" --output=text)
120 |     echo "==> Setting rules for efa sg ${SGId}"
121 |     aws ec2 authorize-security-group-egress --group-id ${SGId} --protocol all --source-group ${SGId}
122 |     aws ec2 authorize-security-group-ingress --group-id ${SGId} --protocol all --source-group ${SGId}
123 |     aws ec2 authorize-security-group-ingress --port 22 --cidr 0.0.0.0/0 --protocol tcp --group-id ${SGId}
124 | }
125 | 
126 | define_subnets() {
127 | 
128 |     # Get a list of subnets within the VPC relevant to the SG
129 |     vpc_id=$(aws ec2 describe-security-groups \
130 |         --group-ids ${SGId} \
131 |         --query SecurityGroups[0].VpcId --output=text)
132 |     if [[ "${AWS_DEFAULT_REGION}" == 'us-west-2' ]]; then
133 |         subnet_ids=$(aws ec2 describe-subnets \
134 |             --filters "Name=availability-zone,Values=[us-west-2a,us-west-2b,us-west-2c]" \
135 |             "Name=vpc-id,Values=$vpc_id" \
136 |             --query "Subnets[*].SubnetId" --output=text)
137 |     elif [[ "${AWS_DEFAULT_REGION}" == 'us-east-1' ]]; then
138 |         subnet_ids=$(aws ec2 describe-subnets \
139 |             --filters "Name=availability-zone,Values=[us-east-1a,us-east-1b]" \
140 |             "Name=vpc-id,Values=$vpc_id" \
141 |             --query "Subnets[*].SubnetId" --output=text)
142 |     else
143 |         subnet_ids=$(aws ec2 describe-subnets \
144 |                     --filters "Name=vpc-id,Values=$vpc_id" \
145 |                     --query "Subnets[*].SubnetId" --output=text)
146 |     fi
147 | 
148 | }
149 | 
150 | custom_instance_preparation() {
151 | 
152 |     define_parameters
153 |     create_efa_sg
154 |     define_subnets
155 | }
156 | 
157 | delete_sg() {
158 | 
159 |     echo "==> Deleting $1"
160 |     if [ -z $1 ]; then
161 |         echo "SG $1 does not exist"
162 |         return 0
163 |     fi
164 |     aws ec2 delete-security-group --group-id $1
165 | }
166 | 
167 | create_instance() {
168 | 
169 |     INSTANCE_IDS=''
170 |     SERVER_ERROR=(InsufficientInstanceCapacity RequestLimitExceeded ServiceUnavailable Unavailable Unsupported)
171 |     creation_attempts_count=0
172 |     error=1
173 |     network_interface="[{\"DeviceIndex\":0,\"DeleteOnTermination\":true,\"InterfaceType\":\"efa\",\"Groups\":[\"$1\"]"
174 |     addl_args=""
175 |     echo "==> Creating instances"
176 |     while [ ${error} -ne 0 ] && [ ${creation_attempts_count} -lt ${create_instance_retries} ]; do
177 |         for subnet in ${subnet_ids[@]}; do
178 |             if [ ${ENABLE_PLACEMENT_GROUP} -eq 1 ]; then
179 |                 addl_args+=" --placement GroupName="${PGS["${subnet}"]}
180 |             fi
181 |             if [[ -n ${USER_DATA_FILE} && -f ${USER_DATA_FILE} ]]; then
182 |                 addl_args+=" --user-data file://${USER_DATA_FILE}"
183 |             fi
184 | 
185 |             error=1
186 |             set +e
187 |             INSTANCE_IDS=$(aws ec2 run-instances \
188 |                     --tag-specification "ResourceType=instance,Tags=[{Key=Workspace,Value="${WORKSPACE}"},{Key=Name,Value=Slave},{Key=Build_Number,Value="${BUILD_NUMBER}"}]" \
189 |                     --image-id $3 \
190 |                     --instance-type $4 \
191 |                     --enable-api-termination \
192 |                     --key-name ${slave_keypair} \
193 |                     --network-interface ${network_interface}",\"SubnetId\":\"${subnet}\"}]" \
194 |                     --count $2 \
195 |                     --query "Instances[*].InstanceId" \
196 |                     --output=text ${addl_args} 2>&1)
197 |             create_instance_exit_code=$?
198 |             echo "${INSTANCE_IDS}"
199 |             set -e
200 |             # If run-instances is successful break from both the loops, else
201 |             # find out whether the error was due to SERVER_ERROR or some other error
202 |             if [ $create_instance_exit_code -ne 0 ]; then
203 |                 # If the error was due to SERVER_ERROR, set error=1 else for
204 |                 # some other error set error=0
205 |                 for code in ${SERVER_ERROR[@]}; do
206 |                     if [[ "${INSTANCE_IDS}" == *${code}* ]]; then
207 |                         error=1
208 |                         break
209 |                     else
210 |                         error=0
211 |                     fi
212 |                 done
213 |             else
214 |                 echo "==> Instances created: ${INSTANCE_IDS}"
215 |                 break 2
216 |             fi
217 |             # If run-instances wasn't successful, and it was due to some other
218 |             # error, exit and fail the test.
219 |             if [ ${error} -eq 0 ]; then
220 |                 exit ${create_instance_exit_code}
221 |             fi
222 |         done
223 |         sleep 2m
224 |         creation_attempts_count=$((creation_attempts_count+1))
225 |     done
226 | }
227 | 
228 | prepare_instance() {
229 | 
230 |     for region in ${aws_regions[@]}; do
231 |         # Set the default region
232 |         set_aws_defaults ${region}
233 |         custom_instance_preparation
234 |         echo "==> Launching instance in region ${AWS_DEFAULT_REGION}"
235 |         num_instances=$2
236 |         # HW CUDA errors: https://docs.nvidia.com/deploy/xid-errors/index.html
237 |         CUDA_HW_ERROR_CODES=(48 74)
238 |         INSTANCES=()
239 |         create_pg
240 |         create_instance_attempts=0
241 |         INSTANCE_STATE="unavailable"
242 |         while [ ${INSTANCE_STATE} != 'running' ] && [ ${create_instance_attempts} -lt ${create_instance_retries} ] ; do
243 |             if [ $1 == 'ami_instance' ] ; then
244 |                 create_instance ${SGId} 1 ${prep_ami} ${instance_ami_type}
245 |             else
246 |                 create_instance ${SGId} ${num_instances} ${AMIS["${AWS_DEFAULT_REGION}"]} ${instance_test_type}
247 |             fi
248 |             if [ ${create_instance_exit_code} -ne 0 ]; then
249 |                 echo "==> Changing the region"
250 |                 delete_pg
251 |                 # Start over with new region
252 |                 continue 3
253 |             else
254 |                 INSTANCES=(${INSTANCE_IDS})
255 |                 for INSTANCE_ID in ${INSTANCES[@]};do
256 |                     test_instance_status $INSTANCE_ID
257 |                     if [ ${INSTANCE_STATE} != "running" ]; then
258 |                         terminate_instances
259 |                         break
260 |                     fi
261 |                 done
262 |                 if [ $1 != 'ami_instance' ] ; then
263 |                     for INSTANCE_ID in ${INSTANCES[@]};do
264 |                         test_ssh $INSTANCE_ID
265 |                         run_nvidia_checks $INSTANCE_ID
266 |                         sleep 1m
267 |                         test_dmesg_errors $INSTANCE_ID
268 |                         if [[ ! -z ${ERRORS} ]]; then
269 |                             echo "XID errors: ${ERRORS}"
270 |                         fi
271 |                         for code in ${CUDA_HW_ERROR_CODES[@]}; do
272 |                             if [[ "${ERRORS}" == *${code},* ]]; then
273 |                                 echo "!!!Node $INSTANCE_ID reports CUDA XID ${code} errors terminating the instances!!!"
274 |                                 terminate_instances
275 |                                 INSTANCE_STATE='terminated'
276 |                                 # Wait before creating new instance to avoid the same pool
277 |                                 sleep 2m
278 |                                 break 2
279 |                             fi
280 |                         done
281 |                     done
282 |                 fi
283 |             fi
284 |                 create_instance_attempts=$((create_instance_attempts+1))
285 |         done
286 |             if [ ${INSTANCE_STATE} != 'running' ] ; then
287 |                 echo "All attempts to create instance failed."
288 |                 exit 1
289 |             fi
290 |         break
291 |     done
292 | }
293 | 
294 | ami_instance_preparation() {
295 | 
296 |     prepare_instance 'ami_instance' 1
297 |     test_ssh ${INSTANCE_IDS}
298 |     # Install software and prepare custom AMI
299 |     prepare_ami "${PULL_REQUEST_REF}" "${PULL_REQUEST_ID}" "${TARGET_BRANCH}" "${TARGET_REPO}" "${PROVIDER}" "${LIBFABRIC_INSTALL_PREFIX}" "${AWS_OFI_NCCL_INSTALL_PREFIX}" "${NCCL_INSTALL_PREFIX}"
300 |     # Upload AMI to marketplace
301 |     create_ami ${INSTANCE_IDS}
302 |     # Copy ami to different region, required for region switch
303 |     copy_ami ${CUSTOM_AMI} ${AWS_DEFAULT_REGION}
304 | }
305 | 
306 | get_instance_ip() {
307 | 
308 |     instance_ip=$(aws ec2 describe-instances --instance-ids $1 \
309 |                 --query "Reservations[*].Instances[*].PrivateIpAddress" \
310 |                 --output=text)
311 |     echo ${instance_ip}
312 | }
313 | 
314 | get_public_dns() {
315 | 
316 |     public_dns=$(aws ec2 describe-instances --instance-ids $1  \
317 |         --query 'Reservations[0].Instances[0].PublicDnsName' --output text)
318 |     echo ${public_dns}
319 | }
320 | 
321 | test_ssh() {
322 | 
323 |     PublicDNS=$(get_public_dns $1)
324 |     host_ready=1
325 |     host_poll_count=0
326 | 
327 |     set +e
328 |     while [ $host_ready -ne 0 ] && [ $host_poll_count -lt ${ssh_check_retries} ] ; do
329 |         echo "Waiting for host instance to become ready"
330 |         sleep 5
331 |         ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
332 |             -i "~/${slave_keypair}" ${ssh_user}@${PublicDNS}  hostname
333 | 
334 |         if [ $? -eq 0 ]; then
335 |             host_ready=0
336 |         fi
337 |         host_poll_count=$((host_poll_count+1))
338 |     done
339 |     echo "Host instance ssh exited with status ${host_ready}"
340 |     set -e
341 | }
342 | 
343 | test_dmesg_errors() {
344 | 
345 |     ERRORS=''
346 |     PublicDNS=$(get_public_dns $1)
347 |     ERRORS=$(ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
348 |         -i "~/${slave_keypair}" ${ssh_user}@${PublicDNS} dmesg | grep -e "Xid" || true)
349 |     echo "ERRORS: ${ERRORS}"
350 | }
351 | 
352 | terminate_instances() {
353 | 
354 |     echo "==> Terminating instances ${INSTANCE_IDS[@]}"
355 |     if [[ ! -z ${INSTANCE_IDS[@]} ]]; then
356 |         aws ec2 terminate-instances --instance-ids ${INSTANCE_IDS[@]}
357 |         aws ec2 wait instance-terminated --instance-ids ${INSTANCE_IDS[@]}
358 |     fi
359 | }
360 | 
361 | # Custom AMI preparation
362 | prepare_ami() {
363 | 
364 |     echo "==> Starting AMI preparation..."
365 |     cat <<-EOF > ${tmp_script}
366 |     export PULL_REQUEST_REF="$1"
367 |     export PULL_REQUEST_ID="$2"
368 |     export TARGET_BRANCH="$3"
369 |     export TARGET_REPO="$4"
370 |     export PROVIDER="$5"
371 |     export LIBFABRIC_INSTALL_PREFIX="$6"
372 |     export AWS_OFI_NCCL_INSTALL_PREFIX="$7"
373 |     export NCCL_INSTALL_PREFIX="$8"
374 | EOF
375 | 
376 |     cat $WORKSPACE/libfabric-ci-scripts/nccl/common/prep_ami.sh >> ${tmp_script}
377 |     ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
378 |         -i "~/${slave_keypair}" ${ssh_user}@${PublicDNS} "bash -s" < ${tmp_script}
379 | }
380 | 
381 | test_ami_status() {
382 |     ami_status="unavailable"
383 |     check_attempts=0
384 | 
385 |     while [ ${ami_status} != "available" ] && [ ${check_attempts} -lt ${ami_check_retries} ] ; do
386 |         sleep 1m
387 |         ami_status=$(aws ec2 describe-images --image-ids $1 --region $2 \
388 |                     --query "Images[*].State" --output text)
389 |         check_attempts=$((check_attempts+1))
390 |         echo "$1 status: ${ami_status}"
391 |         echo "AMI status check attempts: ${check_attempts}"
392 |     done
393 |     if [ ${ami_status} != "available" ]; then
394 |         echo "There is a problem with ami $1 it still has ${ami_status} status after ${ami_check_retries} minutes"
395 |         exit 1
396 |     fi
397 | }
398 | 
399 | # Copy custom AMI to different region
400 | copy_ami() {
401 | 
402 |     if [ $2 == 'us-east-1' ]; then
403 |         destination_region='us-west-2'
404 |     else
405 |         destination_region='us-east-1'
406 |     fi
407 |     COPIED_AMI=$(aws ec2 copy-image --source-image-id $1 --source-region $2 \
408 |                 --region ${destination_region} --name "nccl-enabled-ami-$(get_uniq_num)" \
409 |                 --output=text --query 'ImageId')
410 |     echo "==> Wait for image ${COPIED_AMI} to become available"
411 |     test_ami_status ${COPIED_AMI} ${destination_region}
412 |     AMIS["${destination_region}"]=${COPIED_AMI}
413 | }
414 | 
415 | # Create custom AMI
416 | create_ami() {
417 | 
418 |     echo "==> Create custom AMI"
419 |     CUSTOM_AMI=$(aws ec2 create-image --instance-id $1 --name "nccl-enabled-ami-$(get_uniq_num)" \
420 |         --description "${WORKSPACE}_${BUILD_NUMBER}" --output=text --query 'ImageId')
421 | 
422 |     echo "==> Wait for image ${CUSTOM_AMI} to become available"
423 |     test_ami_status ${CUSTOM_AMI} ${AWS_DEFAULT_REGION}
424 |     AMIS["${AWS_DEFAULT_REGION}"]=${CUSTOM_AMI}
425 | }
426 | 
427 | # Deregister custom AMIs
428 | deregister_ami() {
429 | 
430 |     if [[ -z ${AMIS[@]} ]]; then
431 |         return 0
432 |     fi
433 | 
434 |     echo "==> Deregistering AMIs"
435 |     for region in ${!AMIS[@]}; do
436 |         snapshot=$(aws ec2 describe-images --image-ids ${AMIS[${region}]} --region ${region} --query "Images[*].BlockDeviceMappings[*].Ebs.SnapshotId" --output text)
437 |         aws ec2 deregister-image --image-id ${AMIS[${region}]} --region ${region}
438 |         echo "==> Deleting snapshot"
439 |         aws ec2 delete-snapshot --snapshot-id ${snapshot} --region ${region}
440 |     done
441 | }
442 | 
443 | test_instance_status() {
444 | 
445 |     echo "==> Waiting for instance $1 to become available"
446 |     instance_status="unavailable"
447 |     check_attempts=0
448 |     while [[ ${instance_status} != "running"  &&  ${instance_status} != "terminated"  &&  ${instance_status} != "shutting-down"  &&  ${check_attempts} -lt ${instance_check_retries} ]]; do
449 |         sleep 1m
450 |         instance_status=$(aws ec2 describe-instances --instance-ids $1 --query "Reservations[*].Instances[*].State.Name" --output text)
451 |         check_attempts=$((check_attempts+1))
452 |         echo "$1 status: ${instance_status}"
453 |     done
454 | 
455 |     if [ ${instance_status} != "running" ] && [ ${instance_status} != "terminated" ] && [ ${instance_status} != "shutting-down" ]; then
456 |         echo "There is a problem with instance $1 it still has  ${instance_status} status after ${check_attempts} minutes, terminating"
457 |         terminate_instances
458 |         instance_status='terminated'
459 |     fi
460 |     INSTANCE_STATE=${instance_status}
461 | }
462 | 
463 | # Create placement groups for cluster to run  NCCL test
464 | create_pg() {
465 | 
466 |     if [ ${ENABLE_PLACEMENT_GROUP} -eq 0 ]; then
467 |         return 0
468 |     fi
469 |     echo "==> Creating placement group"
470 |     # We should have placement group for each subnet
471 |     # Once we tr to create instance in particular subnet/AZ
472 |     # PG is tied to it and cannot be used in different AZs
473 |     for subnet in ${subnet_ids[@]}; do
474 |         PLACEMENT_GROUP="placement-group-$(get_uniq_num)"
475 |             placement_group_id=$(aws ec2 create-placement-group \
476 |                 --group-name ${PLACEMENT_GROUP} \
477 |                 --strategy cluster \
478 |                 --tag-specification "ResourceType=placement-group,Tags=[{Key=Workspace,Value="${WORKSPACE}"},{Key=Build_Number,Value="${BUILD_NUMBER}"}]" \
479 |                 --output=text --query 'PlacementGroup.GroupId')
480 |             if [ $? -eq 0 ]; then
481 |                 echo "Placement group: ${PLACEMENT_GROUP} created."
482 |             fi
483 |         PGS["${subnet}"]=${PLACEMENT_GROUP}
484 |     done
485 | }
486 | 
487 | delete_pg() {
488 | 
489 |     echo "==> Removing placement groups"
490 |     for placement_group in ${PGS[@]}; do
491 |         if [ -z ${placement_group} ]; then
492 |             echo "Placement group: ${placement_group} does not exist."
493 |             return 0
494 |         fi
495 |         echo "==> Removing placement group: ${placement_group}"
496 |         aws ec2 delete-placement-group --group-name ${placement_group}
497 |     done
498 |     # clearing the PGs dict
499 |     for key in ${!PGS[@]}; do
500 |         unset PGS["${key}"]
501 |     done
502 | }
503 | 
504 | generate_key() {
505 | 
506 |     cat <<-"EOF" > ${tmp_script}
507 |     #!/bin/bash
508 |     echo "==> Generating key"
509 |     ssh-keygen -f ~/.ssh/id_rsa -N "" > /dev/null 2>&1
510 |     chmod 600 ~/.ssh/id_rsa
511 | EOF
512 | }
513 | 
514 | install_nvidia_driver() {
515 | 
516 |     # Install nvidia driver if it is missing
517 |     cat <<-EOF > ${tmp_script}
518 |     #!/bin/bash
519 |     NVIDIA_DRIVER_PATH="${NVIDIA_DRIVER_PATH}"
520 | EOF
521 |     cat <<-"EOF" >> ${tmp_script}
522 |     echo "==> Checking if nvidia module is loaded"
523 |     /sbin/lsmod | grep nvidia > /dev/null 2>&1
524 |     if [ $? -eq 0 ]; then
525 |         echo "==> nvidia module is loaded"
526 |         exit 0
527 |     fi
528 |     echo "==> nvidia module is missing, installing..."
529 |     cd $HOME
530 |     curl -L -o ./nvidia_driver.run "${NVIDIA_DRIVER_PATH}"
531 |     sudo sh ./nvidia_driver.run --no-drm --disable-nouveau --dkms --silent --no-cc-version-check --install-libglvnd
532 |     echo "==> Verify that nvidia driver is functional after installation"
533 |     set -e
534 |     nvidia-smi -q | head
535 |     echo "==> Check nvidia driver version after installation"
536 |     cat /proc/driver/nvidia/version
537 | EOF
538 |     ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
539 |         -i "~/${slave_keypair}" ${ssh_user}@$1 "bash -s" < ${tmp_script}
540 | }
541 | 
542 | run_nvidia_checks() {
543 | 
544 |     cat <<-EOF > ${tmp_script}
545 |     #!/bin/bash
546 |     TEST_NODE_GPUS="${TEST_NODE_GPUS}"
547 | EOF
548 |     cat <<-"EOF" >> ${tmp_script}
549 |     #!/bin/bash
550 |     echo "==> Running nvidia GPUs count check"
551 |     gpus_count=$(sudo lspci | grep "3D controller: NVIDIA Corporation Device" | wc -l)
552 |     if [[ "${gpus_count}" != "${TEST_NODE_GPUS}" ]]; then
553 |         echo "==> Nvidia GPUs is missing, on board: ${gpus_count}, should be ${TEST_NODE_GPUS}"
554 |         exit 1
555 |     fi
556 |     echo "==> Running basic nvidia devices check"
557 |     nvidia-smi
558 |     echo "==> Running bandwidthTest for each NVIDIA device"
559 |     sudo lspci | grep NVIDIA | cut -d" " -f 1 > devices.txt
560 |     readarray devices_arr < devices.txt
561 |     cd /usr/local/cuda/samples/1_Utilities/bandwidthTest
562 |     sudo make
563 |     for device in "${devices_arr[@]}"; do
564 |         ./bandwidthTest --device ${device}
565 |     done
566 | EOF
567 |     PDNS=$(get_public_dns $1)
568 |     ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
569 |         -i "~/${slave_keypair}" ${ssh_user}@${PDNS} "bash -s" < ${tmp_script}
570 | }
571 | 
572 | generate_unit_tests_script_single_node() {
573 | 
574 |     cat <<-EOF > ${tmp_script}
575 |     #!/bin/bash
576 |     PROVIDER="${PROVIDER}"
577 |     custom_ld_library_path="${custom_ld_library_path}"
578 | EOF
579 | 
580 |     cat <<-"EOF" >> ${tmp_script}
581 |     while true; do
582 |         echo "==> Executing Unit Tests for provider: "$PROVIDER""
583 | 
584 |         echo "==> Running nccl_connection unit test"
585 |         set -xe
586 |         timeout 5m /opt/amazon/openmpi/bin/mpirun -n 2 \
587 |             -x FI_PROVIDER="$PROVIDER" -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
588 |             -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
589 |             -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
590 |             --mca btl tcp,self --mca btl_tcp_if_exclude  lo,docker0 \
591 |             --bind-to none ~/aws-ofi-nccl/install/bin/nccl_connection
592 | 
593 |         echo "==> Running ring unit test"
594 |         timeout 5m /opt/amazon/openmpi/bin/mpirun -n 3 \
595 |             -x FI_PROVIDER="$PROVIDER" -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
596 |             -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
597 |             -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
598 |             --mca btl tcp,self --mca btl_tcp_if_exclude  lo,docker0 \
599 |             --bind-to none ~/aws-ofi-nccl/install/bin/ring
600 | 
601 |         echo "==> Running nccl_message_transfer unit test"
602 |         timeout 5m /opt/amazon/openmpi/bin/mpirun -n 2 \
603 |             -x FI_PROVIDER="$PROVIDER" -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
604 |             -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
605 |             -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
606 |             --mca btl tcp,self --mca btl_tcp_if_exclude  lo,docker0 \
607 |             --bind-to none ~/aws-ofi-nccl/install/bin/nccl_message_transfer
608 |         set +x
609 |         break
610 |     done
611 | EOF
612 | }
613 | 
614 | generate_unit_tests_script_multi_node() {
615 | 
616 |     cat <<-EOF > ${tmp_script}
617 |     #!/bin/bash
618 |     PROVIDER="${PROVIDER}"
619 |     custom_ld_library_path="${custom_ld_library_path}"
620 | EOF
621 | 
622 |     cat <<-"EOF" >> ${tmp_script}
623 |     while true; do
624 |         echo "==> Executing Unit Tests for provider: "$PROVIDER""
625 | 
626 |         echo "==> Running nccl_connection unit test"
627 |         set -xe
628 |         timeout 5m /opt/amazon/openmpi/bin/mpirun -n 2 -N 1 \
629 |             -x FI_PROVIDER="$PROVIDER" -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
630 |             -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
631 |             -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
632 |             --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
633 |             --bind-to none --tag-output --hostfile hosts ~/aws-ofi-nccl/install/bin/nccl_connection
634 | 
635 |         echo "==> Running ring unit test"
636 |         timeout 5m /opt/amazon/openmpi/bin/mpirun -n 3 -N 1 \
637 |             -x FI_PROVIDER="$PROVIDER" -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
638 |             -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
639 |             -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
640 |             --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
641 |             --bind-to none --tag-output --hostfile hosts ~/aws-ofi-nccl/install/bin/ring
642 | 
643 |         echo "==> Running nccl_message_transfer unit test"
644 |         timeout 5m /opt/amazon/openmpi/bin/mpirun -n 2 -N 1 \
645 |             -x FI_PROVIDER="$PROVIDER" -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
646 |             -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
647 |             -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
648 |             --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
649 |             --bind-to none --tag-output --hostfile hosts ~/aws-ofi-nccl/install/bin/nccl_message_transfer
650 |         set +x
651 |         break
652 |     done
653 | EOF
654 | }
655 | 
656 | generate_nccl_test_script() {
657 | 
658 |     cat <<-EOF > ${tmp_script}
659 |     #!/bin/bash
660 |     PROVIDER="${PROVIDER}"
661 |     NUM_GPUS=$1
662 |     custom_ld_library_path="${custom_ld_library_path}"
663 | EOF
664 |     cat <<-"EOF" >> ${tmp_script}
665 |     echo "Executing NCCL test.."
666 |     echo "==>The provider for test is: "$PROVIDER""
667 |     echo "==>The number of GPUs is: $NUM_GPUS"
668 | 
669 |     set -xe
670 |     timeout 30m /opt/amazon/openmpi/bin/mpirun \
671 |         -x FI_PROVIDER="$PROVIDER" \
672 |         -x NCCL_ALGO=ring --hostfile $HOME/hosts \
673 |         -x FI_EFA_ENABLE_SHM_TRANSFER=0 \
674 |         -x LD_LIBRARY_PATH="${custom_ld_library_path}" \
675 |         -x FI_EFA_TX_MIN_CREDITS=64 \
676 |         -x RDMAV_FORK_SAFE=1 \
677 |         -x NCCL_DEBUG=INFO \
678 |         -n $NUM_GPUS -N 8 \
679 |         --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --mca pml ^cm \
680 |         --bind-to none $HOME/nccl-tests/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100
681 |     set +x
682 | EOF
683 | }
684 | 
685 | # Check if EFA provider has been used during test execution
686 | check_allperf_efa_usage() {
687 | 
688 |     grep "Selected Provider is efa" $1 > /dev/null
689 |     if [ $? -ne 0 ];then
690 |         echo "EFA PROVIDER has not been used during the test"
691 |         exit 1
692 |     fi
693 | }
694 | 
695 | on_exit() {
696 |     # Cleanup instances, SGs, PGs after test
697 |     for reg in ${aws_regions[@]}; do
698 |         INSTANCE_IDS=($(aws --region ${reg} ec2 describe-instances --filters "[{\"Name\":\"instance-state-name\",\"Values\":[\"pending\",\"running\",\"stopped\"]},{\"Name\":\"tag:Workspace\",\"Values\":[\"${WORKSPACE}\"]},{\"Name\":\"tag:Build_Number\",\"Values\":[\"${BUILD_NUMBER}\"]}]" --query "Reservations[*].Instances[*].InstanceId" --output text))
699 |         INSTANCE_IDS_SIZE=${#INSTANCE_IDS[@]}
700 |         SG_IDS=($(aws --region ${reg} ec2 describe-security-groups --filters "[{\"Name\":\"tag:Workspace\",\"Values\":[\"${WORKSPACE}\"]},{\"Name\":\"tag:Build_Number\",\"Values\":[\"${BUILD_NUMBER}\"]}]" --query "SecurityGroups[*].{Name:GroupId}" --output text))
701 |         SG_IDS_SIZE=${#SG_IDS[@]}
702 |         if [ ${INSTANCE_IDS_SIZE} -ne 0 ]; then
703 |             aws --region ${reg} ec2 terminate-instances --instance-ids ${INSTANCE_IDS[@]}
704 |             aws --region ${reg} ec2 wait instance-terminated --instance-ids ${INSTANCE_IDS[@]}
705 |         fi
706 |         if [ ${SG_IDS_SIZE} -ne 0 ]; then
707 |             for sg in ${SG_IDS[@]}; do
708 |                 aws --region ${reg} ec2 delete-security-group --group-id ${sg}
709 |             done
710 |         fi
711 |     done
712 |     deregister_ami
713 |     delete_pg
714 | }
715 | 


--------------------------------------------------------------------------------
/nccl/common/prep_ami.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #
  4 | # Copyright 2020 Amazon.com, Inc. or its affiliates.  All Rights Reserved.
  5 | #
  6 | 
  7 | set -e
  8 | 
  9 | echo "Starting host preparation to create custom AMI for NCCL testing"
 10 | 
 11 | echo "==> PULL_REQUEST_REF: ${PULL_REQUEST_REF}"
 12 | echo "==> PULL_REQUEST_ID: ${PULL_REQUEST_ID}"
 13 | echo "==> TARGET_BRANCH: ${TARGET_BRANCH}"
 14 | echo "==> TARGET_REPO: "${TARGET_REPO}""
 15 | echo "==> PROVIDER: "${PROVIDER}""
 16 | echo "==> LIBFABRIC_INSTALL_PREFIX: "${LIBFABRIC_INSTALL_PREFIX}""
 17 | echo "==> AWS_OFI_NCCL_INSTALL_PREFIX: "${AWS_OFI_NCCL_INSTALL_PREFIX}""
 18 | echo "==> NCCL_INSTALL_PREFIX: "${NCCL_INSTALL_PREFIX}""
 19 | 
 20 | 
 21 | eval "PLATFORM_ID=`sed -n 's/^ID=//p' /etc/os-release`"
 22 | eval "VERSION_ID=`sed -n 's/^VERSION_ID=//p' /etc/os-release`"
 23 | 
 24 | echo "==> Platform: $PLATFORM_ID"
 25 | echo "==> Version:  $VERSION_ID"
 26 | 
 27 | # Identify the nccl branch based on provider
 28 | if [[ ${TARGET_REPO} == 'ofiwg/libfabric' ]];then
 29 |     if [[ ${PROVIDER} == 'efa' ]];then
 30 |         plugin_branch='aws'
 31 |     else
 32 |         plugin_branch='master'
 33 |     fi
 34 | fi
 35 | 
 36 | # Locking NCCL version to 2.8.4-1
 37 | NCCL_VERSION='v2.8.4-1'
 38 | 
 39 | # Latest efa installaer location
 40 | EFA_INSTALLER_LOCATION='https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz'
 41 | 
 42 | # Identify latest CUDA on server
 43 | latest_cuda=$(find /usr/local -maxdepth 1 -type d -iname "cuda*" | sort -V -r | head -1)
 44 | echo "==> Latest CUDA: ${latest_cuda}"
 45 | echo "==> Installing packages"
 46 | 
 47 | generate_key() {
 48 | 
 49 |     echo "==> Generating key"
 50 |     ssh-keygen -f ~/.ssh/id_rsa -N "" > /dev/null 2>&1
 51 |     chmod 600 ~/.ssh/id_rsa
 52 |     cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
 53 | }
 54 | 
 55 | generate_config() {
 56 | 
 57 |     ssh_config=$(
 58 |     cat <<-"EOF"
 59 | Host *
 60 |     ForwardAgent yes
 61 | Host *
 62 |     StrictHostKeyChecking no
 63 | EOF
 64 |     )
 65 | 
 66 |     echo "${ssh_config}"  > ~/.ssh/config
 67 |     chmod 600 ~/.ssh/config
 68 | }
 69 | 
 70 | check_lock() {
 71 | 
 72 |     set +e
 73 |     echo "==> Checking if lock-frontend is in use"
 74 |     lock_check_retries=10
 75 |     no_lock=0
 76 |     while [ $lock_check_retries -ne 0 ] && [ $no_lock -ne 1 ]; do
 77 |         lock=$(sudo lsof /var/lib/dpkg/lock-frontend)
 78 |         if [ ! -z "${lock}" ]; then
 79 |             echo "lock-frontend is still in use, waiting for 2 minutes"
 80 |             lock_check_retries=$((lock_check_retries-1))
 81 |             sleep 2m
 82 |         else
 83 |             echo "lock-frontend is released"
 84 |             no_lock=1
 85 |         fi
 86 |     done
 87 |     if [ ! -z "${lock}" ] ; then
 88 |         echo "All attempts to wait for lock are failed."
 89 |         exit 1
 90 |     fi
 91 |     set -e
 92 | }
 93 | 
 94 | install_efa_installer() {
 95 |     curl -o efa_installer.tar.gz ${EFA_INSTALLER_LOCATION}
 96 |     tar -xf efa_installer.tar.gz
 97 |     cd aws-efa-installer
 98 |     # add /opt/amazon/efa and /opt/amazon/openmpi to the PATH
 99 |     . /etc/profile.d/efa.sh
100 |     sudo ./efa_installer.sh -y
101 |     # check the version of the installer after installation
102 |     echo "==> Efa installer version after installation"
103 |     cat /opt/amazon/efa_installed_packages
104 | }
105 | 
106 | install_libfabric() {
107 | 
108 |     cd ${HOME}/libfabric
109 |     ./autogen.sh
110 |     ./configure --prefix=${LIBFABRIC_INSTALL_PREFIX} \
111 |         --enable-debug  \
112 |         --enable-mrail  \
113 |         --enable-tcp    \
114 |         --enable-rxm    \
115 |         --disable-rxd   \
116 |         --disable-verbs \
117 |         --enable-efa
118 |     make -j 4
119 |     make install
120 | 
121 |     export LD_LIBRARY_PATH=${LIBFABRIC_INSTALL_PREFIX}/lib/:\$LD_LIBRARY_PATH
122 | }
123 | 
124 | prepare_libfabric_without_pr() {
125 | 
126 |     echo "==> Building libfabric"
127 |     cd ${HOME}
128 |     sudo rm -rf libfabric
129 |     git clone https://github.com/ofiwg/libfabric -b 'main'
130 | }
131 | 
132 | prepare_libfabric_with_pr() {
133 | 
134 |     echo "==> This PR belongs to lifabric repo: ofiwg/libfabric"
135 |     echo "==> Starting custom libfabric installation"
136 |     # Pulls the libfabric repository and checks out the pull request commit
137 |     cd ${HOME}
138 |     sudo rm -rf libfabric
139 |     git clone https://github.com/ofiwg/libfabric
140 |     cd ${HOME}/libfabric
141 |     git fetch origin +refs/pull/$PULL_REQUEST_ID/*:refs/remotes/origin/pr/$PULL_REQUEST_ID/*
142 |     git checkout $PULL_REQUEST_REF -b PRBranch
143 | }
144 | 
145 | check_efa_installation_libfabric(){
146 | 
147 |     echo "==> Check if the EFA installed correctly"
148 |     ${HOME}/libfabric/install/bin/fi_info -p efa
149 | }
150 | 
151 | install_nccl() {
152 | 
153 |     echo "==> Install NCCL"
154 |     sudo rm -rf ${NCCL_INSTALL_PREFIX}
155 |     git clone https://github.com/NVIDIA/nccl.git ${NCCL_INSTALL_PREFIX}
156 |     cd ${NCCL_INSTALL_PREFIX}
157 |     git checkout ${NCCL_VERSION}
158 |     make -j src.build CUDA_HOME=${latest_cuda}
159 | }
160 | 
161 | install_nccl_tests() {
162 | 
163 |     echo "==> Install NCCL Tests"
164 |     cd $HOME
165 |     sudo rm -rf nccl-tests
166 |     git clone https://github.com/NVIDIA/nccl-tests.git
167 |     cd nccl-tests
168 |     make MPI=1 MPI_HOME=/opt/amazon/openmpi NCCL_HOME=$HOME/nccl/build CUDA_HOME=${latest_cuda}
169 | }
170 | 
171 | install_aws_ofi_nccl_plugin() {
172 | 
173 |     cd $HOME/aws-ofi-nccl
174 |     ./autogen.sh
175 |     ./configure --prefix=${AWS_OFI_NCCL_INSTALL_PREFIX} \
176 |                 --with-mpi=/opt/amazon/openmpi \
177 |                 --with-libfabric=${LIBFABRIC_INSTALL_PREFIX} \
178 |                 --with-nccl="${NCCL_INSTALL_PREFIX}/build" \
179 |                 --with-cuda=${latest_cuda}
180 |     make
181 |     make install
182 | }
183 | 
184 | prepare_aws_ofi_nccl_plugin_without_pr() {
185 | 
186 |     echo "==> Install aws-ofi-nccl plugin"
187 |     echo "==> Configure from branch: ${plugin_branch} provider: ${PROVIDER}"
188 |     cd $HOME
189 |     sudo rm -rf aws-ofi-nccl
190 |     git clone https://github.com/aws/aws-ofi-nccl.git -b ${plugin_branch}
191 | }
192 | 
193 | prepare_aws_ofi_nccl_plugin_with_pr() {
194 | 
195 |     echo "==> This PR belongs to nccl repo: aws/aws-ofi-nccl"
196 |     echo "==> Install aws-ofi-nccl plugin"
197 |     cd $HOME
198 |     sudo rm -rf aws-ofi-nccl
199 |     if [[ ${TARGET_BRANCH} == 'master' && ${PROVIDER} == 'tcp;ofi_rxm' ]]; then
200 |         echo "==> Configure based on PR, branch: ${TARGET_BRANCH} for provider: ${PROVIDER}"
201 |         git clone https://github.com/aws/aws-ofi-nccl.git -b 'master'
202 |         cd aws-ofi-nccl
203 |         git fetch origin +refs/pull/${PULL_REQUEST_ID}/*:refs/remotes/origin/pr/${PULL_REQUEST_ID}/*
204 |         git checkout ${PULL_REQUEST_REF} -b PRBranch
205 |     elif [[ ${TARGET_BRANCH} == 'aws' && ${PROVIDER} == 'efa' ]]; then
206 |         echo "==> Configure based on PR, branch: ${TARGET_BRANCH} for provider: ${PROVIDER}"
207 |         git clone https://github.com/aws/aws-ofi-nccl.git -b 'aws'
208 |         cd aws-ofi-nccl
209 |         git fetch origin +refs/pull/${PULL_REQUEST_ID}/*:refs/remotes/origin/pr/${PULL_REQUEST_ID}/*
210 |         git checkout ${PULL_REQUEST_REF} -b PRBranch
211 |     elif [[ ${PROVIDER} == 'efa' ]]; then
212 |         echo "==> Configure from aws branch for ${PROVIDER} provider"
213 |         git clone https://github.com/aws/aws-ofi-nccl.git -b 'aws'
214 |     elif [[ ${PROVIDER} == 'tcp;ofi_rxm' ]]; then
215 |         echo "==> Configure from master branch for ${PROVIDER} provider"
216 |         git clone https://github.com/aws/aws-ofi-nccl.git -b 'master'
217 |     fi
218 | }
219 | 
220 | install_software() {
221 | 
222 |     generate_key
223 |     generate_config
224 |     install_efa_installer
225 |     if [[ ${TARGET_REPO} == 'ofiwg/libfabric' ]];then
226 |         prepare_libfabric_with_pr
227 |         install_libfabric
228 |         check_efa_installation_libfabric
229 |         install_nccl
230 |         prepare_aws_ofi_nccl_plugin_without_pr
231 |         install_aws_ofi_nccl_plugin
232 |         install_nccl_tests
233 |     else
234 |         prepare_libfabric_without_pr
235 |         install_libfabric
236 |         check_efa_installation_libfabric
237 |         install_nccl
238 |         prepare_aws_ofi_nccl_plugin_with_pr
239 |         install_aws_ofi_nccl_plugin
240 |         install_nccl_tests
241 |     fi
242 | }
243 | 
244 | case $PLATFORM_ID in
245 |     amzn)
246 |         sudo yum -y groupinstall 'Development Tools'
247 |         install_software
248 |         ;;
249 |     ubuntu)
250 |         # Wait until lock /var/lib/dpkg/lock-frontend released by unattended security upgrade
251 |         sleep 30
252 |         check_lock
253 |         install_software
254 |         ;;
255 |     *)
256 |     echo "ERROR: Unknown platform ${PLATFORM_ID}"
257 |     exit 1
258 | esac
259 | 


--------------------------------------------------------------------------------
/nccl/tests/nccl-multi-node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright 2020 Amazon.com, Inc. or its affiliates.  All Rights Reserved.
 5 | #
 6 | 
 7 | set -e
 8 | 
 9 | echo "'INFO '==> Staring preparation for NCCL multi-node Tests"
10 | 
11 | source $WORKSPACE/libfabric-ci-scripts/nccl/common/nccl-common.sh
12 | 
13 | # Number of nodes used for nccl tests, at least 3 nodes are required for ring unit test
14 | NUM_NODES=3
15 | 
16 | # Each node has 8 GPUS
17 | NUM_GPUS=$(( ${NUM_NODES} * 8 ))
18 | 
19 | set_jenkins_variables
20 | 
21 | trap 'on_exit'  EXIT
22 | 
23 | ENABLE_PLACEMENT_GROUP=0
24 | 
25 | ami_instance_preparation
26 | 
27 | # Create Nodes
28 | echo "==> Creating Nodes"
29 | 
30 | prepare_instance 'test_instance' ${NUM_NODES}
31 | 
32 | nodes_ips=()
33 | 
34 | nodes_pub_dns=()
35 | 
36 | for instance in ${INSTANCES[@]}; do
37 |     nodes_pub_dns+=($(get_public_dns ${instance}))
38 |     nodes_ips+=($(get_instance_ip ${instance}))
39 | done
40 | 
41 | truncate -s 0 ${tmp_script}
42 | 
43 | for ip in ${nodes_ips[@]}; do
44 |     echo "${ip} slots=8" >> ${tmp_script}
45 | done
46 | 
47 | for pub_dns in ${nodes_pub_dns[@]}; do
48 |     scp -i "~/${slave_keypair}" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${tmp_script} ${ssh_user}@${pub_dns}:/home/${ssh_user}/hosts
49 | done
50 | 
51 | for pub_dns in ${nodes_pub_dns[@]}; do
52 |     install_nvidia_driver ${pub_dns}
53 | done
54 | 
55 | echo "==> Running unit tests"
56 | 
57 | generate_unit_tests_script_multi_node
58 | 
59 | ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
60 |     -i "~/${slave_keypair}" ${ssh_user}@${nodes_pub_dns[0]} "bash -s" < ${tmp_script}
61 | 
62 | echo "==> Running NCCL test on ${NUM_NODES} nodes with ${NUM_GPUS} GPUs"
63 | 
64 | generate_nccl_test_script ${NUM_GPUS}
65 | 
66 | ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
67 |     -i "~/${slave_keypair}" ${ssh_user}@${nodes_pub_dns[0]} "bash -s" < ${tmp_script} >> ${tmp_out}
68 | 
69 | # Show full test results
70 | cat ${tmp_out}
71 | 
72 | if [[ "${PROVIDER}" == 'efa' ]]; then
73 |     # check if EFA was used during all_perf_reduce test
74 |     check_allperf_efa_usage ${tmp_out}
75 | fi
76 | 
77 | # Show only busbw
78 | echo "==> The test result busbw (GB/s): " `cat ${tmp_out} | grep ${test_b_size} | tail -n1 | awk -F " " '{print $11}' | sed 's/ //' | sed 's/  5e-07//'`
79 | 
80 | echo "==> All done"
81 | 


--------------------------------------------------------------------------------
/nccl/tests/nccl-single-node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright 2020 Amazon.com, Inc. or its affiliates.  All Rights Reserved.
 5 | #
 6 | 
 7 | set -e
 8 | 
 9 | echo "'INFO' ==> Starting preparation for NCCL single-node test"
10 | 
11 | source $WORKSPACE/libfabric-ci-scripts/nccl/common/nccl-common.sh
12 | 
13 | # Number of nodes used for nccl tests
14 | NUM_NODES=1
15 | 
16 | # Each node has 8 GPUS
17 | NUM_GPUS=$(( ${NUM_NODES} * 8 ))
18 | 
19 | set_jenkins_variables
20 | 
21 | trap 'on_exit'  EXIT
22 | 
23 | ENABLE_PLACEMENT_GROUP=0
24 | 
25 | ami_instance_preparation
26 | 
27 | prepare_instance 'test_instance' ${NUM_NODES}
28 | 
29 | PublicDNSLeader=$(get_public_dns ${INSTANCE_IDS})
30 | 
31 | LeaderIp=$(get_instance_ip ${INSTANCE_IDS})
32 | 
33 | install_nvidia_driver ${PublicDNSLeader}
34 | 
35 | hosts=$(
36 | cat <<-EOF
37 | ${LeaderIp} slots=8
38 | EOF
39 | )
40 | 
41 | echo "${hosts}" > ${tmp_script}
42 | 
43 | scp -i "~/${slave_keypair}" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${tmp_script} ${ssh_user}@${PublicDNSLeader}:/home/${ssh_user}/hosts
44 | 
45 | echo "==> Running unit tests"
46 | generate_unit_tests_script_single_node
47 | 
48 | ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
49 |     -i "~/${slave_keypair}" ${ssh_user}@${PublicDNSLeader} "bash -s" < ${tmp_script}
50 | 
51 | echo "==> Running NCCL test with ${NUM_GPUS} GPUs"
52 | generate_nccl_test_script ${NUM_GPUS}
53 | 
54 | ssh -T -o ConnectTimeout=30 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
55 |     -i "~/${slave_keypair}" ${ssh_user}@${PublicDNSLeader} "bash -s" < ${tmp_script} >> ${tmp_out}
56 | 
57 | # Show full test results
58 | cat ${tmp_out}
59 | 
60 | if [[ "${PROVIDER}" == 'efa' ]]; then
61 |     # check if EFA was used during all_perf_reduce test
62 |     check_allperf_efa_usage ${tmp_out}
63 | fi
64 | 
65 | # Show only busbw
66 | echo "==> The test result busbw (GB/s): " `cat ${tmp_out} | grep ${test_b_size} | tail -n1 | awk -F " " '{print $11}' | sed 's/ //' | sed 's/  5e-07//' `
67 | 
68 | echo "==> All done"
69 | 


--------------------------------------------------------------------------------
/run-nccl-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ~/.bash_profile
 4 | source ~/mpi_common.sh
 5 | source /etc/profile.d/efa.sh
 6 | 
 7 | set -x
 8 | set -o pipefail
 9 | 
10 | hosts=$@
11 | hostfile=$(mktemp)
12 | out=$(mktemp)
13 | 
14 | host_setup ${hostfile} ${hosts}
15 | 
16 | echo "Running nccl-tests: all_reduce_perf"
17 | 
18 | mpirun  --prefix /opt/amazon/openmpi \
19 |         --hostfile $hostfile \
20 |         -x PATH -x LD_LIBRARY_PATH="/opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:$LD_LIBRARY_PATH" \
21 |         -x FI_PROVIDER=efa \
22 |         -x MPIEXEC_TIMEOUT=1800 \
23 |         -x FI_EFA_USE_DEVICE_RDMA=1 \
24 |         -x RDMAV_FORK_SAFE=1 \
25 |         -x NCCL_DEBUG=INFO -x NCCL_ALGO=ring -x NCCL_PROTO=simple \
26 |         -n 16 -N 8 \
27 |         --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --mca pml ^cm \
28 |         --bind-to none $HOME/nccl-tests/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee $out
29 | 
30 | if [ $? -ne 0 ]; then
31 |     echo "nccl-tests: all_reduce_perf failed"
32 |     exit 1
33 | fi
34 | 
35 | # Verify EFA is selected.
36 | grep -q "Selected Provider is efa" $out
37 | if [ $? -ne 0 ]; then
38 |     echo "EFA provider is not selected in nccl-tests."
39 |     exit 1
40 | fi
41 | 
42 | # Verify GPU Direct RDMA is used.
43 | grep -q "\[send\] via NET/AWS Libfabric/0/GDRDMA" $out
44 | if [ $? -ne 0 ]; then
45 |     echo "GPU Direct RDMA is not used in nccl-tests."
46 |     exit 1
47 | fi
48 | echo "Test Passed"
49 | set +x
50 | 


--------------------------------------------------------------------------------
/single-node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -xe
  4 | source $WORKSPACE/libfabric-ci-scripts/common.sh
  5 | trap 'on_exit'  EXIT
  6 | slave_name=slave_$label
  7 | slave_value=${!slave_name}
  8 | ami=($slave_value)
  9 | NODES=1
 10 | # Placement group is not needed for single-node tests.
 11 | export ENABLE_PLACEMENT_GROUP=0
 12 | export USER_DATA_FILE=${USER_DATA_FILE:-${JENKINS_HOME}/user_data_script.sh}
 13 | 
 14 | set +x
 15 | create_instance || { echo "==>Unable to create instance"; exit 65; }
 16 | set -x
 17 | 
 18 | get_instance_ip
 19 | 
 20 | execution_seq=$((${execution_seq}+1))
 21 | test_ssh ${INSTANCE_IPS}
 22 | 
 23 | 
 24 | scp -o ConnectTimeout=30 -o StrictHostKeyChecking=no -i ~/${slave_keypair} \
 25 |             $WORKSPACE/libfabric-ci-scripts/wget_check.sh \
 26 |             ${ami[1]}@${INSTANCE_IPS}:~/
 27 | 
 28 | execution_seq=$((${execution_seq}+1))
 29 | 
 30 | # Add AMI specific installation commands
 31 | script_builder single-node
 32 | 
 33 | # Appending fabtests to the existing installation script
 34 | cat <<-"EOF" >> ${tmp_script}
 35 | . ~/.bash_profile
 36 | ssh-keygen -f ${HOME}/.ssh/id_rsa -N "" > /dev/null
 37 | cat ${HOME}/.ssh/id_rsa.pub >> ${HOME}/.ssh/authorized_keys
 38 | 
 39 | runfabtests_script="${HOME}/libfabric/fabtests/install/bin/runfabtests.sh"
 40 | 
 41 | EXCLUDE=${HOME}/libfabric/fabtests/install/share/fabtests/test_configs/${PROVIDER}/${PROVIDER}.exclude
 42 | if [ -f ${EXCLUDE} ]; then
 43 |     EXCLUDE="-R -f ${EXCLUDE}"
 44 | else
 45 |     EXCLUDE=""
 46 | fi
 47 | 
 48 | # Provider-specific handling of the options passed to runfabtests.sh
 49 | FABTESTS_OPTS="-E LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\" -vvv ${EXCLUDE}"
 50 | FABTESTS_OPTS+=" -p ${HOME}/libfabric/fabtests/install/bin/"
 51 | case "${PROVIDER}" in
 52 | "efa")
 53 |     # EFA provider supports a custom address format based on the GID of the
 54 |     # device. Extract that from sysfs and pass it to the tests. Also have the
 55 |     # client communicate with QP0 of the server. This is only for older
 56 |     # versions of fabtests, newer versions can use the -b option to exchange
 57 |     # out of band.
 58 |     b_option_available="$($runfabtests_script -h 2>&1 | grep '\-b' || true)"
 59 |     # Check if '-P' option (Run provider specific fabtests) is available
 60 |     P_option_available="$($runfabtests_script -h 2>&1 | grep '\-P' || true)"
 61 |     FABTESTS_OPTS+=" -t all"
 62 |     if [ -n "$P_option_available" ]; then
 63 |         FABTESTS_OPTS+=" -P"
 64 |     fi
 65 |     if [ -n "$b_option_available" ]; then
 66 |         FABTESTS_OPTS+=" -b"
 67 |     else
 68 |         gid=$(ibv_devinfo -v | grep GID | awk '{print $3}')
 69 |         FABTESTS_OPTS+=" -C \"-P 0\" -s $gid -c $gid"
 70 |     fi
 71 |     ;;
 72 | "shm")
 73 |     # The shm provider does not support the negative tests with bad addresses,
 74 |     # and there seems to be no easy way to add them to the exclude lists..
 75 |     # See https://github.com/ofiwg/libfabric/issues/5182 for context.
 76 |     FABTESTS_OPTS+=" -N"
 77 |     ;;
 78 | esac
 79 | 
 80 | bash -c "$runfabtests_script ${FABTESTS_OPTS} ${PROVIDER} 127.0.0.1 127.0.0.1"
 81 | 
 82 | EOF
 83 | 
 84 | # Test whether node is ready for SSH connection or not
 85 | test_ssh ${INSTANCE_IPS}
 86 | 
 87 | execution_seq=$((${execution_seq}+1))
 88 | # For single node, the ssh connection is established only once. The script
 89 | # builds libfabric and also executes fabtests
 90 | set +x
 91 | ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no -T -i ~/${slave_keypair} ${ami[1]}@${INSTANCE_IPS} \
 92 |     "bash -s" -- <${tmp_script} \
 93 |     "$PULL_REQUEST_ID" "$PULL_REQUEST_REF" "$PROVIDER" "$ami_arch" "$libfabric_job_type" 2>&1 | tr \\r \\n | \
 94 |     sed 's/\(.*\)/'${INSTANCE_IPS}' \1/' | tee ${output_dir}/temp_execute_runfabtests.txt
 95 | EXIT_CODE=${PIPESTATUS[0]}
 96 | set -x
 97 | 
 98 | # Get build status
 99 | exit_status "$EXIT_CODE" "${INSTANCE_IPS}"
100 | exit ${BUILD_CODE}
101 | 


--------------------------------------------------------------------------------
/test/setup.sh:
--------------------------------------------------------------------------------
 1 | # Modify and source this file to test these scripts outside of Jenkins.
 2 | export WORKSPACE=${HOME}
 3 | export BUILD_NUMBER=test
 4 | export PROVIDER=efa
 5 | export AWS_DEFAULT_REGION=us-west-2
 6 | # This will cause the git checkout to fail and stay on master, will follow up
 7 | # later to fix these scripts to take arbitrary branches, tags, etc.
 8 | export PULL_REQUEST_ID=0
 9 | export PULL_REQUEST_REF=0
10 | export label=alinux # rhel and ubuntu also valid options
11 | export slave_rhel=ami-036affea69a1101c9\ ec2-user
12 | export slave_alinux=ami-0cb72367e98845d43\ ec2-user
13 | export slave_ubuntu=ami-005bdb005fb00e791\ ubuntu
14 | export slave_security_group=sg-xxxxxxxx
15 | export slave_keypair=keypair
16 | 


--------------------------------------------------------------------------------
/wget_check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Loads wget_check function which downloads files using wget.
 4 | # The following two flags are used with wget
 5 | # 1) tries: set to 5, retry 5 times in case of a failure
 6 | # 2) content-on-error: If an error occurs download the error webpage
 7 | # If an error occurs wget saves the error webpage in the filename
 8 | # provided during download (generally .tar format). We need to rename
 9 | # it to .html format, to cat it to stdout
10 | 
11 | WGET_OPT="--tries=5 --content-on-error --no-verbose"
12 | 
13 | function wget_check {
14 |     url=$1
15 |     file_name=$2
16 |     bash_option=$-
17 |     restore_e=0
18 |     if [[ $bash_option =~ e ]]; then
19 |         restore_e=1
20 |         set +e
21 |     fi
22 |     # bash -c is used to avoid issues due to quotation within quotation
23 |     bash -c "wget ${WGET_OPT} -O $file_name $url"
24 |     if [ $? -ne 0 ]; then
25 |         if [ -f "$file_name" ]; then
26 |             # Only if the file type has ASCII text output the file
27 |             if [[ $(file $file_name | grep "ASCII") =~ "ASCII" ]]; then
28 |                 cat $file_name
29 |             fi
30 |         fi
31 |         exit 1
32 |     fi
33 |     if [ $restore_e -eq 1 ]; then
34 |         set -e
35 |     fi
36 | }
37 | 


--------------------------------------------------------------------------------