├── .github └── workflows │ └── codeql.yml ├── .gitignore ├── DEVEL.md ├── LICENSE ├── README.md ├── SECURITY.md ├── ThirdPartyNotice ├── azure-slurm-install ├── capture_logs.sh ├── conf │ └── install_logging.conf ├── imex_epilog.sh ├── imex_prolog.sh ├── install.py ├── install.sh ├── installlib.py ├── package.py ├── package.sh ├── rhel.sh ├── setup.py ├── slurmel8.repo ├── slurmel8insiders.repo ├── slurmel9.repo ├── slurmel9insiders.repo ├── start-services.sh ├── suse.sh ├── templates │ ├── cgroup.conf.template │ ├── enroot.conf.template │ ├── job_submit.lua │ ├── munge.override │ ├── slurm-limits.conf │ ├── slurm.conf.template │ ├── slurm_exporter.yml │ ├── slurmctld.override │ ├── slurmdbd.conf.template │ └── slurmrestd.override ├── test │ └── installlib_test.py └── ubuntu.sh ├── azure-slurm ├── conf │ └── logging.conf ├── install.sh ├── package.py ├── package.sh ├── sbin │ ├── get_acct_info.sh │ ├── init-config.sh │ ├── post-install.sh │ ├── prolog.sh │ ├── resume_fail_program.sh │ ├── resume_program.sh │ ├── return_to_idle.sh │ ├── return_to_idle_legacy.sh │ └── suspend_program.sh ├── setup.py ├── slurmcc │ ├── __init__.py │ ├── allocation.py │ ├── azslurmd.py │ ├── azslurmdwrapper.py │ ├── cli.py │ ├── cost.py │ ├── partition.py │ ├── topology.py │ └── util.py └── test │ └── slurmcc_test │ ├── __init__.py │ ├── allocation_test.py │ ├── azslurmd_test.py │ ├── cli_test.py │ ├── testutil.py │ ├── testutil_test.py │ ├── topology_test.py │ ├── topology_test_input │ ├── all_hostnames.txt │ ├── block_topology.txt │ ├── guid_hostnames.txt │ ├── guids.txt │ ├── hostnames.txt │ ├── nodes_clusterUUIDs.txt │ ├── nodes_clusterUUIDs_2.txt │ ├── nodes_guids.txt │ ├── partitions.txt │ ├── powered_down_hostnames.txt │ ├── slurm_illegal_block_topology.txt │ ├── slurm_illegal_block_topology_2.txt │ ├── slurm_topology.txt │ ├── topology.txt │ └── valid_hostnames.txt │ └── util_test.py ├── description.html ├── dev-requirements.txt ├── docker-package.sh ├── icon.png ├── images ├── nodearrayedit.png ├── nodearraytab.png └── schedulernodeedit.png ├── integration ├── README.md └── src │ └── integration.py ├── project.ini ├── specs ├── default │ ├── chef │ │ ├── roles │ │ │ ├── slurm_execute_role.rb │ │ │ ├── slurm_login_role.rb │ │ │ └── slurm_scheduler_role.rb │ │ └── site-cookbooks │ │ │ └── slurm │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── attributes │ │ │ └── default.rb │ │ │ ├── chefignore │ │ │ ├── libraries │ │ │ └── helpers.rb │ │ │ ├── metadata.rb │ │ │ └── recipes │ │ │ └── delayed_services.rb │ └── cluster-init │ │ ├── files │ │ ├── JobSubmitPlugin │ │ │ └── job_submit_cyclecloud.lua │ │ ├── README.txt │ │ └── install-non-scheduler.sh │ │ ├── scripts │ │ └── README.txt │ │ └── tests │ │ ├── README.txt │ │ └── test_uid.py ├── execute │ └── cluster-init │ │ └── scripts │ │ └── 00-install-execute.sh ├── login │ └── cluster-init │ │ └── scripts │ │ └── 00-install-login.sh └── scheduler │ └── cluster-init │ ├── scripts │ └── 00-install.sh │ └── tests │ └── test_slurm.py ├── templates ├── slurm-beegfs.txt ├── slurm-cs.txt ├── slurm-custom.txt └── slurm.txt └── util ├── Dockerfile └── build.sh /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # Basic python codeql workflow with minor customizations 2 | # 3 | # For most projects, this workflow file will not need changing; you simply need 4 | # to commit it to your repository. 5 | # 6 | # You may wish to alter this file to override the set of languages analyzed, 7 | # or to provide custom queries or build logic. 8 | # 9 | # ******** NOTE ******** 10 | # We have attempted to detect the languages in your repository. Please check 11 | # the `language` matrix defined below to confirm you have the correct set of 12 | # supported CodeQL languages. 13 | # 14 | name: "CodeQL" 15 | 16 | on: 17 | push: 18 | branches: [ "master" ] 19 | pull_request: 20 | # The branches below must be a subset of the branches above 21 | branches: [ "master" ] 22 | schedule: 23 | - cron: '42 23 * * 4' 24 | 25 | jobs: 26 | analyze: 27 | name: Analyze 28 | runs-on: ubuntu-latest 29 | permissions: 30 | actions: read 31 | contents: read 32 | security-events: write 33 | 34 | strategy: 35 | fail-fast: false 36 | matrix: 37 | language: [ 'python' ] 38 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 39 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 40 | 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@v4 44 | 45 | # Initializes the CodeQL tools for scanning. 46 | - name: Initialize CodeQL 47 | uses: github/codeql-action/init@v3 48 | with: 49 | languages: ${{ matrix.language }} 50 | # If you wish to specify custom queries, you can do so here or in a config file. 51 | # By default, queries listed here will override any specified in a config file. 52 | # Prefix the list here with "+" to use these queries and those in the config file. 53 | 54 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 55 | # queries: security-extended,security-and-quality 56 | 57 | 58 | - name: BuildPackage 59 | run: | 60 | echo "Build azslurm package" 61 | python3 azure-slurm/package.py 62 | 63 | 64 | - name: Perform CodeQL Analysis 65 | uses: github/codeql-action/analyze@v3 66 | with: 67 | category: "/language:${{matrix.language}}" 68 | 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | *.eggs* 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | build/ 24 | [Bb]in/ 25 | [Oo]bj/ 26 | [Ll]og/ 27 | 28 | # Visual Studio 2015/2017 cache/options directory 29 | .vs/ 30 | # Uncomment if you have tasks that create the project's static files in wwwroot 31 | #wwwroot/ 32 | 33 | # Visual Studio 2017 auto generated files 34 | Generated\ Files/ 35 | 36 | # MSTest test Results 37 | [Tt]est[Rr]esult*/ 38 | [Bb]uild[Ll]og.* 39 | 40 | # NUNIT 41 | *.VisualState.xml 42 | TestResult.xml 43 | 44 | # Build Results of an ATL Project 45 | [Dd]ebugPS/ 46 | [Rr]eleasePS/ 47 | dlldata.c 48 | 49 | # Benchmark Results 50 | BenchmarkDotNet.Artifacts/ 51 | 52 | # .NET Core 53 | project.lock.json 54 | project.fragment.lock.json 55 | artifacts/ 56 | **/Properties/launchSettings.json 57 | 58 | # StyleCop 59 | StyleCopReport.xml 60 | 61 | # Files built by Visual Studio 62 | *_i.c 63 | *_p.c 64 | *_i.h 65 | *.ilk 66 | *.meta 67 | *.obj 68 | *.iobj 69 | *.pch 70 | *.pdb 71 | *.ipdb 72 | *.pgc 73 | *.pgd 74 | *.rsp 75 | *.sbr 76 | *.tlb 77 | *.tli 78 | *.tlh 79 | *.tmp 80 | *.tmp_proj 81 | *.log 82 | *.vspscc 83 | *.vssscc 84 | .builds 85 | *.pidb 86 | *.svclog 87 | *.scc 88 | 89 | # Chutzpah Test files 90 | _Chutzpah* 91 | 92 | # Visual C++ cache files 93 | ipch/ 94 | *.aps 95 | *.ncb 96 | *.opendb 97 | *.opensdf 98 | *.sdf 99 | *.cachefile 100 | *.VC.db 101 | *.VC.VC.opendb 102 | 103 | # Visual Studio profiler 104 | *.psess 105 | *.vsp 106 | *.vspx 107 | *.sap 108 | 109 | # Visual Studio Trace Files 110 | *.e2e 111 | 112 | # TFS 2012 Local Workspace 113 | $tf/ 114 | 115 | # Guidance Automation Toolkit 116 | *.gpState 117 | 118 | # ReSharper is a .NET coding add-in 119 | _ReSharper*/ 120 | *.[Rr]e[Ss]harper 121 | *.DotSettings.user 122 | 123 | # JustCode is a .NET coding add-in 124 | .JustCode 125 | 126 | # TeamCity is a build add-in 127 | _TeamCity* 128 | 129 | # DotCover is a Code Coverage Tool 130 | *.dotCover 131 | 132 | # AxoCover is a Code Coverage Tool 133 | .axoCover/* 134 | !.axoCover/settings.json 135 | 136 | # Visual Studio code coverage results 137 | *.coverage 138 | *.coveragexml 139 | 140 | # NCrunch 141 | _NCrunch_* 142 | .*crunch*.local.xml 143 | nCrunchTemp_* 144 | 145 | # MightyMoose 146 | *.mm.* 147 | AutoTest.Net/ 148 | 149 | # Web workbench (sass) 150 | .sass-cache/ 151 | 152 | # Installshield output folder 153 | [Ee]xpress/ 154 | 155 | # DocProject is a documentation generator add-in 156 | DocProject/buildhelp/ 157 | DocProject/Help/*.HxT 158 | DocProject/Help/*.HxC 159 | DocProject/Help/*.hhc 160 | DocProject/Help/*.hhk 161 | DocProject/Help/*.hhp 162 | DocProject/Help/Html2 163 | DocProject/Help/html 164 | 165 | # Click-Once directory 166 | publish/ 167 | 168 | # Publish Web Output 169 | *.[Pp]ublish.xml 170 | *.azurePubxml 171 | # Note: Comment the next line if you want to checkin your web deploy settings, 172 | # but database connection strings (with potential passwords) will be unencrypted 173 | *.pubxml 174 | *.publishproj 175 | 176 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 177 | # checkin your Azure Web App publish settings, but sensitive information contained 178 | # in these scripts will be unencrypted 179 | PublishScripts/ 180 | 181 | # NuGet Packages 182 | *.nupkg 183 | # The packages folder can be ignored because of Package Restore 184 | **/[Pp]ackages/* 185 | # except build/, which is used as an MSBuild target. 186 | !**/[Pp]ackages/build/ 187 | # Uncomment if necessary however generally it will be regenerated when needed 188 | #!**/[Pp]ackages/repositories.config 189 | # NuGet v3's project.json files produces more ignorable files 190 | *.nuget.props 191 | *.nuget.targets 192 | 193 | # Microsoft Azure Build Output 194 | csx/ 195 | *.build.csdef 196 | 197 | # Microsoft Azure Emulator 198 | ecf/ 199 | rcf/ 200 | 201 | # Windows Store app package directories and files 202 | AppPackages/ 203 | BundleArtifacts/ 204 | Package.StoreAssociation.xml 205 | _pkginfo.txt 206 | *.appx 207 | 208 | # Visual Studio cache files 209 | # files ending in .cache can be ignored 210 | *.[Cc]ache 211 | # but keep track of directories ending in .cache 212 | !*.[Cc]ache/ 213 | 214 | # Others 215 | ClientBin/ 216 | ~$* 217 | *~ 218 | *.dbmdl 219 | *.dbproj.schemaview 220 | *.jfm 221 | *.pfx 222 | *.publishsettings 223 | orleans.codegen.cs 224 | 225 | # Including strong name files can present a security risk 226 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 227 | #*.snk 228 | 229 | # Since there are multiple workflows, uncomment next line to ignore bower_components 230 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 231 | #bower_components/ 232 | 233 | # RIA/Silverlight projects 234 | Generated_Code/ 235 | 236 | # Backup & report files from converting an old project file 237 | # to a newer Visual Studio version. Backup files are not needed, 238 | # because we have git ;-) 239 | _UpgradeReport_Files/ 240 | Backup*/ 241 | UpgradeLog*.XML 242 | UpgradeLog*.htm 243 | ServiceFabricBackup/ 244 | *.rptproj.bak 245 | 246 | # SQL Server files 247 | *.mdf 248 | *.ldf 249 | *.ndf 250 | 251 | # Business Intelligence projects 252 | *.rdl.data 253 | *.bim.layout 254 | *.bim_*.settings 255 | *.rptproj.rsuser 256 | 257 | # Microsoft Fakes 258 | FakesAssemblies/ 259 | 260 | # GhostDoc plugin setting file 261 | *.GhostDoc.xml 262 | 263 | # Node.js Tools for Visual Studio 264 | .ntvs_analysis.dat 265 | node_modules/ 266 | 267 | # Visual Studio 6 build log 268 | *.plg 269 | 270 | # Visual Studio 6 workspace options file 271 | *.opt 272 | 273 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 274 | *.vbw 275 | 276 | # Visual Studio LightSwitch build output 277 | **/*.HTMLClient/GeneratedArtifacts 278 | **/*.DesktopClient/GeneratedArtifacts 279 | **/*.DesktopClient/ModelManifest.xml 280 | **/*.Server/GeneratedArtifacts 281 | **/*.Server/ModelManifest.xml 282 | _Pvt_Extensions 283 | 284 | # Paket dependency manager 285 | .paket/paket.exe 286 | paket-files/ 287 | 288 | # FAKE - F# Make 289 | .fake/ 290 | 291 | # JetBrains Rider 292 | .idea/ 293 | *.sln.iml 294 | 295 | # CodeRush 296 | .cr/ 297 | 298 | # Python Tools for Visual Studio (PTVS) 299 | __pycache__/ 300 | *.pyc 301 | 302 | # Cake - Uncomment if you are using it 303 | # tools/** 304 | # !tools/packages.config 305 | 306 | # Tabs Studio 307 | *.tss 308 | 309 | # Telerik's JustMock configuration file 310 | *.jmconfig 311 | 312 | # BizTalk build output 313 | *.btp.cs 314 | *.btm.cs 315 | *.odx.cs 316 | *.xsd.cs 317 | 318 | # OpenCover UI analysis results 319 | OpenCover/ 320 | 321 | # Azure Stream Analytics local run output 322 | ASALocalRun/ 323 | 324 | # MSBuild Binary and Structured Log 325 | *.binlog 326 | 327 | # NVidia Nsight GPU debugger configuration file 328 | *.nvuser 329 | 330 | # MFractors (Xamarin productivity tool) working folder 331 | .mfractor/ 332 | 333 | # rpmbuild output 334 | *.rpm 335 | .venv/ 336 | .env 337 | slurm/install/slurm-pkg* 338 | blobs 339 | **/libs 340 | **/*.mypy_cache/ 341 | **/dist 342 | **/*egg-info 343 | **/creds.json 344 | azure-slurm-install/dist 345 | azure-slurm-install/docker/ 346 | .DS_Store 347 | .build.log 348 | azure-slurm-install/AzureCA.pem 349 | -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | ``` 4 | CODE_DIR=~/code # or where ever you wish to develop this 5 | cd $CODE_DIR 6 | git clone https://github.com/Azure/cyclecloud-scalelib.git 7 | # cd cyclecloud-scalelib 8 | # git checkout specific-branch 9 | 10 | cd $CODE_DIR 11 | git clone https://github.com/Azure/cyclecloud-slurm.git 12 | cd cyclecloud-slurm 13 | 14 | docker-package.sh ../cyclecloud-scalelib 15 | ``` 16 | 17 | ## New Slurm Versions 18 | 1. Add a record to slurm/install/slurm_supported_version.py:SUPPORTED_VERSIONS 19 | Currently it looks like 20 | ```json 21 | SUPPORTED_VERSIONS = { 22 | "22.05.8": { 23 | "rhel": [{"platform_version": "el8", "arch": "x86_64"}], 24 | "debian": [{"arch": "amd64"}], 25 | }, 26 | "23.02.0": { 27 | "rhel": [{"platform_version": "el8", "arch": "x86_64"}], 28 | "debian": [{"arch": "amd64"}], 29 | } 30 | } 31 | ``` 32 | 2. Build the RPMs and DEBs 33 | ```bash 34 | # this should be all you need, but new versions may require 35 | # updates. See the below for more information, as they are what is run inside the 36 | # container. 37 | # ./specs/default/cluster-init/files/01-build-debs.sh 38 | # ./specs/default/cluster-init/files/00-build-slurm.sh 39 | ./util/docker-rpmbuild.sh 40 | ``` 41 | 42 | 3. Create a new -bins release 43 | Currently we have a release called 2023-03-13-bins in GitHub. 44 | 45 | See `https://github.com/Azure/cyclecloud-slurm/releases/tag/2023-03-13-bins` 46 | 47 | Simply create a new release and upload all of the files in slurm/install/slurm-pkgs/. 48 | 49 | 3. Update slurm/install/slurm_supported_version.py:CURRENT_DOWNLOAD_URL 50 | Point this variable at the latest slurm bins release. 51 | 52 | 4. Rerun docker-package.sh 53 | When you run docker-package.sh, even on a new repo, the files should now be downloaded. 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /azure-slurm-install/capture_logs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Generic log and configuration capture script for slurm project. Can be used on Scheduler/login/Execute nodes. 3 | set -euo pipefail 4 | 5 | LOG_LOCATIONS=( 6 | "/etc/slurm" 7 | "/opt/azurehpc/slurm/logs" 8 | "/opt/cycle/jetpack/logs" 9 | "/opt/healthagent/healthagent.log" 10 | "/opt/healthagent/healthagent_install.log" 11 | "/var/log/slurmctld/" 12 | "/var/log/slurmd" 13 | "/var/log/syslog" 14 | "/var/log/waagent.log" 15 | "/var/log/cloud-init.log" 16 | "/var/log/azure-slurm-install.log" 17 | ) 18 | HOSTNAME=$(hostname) 19 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 20 | ARCHIVE_NAME="${HOSTNAME}-${TIMESTAMP}.log.tar.gz" 21 | ARCHIVE_BASENAME="${HOSTNAME}-${TIMESTAMP}.log" 22 | ARCHIVE_PATH="$(pwd)/$ARCHIVE_NAME" 23 | OUTPUT_DIR="/tmp/logbundle/$ARCHIVE_BASENAME" 24 | 25 | echo "Creating output directory..." 26 | rm -rf "$OUTPUT_DIR" 27 | mkdir -p "$OUTPUT_DIR" 28 | 29 | # === Copy Logs === 30 | echo "Copying log files..." 31 | for path in "${LOG_LOCATIONS[@]}"; do 32 | if [ -e "$path" ]; then 33 | echo " - Copying $path" 34 | dest="$OUTPUT_DIR${path%/*}" 35 | mkdir -p "$dest" 36 | cp -r "$path" "$dest/" 37 | else 38 | echo " - Skipping missing path: $path" 39 | fi 40 | done 41 | 42 | echo "Creating tar archive..." 43 | tar -czf "$ARCHIVE_PATH" -C "/tmp/logbundle" "$ARCHIVE_BASENAME" 44 | 45 | echo "Log archive created at: $ARCHIVE_PATH" 46 | rm -rf $OUTPUT_DIR -------------------------------------------------------------------------------- /azure-slurm-install/conf/install_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=consoleHandler, fileHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=consoleHandler, fileHandler 13 | 14 | [handler_fileHandler] 15 | class=logging.handlers.RotatingFileHandler 16 | level=DEBUG 17 | formatter=simpleFormatter 18 | args=("/var/log/azure-slurm-install.log", "a", 1024 * 1024 * 5, 5) 19 | 20 | [handler_consoleHandler] 21 | class=StreamHandler 22 | level=ERROR 23 | formatter=simpleFormatter 24 | args=(sys.stderr,) 25 | 26 | [formatter_simpleFormatter] 27 | format=%(asctime)s %(levelname)s: %(message)s 28 | 29 | [formatter_reproFormatter] 30 | format=%(message)s 31 | -------------------------------------------------------------------------------- /azure-slurm-install/imex_epilog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | run_epilog(){ 4 | if ! systemctl list-units --full --all | grep -Fq "nvidia-imex.service"; then 5 | exit 0 6 | fi 7 | # Clean the config file in case the service gets started by accident 8 | # clean up connection 9 | > /etc/nvidia-imex/nodes_config.cfg 10 | NVIDIA_IMEX_STOP_TIMEOUT=15 11 | set +e 12 | timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex 13 | pkill -9 nvidia-imex 14 | set -e 15 | } 16 | # Get VM size from Jetpack 17 | mkdir -p /var/log/slurm 18 | { 19 | set -x 20 | set +e 21 | VM_SIZE=$(/opt/cycle/jetpack/bin/jetpack config azure.metadata.compute.vmSize) 22 | IMEX_ENABLED=$(/opt/cycle/jetpack/bin/jetpack config slurm.imex.enabled null) 23 | 24 | # Main logic 25 | set -e 26 | if [[ "$VM_SIZE" == *"GB200"* || "$VM_SIZE" == *"GB300"* ]]; then 27 | if [[ "$IMEX_ENABLED" == "False" ]]; then 28 | exit 0 # No-op 29 | else 30 | run_epilog # Run epilog for GB200/GB300 by default 31 | fi 32 | elif [[ "$IMEX_ENABLED" == "True" ]]; then 33 | run_epilog # Run epilog for non-GB200/GB300 VM if explicitly enabled 34 | else 35 | exit 0 # No-op 36 | fi 37 | } > "/var/log/slurm/imex_epilog_$SLURM_JOB_ID.log" 2>&1 -------------------------------------------------------------------------------- /azure-slurm-install/imex_prolog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Prolog script for NVIDIA IMEX 3 | run_prolog() { 4 | 5 | if ! systemctl list-units --full --all | grep -Fq "nvidia-imex.service"; then 6 | exit 0 7 | fi 8 | 9 | echo "SLURM_NODELIST: $SLURM_NODELIST" 10 | 11 | set -ex 12 | NODES=$SLURM_NODELIST 13 | echo $NODES 14 | NVIDIA_IMEX_START_TIMEOUT=60 15 | IMEX_CONN_WAIT_TIMEOUT=70 16 | NVIDIA_IMEX_STOP_TIMEOUT=15 17 | # clean up prev connection 18 | > /etc/nvidia-imex/nodes_config.cfg 19 | set +e 20 | timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex 21 | pkill -9 nvidia-imex 22 | set -e 23 | 24 | # update peer list 25 | scontrol show hostnames "$SLURM_NODELIST" | while read host; do 26 | getent ahosts "$host" | awk '{ print $1 }' | head -n1 27 | done > /etc/nvidia-imex/nodes_config.cfg 28 | 29 | # 08/08: undo the rotate server port change. This means the server ports will be static and by default that should be 30 | # port 50000. This contradicts nvidia's documentation but BFL reportedly were hitting conflicts, and nvdia suggests to use static ports to avoid 31 | # conflicts.We could comment out the file but this will ensure running nodes always use the same port. 32 | # rotate server port to prevent race condition 33 | #NEW_SERVER_PORT=$((${SLURM_JOB_ID}% 16384 + 33792)) 34 | NEW_SERVER_PORT=50000 35 | sed -i "s/SERVER_PORT.*/SERVER_PORT=${NEW_SERVER_PORT}/" /etc/nvidia-imex/config.cfg 36 | 37 | # enable imex-ctl on all nodes so you can query imex status with: nvidia-imex-ctl -a -q 38 | sed -i "s/IMEX_CMD_PORT.*/IMEX_CMD_PORT=50005/" /etc/nvidia-imex/config.cfg 39 | sed -i "s/IMEX_CMD_ENABLED.*/IMEX_CMD_ENABLED=1/" /etc/nvidia-imex/config.cfg 40 | 41 | # set timeouts for start 42 | sed -i "s/IMEX_CONN_WAIT_TIMEOUT.*/IMEX_CONN_WAIT_TIMEOUT=${IMEX_CONN_WAIT_TIMEOUT}/" /etc/nvidia-imex/config.cfg 43 | timeout $NVIDIA_IMEX_START_TIMEOUT systemctl start nvidia-imex 44 | } 45 | # Get VM size from Jetpack 46 | mkdir -p /var/log/slurm 47 | { 48 | set -x 49 | set +e 50 | VM_SIZE=$(/opt/cycle/jetpack/bin/jetpack config azure.metadata.compute.vmSize) 51 | IMEX_ENABLED=$(/opt/cycle/jetpack/bin/jetpack config slurm.imex.enabled null) 52 | echo "VM_SIZE: $VM_SIZE" 53 | echo "IMEX_ENABLED: $IMEX_ENABLED" 54 | # Main logic 55 | set -e 56 | if [[ "$VM_SIZE" == *"GB200"* || "$VM_SIZE" == *"GB300"* ]]; then 57 | if [[ "$IMEX_ENABLED" == "False" ]]; then 58 | exit 0 # No-op 59 | else 60 | run_prolog # Run prolog for GB200/GB300 series by default 61 | fi 62 | elif [[ "$IMEX_ENABLED" == "True" ]]; then 63 | run_prolog # Run prolog for non-GB200/GB300 VM if explicitly enabled 64 | else 65 | exit 0 # No-op 66 | fi 67 | } > "/var/log/slurm/imex_prolog_$SLURM_JOB_ID.log" 2>&1 68 | -------------------------------------------------------------------------------- /azure-slurm-install/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | set -e 6 | cd $(dirname $0) 7 | 8 | if [ $(whoami) != root ]; then 9 | echo "Please run as root" 10 | exit 1 11 | fi 12 | 13 | if [ -e /etc/centos-release ]; then 14 | python3 install.py --platform rhel $@ 15 | else 16 | python3 install.py --platform ubuntu $@ 17 | fi 18 | -------------------------------------------------------------------------------- /azure-slurm-install/package.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import os 3 | import subprocess 4 | import sys 5 | import tarfile 6 | import requests 7 | from typing import Optional 8 | 9 | def execute() -> None: 10 | ENROOT_VERSION = "4.0.1" 11 | PYXIS_VERSION = "0.21.0" 12 | 13 | expected_cwd = os.path.abspath(os.path.dirname(__file__)) 14 | os.chdir(expected_cwd) 15 | 16 | if not os.path.exists("libs"): 17 | os.makedirs("libs") 18 | 19 | parser = configparser.ConfigParser() 20 | ini_path = os.path.abspath("../project.ini") 21 | 22 | with open(ini_path) as fr: 23 | parser.read_file(fr) 24 | 25 | version = parser.get("project", "version") 26 | 27 | if not version: 28 | raise RuntimeError("Missing [project] -> version in {}".format(ini_path)) 29 | 30 | if not os.path.exists("dist"): 31 | os.makedirs("dist") 32 | 33 | tf = tarfile.TarFile.gzopen( 34 | f"dist/azure-slurm-install-pkg-{version}.tar.gz", "w" 35 | ) 36 | 37 | def _download(url: str, dest: str) -> None: 38 | try: 39 | response = requests.get(url, stream=True, timeout=60) 40 | response.raise_for_status() 41 | with open(dest, "wb") as f: 42 | for chunk in response.iter_content(chunk_size=8192): 43 | f.write(chunk) 44 | except requests.RequestException as e: 45 | print(f"Error downloading {url}: {e}") 46 | 47 | artifacts_dir = "artifacts" 48 | os.makedirs(artifacts_dir, exist_ok=True) 49 | 50 | def _add(name: str, path: Optional[str] = None, mode: Optional[int] = None) -> None: 51 | path = path or name 52 | tarinfo = tarfile.TarInfo(f"azure-slurm-install/{name}") 53 | tarinfo.size = os.path.getsize(path) 54 | tarinfo.mtime = int(os.path.getmtime(path)) 55 | if mode: 56 | tarinfo.mode = mode 57 | 58 | with open(path, "rb") as fr: 59 | tf.addfile(tarinfo, fr) 60 | 61 | #Download EPEL 62 | for ver in ["8", "9"]: 63 | url = f"https://dl.fedoraproject.org/pub/epel/epel-release-latest-{ver}.noarch.rpm" 64 | dest = os.path.join(artifacts_dir, f"epel-release-latest-{ver}.noarch.rpm") 65 | _download(url, dest) 66 | _add(dest, dest) 67 | 68 | # Download Pyxis and Enroot packages 69 | for arch in ["x86_64", "aarch64"]: 70 | enroot_check_url = f"https://github.com/NVIDIA/enroot/releases/download/v{ENROOT_VERSION}/enroot-check_{ENROOT_VERSION}_{arch}.run" 71 | enroot_check_dest = os.path.join(artifacts_dir, f"enroot-check_{ENROOT_VERSION}_{arch}.run") 72 | _download(enroot_check_url, enroot_check_dest) 73 | _add(enroot_check_dest, enroot_check_dest) 74 | enroot_rpm_url = f"https://github.com/NVIDIA/enroot/releases/download/v{ENROOT_VERSION}/enroot-{ENROOT_VERSION}-1.el8.{arch}.rpm" 75 | enroot_rpm_dest = os.path.join(artifacts_dir, f"enroot-{ENROOT_VERSION}-1.el8.{arch}.rpm") 76 | _download(enroot_rpm_url, enroot_rpm_dest) 77 | _add(enroot_rpm_dest, enroot_rpm_dest) 78 | enroot_caps_url = f"https://github.com/NVIDIA/enroot/releases/download/v{ENROOT_VERSION}/enroot+caps-{ENROOT_VERSION}-1.el8.{arch}.rpm" 79 | enroot_caps_dest = os.path.join(artifacts_dir, f"enroot+caps-{ENROOT_VERSION}-1.el8.{arch}.rpm") 80 | _download(enroot_caps_url, enroot_caps_dest) 81 | _add(enroot_caps_dest, enroot_caps_dest) 82 | 83 | for arch in ["amd64", "arm64"]: 84 | enroot_deb_url = f"https://github.com/NVIDIA/enroot/releases/download/v{ENROOT_VERSION}/enroot_{ENROOT_VERSION}-1_{arch}.deb" 85 | enroot_deb_dest = os.path.join(artifacts_dir, f"enroot_{ENROOT_VERSION}-1_{arch}.deb") 86 | _download(enroot_deb_url, enroot_deb_dest) 87 | _add(enroot_deb_dest, enroot_deb_dest) 88 | enroot_caps_url = f"https://github.com/NVIDIA/enroot/releases/download/v{ENROOT_VERSION}/enroot+caps_{ENROOT_VERSION}-1_{arch}.deb" 89 | enroot_caps_dest = os.path.join(artifacts_dir, f"enroot+caps_{ENROOT_VERSION}-1_{arch}.deb") 90 | _download(enroot_caps_url, enroot_caps_dest) 91 | _add(enroot_caps_dest, enroot_caps_dest) 92 | 93 | pyxis_url = f"https://github.com/NVIDIA/pyxis/archive/refs/tags/v{PYXIS_VERSION}.tar.gz" 94 | pyxis_dest = os.path.join(artifacts_dir, f"pyxis-{PYXIS_VERSION}.tar.gz") 95 | _download(pyxis_url, pyxis_dest) 96 | _add(pyxis_dest, pyxis_dest) 97 | 98 | _add("install.sh", "install.sh", mode=os.stat("install.sh")[0]) 99 | _add("install_logging.conf", "conf/install_logging.conf") 100 | _add("installlib.py", "installlib.py") 101 | _add("install.py", "install.py") 102 | _add("slurmel8insiders.repo", "slurmel8insiders.repo") 103 | _add("slurmel9insiders.repo", "slurmel9insiders.repo") 104 | _add("slurmel8.repo", "slurmel8.repo") 105 | _add("slurmel9.repo", "slurmel9.repo") 106 | _add("ubuntu.sh", "ubuntu.sh", 600) 107 | _add("rhel.sh", "rhel.sh", 600) 108 | _add("imex_prolog.sh", "imex_prolog.sh", 600) 109 | _add("imex_epilog.sh", "imex_epilog.sh", 600) 110 | _add("AzureCA.pem", "AzureCA.pem") 111 | _add("suse.sh", "suse.sh", 600) 112 | _add("start-services.sh", "start-services.sh", 555) 113 | _add("capture_logs.sh", "capture_logs.sh", 755) 114 | 115 | for fil in os.listdir("templates"): 116 | if os.path.isfile(f"templates/{fil}"): 117 | _add(f"templates/{fil}", f"templates/{fil}") 118 | 119 | if __name__ == "__main__": 120 | execute() 121 | -------------------------------------------------------------------------------- /azure-slurm-install/package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | cd $(dirname $0) 5 | python3.11 package.py $@ -------------------------------------------------------------------------------- /azure-slurm-install/rhel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | set -e 6 | INSTALLED_FILE=/etc/azslurm-bins.installed 7 | SLURM_ROLE=$1 8 | SLURM_VERSION=$2 9 | OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d= -f2 | cut -d\" -f2 | cut -d. -f1) 10 | OS_ID=$(cat /etc/os-release | grep ^ID= | cut -d= -f2 | cut -d\" -f2 | cut -d. -f1) 11 | ENROOT_VERSION="4.0.1" 12 | PYXIS_VERSION="0.21.0" 13 | PYXIS_DIR="/opt/pyxis" 14 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 15 | ARTIFACTS_DIR="$SCRIPT_DIR/artifacts" 16 | 17 | if [ "$OS_VERSION" -lt "8" ]; then 18 | echo "RHEL versions < 8 no longer supported" 19 | exit 1 20 | fi 21 | 22 | # Check if artifacts directory exists 23 | if [ ! -d "$ARTIFACTS_DIR" ]; then 24 | echo "Error: Artifacts directory not found: $ARTIFACTS_DIR" 25 | exit 1 26 | fi 27 | 28 | #Almalinux 8/9 and RHEL 8/9 both need epel-release to install libjwt for slurm packages 29 | enable_epel() { 30 | if ! rpm -qa | grep -q "^epel-release-"; then 31 | if [ "${OS_ID,,}" == "rhel" ]; then 32 | yum -y install $ARTIFACTS_DIR/epel-release-latest-${OS_VERSION}.noarch.rpm 33 | else 34 | yum -y install epel-release 35 | fi 36 | fi 37 | if [ "${OS_ID}" == "almalinux" ]; then 38 | if [ "$OS_VERSION" == "8" ]; then 39 | # Enable powertools repo for AlmaLinux 8 (needed for perl-Switch package) 40 | yum config-manager --set-enabled powertools 41 | else 42 | # Enable crb repo for AlmaLinux 9 (needed for perl-Switch package) 43 | yum config-manager --set-enabled crb 44 | fi 45 | fi 46 | 47 | } 48 | 49 | rpm_pkg_install() { 50 | local packages_to_install="" 51 | local pkg_names=$1 52 | local extra_flags=$2 53 | for pkg_name in $pkg_names; do 54 | base_pkg=$pkg_name 55 | if [[ "$pkg_name" == *.rpm ]]; then 56 | # Extract package name from .rpm filename 57 | base_pkg=$(basename "$pkg_name" | sed 's/-[0-9]*\.el.*$//') 58 | fi 59 | if ! rpm -qa | grep -q "^${base_pkg}-"; then 60 | packages_to_install="$packages_to_install $pkg_name" 61 | fi 62 | done 63 | if [ -n "$packages_to_install" ]; then 64 | echo "The following packages need to be installed: $packages_to_install" 65 | # Install all packages in one yum command 66 | yum install -y $packages_to_install $extra_flags 67 | echo "Successfully installed all required packages" 68 | else 69 | echo "All required packages are already installed" 70 | fi 71 | } 72 | 73 | dependency_packages="perl-Switch munge jq jansson-devel libjwt-devel binutils make wget gcc" 74 | slurm_packages="slurm slurm-libpmi slurm-devel slurm-pam_slurm slurm-perlapi slurm-torque slurm-openlava slurm-example-configs slurm-contribs" 75 | sched_packages="slurm-slurmctld slurm-slurmdbd slurm-slurmrestd" 76 | execute_packages="slurm-slurmd" 77 | 78 | INSIDERS=$(/opt/cycle/jetpack/bin/jetpack config slurm.insiders False) 79 | 80 | if [[ "$OS_VERSION" == "9" ]]; then 81 | if [[ "$INSIDERS" == "True" ]]; then 82 | cp slurmel9insiders.repo /etc/yum.repos.d/slurm.repo 83 | else 84 | cp slurmel9.repo /etc/yum.repos.d/slurm.repo 85 | fi 86 | elif [[ "$OS_VERSION" == "8" ]]; then 87 | if [[ "$INSIDERS" == "True" ]]; then 88 | cp slurmel8insiders.repo /etc/yum.repos.d/slurm.repo 89 | else 90 | cp slurmel8.repo /etc/yum.repos.d/slurm.repo 91 | fi 92 | else 93 | echo "Unsupported OS version: $OS_VERSION" 94 | exit 1 95 | fi 96 | 97 | # Collect all SLURM packages based on role 98 | all_slurm_packages="$slurm_packages" 99 | 100 | if [ "${SLURM_ROLE}" == "scheduler" ]; then 101 | all_slurm_packages="$all_slurm_packages $sched_packages" 102 | fi 103 | 104 | if [ "${SLURM_ROLE}" == "execute" ]; then 105 | all_slurm_packages="$all_slurm_packages $execute_packages" 106 | fi 107 | 108 | ## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to 109 | ## build an image from generic marketplace images then this package sets up the right gpg keys for PMC. 110 | if [ ! -e /etc/yum.repos.d/microsoft-prod.repo ]; then 111 | curl -sSL -O https://packages.microsoft.com/config/rhel/$OS_VERSION/packages-microsoft-prod.rpm 112 | rpm -i packages-microsoft-prod.rpm 113 | rm packages-microsoft-prod.rpm 114 | fi 115 | 116 | versioned_slurm_packages="" 117 | #add version suffix to all slurm packages 118 | for pkg in $all_slurm_packages; do 119 | versioned_slurm_packages="$versioned_slurm_packages ${pkg}-${SLURM_VERSION}*" 120 | done 121 | 122 | enable_epel 123 | rpm_pkg_install "$dependency_packages" 124 | rpm_pkg_install "$versioned_slurm_packages" "--disableexcludes slurm" 125 | 126 | # Install slurm_exporter container (will refactor this later) 127 | monitoring_enabled=$(/opt/cycle/jetpack/bin/jetpack config cyclecloud.monitoring.enabled False) 128 | if [ "${SLURM_ROLE}" == "scheduler" ] && [ "$monitoring_enabled" == "True" ]; then 129 | SLURM_EXPORTER_IMAGE_NAME="ghcr.io/slinkyproject/slurm-exporter:0.3.0" 130 | docker pull $SLURM_EXPORTER_IMAGE_NAME 131 | fi 132 | 133 | # Install enroot package 134 | if [[ "$OS_VERSION" == "8" ]]; then 135 | rpm -e --nodeps enroot enroot+caps 2>/dev/null || true 136 | arch=$(uname -m) 137 | run_file=${ARTIFACTS_DIR}/enroot-check_${ENROOT_VERSION}_$(uname -m).run 138 | chmod 755 $run_file 139 | $run_file --verify 140 | rpm_pkg_install "${ARTIFACTS_DIR}/enroot-${ENROOT_VERSION}-1.el8.${arch}.rpm ${ARTIFACTS_DIR}/enroot+caps-${ENROOT_VERSION}-1.el8.${arch}.rpm" 141 | fi 142 | 143 | # Install pyxis 144 | if [[ ! -f $PYXIS_DIR/spank_pyxis.so ]]; then 145 | tar -xzf ${ARTIFACTS_DIR}/pyxis-${PYXIS_VERSION}.tar.gz 146 | cd pyxis-${PYXIS_VERSION} 147 | make 148 | mkdir -p $PYXIS_DIR 149 | cp -fv spank_pyxis.so $PYXIS_DIR 150 | chmod +x $PYXIS_DIR/spank_pyxis.so 151 | fi 152 | 153 | touch $INSTALLED_FILE 154 | exit -------------------------------------------------------------------------------- /azure-slurm-install/setup.py: -------------------------------------------------------------------------------- 1 | # test: ignore 2 | import os 3 | from subprocess import check_call 4 | from typing import List 5 | 6 | from setuptools import find_packages, setup 7 | from setuptools.command.test import Command 8 | from setuptools.command.test import test as TestCommand # noqa: N812 9 | 10 | __version__ = "4.0.3" 11 | CWD = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | 14 | class PyTest(TestCommand): 15 | def finalize_options(self) -> None: 16 | TestCommand.finalize_options(self) 17 | import os 18 | 19 | xml_out = os.path.join(".", "build", "test-results", "pytest.xml") 20 | if not os.path.exists(os.path.dirname(xml_out)): 21 | os.makedirs(os.path.dirname(xml_out)) 22 | # -s is needed so py.test doesn't mess with stdin/stdout 23 | self.test_args = ["-s", "test", "--junitxml=%s" % xml_out] 24 | # needed for older setuptools to actually run this as a test 25 | self.test_suite = True 26 | 27 | def run_tests(self) -> None: 28 | # import here, cause outside the eggs aren't loaded 29 | import sys 30 | import pytest 31 | 32 | # run the tests, then the format checks. 33 | errno = pytest.main(self.test_args) 34 | if errno != 0: 35 | sys.exit(errno) 36 | 37 | check_call( 38 | ["black", "--check", "src", "test"], 39 | cwd=CWD, 40 | ) 41 | check_call( 42 | ["isort", "-c"], 43 | cwd=os.path.join(CWD, "src"), 44 | ) 45 | check_call( 46 | ["isort", "-c"], 47 | cwd=os.path.join(CWD, "test"), 48 | ) 49 | 50 | run_type_checking() 51 | 52 | sys.exit(errno) 53 | 54 | 55 | class Formatter(Command): 56 | user_options: List[str] = [] 57 | 58 | def initialize_options(self) -> None: 59 | pass 60 | 61 | def finalize_options(self) -> None: 62 | pass 63 | 64 | def run(self) -> None: 65 | check_call( 66 | ["black", "src", "test"], 67 | cwd=CWD, 68 | ) 69 | check_call( 70 | ["isort", "-y"], 71 | cwd=os.path.join(CWD, "src"), 72 | ) 73 | check_call( 74 | ["isort", "-y"], 75 | cwd=os.path.join(CWD, "test"), 76 | ) 77 | run_type_checking() 78 | 79 | 80 | def run_type_checking() -> None: 81 | check_call( 82 | [ 83 | "mypy", 84 | "--ignore-missing-imports", 85 | "--follow-imports=silent", 86 | "--show-column-numbers", 87 | "--disallow-untyped-defs", 88 | os.path.join(CWD, "test"), 89 | ] 90 | ) 91 | check_call( 92 | [ 93 | "mypy", 94 | "--ignore-missing-imports", 95 | "--follow-imports=silent", 96 | "--show-column-numbers", 97 | "--disallow-untyped-defs", 98 | os.path.join(CWD, "src"), 99 | ] 100 | ) 101 | 102 | check_call( 103 | ["flake8", "--ignore=E203,E231,F405,E501,W503", "src", "test", "setup.py"] 104 | ) 105 | 106 | 107 | class TypeChecking(Command): 108 | user_options: List[str] = [] 109 | 110 | def initialize_options(self) -> None: 111 | pass 112 | 113 | def finalize_options(self) -> None: 114 | pass 115 | 116 | def run(self) -> None: 117 | run_type_checking() 118 | 119 | 120 | setup( 121 | name="azure-slurm-install", 122 | version=__version__, 123 | packages=find_packages(where="src"), 124 | package_dir={"": "src"}, 125 | package_data={ 126 | "azure-slurm-install": [ 127 | "BUILD_NUMBER", 128 | "private-requirements.json", 129 | "../NOTICE", 130 | "../notices", 131 | ] 132 | }, 133 | install_requires=[ 134 | "typing_extensions==3.7.4.3", 135 | "zipp==3.19.1" 136 | ], 137 | tests_require=["pytest==3.2.3"], 138 | cmdclass={"test": PyTest, "format": Formatter, "types": TypeChecking}, 139 | url="http://www.cyclecomputing.com", 140 | maintainer="Cycle Computing", 141 | maintainer_email="support@cyclecomputing.com", 142 | ) 143 | -------------------------------------------------------------------------------- /azure-slurm-install/slurmel8.repo: -------------------------------------------------------------------------------- 1 | [slurm] 2 | name=Slurm Workload Manager 3 | baseurl=https://packages.microsoft.com/yumrepos/slurm-el8 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=https://packages.microsoft.com/keys/microsoft.asc 7 | priority=10 -------------------------------------------------------------------------------- /azure-slurm-install/slurmel8insiders.repo: -------------------------------------------------------------------------------- 1 | [slurm] 2 | name=Slurm Workload Manager 3 | baseurl=https://packages.microsoft.com/yumrepos/slurm-el8-insiders 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=https://packages.microsoft.com/keys/microsoft.asc 7 | priority=10 -------------------------------------------------------------------------------- /azure-slurm-install/slurmel9.repo: -------------------------------------------------------------------------------- 1 | [slurm] 2 | name=Slurm Workload Manager 3 | baseurl=https://packages.microsoft.com/yumrepos/slurm-el9 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=https://packages.microsoft.com/keys/microsoft.asc 7 | priority=10 -------------------------------------------------------------------------------- /azure-slurm-install/slurmel9insiders.repo: -------------------------------------------------------------------------------- 1 | [slurm] 2 | name=Slurm Workload Manager 3 | baseurl=https://packages.microsoft.com/yumrepos/slurm-el9-insiders 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=https://packages.microsoft.com/keys/microsoft.asc 7 | priority=10 -------------------------------------------------------------------------------- /azure-slurm-install/start-services.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | if [ "$1" == "" ]; then 5 | echo "Usage: $0 [scheduler|execute|login]" 6 | exit 1 7 | fi 8 | 9 | role=$1 10 | monitoring_enabled=$(/opt/cycle/jetpack/bin/jetpack config cyclecloud.monitoring.enabled False) 11 | 12 | reload_prom_config(){ 13 | # Find the Prometheus process and send SIGHUP to reload config or log a warning if not found 14 | if [[ "$monitoring_enabled" == "False" ]]; then 15 | echo "Monitoring is disabled, skipping Prometheus config reload" 16 | return 0 17 | fi 18 | PROM_PID=$(pgrep -f 'prometheus') 19 | if [ -n "$PROM_PID" ]; then 20 | echo "Sending SIGHUP to Prometheus (PID $PROM_PID) to reload configuration" 21 | kill -HUP $PROM_PID 22 | else 23 | echo "Prometheus process not found, unable to reload configuration" 24 | fi 25 | } 26 | # all nodes need to have munge running 27 | echo restarting munge... 28 | systemctl restart munge 29 | # wait up to 60 seconds for munge to start 30 | iters=60 31 | while [ $iters -ge 0 ]; do 32 | echo test | munge > /dev/null 2>&1 33 | if [ $? == 0 ]; then 34 | break 35 | fi 36 | sleep 1 37 | iters=$(( $iters - 1 )) 38 | done 39 | 40 | # login nodes explicitly should _not_ have slurmd running. 41 | if [ $role == "login" ]; then 42 | reload_prom_config 43 | exit 0 44 | fi 45 | 46 | # execute nodes just need slurmd 47 | if [ $role == "execute" ]; then 48 | systemctl start slurmd 49 | reload_prom_config 50 | exit 0 51 | fi 52 | 53 | # sanity check - make sure a valid role was actually passed in. 54 | # note they are defined in the slurm_*_role.rb 55 | if [ $role != "scheduler" ]; then 56 | echo unknown role! $role 1>&2 57 | exit 2 58 | fi 59 | 60 | # lastly - the scheduler 61 | 62 | systemctl show slurmdbd 2>&1 > /dev/null && systemctl start slurmdbd 63 | # there is no obvious way to check slurmdbd status _before_ starting slurmctld 64 | sleep 10 65 | systemctl start slurmctld 66 | attempts=3 67 | delay=5 68 | set +e 69 | for i in $( seq 1 $attempts ); do 70 | echo $i/$attempts sleeping $delay seconds before running scontrol ping 71 | sleep $delay 72 | scontrol ping 73 | if [ $? == 0 ]; then 74 | systemctl start slurmctld || exit 1 75 | break 76 | fi; 77 | done 78 | if [ $i == $attempts ] && [ $? != 0 ]; then 79 | echo FATAL: slurmctld failed to start! 1>&2 80 | echo Here are the last 100 lines of slurmctld.log 81 | tail -n 100 /var/log/slurmctld/slurmctld.log 1>&2 82 | exit 2 83 | fi 84 | 85 | run_slurm_exporter() { 86 | # Run Slurm Exporter in a container 87 | if [[ "$role" != "scheduler" ]]; then 88 | echo "Slurm Exporter can only be run on the scheduler node, skipping setup." 89 | return 0 90 | fi 91 | 92 | primary_scheduler=$(/opt/cycle/jetpack/bin/jetpack config slurm.is_primary_scheduler True) 93 | if [[ "$primary_scheduler" != "True" ]]; then 94 | echo "This is not the primary scheduler, skipping slurm_exporter setup." 95 | return 0 96 | fi 97 | 98 | SLURM_EXPORTER_PORT=9200 99 | SLURM_EXPORTER_IMAGE_NAME="ghcr.io/slinkyproject/slurm-exporter:0.3.0" 100 | # Try to get the token, retry up to 3 times 101 | unset SLURM_JWT 102 | for attempt in 1 2 3; do 103 | export $(scontrol token username="slurmrestd" lifespan=infinite) 104 | if [ -n "$SLURM_JWT" ]; then 105 | break 106 | fi 107 | echo "Attempt $attempt: Failed to get SLURM_JWT token, retrying in 5 seconds..." 108 | scontrol reconfigure 109 | sleep 5 110 | done 111 | 112 | if [ -z "$SLURM_JWT" ]; then 113 | echo "Failed to get SLURM_JWT token after 3 attempts." 114 | echo "Check slurmctld status, slurm.conf JWT configuration, and logs for errors." 115 | /opt/cycle/jetpack/bin/jetpack log "Failed to get SLURM_JWT token after 3 attempts, disabling slurm_exporter setup." --level=warn --priority=medium 116 | return 0 117 | fi 118 | # Check if the container is already running, and if so, stop it 119 | if [ "$(docker ps -q -f ancestor=$SLURM_EXPORTER_IMAGE_NAME)" ]; then 120 | echo "Slurm Exporter is already running, stopping it..." 121 | docker stop $(docker ps -q -f ancestor=$SLURM_EXPORTER_IMAGE_NAME) 122 | fi 123 | 124 | # Run the Slurm Exporter container, expose the port so prometheus can scrape it. Redirect the host.docker.internal to the host gateway == localhost 125 | docker run -v /var:/var -e SLURM_JWT=${SLURM_JWT} -d --restart always -p ${SLURM_EXPORTER_PORT}:8080 --add-host=host.docker.internal:host-gateway $SLURM_EXPORTER_IMAGE_NAME -server http://host.docker.internal:6820 -cache-freq 10s 126 | 127 | # Check if the container is running 128 | if [ "$(docker ps -q -f ancestor=$SLURM_EXPORTER_IMAGE_NAME)" ]; then 129 | echo "Slurm Exporter is running" 130 | else 131 | echo "Slurm Exporter is not running" 132 | /opt/cycle/jetpack/bin/jetpack log "Slurm Exporter container failed to start" --level=warn --priority=medium 133 | return 0 # do not fail the slurm startup if exporter fails 134 | fi 135 | 136 | reload_prom_config 137 | 138 | sleep 20 139 | if curl -s http://localhost:${SLURM_EXPORTER_PORT}/metrics | grep -q "slurm_nodes_total"; then 140 | echo "Slurm Exporter metrics are available" 141 | else 142 | echo "Slurm Exporter metrics are not available" 143 | /opt/cycle/jetpack/bin/jetpack log "Slurm Exporter metrics are not available" --level=warn --priority=medium 144 | fi 145 | } 146 | 147 | # start slurmrestd 148 | sleep 10 149 | systemctl start slurmrestd 150 | systemctl status slurmrestd --no-pager 151 | if [ $? != 0 ]; then 152 | echo Warning: slurmrestd failed to start! 1>&2 153 | /opt/cycle/jetpack/bin/jetpack log "slurmrestd failed to start" --level=warn --priority=medium 154 | exit 0 155 | fi 156 | # start slurm_exporter if monitoring is enabled and slurmrestd is running 157 | if [[ "$monitoring_enabled" == "True" ]]; then 158 | run_slurm_exporter 159 | fi 160 | exit 0 161 | -------------------------------------------------------------------------------- /azure-slurm-install/suse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | 6 | INSTALLED_FILE=/etc/azslurm-bins.installed 7 | if [ -e $INSTALLED_FILE ]; then 8 | exit 0 9 | fi 10 | 11 | SLURM_ROLE=$1 12 | SLURM_VERSION=$(echo $2 | cut -d- -f1) 13 | 14 | 15 | which munge 2>/dev/null 16 | if [ $? != 0 ]; then 17 | zypper install -y munge || exit 1 18 | fi 19 | 20 | which python3 2>/dev/null 21 | if [ $? != 0 ]; then 22 | zypper install -y python3 || exit 1 23 | fi 24 | 25 | set -e 26 | 27 | if [ ${SLURM_ROLE} == "scheduler" ]; then 28 | zypper install -y slurm-${SLURM_VERSION} \ 29 | slurm-slurmdbd-${SLURM_VERSION} \ 30 | slurm-lua-${SLURM_VERSION} \ 31 | slurm-sql-${SLURM_VERSION} 32 | else 33 | zypper install -y slurm-${SLURM_VERSION} 34 | fi 35 | 36 | for fil in slurm.conf cgroup.conf slurmdbd.conf; do 37 | if [ -e /etc/slurm/$fil ]; then 38 | if [ ! -L /etc/slurm/$fil ]; then 39 | mv /etc/slurm/$fil /etc/slurm/$fil.suse_example 40 | fi 41 | fi 42 | done 43 | 44 | touch $INSTALLED_FILE -------------------------------------------------------------------------------- /azure-slurm-install/templates/cgroup.conf.template: -------------------------------------------------------------------------------- 1 | ConstrainCores=yes 2 | ConstrainRamSpace=yes 3 | #ConstrainSwapSpace=yes 4 | #TaskAffinity=no 5 | ConstrainDevices=yes 6 | -------------------------------------------------------------------------------- /azure-slurm-install/templates/enroot.conf.template: -------------------------------------------------------------------------------- 1 | ENROOT_RUNTIME_PATH {ENROOT_SCRATCH_DIR}/enroot-run/user-$(id -u) 2 | ENROOT_CACHE_PATH {ENROOT_SCRATCH_DIR}/enroot-cache/group-$(id -g) 3 | ENROOT_DATA_PATH {ENROOT_SCRATCH_DIR}/enroot-data/user-$(id -u) 4 | ENROOT_TEMP_PATH {ENROOT_SCRATCH_DIR}/enroot-temp 5 | ENROOT_SQUASH_OPTIONS -noI -noD -noF -noX -no-duplicates 6 | ENROOT_MOUNT_HOME y 7 | ENROOT_RESTRICT_DEV y 8 | ENROOT_ROOTFS_WRITABLE y 9 | MELLANOX_VISIBLE_DEVICES all -------------------------------------------------------------------------------- /azure-slurm-install/templates/job_submit.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) Microsoft Corporation. All rights reserved. 2 | -- Licensed under the MIT License. 3 | 4 | function slurm_job_submit(job_desc, part_list, submit_uid) 5 | if job_desc.argv ~= nil then 6 | for i = 0, job_desc.argc, 1 do 7 | if job_desc.argv[i] == "--switches" then 8 | slurm.log_info("--switches was set, ignoring."); 9 | return slurm.SUCCESS; 10 | end 11 | end 12 | end 13 | if job_desc.network ~= nil and job_desc.network ~= '' then 14 | if job_desc.network == "sn_single" then 15 | slurm.log_info("sn_single was set, ignoring."); 16 | return slurm.SUCCESS 17 | end 18 | end 19 | slurm.log_info("Setting reqswitch to 1."); 20 | job_desc.req_switch = 1; 21 | 22 | slurm.log_info("returning."); 23 | 24 | return slurm.SUCCESS 25 | end 26 | 27 | function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) 28 | return slurm.SUCCESS 29 | end 30 | 31 | slurm.log_info("initialized job_submit_cyclecloud") 32 | return slurm.SUCCESS -------------------------------------------------------------------------------- /azure-slurm-install/templates/munge.override: -------------------------------------------------------------------------------- 1 | [Service] 2 | Restart=always -------------------------------------------------------------------------------- /azure-slurm-install/templates/slurm-limits.conf: -------------------------------------------------------------------------------- 1 | * soft memlock unlimited 2 | * hard memlock unlimited -------------------------------------------------------------------------------- /azure-slurm-install/templates/slurm.conf.template: -------------------------------------------------------------------------------- 1 | MpiDefault=none 2 | ProctrackType=proctrack/cgroup 3 | ReturnToService=2 4 | PropagateResourceLimits=ALL 5 | SlurmctldPidFile=/var/run/slurmctld.pid 6 | SlurmdPidFile=/var/run/slurmd.pid 7 | SlurmdSpoolDir=/var/spool/slurmd 8 | SlurmUser=slurm 9 | StateSaveLocation={state_save_location} 10 | SwitchType=switch/none 11 | TaskPlugin=task/affinity,task/cgroup 12 | SchedulerType=sched/backfill 13 | SelectType=select/cons_tres 14 | GresTypes=gpu 15 | SelectTypeParameters=CR_Core_Memory 16 | # We use a "safe" form of the CycleCloud ClusterName throughout slurm. 17 | # First we lowercase the cluster name, then replace anything 18 | # that is not letters, digits and '-' with a '-' 19 | # eg My Cluster == my-cluster 20 | ClusterName={cluster_name} 21 | JobAcctGatherType=jobacct_gather/none 22 | SlurmctldDebug=debug 23 | SlurmctldLogFile=/var/log/slurmctld/slurmctld.log 24 | SlurmctldParameters=idle_on_node_suspend 25 | SlurmdDebug=debug 26 | SlurmdLogFile=/var/log/slurmd/slurmd.log 27 | DisableRootJobs=No 28 | LaunchParameters={launch_parameters} 29 | # TopologyPlugin=topology/tree 30 | # If you use the TopologyPlugin you likely also want to use our 31 | # job submit plugin so that your jobs run on a single switch 32 | # or just add --switches 1 to your submission scripts 33 | # JobSubmitPlugins=lua 34 | PrivateData=cloud 35 | PrologSlurmctld=/opt/azurehpc/slurm/prolog.sh 36 | Prolog={prolog} 37 | Epilog={epilog} 38 | # This flag executes prolog scripts on job allocation rather than first task/step launch. 39 | # It does not change how a prolog script is run but runs it earlier. 40 | # This flag is also required for GB200/GB300 IMEX integration. 41 | # Recommended to keep this on. 42 | PrologFlags=alloc 43 | TreeWidth=65533 44 | ReconfigFlags=KeepPowerSaveSettings 45 | ResumeTimeout=1800 46 | SuspendTimeout=600 47 | SuspendTime=300 48 | ResumeProgram=/opt/azurehpc/slurm/resume_program.sh 49 | ResumeFailProgram=/opt/azurehpc/slurm/resume_fail_program.sh 50 | SuspendProgram=/opt/azurehpc/slurm/suspend_program.sh 51 | SchedulerParameters=max_switch_wait=24:00:00 52 | # Only used with dynamic node partitions. 53 | MaxNodeCount={max_node_count} 54 | 55 | ## Node HealthChecks related configurations 56 | 57 | ## The interval in seconds between executions of HealthCheckProgram. Setting the value to zero disables execution. 58 | HealthCheckInterval={health_interval} 59 | ## Identify what node states should execute the HealthCheckProgram. Multiple state values may be specified with a comma separator. The default value is ANY to execute on nodes in any state 60 | HealthCheckNodeState=ANY 61 | ## Fully qualified pathname of a script to execute as user root periodically on all compute nodes that are not in the NOT_RESPONDING state. 62 | HealthCheckProgram={health_program} 63 | 64 | 65 | # This as the partition definitions managed by azslurm partitions > /sched/azure.conf 66 | Include azure.conf 67 | # If slurm.accounting.enabled=true this will setup slurmdbd 68 | # otherwise it will just define accounting_storage/none as the plugin 69 | Include accounting.conf 70 | # SuspendExcNodes is persisted to /etc/slurm/keep_alive.conf 71 | # via azlurmd. There can be up to a 15 second delay. 72 | # Note this is a work around until ReconfigFlags=KeepPowerSaveSettings is fixed in slurm. 73 | Include keep_alive.conf 74 | {auth_alt_type} 75 | {auth_alt_parameters} 76 | 77 | SlurmCtldHost={slurmctldhost} -------------------------------------------------------------------------------- /azure-slurm-install/templates/slurm_exporter.yml: -------------------------------------------------------------------------------- 1 | scrape_configs: 2 | - job_name: slurm_exporter 3 | static_configs: 4 | - targets: ["instance_name:9200"] 5 | relabel_configs: 6 | - source_labels: [__address__] 7 | target_label: instance 8 | regex: '([^:]+)(:[0-9]+)?' 9 | replacement: '${1}' -------------------------------------------------------------------------------- /azure-slurm-install/templates/slurmctld.override: -------------------------------------------------------------------------------- 1 | [Service] 2 | WorkingDirectory=/var/log/slurmctld -------------------------------------------------------------------------------- /azure-slurm-install/templates/slurmdbd.conf.template: -------------------------------------------------------------------------------- 1 | # 2 | # See the slurmdbd.conf man page for more information. 3 | # 4 | # Archive info 5 | #ArchiveJobs=yes 6 | #ArchiveDir="/tmp" 7 | #ArchiveSteps=yes 8 | #ArchiveScript= 9 | #JobPurge=12 10 | #StepPurge=1 11 | # 12 | # Authentication info 13 | AuthType=auth/munge 14 | #AuthInfo=/var/run/munge/munge.socket.2 15 | {auth_alt_type} 16 | {auth_alt_parameters} 17 | # 18 | # slurmDBD info 19 | DbdAddr=localhost 20 | DbdHost={dbdhost} 21 | #DbdPort=7031 22 | SlurmUser=slurm 23 | #MessageTimeout=300 24 | DebugLevel=verbose 25 | #DefaultQOS=normal,standby 26 | LogFile=/var/log/slurmctld/slurmdbd.log 27 | PidFile=/var/run/slurmdbd.pid 28 | #PluginDir=/usr/lib/slurm 29 | #PrivateData=accounts,users,usage,jobs 30 | #TrackWCKey=yes 31 | # 32 | # Database info 33 | StorageType=accounting_storage/mysql 34 | StorageHost={accountdb} 35 | StorageLoc={storageloc} 36 | {storagepass} 37 | StorageUser={dbuser} 38 | {storage_parameters} 39 | -------------------------------------------------------------------------------- /azure-slurm-install/templates/slurmrestd.override: -------------------------------------------------------------------------------- 1 | [Service] 2 | AmbientCapabilities=CAP_SETGID 3 | CapabilityBoundingSet=CAP_SETGID -------------------------------------------------------------------------------- /azure-slurm-install/test/installlib_test.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import installlib 3 | from installlib import CCNode 4 | import logging 5 | from typing import Dict 6 | import pytest 7 | 8 | 9 | base_software_configuration = { 10 | "slurm": {"nodename_as_hostname": True}, 11 | "cyclecloud": {"hosts": {"standalone_dns": {"enabled": False}}}, 12 | } 13 | 14 | 15 | @pytest.fixture 16 | def mock_clock(): 17 | yield installlib.use_mock_clock() 18 | 19 | 20 | def _node( 21 | name: str, 22 | hostname: str, 23 | private_ipv4="10.1.0.5", 24 | status="Allocation", 25 | node_prefix=None, 26 | software_configuration=None, 27 | ) -> CCNode: 28 | if not software_configuration: 29 | software_configuration = deepcopy(base_software_configuration) 30 | if node_prefix: 31 | software_configuration["slurm"]["node_prefix"] = node_prefix 32 | 33 | return CCNode( 34 | name=name, 35 | nodearray_name="execute", 36 | hostname=hostname, 37 | private_ipv4="10.1.0.5", 38 | status=status, 39 | software_configuration=software_configuration or {}, 40 | ) 41 | 42 | 43 | def test_is_valid_hostname(mock_clock) -> None: 44 | assert not installlib.is_valid_hostname({}, _node("node1", "random")) 45 | assert installlib.is_valid_hostname({}, _node("node1", "node1")) 46 | assert installlib.is_valid_hostname( 47 | {}, _node("node1", "prefix-node1", node_prefix="prefix-") 48 | ) 49 | 50 | assert not installlib.is_valid_hostname( 51 | {}, _node("NoDe1", "prefix-NoDe1", node_prefix="prefix-") 52 | ) 53 | assert installlib.is_valid_hostname( 54 | {}, _node("NoDe1", "prefix-node1", node_prefix="prefix-") 55 | ) 56 | 57 | # let's setup a noe that should have a hostname like ip-XXXXXXX 58 | soft_config = deepcopy(base_software_configuration) 59 | soft_config["cyclecloud"]["hosts"]["standalone_dns"]["enabled"] = True 60 | # first - fail if it has a non ip-* hostname. 61 | assert not installlib.is_valid_hostname( 62 | {}, 63 | _node( 64 | "node1", 65 | "prefix-node1", 66 | node_prefix="prefix-", 67 | software_configuration=soft_config, 68 | ), 69 | ) 70 | assert installlib.is_valid_hostname( 71 | {}, 72 | _node( 73 | "node1", 74 | "ip-0A010005", 75 | node_prefix="prefix-", 76 | software_configuration=soft_config, 77 | ), 78 | ) 79 | 80 | # and lastly, make sure custom names work. 81 | assert installlib.is_valid_hostname( 82 | {"valid_hostnames": ["^justthisone$"]}, 83 | _node( 84 | "node1", 85 | "justthisone", 86 | node_prefix="prefix-", 87 | software_configuration=soft_config, 88 | ), 89 | ) 90 | 91 | assert not installlib.is_valid_hostname( 92 | {"valid_hostnames": ["^justthisone$"]}, 93 | _node( 94 | "node1", 95 | "butnotthisone", 96 | node_prefix="prefix-", 97 | software_configuration=soft_config, 98 | ), 99 | ) 100 | 101 | 102 | def test_is_standalone_dns(mock_clock) -> None: 103 | soft_config = deepcopy(base_software_configuration) 104 | 105 | assert not installlib.is_standalone_dns( 106 | _node(name="node1", hostname="node1", software_configuration=soft_config) 107 | ) 108 | soft_config["cyclecloud"]["hosts"]["standalone_dns"]["enabled"] = True 109 | assert installlib.is_standalone_dns( 110 | _node(name="node1", hostname="node1", software_configuration=soft_config) 111 | ) 112 | 113 | 114 | def test_get_ccnode(mock_clock) -> None: 115 | 116 | expected = CCNode( 117 | name="node1", 118 | nodearray_name="execute", 119 | hostname="prefix-node1", 120 | private_ipv4="10.1.0.5", 121 | status="Allocation", 122 | software_configuration=base_software_configuration, 123 | ) 124 | 125 | cluster_status = { 126 | "nodes": [ 127 | { 128 | "Name": "node1", 129 | "Template": "execute", 130 | "PrivateIp": "10.1.0.5", 131 | "Hostname": "prefix-node1", 132 | "Status": "Allocation", 133 | "Configuration": base_software_configuration, 134 | } 135 | ] 136 | } 137 | 138 | actual = installlib.get_ccnode( 139 | {}, 140 | "node1", 141 | lambda config: cluster_status, 142 | ) 143 | assert expected == actual 144 | 145 | try: 146 | installlib.get_ccnode( 147 | {}, 148 | "node2", 149 | lambda config: cluster_status, 150 | ) 151 | assert False 152 | except RuntimeError as e: 153 | assert "Node node2 not found in cluster status!" in str(e) 154 | 155 | 156 | def test_await_node_hostname_nodename_as_hostname(mock_clock) -> None: 157 | """ 158 | This is a bit of a lengthy test, but the point is to ensure that 159 | we continue to ask cluster status for the node until the hostname 160 | is correct. 161 | """ 162 | soft_config = deepcopy(base_software_configuration) 163 | soft_config["slurm"]["node_prefix"] = "prefix-" 164 | 165 | class IterativeClusterStatus: 166 | def __init__(self): 167 | self.iters = 0 168 | self.cluster_status = { 169 | "nodes": [ 170 | { 171 | "Name": "node1", 172 | "Template": "execute", 173 | "PrivateIp": "10.1.0.5", 174 | "Hostname": "random", 175 | "Status": "Allocation", 176 | "Configuration": soft_config, 177 | } 178 | ] 179 | } 180 | 181 | def __call__(self, config: Dict) -> Dict: 182 | """ 183 | This is the meat of the test - we assert that 184 | we keep retrying until the hostname is correct. 185 | """ 186 | self.iters += 1 187 | if self.iters == 1: 188 | logging.warning("Iter 1") 189 | return self.cluster_status 190 | if self.iters == 2: 191 | logging.warning("Iter 2") 192 | assert self.cluster_status["nodes"][0]["Hostname"] == "random" 193 | self.cluster_status["nodes"][0]["Hostname"] = "prefix-node1" 194 | return self.cluster_status 195 | raise RuntimeError(f"Reached iter {self.iters}") 196 | 197 | actual = installlib.await_node_hostname( 198 | config={}, 199 | node_name="node1", 200 | timeout=300, 201 | cluster_status_func=IterativeClusterStatus(), 202 | ) 203 | 204 | expected = CCNode( 205 | name="node1", 206 | nodearray_name="execute", 207 | hostname="prefix-node1", 208 | private_ipv4="10.1.0.5", 209 | status="Allocation", 210 | software_configuration=soft_config, 211 | ) 212 | 213 | assert actual.to_dict() == expected.to_dict() -------------------------------------------------------------------------------- /azure-slurm-install/ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | set -e 6 | INSTALLED_FILE=/etc/azslurm-bins.installed 7 | SLURM_ROLE=$1 8 | SLURM_VERSION=$2 9 | 10 | UBUNTU_VERSION=$(cat /etc/os-release | grep VERSION_ID | cut -d= -f2 | cut -d\" -f2) 11 | ENROOT_VERSION="4.0.1" 12 | PYXIS_VERSION="0.21.0" 13 | PYXIS_DIR="/opt/pyxis" 14 | 15 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 16 | ARTIFACTS_DIR="$SCRIPT_DIR/artifacts" 17 | 18 | dpkg_pkg_install() { 19 | local packages_to_install="" 20 | local packages_to_hold="" 21 | local pkg_names=$1 22 | 23 | for pkg_name in $pkg_names; do 24 | # Check if it's a local .deb file 25 | if [[ "$pkg_name" == *.deb ]]; then 26 | # Extract package name from .deb filename 27 | local base_pkg=$(basename "$pkg_name" | sed 's/_.*$//') 28 | local version_pattern=$(basename "$pkg_name" | sed 's/^[^_]*_\([^-]*\).*$/\1/') 29 | # Check if it's a versioned SLURM package 30 | elif [[ "$pkg_name" == *"=${SLURM_VERSION}"* ]]; then 31 | local base_pkg=$(echo "$pkg_name" | sed "s/=${SLURM_VERSION}.*//") 32 | local version_pattern="${SLURM_VERSION}" 33 | else 34 | # Regular package check 35 | if ! dpkg-query -W -f='${Status}' "${pkg_name}" 2>/dev/null | grep -q "install ok installed"; then 36 | packages_to_install="$packages_to_install $pkg_name" 37 | fi 38 | continue 39 | fi 40 | # For versioned packages, check if the installed version matches 41 | if ! dpkg -l | grep "^[hi]i ${base_pkg}" | grep -q "${version_pattern}"; then 42 | packages_to_install="$packages_to_install $pkg_name" 43 | packages_to_hold="$packages_to_hold $base_pkg" 44 | fi 45 | done 46 | 47 | if [ -n "$packages_to_install" ]; then 48 | echo "The following packages need to be installed: $packages_to_install" 49 | apt update 50 | # Install all packages in one command 51 | apt install -y --allow-downgrades --allow-change-held-packages $packages_to_install 52 | # Hold SLURM packages to prevent automatic updates 53 | if [ -n "$packages_to_hold" ]; then 54 | apt-mark hold $packages_to_hold 55 | fi 56 | echo "Successfully installed all required packages" 57 | else 58 | echo "All required packages are already installed" 59 | fi 60 | } 61 | 62 | dependency_packages="" 63 | 64 | # Handle python3-venv for Ubuntu > 19 65 | if [[ $UBUNTU_VERSION > "19" ]]; then 66 | dependency_packages="$dependency_packages python3-venv" 67 | fi 68 | 69 | dependency_packages="$dependency_packages munge libmysqlclient-dev libssl-dev jq libjansson-dev libjwt-dev binutils gcc make wget" 70 | 71 | arch=$(dpkg --print-architecture) 72 | if [[ $UBUNTU_VERSION =~ ^24\.* ]]; then 73 | REPO=slurm-ubuntu-noble 74 | elif [ $UBUNTU_VERSION == 22.04 ]; then 75 | REPO=slurm-ubuntu-jammy 76 | else 77 | REPO=slurm-ubuntu-focal 78 | fi 79 | 80 | REPO_GROUP="stable" 81 | INSIDERS=$(/opt/cycle/jetpack/bin/jetpack config slurm.insiders False) 82 | if [[ "$INSIDERS" == "True" ]]; then 83 | REPO_GROUP="insiders" 84 | fi 85 | 86 | if [[ $UBUNTU_VERSION =~ ^24\.* ]]; then 87 | # microsoft-prod no longer installs GPG key in /etc/apt/trusted.gpg.d 88 | # so we need to use signed-by instead to specify the key for Ubuntu 24.04 onwards 89 | echo "deb [arch=$arch signed-by=/usr/share/keyrings/microsoft-prod.gpg] https://packages.microsoft.com/repos/$REPO/ $REPO_GROUP main" > /etc/apt/sources.list.d/slurm.list 90 | else 91 | if [ "$arch" == "arm64" ]; then 92 | echo "Slurm is not supported on arm64 architecture for Ubuntu versions < 24.04" 93 | exit 1 94 | fi 95 | echo "deb [arch=$arch] https://packages.microsoft.com/repos/$REPO/ $REPO_GROUP main" > /etc/apt/sources.list.d/slurm.list 96 | fi 97 | 98 | echo "\ 99 | Package: slurm, slurm-* 100 | Pin: origin \"packages.microsoft.com\" 101 | Pin-Priority: 990 102 | 103 | Package: slurm, slurm-* 104 | Pin: origin *ubuntu.com* 105 | Pin-Priority: -1" > /etc/apt/preferences.d/slurm-repository-pin-990 106 | 107 | ## This package is pre-installed in all hpc images used by cyclecloud, but if customer wants to 108 | ## use generic ubuntu marketplace image then this package sets up the right gpg keys for PMC. 109 | if [ ! -e /etc/apt/sources.list.d/microsoft-prod.list ]; then 110 | curl -sSL -O https://packages.microsoft.com/config/ubuntu/$UBUNTU_VERSION/packages-microsoft-prod.deb 111 | dpkg -i packages-microsoft-prod.deb 112 | rm packages-microsoft-prod.deb 113 | fi 114 | 115 | slurm_packages="slurm-smd slurm-smd-client slurm-smd-dev slurm-smd-libnss-slurm slurm-smd-libpam-slurm-adopt slurm-smd-sview" 116 | sched_packages="slurm-smd-slurmctld slurm-smd-slurmdbd slurm-smd-slurmrestd" 117 | execute_packages="slurm-smd-slurmd" 118 | 119 | # Collect all SLURM packages based on role 120 | all_slurm_packages="$slurm_packages" 121 | 122 | if [ "${SLURM_ROLE}" == "scheduler" ]; then 123 | all_slurm_packages="$all_slurm_packages $sched_packages" 124 | fi 125 | 126 | if [ "${SLURM_ROLE}" == "execute" ]; then 127 | all_slurm_packages="$all_slurm_packages $execute_packages" 128 | fi 129 | 130 | # Combine dependency packages and versioned SLURM packages 131 | all_packages="$dependency_packages" 132 | 133 | # Add version suffix to all slurm packages 134 | for pkg in $all_slurm_packages; do 135 | all_packages="$all_packages ${pkg}=${SLURM_VERSION}*" 136 | done 137 | 138 | # Install all packages using the unified function 139 | dpkg_pkg_install "$all_packages" 140 | 141 | # Install slurm_exporter container (will refactor this later) 142 | monitoring_enabled=$(/opt/cycle/jetpack/bin/jetpack config cyclecloud.monitoring.enabled False) 143 | if [ "${SLURM_ROLE}" == "scheduler" ] && [ "$monitoring_enabled" == "True" ]; then 144 | SLURM_EXPORTER_IMAGE_NAME="ghcr.io/slinkyproject/slurm-exporter:0.3.0" 145 | docker pull $SLURM_EXPORTER_IMAGE_NAME 146 | fi 147 | 148 | # Check if artifacts directory exists 149 | if [ ! -d "$ARTIFACTS_DIR" ]; then 150 | echo "Error: Artifacts directory not found: $ARTIFACTS_DIR" 151 | exit 1 152 | fi 153 | 154 | #verify enroot package 155 | run_file=${ARTIFACTS_DIR}/enroot-check_${ENROOT_VERSION}_$(uname -m).run 156 | chmod 755 $run_file 157 | $run_file --verify 158 | 159 | # Install enroot package 160 | dpkg_pkg_install "${ARTIFACTS_DIR}/enroot_${ENROOT_VERSION}-1_${arch}.deb ${ARTIFACTS_DIR}/enroot+caps_${ENROOT_VERSION}-1_${arch}.deb" 161 | 162 | # Install pyxis 163 | if [[ ! -f $PYXIS_DIR/spank_pyxis.so ]]; then 164 | tar -xzf ${ARTIFACTS_DIR}/pyxis-${PYXIS_VERSION}.tar.gz 165 | cd pyxis-${PYXIS_VERSION} 166 | make 167 | mkdir -p $PYXIS_DIR 168 | cp -fv spank_pyxis.so $PYXIS_DIR 169 | chmod +x $PYXIS_DIR/spank_pyxis.so 170 | fi 171 | 172 | touch $INSTALLED_FILE 173 | exit 174 | -------------------------------------------------------------------------------- /azure-slurm/conf/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root, repro, slurm_driver, demand, cost, topology 3 | 4 | [handlers] 5 | keys=consoleHandler, fileHandler, reproFileHandler, qcmdHandler, demandHandler, suspendHandler, azslurmdHandler, resumeHandler, resume_failHandler, costFileHandler, topologyFileHandler, topologyConsoleHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter, reproFormatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=consoleHandler, fileHandler, suspendHandler, azslurmdHandler, resumeHandler, resume_failHandler 13 | 14 | [logger_repro] 15 | qualname=repro 16 | # level=INFO 17 | # set to REPRO to enable 18 | level=DEBUG 19 | handlers=reproFileHandler 20 | 21 | [logger_slurm_driver] 22 | propagate=0 23 | qualname=slurm.driver 24 | # level=INFO 25 | # set to REPRO to enable 26 | level=DEBUG 27 | handlers=qcmdHandler 28 | 29 | [logger_demand] 30 | propagate=1 31 | qualname=demand 32 | # level=INFO 33 | # set to REPRO to enable 34 | level=DEBUG 35 | handlers=demandHandler 36 | 37 | [logger_cost] 38 | qualname=cost 39 | level=DEBUG 40 | handlers=costFileHandler 41 | 42 | [logger_topology] 43 | qualname=topology 44 | level=DEBUG 45 | handlers=topologyFileHandler, topologyConsoleHandler 46 | 47 | [handler_fileHandler] 48 | class=logging.handlers.RotatingFileHandler 49 | level=DEBUG 50 | formatter=simpleFormatter 51 | args=("/opt/azurehpc/slurm/logs/autoscale.log", "a", 1024 * 1024 * 5, 5) 52 | 53 | [handler_demandHandler] 54 | class=logging.handlers.RotatingFileHandler 55 | level=DEBUG 56 | formatter=simpleFormatter 57 | args=("/opt/azurehpc/slurm/logs/demand.log", "a", 1024 * 1024 * 5, 5) 58 | 59 | [handler_qcmdHandler] 60 | class=logging.handlers.RotatingFileHandler 61 | level=DEBUG 62 | formatter=simpleFormatter 63 | args=("/opt/azurehpc/slurm/logs/audit.log", "a", 1024 * 1024 * 5, 5) 64 | 65 | [handler_reproFileHandler] 66 | class=logging.handlers.RotatingFileHandler 67 | level=INFO 68 | formatter=reproFormatter 69 | args=("/opt/azurehpc/slurm/logs/autoscale_repro.log", "a", 1024 * 1024 * 5, 5) 70 | 71 | [handler_costFileHandler] 72 | class=logging.handlers.RotatingFileHandler 73 | level=DEBUG 74 | formatter=simpleFormatter 75 | args=("/opt/azurehpc/slurm/logs/cost.log", "a", 1024 * 1024 * 5, 5) 76 | 77 | [handler_topologyFileHandler] 78 | class=logging.handlers.RotatingFileHandler 79 | level=DEBUG 80 | formatter=simpleFormatter 81 | args=("/opt/azurehpc/slurm/logs/topology.log", "a", 1024 * 1024 * 5, 5) 82 | 83 | [handler_topologyConsoleHandler] 84 | class=StreamHandler 85 | level=ERROR 86 | formatter=simpleFormatter 87 | args=(sys.stderr,) 88 | 89 | [handler_consoleHandler] 90 | class=StreamHandler 91 | level=ERROR 92 | formatter=simpleFormatter 93 | args=(sys.stderr,) 94 | 95 | [handler_azslurmdHandler] 96 | name=azslurmd 97 | class=logging.handlers.RotatingFileHandler 98 | level=CRITICAL 99 | formatter=simpleFormatter 100 | args=("/opt/azurehpc/slurm/logs/azslurmd.log", "a", 1024 * 1024 * 50, 15) 101 | 102 | [handler_resumeHandler] 103 | name=resume 104 | class=logging.handlers.RotatingFileHandler 105 | level=CRITICAL 106 | formatter=simpleFormatter 107 | args=("/opt/azurehpc/slurm/logs/resume.log", "a", 1024 * 1024 * 5, 5) 108 | 109 | [handler_suspendHandler] 110 | class=logging.handlers.RotatingFileHandler 111 | level=CRITICAL 112 | formatter=simpleFormatter 113 | args=("/opt/azurehpc/slurm/logs/suspend.log", "a", 1024 * 1024 * 5, 5) 114 | 115 | [handler_resume_failHandler] 116 | class=logging.handlers.RotatingFileHandler 117 | level=CRITICAL 118 | formatter=simpleFormatter 119 | args=("/opt/azurehpc/slurm/logs/resume_fail.log", "a", 1024 * 1024 * 5, 5) 120 | 121 | [formatter_simpleFormatter] 122 | format=%(asctime)s %(context)s %(levelname)s: %(message)s 123 | 124 | [formatter_reproFormatter] 125 | format=%(message)s 126 | -------------------------------------------------------------------------------- /azure-slurm/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | set -e 6 | 7 | find_python3() { 8 | export PATH=$(echo $PATH | sed -e 's/\/opt\/cycle\/jetpack\/system\/embedded\/bin://g' | sed -e 's/:\/opt\/cycle\/jetpack\/system\/embedded\/bin//g') 9 | if [ ! -z $AZSLURM_PYTHON_PATH ]; then 10 | echo $AZSLURM_PYTHON_PATH 11 | return 0 12 | fi 13 | for version in $( seq 11 20 ); do 14 | which python3.$version 15 | if [ $? == 0 ]; then 16 | return 0 17 | fi 18 | done 19 | echo Could not find python3 version 3.11 >&2 20 | return 1 21 | } 22 | 23 | setup_venv() { 24 | 25 | set -e 26 | 27 | $PYTHON_PATH -c "import sys; sys.exit(0)" || (echo "$PYTHON_PATH is not a valid python3 executable. Please install python3.11 or higher." && exit 1) 28 | $PYTHON_PATH -m pip --version > /dev/null || $PYTHON_PATH -m ensurepip 29 | $PYTHON_PATH -m venv $VENV 30 | 31 | set +e 32 | source $VENV/bin/activate 33 | set -e 34 | 35 | # ensure wheel is installed 36 | python3 -m pip install wheel 37 | python3 -m pip install parallel-ssh 38 | # upgrade venv with packages from intallation 39 | python3 -m pip install --upgrade --no-deps packages/* 40 | 41 | # Create azslurm executable 42 | # NOTE: dynamically generated due to the SCALELIB_LOG_USER and SCALELIB_LOG_GROUP 43 | cat > $VENV/bin/azslurm < $VENV/bin/azslurmd < /etc/profile.d/azslurm_autocomplete.sh< /dev/null 2>&1 || export PATH=$PATH:/root/bin 83 | eval "\$(/opt/azurehpc/slurm/venv/bin/register-python-argcomplete azslurm)" || echo "Warning: Autocomplete is disabled" 1>&2 84 | EOF 85 | fi 86 | 87 | azslurm -h 2>&1 > /dev/null || exit 1 88 | } 89 | 90 | setup_install_dir() { 91 | mkdir -p $INSTALL_DIR/logs 92 | cp logging.conf $INSTALL_DIR/ 93 | cp sbin/*.sh $INSTALL_DIR/ 94 | chown slurm:slurm $INSTALL_DIR/*.sh 95 | chmod +x $INSTALL_DIR/*.sh 96 | } 97 | 98 | init_azslurm_config() { 99 | which jetpack || (echo "Jetpack is not installed. Please run this from a CycleCloud node, or pass in --no-jetpack if you intend to install this outside of CycleCloud provisioned nodes." && exit 1) 100 | 101 | $INSTALL_DIR/init-config.sh \ 102 | --url "$(jetpack config cyclecloud.config.web_server)" \ 103 | --cluster-name "$(jetpack config cyclecloud.cluster.name)" \ 104 | --username $(jetpack config cyclecloud.config.username) \ 105 | --password $(jetpack config cyclecloud.config.password) \ 106 | --accounting-subscription-id $(jetpack config azure.metadata.compute.subscriptionId) 107 | } 108 | 109 | setup_azslurmd() { 110 | cat > /etc/systemd/system/azslurmd.service < /dev/null || echo slurm) 148 | export SCALELIB_LOG_GROUP=$(jetpack config slurm.group.name 2>/dev/null || echo slurm) 149 | # Set this globally before running main. 150 | export PYTHON_PATH=$(find_python3) 151 | export PATH=$PATH:/root/bin 152 | 153 | while (( "$#" )); do 154 | case "$1" in 155 | --no-jetpack) 156 | NO_JETPACK=1 157 | shift 158 | ;; 159 | --help) 160 | echo "Usage: $0 [--no-jetpack]" 161 | exit 0 162 | ;; 163 | -*|--*=) 164 | echo "Unknown option $1" >&2 165 | exit 1 166 | ;; 167 | *) 168 | echo "Unknown option $1" >&2 169 | exit 1 170 | ;; 171 | esac 172 | done 173 | } 174 | 175 | main() { 176 | # create the venv and make sure azslurm is in the path 177 | setup_venv 178 | # setup the install dir - logs and logging.conf, some permissions. 179 | setup_install_dir 180 | # setup the azslurmd but do not start it. 181 | setup_azslurmd 182 | # If there is no jetpack, we have to stop here. 183 | # The user has to run $INSTALL_DIR/init-config.sh with the appropriate arguments, and then $INSTALL_DIR/post-install.sh 184 | if [ $NO_JETPACK == 1 ]; then 185 | no_jetpack 186 | exit 0 187 | fi 188 | 189 | # we have jetpack, so let's automate init-config.sh and post-install.sh 190 | init_azslurm_config 191 | echo Running $INSTALL_DIR/post-install.sh 192 | $INSTALL_DIR/post-install.sh $SCALELIB_LOG_GROUP $SCALELIB_LOG_USER 193 | } 194 | 195 | require_root() { 196 | if [ $(whoami) != root ]; then 197 | echo "Please run as root" 198 | exit 1 199 | fi 200 | } 201 | 202 | 203 | parse_args_set_variables() { 204 | export SCHEDULER=slurm 205 | export VENV=/opt/azurehpc/slurm/venv 206 | export INSTALL_DIR=$(dirname $VENV) 207 | export NO_JETPACK=0 208 | # if jetpack doesn't exist or this is not defined, it will silently use slurm as default 209 | export SCALELIB_LOG_USER=$(jetpack config slurm.user.name 2> /dev/null || echo slurm) 210 | export SCALELIB_LOG_GROUP=$(jetpack config slurm.group.name 2>/dev/null || echo slurm) 211 | # Set this globally before running main. 212 | export PYTHON_PATH=$(find_python3) 213 | export PATH=$PATH:/root/bin 214 | 215 | while (( "$#" )); do 216 | case "$1" in 217 | --no-jetpack) 218 | NO_JETPACK=1 219 | shift 220 | ;; 221 | --help) 222 | echo "Usage: $0 [--no-jetpack]" 223 | exit 0 224 | ;; 225 | -*|--*=) 226 | echo "Unknown option $1" >&2 227 | exit 1 228 | ;; 229 | *) 230 | echo "Unknown option $1" >&2 231 | exit 1 232 | ;; 233 | esac 234 | done 235 | } 236 | 237 | require_root 238 | parse_args_set_variables $@ 239 | main 240 | echo Installation complete. -------------------------------------------------------------------------------- /azure-slurm/package.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import configparser 3 | import glob 4 | import pip 5 | import os 6 | import shutil 7 | import sys 8 | import tarfile 9 | import tempfile 10 | from argparse import Namespace 11 | from subprocess import check_call 12 | from typing import Dict, List, Optional 13 | 14 | SCALELIB_VERSION = "1.0.7" 15 | CYCLECLOUD_API_VERSION = "8.7.1" 16 | 17 | 18 | def build_sdist() -> str: 19 | check_call([sys.executable, "setup.py", "sdist"]) 20 | # sometimes this is azure-slurm, sometimes it is azure_slurm, depenends on the build system version. 21 | sdists = glob.glob("dist/azure*slurm-*.tar.gz") 22 | assert len(sdists) == 1, f"Found %d sdist packages, expected 1 - see {os.path.abspath('dist/azure-slurm-*.tar.gz')}" % len(sdists) 23 | path = sdists[0] 24 | fname = os.path.basename(path) 25 | dest = os.path.join("libs", fname) 26 | if os.path.exists(dest): 27 | os.remove(dest) 28 | shutil.move(path, dest) 29 | return fname 30 | 31 | 32 | def get_cycle_libs(args: Namespace) -> List[str]: 33 | ret = [build_sdist()] 34 | 35 | scalelib_file = "cyclecloud-scalelib-{}.tar.gz".format(SCALELIB_VERSION) 36 | cyclecloud_api_file = "cyclecloud_api-{}-py2.py3-none-any.whl".format( 37 | CYCLECLOUD_API_VERSION 38 | ) 39 | # swagger_file = "swagger-client-1.0.0.tar.gz" 40 | 41 | scalelib_url = "https://github.com/Azure/cyclecloud-scalelib/archive/refs/tags/{}.tar.gz".format( 42 | SCALELIB_VERSION 43 | ) 44 | 45 | cyclecloud_api_url = f"https://github.com/Azure/cyclecloud-scalelib/releases/download/{SCALELIB_VERSION}/cyclecloud_api-{CYCLECLOUD_API_VERSION}-py2.py3-none-any.whl" 46 | to_download = { 47 | scalelib_file: (args.scalelib, scalelib_url), 48 | cyclecloud_api_file: (args.cyclecloud_api, cyclecloud_api_url), 49 | # swagger_file: (args.swagger, None) 50 | } 51 | 52 | for lib_file in to_download: 53 | arg_override, url = to_download[lib_file] 54 | if arg_override: 55 | if not os.path.exists(arg_override): 56 | print(arg_override, "does not exist", file=sys.stderr) 57 | sys.exit(1) 58 | fname = os.path.basename(arg_override) 59 | orig = os.path.abspath(arg_override) 60 | dest = os.path.abspath(os.path.join("libs", fname)) 61 | if orig != dest: 62 | shutil.copyfile(orig, dest) 63 | ret.append(fname) 64 | else: 65 | dest = os.path.join("libs", lib_file) 66 | check_call(["curl", "-L", "-k", "-s", "-f", "-o", dest, url]) 67 | ret.append(lib_file) 68 | print("Downloaded", lib_file, "to") 69 | 70 | return ret 71 | 72 | 73 | def execute() -> None: 74 | expected_cwd = os.path.abspath(os.path.dirname(__file__)) 75 | os.chdir(expected_cwd) 76 | 77 | print("Running from", expected_cwd) 78 | 79 | if not os.path.exists("libs"): 80 | os.makedirs("libs") 81 | 82 | argument_parser = argparse.ArgumentParser( 83 | "Builds Azure Slurm project with all dependencies.\n" 84 | + "If you don't specify local copies of scalelib or cyclecloud-api they will be downloaded from github." 85 | ) 86 | argument_parser.add_argument("--scalelib", default=None) 87 | # argument_parser.add_argument("--swagger", default=None) 88 | argument_parser.add_argument("--cyclecloud-api", default=None) 89 | args = argument_parser.parse_args() 90 | 91 | cycle_libs = get_cycle_libs(args) 92 | 93 | parser = configparser.ConfigParser() 94 | ini_path = os.path.abspath("../project.ini") 95 | 96 | with open(ini_path) as fr: 97 | parser.read_file(fr) 98 | 99 | version = parser.get("project", "version") 100 | if not version: 101 | raise RuntimeError("Missing [project] -> version in {}".format(ini_path)) 102 | 103 | if not os.path.exists("dist"): 104 | os.makedirs("dist") 105 | 106 | tf = tarfile.TarFile.gzopen( 107 | "dist/azure-slurm-pkg-{}.tar.gz".format(version), "w" 108 | ) 109 | 110 | build_dir = tempfile.mkdtemp("azure-slurm") 111 | 112 | def _add(name: str, path: Optional[str] = None, mode: Optional[int] = None) -> None: 113 | path = path or name 114 | tarinfo = tarfile.TarInfo("azure-slurm/" + name) 115 | tarinfo.size = os.path.getsize(path) 116 | tarinfo.mtime = int(os.path.getmtime(path)) 117 | if mode: 118 | tarinfo.mode = mode 119 | 120 | with open(path, "rb") as fr: 121 | tf.addfile(tarinfo, fr) 122 | 123 | packages = [] 124 | for dep in cycle_libs: 125 | dep_path = os.path.abspath(os.path.join("libs", dep)) 126 | _add("packages/" + dep, dep_path) 127 | packages.append(dep_path) 128 | mypip = shutil.which("pip3") 129 | print("my pip", mypip) 130 | check_call([mypip, "download"] + packages, cwd=build_dir) 131 | 132 | print("Using build dir", build_dir) 133 | by_package: Dict[str, List[str]] = {} 134 | for fil in os.listdir(build_dir): 135 | toks = fil.split("-", 1) 136 | package = toks[0] 137 | if package == "cyclecloud": 138 | package = "{}-{}".format(toks[0], toks[1]) 139 | if package not in by_package: 140 | by_package[package] = [] 141 | by_package[package].append(fil) 142 | 143 | for package, fils in by_package.items(): 144 | 145 | if len(fils) > 1: 146 | print("WARNING: Ignoring duplicate package found:", package, fils) 147 | assert False 148 | 149 | for fil in os.listdir(build_dir): 150 | if "pyyaml" in fil.lower(): 151 | print(f"WARNING: Ignoring unnecessary PyYaml {fil}, also it is platform (ubuntu/rhel) specific.") 152 | continue 153 | path = os.path.join(build_dir, fil) 154 | _add("packages/" + fil, path) 155 | 156 | _add("install.sh", "install.sh", mode=os.stat("install.sh")[0]) 157 | _add("sbin/init-config.sh", "sbin/init-config.sh", mode=os.stat("sbin/init-config.sh")[0]) 158 | _add("sbin/post-install.sh", "sbin/post-install.sh", mode=os.stat("sbin/post-install.sh")[0]) 159 | _add("sbin/resume_fail_program.sh", "sbin/resume_fail_program.sh") 160 | _add("sbin/prolog.sh", "sbin/prolog.sh") 161 | _add("sbin/resume_program.sh", "sbin/resume_program.sh") 162 | _add("sbin/return_to_idle.sh", "sbin/return_to_idle.sh") 163 | _add("sbin/return_to_idle_legacyfin.sh", "sbin/return_to_idle_legacy.sh") 164 | _add("sbin/suspend_program.sh", "sbin/suspend_program.sh") 165 | _add("sbin/get_acct_info.sh", "sbin/get_acct_info.sh") 166 | _add("logging.conf", "conf/logging.conf") 167 | 168 | 169 | 170 | if __name__ == "__main__": 171 | execute() 172 | -------------------------------------------------------------------------------- /azure-slurm/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | cd $(dirname $0)/ 4 | 5 | 6 | if [ ! -e libs ]; then 7 | mkdir libs 8 | fi 9 | 10 | LOCAL_SCALELIB=$1 11 | 12 | rm -f dist/* 13 | 14 | if [ "$LOCAL_SCALELIB" == "" ]; then 15 | # we are using released versions of scalelib 16 | python3.11 package.py 17 | else 18 | pushd $LOCAL_SCALELIB 19 | rm -f dist/*.gz 20 | # python3 setup.py swagger 21 | python3.11 setup.py sdist 22 | popd 23 | # swagger=`ls $LOCAL_SCALELIB/dist/swagger*.gz` 24 | scalelib=`ls $LOCAL_SCALELIB/dist/cyclecloud-scalelib*.gz` 25 | # python3 package.py --scalelib $scalelib --swagger $swagger 26 | python3.11 package.py --scalelib $scalelib 27 | fi 28 | -------------------------------------------------------------------------------- /azure-slurm/sbin/get_acct_info.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source /opt/azurehpc/slurm/venv/bin/activate 3 | 4 | azslurm accounting_info --node-name $1 5 | -------------------------------------------------------------------------------- /azure-slurm/sbin/init-config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | set -e 6 | CLUSTER_NAME= 7 | USERNAME= 8 | PASSWORD= 9 | ACCOUNTING_SUBSCRIPTION_ID=unused 10 | URL= 11 | INSTALL_DIR=/opt/azurehpc/slurm 12 | 13 | usage() { 14 | echo "Usage: $0 [--url ] [--cluster-name ] [--username ] [--password ] [--accounting-subscription-id ] [--accounting-tag-value ]" 15 | echo " --url The URL of the CycleCloud instance" 16 | echo " --cluster-name The name of the cluster" 17 | echo " --username The username for CycleCloud" 18 | echo " --password The password for CycleCloud" 19 | echo " --accounting-subscription-id The subscription ID for accounting (optional)" 20 | } 21 | 22 | 23 | while (( "$#" )); do 24 | case "$1" in 25 | --url) 26 | URL=$2 27 | shift 2 28 | ;; 29 | --cluster-name) 30 | CLUSTER_NAME=$2 31 | shift 2 32 | ;; 33 | --username) 34 | USERNAME=$2 35 | shift 2 36 | ;; 37 | --password) 38 | PASSWORD=$2 39 | shift 2 40 | ;; 41 | --accounting-subscription-id) 42 | ACCOUNTING_SUBSCRIPTION_ID=$2 43 | shift 2 44 | ;; 45 | --help) 46 | usage 47 | exit 0 48 | ;; 49 | -*|--*=) 50 | echo "Unknown option $1" >&2 51 | exit 1 52 | ;; 53 | *) 54 | echo "Unknown option $1" >&2 55 | exit 1 56 | ;; 57 | esac 58 | done 59 | 60 | if [ -z "$CLUSTER_NAME" ] || [ -z "$USERNAME" ] || [ -z "$PASSWORD" ] || [ -z "$URL" ]; then 61 | echo "Error: Missing required arguments." 62 | echo "Please provide --url, --cluster-name, --username, and --password." 63 | usage 64 | exit 1 65 | fi 66 | 67 | escaped_cluster_name=$(python3 -c "import re; print(re.sub('[^a-zA-Z0-9-]', '-', '$CLUSTER_NAME').lower())") 68 | 69 | config_dir=/sched/$escaped_cluster_name 70 | azslurm initconfig --username $USERNAME \ 71 | --password $PASSWORD \ 72 | --url $URL \ 73 | --cluster-name $CLUSTER_NAME\ 74 | --config-dir $config_dir \ 75 | --accounting-subscription-id $ACCOUNTING_SUBSCRIPTION_ID \ 76 | --default-resource '{"select": {}, "name": "slurm_gpus", "value": "node.gpu_count"}' \ 77 | --cost-cache-root $INSTALL_DIR/.cache \ 78 | > $INSTALL_DIR/autoscale.json -------------------------------------------------------------------------------- /azure-slurm/sbin/post-install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | set -e 6 | cd $(dirname $0) 7 | azslurm scale --no-restart 8 | SLURM_GROUP=${1:-slurm} 9 | SLURM_USER=${2:-slurm} 10 | chown -R $SLURM_GROUP:$SLURM_USER logs/ 11 | systemctl restart azslurmd 12 | -------------------------------------------------------------------------------- /azure-slurm/sbin/prolog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | set +e 4 | 5 | log=/var/log/slurmctld/prolog_slurmctld.log 6 | script=/opt/azurehpc/slurm/get_acct_info.sh 7 | 8 | if [ -e /bin/scontrol ]; then 9 | scontrol=/bin/scontrol 10 | elif [ -e /usr/bin/scontrol ]; then 11 | scontrol=/usr/bin/scontrol 12 | fi 13 | 14 | if [ -e /bin/jq ]; then 15 | JQ=/bin/jq 16 | elif [ -e /usr/bin/jq ]; then 17 | JQ=/usr/bin/jq 18 | fi 19 | 20 | job=$SLURM_JOBID 21 | 22 | nodename=$($scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 23 | 24 | ret=0 25 | count=0 26 | 27 | while [ $ret -eq 0 ] && [ $count -lt 5 ] 28 | do 29 | sleep 2 30 | output=$($script $nodename 2>>$log) 31 | ret=$(echo $output | $JQ '. | length') 32 | echo "DEBUG: json: " $output "ret: " $ret "job: " $SLURM_JOBID "nodename: " $nodename >> $log 33 | count=$((count+1)) 34 | done 35 | 36 | if [ $ret -eq 0 ]; then 37 | echo "ERROR: Could not process get node info for admincomment" >> $log 38 | else 39 | $scontrol update job=$job admincomment="$output" 40 | fi 41 | 42 | exit 0 -------------------------------------------------------------------------------- /azure-slurm/sbin/resume_fail_program.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | node_list=$(echo $@ | sed "s/ /,/g") 3 | source /opt/azurehpc/slurm/venv/bin/activate 4 | azslurm suspend --node-list $node_list 5 | exit $? -------------------------------------------------------------------------------- /azure-slurm/sbin/resume_program.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | node_list=$(echo $@ | sed "s/ /,/g") 3 | source /opt/azurehpc/slurm/venv/bin/activate 4 | azslurm resume --node-list $node_list 5 | exit $? -------------------------------------------------------------------------------- /azure-slurm/sbin/return_to_idle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /opt/azurehpc/slurm/venv/bin/activate 4 | azslurm return_to_idle -------------------------------------------------------------------------------- /azure-slurm/sbin/return_to_idle_legacy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | down_and_off=$(sinfo -O nodelist:500,statelong -h | grep down~ | cut -d" " -f1) 4 | 5 | if [ "$down_and_off" != "" ]; then 6 | echo $(date): Setting the following down~ nodes to idle~: $down_and_off 7 | scontrol update nodename=$down_and_off state=idle 8 | if [ $? != 0 ]; then 9 | echo $(date): Updating nodes failed! Command was "scontrol update nodename=$down_and_off state=idle" 10 | exit 1 11 | fi 12 | fi 13 | 14 | drained_and_off=$(sinfo -O nodelist:500,statelong -h | grep drained~ | cut -d" " -f1) 15 | 16 | if [ "$drained_and_off" != "" ]; then 17 | echo $(date): Setting the following drained~ nodes to idle~: $drained_and_off 18 | scontrol update nodename=$drained_and_off state=idle 19 | if [ $? != 0 ]; then 20 | echo $(date): Updating nodes failed! Command was "scontrol update nodename=$drained_and_off state=idle" 21 | exit 1 22 | fi 23 | fi 24 | -------------------------------------------------------------------------------- /azure-slurm/sbin/suspend_program.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | node_list=$(echo $@ | sed "s/ /,/g") 3 | source /opt/azurehpc/slurm/venv/bin/activate 4 | azslurm suspend --node-list $node_list 5 | exit $? -------------------------------------------------------------------------------- /azure-slurm/setup.py: -------------------------------------------------------------------------------- 1 | # test: ignore 2 | import os 3 | from subprocess import check_call 4 | from typing import List 5 | 6 | from setuptools import find_packages, setup 7 | from setuptools.command.test import Command 8 | from setuptools.command.test import test as TestCommand # noqa: N812 9 | 10 | __version__ = "4.0.3" 11 | CWD = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | 14 | class PyTest(TestCommand): 15 | def finalize_options(self) -> None: 16 | TestCommand.finalize_options(self) 17 | import os 18 | 19 | xml_out = os.path.join(".", "build", "test-results", "pytest.xml") 20 | if not os.path.exists(os.path.dirname(xml_out)): 21 | os.makedirs(os.path.dirname(xml_out)) 22 | # -s is needed so py.test doesn't mess with stdin/stdout 23 | self.test_args = ["-s", "test", "--junitxml=%s" % xml_out] 24 | # needed for older setuptools to actually run this as a test 25 | self.test_suite = True 26 | 27 | def run_tests(self) -> None: 28 | # import here, cause outside the eggs aren't loaded 29 | import sys 30 | import pytest 31 | 32 | # run the tests, then the format checks. 33 | errno = pytest.main(self.test_args) 34 | if errno != 0: 35 | sys.exit(errno) 36 | 37 | check_call( 38 | ["black", "--check", "src", "test"], 39 | cwd=CWD, 40 | ) 41 | check_call( 42 | ["isort", "-c"], 43 | cwd=os.path.join(CWD, "src"), 44 | ) 45 | check_call( 46 | ["isort", "-c"], 47 | cwd=os.path.join(CWD, "test"), 48 | ) 49 | 50 | run_type_checking() 51 | 52 | sys.exit(errno) 53 | 54 | 55 | class Formatter(Command): 56 | user_options: List[str] = [] 57 | 58 | def initialize_options(self) -> None: 59 | pass 60 | 61 | def finalize_options(self) -> None: 62 | pass 63 | 64 | def run(self) -> None: 65 | check_call( 66 | ["black", "src", "test"], cwd=CWD, 67 | ) 68 | check_call( 69 | ["isort", "-y"], 70 | cwd=os.path.join(CWD, "src"), 71 | ) 72 | check_call( 73 | ["isort", "-y"], 74 | cwd=os.path.join(CWD, "test"), 75 | ) 76 | run_type_checking() 77 | 78 | 79 | def run_type_checking() -> None: 80 | check_call( 81 | [ 82 | "mypy", 83 | "--ignore-missing-imports", 84 | "--follow-imports=silent", 85 | "--show-column-numbers", 86 | "--disallow-untyped-defs", 87 | os.path.join(CWD, "test"), 88 | ] 89 | ) 90 | check_call( 91 | [ 92 | "mypy", 93 | "--ignore-missing-imports", 94 | "--follow-imports=silent", 95 | "--show-column-numbers", 96 | "--disallow-untyped-defs", 97 | os.path.join(CWD, "src"), 98 | ] 99 | ) 100 | 101 | check_call(["flake8", "--ignore=E203,E231,F405,E501,W503", "src", "test", "setup.py"]) 102 | 103 | 104 | class TypeChecking(Command): 105 | user_options: List[str] = [] 106 | 107 | def initialize_options(self) -> None: 108 | pass 109 | 110 | def finalize_options(self) -> None: 111 | pass 112 | 113 | def run(self) -> None: 114 | run_type_checking() 115 | 116 | 117 | setup( 118 | name="azure-slurm", 119 | version=__version__, 120 | packages=find_packages(), 121 | #package_dir={"": "slurmcc"}, 122 | package_data={ 123 | "azure-slurm": [ 124 | "BUILD_NUMBER", 125 | "private-requirements.json", 126 | "../NOTICE", 127 | "../notices", 128 | ] 129 | }, 130 | install_requires=["typing_extensions==3.7.4.3", "zipp==3.6", "tabulate", "python-daemon"], 131 | tests_require=["pytest==3.2.3"], 132 | cmdclass={"test": PyTest, "format": Formatter, "types": TypeChecking}, 133 | url="http://www.cyclecomputing.com", 134 | maintainer="Cycle Computing", 135 | maintainer_email="support@cyclecomputing.com", 136 | ) 137 | -------------------------------------------------------------------------------- /azure-slurm/slurmcc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # 4 | import os 5 | import random 6 | from typing import Any, Callable, Optional 7 | 8 | try: 9 | from requests.exceptions import ConnectionError 10 | except ImportError: 11 | # this is only used during live testing with scalelib, so this should never happen 12 | import logging 13 | logging.exception("Failed to import ConnectionError from requests.exceptions") 14 | ConnectionError = RuntimeError 15 | 16 | 17 | class AzureSlurmError(RuntimeError): 18 | pass 19 | 20 | 21 | def custom_chaos_mode(action: Callable) -> Callable: 22 | def wrapped(func: Callable) -> Any: 23 | return chaos_mode(func, action) 24 | 25 | return wrapped 26 | 27 | 28 | def chaos_mode(func: Callable, action: Optional[Callable] = None) -> Callable: 29 | def default_action() -> Any: 30 | raise random.choice( 31 | [RuntimeError, ValueError, ConnectionError] 32 | )("Random failure") 33 | 34 | action = action or default_action 35 | 36 | def wrapped(*args: Any, **kwargs: Any) -> Any: 37 | if is_chaos_mode(): 38 | return action or default_action() 39 | 40 | return func(*args, **kwargs) 41 | 42 | return wrapped 43 | 44 | 45 | def is_chaos_mode() -> bool: 46 | return random.random() < float(os.getenv("AZURE_SLURM_CHAOS_MODE", 0)) 47 | -------------------------------------------------------------------------------- /azure-slurm/slurmcc/azslurmd.py: -------------------------------------------------------------------------------- 1 | from slurmcc import allocation, cli, util 2 | import abc 3 | import os 4 | import time 5 | from hpc.autoscale.node.node import Node 6 | from hpc.autoscale.util import load_config 7 | from hpc.autoscale import hpclogging as logging 8 | 9 | 10 | class NodeSource(abc.ABC): 11 | @abc.abstractmethod 12 | def get_nodes(self) -> list[Node]: ... 13 | 14 | 15 | class NodeManagerSource(NodeSource): 16 | def get_nodes(self) -> list[Node]: 17 | node_mgr = cli.new_node_manager() 18 | return node_mgr.get_nodes() 19 | 20 | 21 | class AzslurmDaemon: 22 | 23 | def __init__(self, node_source: NodeSource) -> None: 24 | self.node_source = node_source 25 | self.sync_nodes = allocation.SyncNodes() 26 | keep_alive_conf = os.path.realpath("/etc/slurm/keep_alive.conf") 27 | self.slurm_nodes = allocation.SlurmNodes(allocation.SuspendExcNodesSerializer(keep_alive_conf)) 28 | 29 | def run_once(self) -> None: 30 | start = time.time() 31 | logging.debug("begin azslurmd") 32 | self.slurm_nodes.refresh() 33 | self.converge_nodes() 34 | end = time.time() 35 | duration = end - start 36 | logging.info("Completed azslurmd in %.1fs" % duration) 37 | 38 | def converge_nodes(self) -> None: 39 | cc_nodes = self.node_source.get_nodes() 40 | # follow the symlink 41 | self.sync_nodes.sync_nodes(self.slurm_nodes, cc_nodes) 42 | 43 | 44 | @cli.init_power_saving_log 45 | def azslurmd(sleep_time: int = 15) -> None: 46 | """Run the main loop of the azslurm daemon. Writes log to azslurmd.log under /opt/azurehpc/slurm/logs""" 47 | logging.info("azslurmd is running with PID=%s", os.getpid()) 48 | azslurm_daemon = AzslurmDaemon(NodeManagerSource()) 49 | while True: 50 | try: 51 | azslurm_daemon.run_once() 52 | time.sleep(sleep_time) 53 | except InterruptedError: 54 | logging.warning("azslurmd recieved sigkill") 55 | return 56 | except Exception: 57 | logging.exception("azslurmd hit an exception - sleeping") 58 | time.sleep(sleep_time) 59 | 60 | 61 | def run(config_path: str) -> None: 62 | config = load_config(config_path) 63 | logging.set_context(f"[azslurmd]") 64 | logging.initialize_logging(config) 65 | sleep_time = (config.get("azslurmd") or {}).get("sleep_time") or 15 66 | azslurmd(max(1, sleep_time)) 67 | -------------------------------------------------------------------------------- /azure-slurm/slurmcc/azslurmdwrapper.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import daemon 3 | import daemon.pidfile 4 | import lockfile 5 | import os 6 | import sys 7 | from slurmcc import azslurmd 8 | 9 | PID_FILE = os.environ.get("AZSLURM_PID_FILE", "/opt/azurehpc/slurm/azslurm.pid") 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--config", "-c", default="/opt/azurehpc/slurm/autoscale.json", help="Path to the configuration file") 15 | parser.add_argument("--foreground", "-f", action="store_true", default=False, help="Run in the foreground") 16 | args = parser.parse_args() 17 | 18 | if args.foreground: 19 | logging.basicConfig( 20 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 21 | ) 22 | logging.info("Starting azslurmd in the foreground") 23 | return azslurmd.run(args.config) 24 | 25 | with daemon.DaemonContext(stdout=sys.stdout, stderr=sys.stderr, pidfile=daemon.pidfile.PIDLockFile(PID_FILE)): 26 | azslurmd.run(args.config) 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/cyclecloud-slurm/5e2514620b752ea6e194fd7b55f2d6d204e20e53/azure-slurm/test/slurmcc_test/__init__.py -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/allocation_test.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | from hpc.autoscale import util as hpcutil 4 | from hpc.autoscale.ccbindings.mock import MockClusterBinding 5 | from hpc.autoscale.node.node import Node 6 | from slurmcc import allocation 7 | from slurmcc import partition 8 | from slurmcc import util as slutil 9 | from slurmcc.partition import fetch_partitions 10 | 11 | from . import testutil 12 | 13 | import logging 14 | import pytest 15 | import sys 16 | logging.basicConfig(level=logging.DEBUG, format="%(message)s", stream=sys.stderr) 17 | 18 | 19 | class MockSyncNodes(allocation.SyncNodes): 20 | def __init__(self, ) -> None: 21 | super().__init__() 22 | 23 | def sync_nodes( 24 | self, slurm_nodes: allocation.SlurmNodes, cyclecloud_nodes: List[Node] 25 | ) -> Tuple[Dict, List[Node]]: 26 | return super().sync_nodes(slurm_nodes, cyclecloud_nodes) 27 | 28 | 29 | def setup(): 30 | slutil.TEST_MODE = True 31 | 32 | 33 | def teardown(): 34 | slutil.TEST_MODE = False 35 | 36 | 37 | def test_basic_resume() -> None: 38 | node_mgr = testutil.make_test_node_manager() 39 | bindings: MockClusterBinding = node_mgr.cluster_bindings # type: ignore 40 | partitions = fetch_partitions(node_mgr) 41 | native_cli = testutil.make_native_cli() 42 | node_list = allocation.SlurmNodes(["hpc-1", "hpc-2", "htc-1"]) 43 | 44 | bootup_result = allocation.resume(testutil.CONFIG, node_mgr, node_list, partitions) 45 | assert bootup_result 46 | assert bootup_result.nodes 47 | assert len(bootup_result.nodes) == 3 48 | assert list(node_list) == [n.name for n in bootup_result.nodes] 49 | assert 3 == len(bindings.get_nodes().nodes) 50 | by_name = hpcutil.partition_single(bindings.get_nodes().nodes, lambda n: n["Name"]) 51 | assert by_name["hpc-1"]["PlacementGroupId"] 52 | assert by_name["hpc-2"]["PlacementGroupId"] 53 | assert not by_name["htc-1"]["PlacementGroupId"] 54 | 55 | def get_latest_nodes() -> List[Node]: 56 | new_node_mgr = testutil.refresh_test_node_manager(node_mgr) 57 | return new_node_mgr.get_nodes() 58 | 59 | assert 3 == len(get_latest_nodes()) 60 | 61 | sync_nodes = MockSyncNodes() 62 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 63 | assert len(ready) == 0 64 | 65 | bindings.assign_ip(["hpc-2", "htc-1"]) 66 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 67 | assert len(ready) == 0 68 | assert native_cli.slurm_nodes["hpc-2"]["NodeAddr"] == "hpc-2" 69 | assert native_cli.slurm_nodes["htc-1"]["NodeAddr"] == "10.1.0.3" 70 | 71 | bindings.update_state("Ready", ["hpc-2"]) 72 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 73 | assert ["hpc-2"] == [n.name for n in ready] 74 | 75 | bindings.update_state("Ready", ["hpc-1", "htc-1"]) 76 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 77 | # hpc-1 should not be ready - it still has no ip address 78 | assert ["hpc-2", "htc-1"] == [n.name for n in ready] 79 | 80 | bindings.assign_ip(["hpc-1"]) 81 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 82 | # hpc-1 should not be ready - it still has no ip address 83 | assert list(node_list) == [n.name for n in ready] 84 | 85 | 86 | def test_mixed_resume_names() -> None: 87 | node_mgr = testutil.make_test_node_manager() 88 | node_list = allocation.SlurmNodes(["hpc-4", "hpc-20"]) 89 | partitions = fetch_partitions(node_mgr) 90 | 91 | bootup_result = allocation.resume(testutil.CONFIG, node_mgr, node_list, partitions) 92 | assert bootup_result 93 | assert bootup_result.nodes 94 | assert len(bootup_result.nodes) == 2 95 | assert list(node_list) == [n.name for n in bootup_result.nodes] 96 | 97 | 98 | @pytest.fixture(autouse=True) 99 | def mock_is_slurmctld_up() -> None: 100 | implementation = slutil.is_slurmctld_up 101 | slutil.is_slurmctld_up = lambda: True 102 | yield 103 | slutil.is_slurmctld_up = implementation 104 | 105 | 106 | def test_resume_dynamic_by_feature() -> None: 107 | 108 | 109 | node_mgr = testutil.make_test_node_manager() 110 | bindings: MockClusterBinding = node_mgr.cluster_bindings # type: ignore 111 | native_cli = testutil.make_native_cli() 112 | native_cli.create_nodes(["mydynamic-1"], features=["dyn"]) 113 | 114 | partitions = partition.fetch_partitions(node_mgr, include_dynamic=True) 115 | assert 3 == len(partitions) 116 | 117 | result = allocation.resume(testutil.CONFIG, node_mgr, ["mydynamic-1"], partitions) 118 | assert result 119 | assert result.nodes 120 | assert len(result.nodes) == 1 121 | assert result.nodes[0].name == "mydynamic-1" 122 | 123 | assert 1 == len(bindings.get_nodes().nodes) 124 | 125 | bindings.add_bucket( 126 | nodearray_name="dynamic", 127 | vm_size="Standard_F4", 128 | max_count=100, 129 | available_count=100, 130 | ) 131 | 132 | node_mgr = testutil.refresh_test_node_manager(node_mgr) 133 | assert 4 == len(partition.fetch_partitions(node_mgr, include_dynamic=True)) 134 | 135 | native_cli.create_nodes(["f2-1"], features=["dyn", "Standard_F2"]) 136 | native_cli.create_nodes(["f4-1"], features=["dyn", "Standard_F4"]) 137 | 138 | result = allocation.resume(testutil.CONFIG, node_mgr, ["f2-1", "f4-1"], fetch_partitions(node_mgr)) 139 | assert result 140 | assert result.nodes 141 | assert 2 == len(result.nodes) 142 | 143 | 144 | def test_failure_mode() -> None: 145 | node_mgr = testutil.make_test_node_manager() 146 | bindings: MockClusterBinding = node_mgr.cluster_bindings # type: ignore 147 | node_list = allocation.SlurmNodes(["hpc-1", "hpc-2", "htc-1"]) 148 | partitions = fetch_partitions(node_mgr) 149 | native_cli = testutil.make_native_cli() 150 | 151 | bootup_result = allocation.resume(testutil.CONFIG, node_mgr, node_list, partitions) 152 | assert bootup_result 153 | assert bootup_result.nodes 154 | assert len(bootup_result.nodes) == 3 155 | assert list(node_list) == [n.name for n in bootup_result.nodes] 156 | 157 | def get_latest_nodes() -> List[Node]: 158 | new_node_mgr = testutil.refresh_test_node_manager(node_mgr) 159 | return new_node_mgr.get_nodes() 160 | 161 | sync_nodes = MockSyncNodes() 162 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 163 | assert len(ready) == 0 164 | 165 | bindings.assign_ip(node_list) 166 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 167 | 168 | assert native_cli.slurm_nodes["htc-1"]["NodeAddr"] == "10.1.0.4" 169 | 170 | # make sure new IPs are assigned - cyclecloud often does this 171 | bindings.assign_ip(["htc-1"]) 172 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 173 | assert native_cli.slurm_nodes["htc-1"]["NodeAddr"] == "10.1.0.5" 174 | 175 | # make sure we unassign the IP for a failed node 176 | bindings.update_state("Failed", ["htc-1"]) 177 | states, ready = sync_nodes.sync_nodes(node_list, get_latest_nodes()) 178 | assert native_cli.slurm_nodes["htc-1"]["NodeAddr"] == "htc-1", states 179 | 180 | 181 | def test_keep_alive() -> None: 182 | native_cli = testutil.make_native_cli() 183 | # add the node outside of azslurmd 184 | native_cli.scontrol(["update", "suspendexcnodes+=htc-1"]) 185 | sn = allocation.SlurmNodes([], slutil.get_current_suspend_exc_nodes()) 186 | 187 | # node is in suspend_exc and remains there, even if we try to remove it 188 | assert sn.is_suspend_exc("htc-1") 189 | sn.unsuspend_exc_node("htc-1") 190 | assert sn.is_suspend_exc("htc-1") 191 | 192 | # now _we_ have suspended this node (i.e. KeepAlive=true was seen) 193 | sn.suspend_exc_node("htc-1") 194 | assert sn.is_suspend_exc("htc-1") 195 | # so now we can remove it - and it is NOT suspended 196 | sn.unsuspend_exc_node("htc-1") 197 | assert not sn.is_suspend_exc("htc-1") 198 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/azslurmd_test.py: -------------------------------------------------------------------------------- 1 | from slurmcc import azslurmd 2 | from slurmcc import util as slutil 3 | from slurmcc_test import testutil 4 | from slurmcc.azslurmd import NodeSource 5 | from hpc.autoscale.node.node import Node 6 | from hpc.autoscale.node.nodemanager import NodeManager 7 | from typing import Optional 8 | import pytest 9 | 10 | 11 | class MockNodeSource(NodeSource): 12 | def __init__(self, bindings: testutil.MockClusterBinding) -> None: 13 | self.node_mgr = testutil.make_test_node_manager_with_bindings(bindings) 14 | 15 | def get_nodes(self) -> list[Node]: 16 | return self.node_mgr.get_nodes() 17 | 18 | 19 | class MockNode: 20 | def __init__( 21 | self, 22 | name: str, 23 | status: str, 24 | private_ip: Optional[str] = None, 25 | software_configuration: Optional[dict] = None, 26 | ) -> None: 27 | self.name = name 28 | self.status = status 29 | self.state = status 30 | self.target_state = "Ready" 31 | self.private_ip = private_ip 32 | self.software_configuration = software_configuration or {} 33 | 34 | 35 | @pytest.fixture() 36 | def bindings(setup_slurm: None) -> testutil.MockClusterBinding: 37 | return testutil.make_test_node_manager().cluster_bindings 38 | 39 | 40 | @pytest.fixture() 41 | def setup_slurm() -> None: 42 | testutil.make_native_cli(False) 43 | slutil.SLURM_CLI.create_nodes(["hpc-1", "hpc-2"], features=["CLOUD"]) 44 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "idle" 45 | return None 46 | 47 | 48 | def test_noop(bindings: testutil.MockClusterBinding) -> None: 49 | # return an empty list of nodes - nothing to do 50 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 51 | 52 | 53 | def test_one_ready_node(bindings: testutil.MockClusterBinding) -> None: 54 | bindings.add_node( 55 | name="hpc-1", nodearray="hpc", state="Ready", target_state="Started" 56 | ) 57 | 58 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 59 | 60 | 61 | def test_one_not_ready_node_no_ip(bindings: testutil.MockClusterBinding) -> None: 62 | bindings.add_node( 63 | name="hpc-1", nodearray="hpc", state="Allocation", target_state="Started" 64 | ) 65 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 66 | 67 | 68 | def test_one_not_ready_node_with_ip(bindings: testutil.MockClusterBinding) -> None: 69 | bindings.add_node( 70 | name="hpc-1", nodearray="hpc", state="Allocation", target_state="Started" 71 | ) 72 | 73 | bindings.assign_ip(["hpc-1"]) 74 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 75 | 76 | 77 | def test_one_ready_node_with_ip(bindings: testutil.MockClusterBinding) -> None: 78 | bindings.add_node( 79 | name="hpc-1", nodearray="hpc", state="Ready", target_state="Started" 80 | ) 81 | bindings.assign_ip(["hpc-1"]) 82 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 83 | 84 | 85 | def test_one_ready_node_with_ip_internal_dns( 86 | bindings: testutil.MockClusterBinding, 87 | ) -> None: 88 | from slurmcc import allocation 89 | allocation.BEGIN_TEST = True 90 | node = bindings.add_node( 91 | name="hpc-1", nodearray="hpc", state="Ready", target_state="Started" 92 | ) 93 | bindings.assign_ip(["hpc-1"]) 94 | node.software_configuration["slurm"]["use_nodename_as_hostname"] = False 95 | assert not node.software_configuration["slurm"]["use_nodename_as_hostname"] 96 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 97 | assert node.private_ip, "node should have a private IP address" 98 | assert ( 99 | slutil.SLURM_CLI.slurm_nodes["hpc-1"]["NodeAddr"] == node.private_ip 100 | ), f"NodeAddr should be the private IP address - got {slutil.SLURM_CLI.slurm_nodes['hpc-1']['NodeAddr']}" 101 | 102 | 103 | 104 | def test_one_failed_node_with_ip(bindings: testutil.MockClusterBinding) -> None: 105 | hpc1 = bindings.add_node( 106 | name="hpc-1", nodearray="hpc", state="Allocation", target_state="Started" 107 | ) 108 | bindings.assign_ip(["hpc-1"]) 109 | hpc1.software_configuration["slurm"]["use_nodename_as_hostname"] = False 110 | 111 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 112 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "idle" 113 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["NodeAddr"] == hpc1.private_ip, "NodeAddr should be the private IP address" 114 | 115 | bindings.update_state("Failed", ["hpc-1"]) 116 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 117 | 118 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "down", "Should be marked down because it failed" 119 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["Reason"] == "cyclecloud_node_failure" 120 | 121 | bindings.update_state("Ready", ["hpc-1"]) 122 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 123 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "idle", "node should be recovered %s" % SLURM_CLI.slurm_nodes["hpc-1"] 124 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["Reason"] == "cyclecloud_node_recovery" 125 | 126 | 127 | def test_node_goes_missing(bindings: testutil.MockClusterBinding) -> None: 128 | bindings.add_node( 129 | name="hpc-1", nodearray="hpc", state="Ready", target_state="Started" 130 | ) 131 | bindings.assign_ip(["hpc-1"]) 132 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 133 | 134 | # remove the node from the cluster 135 | bindings.shutdown_nodes(names=["hpc-1"]) 136 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 137 | 138 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "down" 139 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["Reason"] == "cyclecloud_node_failure" 140 | 141 | 142 | def test_node_goes_missing_with_ip(bindings: testutil.MockClusterBinding) -> None: 143 | node = bindings.add_node( 144 | name="hpc-1", nodearray="hpc", state="Ready", target_state="Started" 145 | ) 146 | bindings.assign_ip(["hpc-1"]) 147 | node.software_configuration["slurm"]["use_nodename_as_hostname"] = False 148 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 149 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["NodeAddr"] == node.private_ip, "NodeAddr should be the private IP address" 150 | # remove the node from cyclecloud 151 | bindings.shutdown_nodes(names=["hpc-1"]) 152 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 153 | 154 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "down" 155 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["Reason"] == "cyclecloud_node_failure" 156 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["NodeAddr"] == "hpc-1" 157 | 158 | 159 | def test_zombie_node(bindings: testutil.MockClusterBinding) -> None: 160 | bindings.add_node( 161 | name="hpc-1", nodearray="hpc", state="Ready", target_state="Started" 162 | ) 163 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 164 | 165 | # change the node state in slurm to powered / powering_down 166 | slutil.SLURM_CLI.scontrol(["update", "NodeName=hpc-1", "State=powered_down"]) 167 | azslurmd.AzslurmDaemon(MockNodeSource(bindings)).run_once() 168 | 169 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["State"] == "down" 170 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["Reason"] == "cyclecloud_zombie_node" 171 | assert slutil.SLURM_CLI.slurm_nodes["hpc-1"]["NodeAddr"] == "hpc-1" -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/cli_test.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from typing import List, Optional 3 | 4 | from hpc.autoscale.hpctypes import Memory 5 | from hpc.autoscale.node.bucket import NodeBucket, NodeDefinition 6 | from hpc.autoscale.results import ShutdownResult 7 | 8 | from slurmcc import cli, util 9 | from slurmcc.partition import Partition 10 | 11 | from slurmcc_test import testutil 12 | 13 | 14 | class SimpleMockLimits: 15 | def __init__(self, max_count: int) -> None: 16 | self.max_count = max_count 17 | 18 | 19 | def setup() -> None: 20 | util.TEST_MODE = True 21 | util.SLURM_CLI = testutil.MockNativeSlurmCLI() 22 | 23 | def teardown() -> None: 24 | util.TEST_MODE = False 25 | util.SLURM_CLI = testutil.MockNativeSlurmCLI() 26 | 27 | 28 | def make_partition( 29 | name: str, 30 | is_default: bool, 31 | is_hpc: bool, 32 | use_pcpu: bool = True, 33 | slurm_memory: str = "", 34 | dampen_memory: Optional[float] = None, 35 | dynamic_config: str = "", 36 | ) -> Partition: 37 | resources = {"slurm_memory": Memory.value_of(slurm_memory)} if slurm_memory else {} 38 | node_def = NodeDefinition( 39 | name, 40 | f"b-id-{name}", 41 | "Standard_F4", 42 | "southcentralus", 43 | False, 44 | "subnet", 45 | 4, 46 | 0, 47 | Memory.value_of("16g"), 48 | f"pg-{name}" if is_hpc else None, 49 | resources, 50 | {}, 51 | ) 52 | 53 | limits = SimpleMockLimits(100) 54 | 55 | bucket = NodeBucket(node_def, limits, 100, []) 56 | return Partition( 57 | name, 58 | name, 59 | f"pre-", 60 | "Standard_F4", 61 | is_default, 62 | is_hpc, 63 | 100, 64 | [bucket], 65 | 100, 66 | use_pcpu, 67 | dynamic_config, 68 | {}, 69 | ["Standard_f4"], 70 | dampen_memory, 71 | ) 72 | 73 | 74 | def test_partitions() -> None: 75 | 76 | partitions = [ 77 | make_partition("htc", False, False), 78 | make_partition("hpc", True, True), 79 | make_partition("dynamic", False, False, dynamic_config="-Z Feature=dyn"), 80 | ] 81 | 82 | # Define neither slurm_memory nor dampen_memory, autoscale=true 83 | # Expect full 16g to be applied. 84 | writer = StringIO() 85 | 86 | cli._partitions(partitions, writer, autoscale=True) 87 | actual = "\n".join( 88 | [x for x in writer.getvalue().splitlines() if not x.startswith("#")] 89 | ) 90 | with open("/tmp/partitions.txt", "w") as f: 91 | f.write(actual) 92 | assert ( 93 | actual 94 | == """PartitionName=htc Nodes=pre-[1-100] Default=NO DefMemPerCPU=3840 MaxTime=INFINITE State=UP 95 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=15360 96 | PartitionName=hpc Nodes=pre-[1-100] Default=YES DefMemPerCPU=3840 MaxTime=INFINITE State=UP 97 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=15360 98 | Nodeset=dynamicns Feature=dyn 99 | PartitionName=dynamic Nodes=dynamicns""" 100 | ) 101 | 102 | # Define neither slurm_memory nor dampen_memory, autoscale=true 103 | # Expect default of 16g - 1gb to be applied. 104 | # Exoect state=FUTURE instead of CLOUD 105 | writer = StringIO() 106 | cli._partitions(partitions, writer, autoscale=False) 107 | actual = "\n".join( 108 | [x for x in writer.getvalue().splitlines() if not x.startswith("#")] 109 | ) 110 | assert ( 111 | actual 112 | == """PartitionName=htc Nodes=pre-[1-100] Default=NO DefMemPerCPU=3840 MaxTime=INFINITE State=UP 113 | Nodename=pre-[1-100] Feature=cloud STATE=FUTURE CPUs=4 ThreadsPerCore=1 RealMemory=15360 114 | PartitionName=hpc Nodes=pre-[1-100] Default=YES DefMemPerCPU=3840 MaxTime=INFINITE State=UP 115 | Nodename=pre-[1-100] Feature=cloud STATE=FUTURE CPUs=4 ThreadsPerCore=1 RealMemory=15360 116 | Nodeset=dynamicns Feature=dyn 117 | PartitionName=dynamic Nodes=dynamicns""" 118 | ) 119 | 120 | # Define only slurm_memory resource, autoscale=true 121 | # Expect slurm_memory (15g, 14g) will be applied. 122 | partitions = [ 123 | make_partition("htc", False, False, slurm_memory="15g"), 124 | make_partition("hpc", True, True, slurm_memory="14g"), 125 | make_partition( 126 | "dynamic", False, False, dynamic_config="-Z Feature=dyn", slurm_memory="13g" 127 | ), 128 | ] 129 | # No slurm.dampen_memory or slurm_memory resource, autoscale=FALSE 130 | writer = StringIO() 131 | cli._partitions(partitions, writer, autoscale=True) 132 | actual = "\n".join( 133 | [x for x in writer.getvalue().splitlines() if not x.startswith("#")] 134 | ) 135 | assert ( 136 | actual 137 | == """PartitionName=htc Nodes=pre-[1-100] Default=NO DefMemPerCPU=3840 MaxTime=INFINITE State=UP 138 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=15360 139 | PartitionName=hpc Nodes=pre-[1-100] Default=YES DefMemPerCPU=3584 MaxTime=INFINITE State=UP 140 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=14336 141 | Nodeset=dynamicns Feature=dyn 142 | PartitionName=dynamic Nodes=dynamicns""" 143 | ) 144 | 145 | # Define both slurm_memory resource and slurm.dampen_memory, autoscale=true 146 | # Expect dampen_memory (25%, 50%) will be applied 147 | partitions = [ 148 | make_partition("htc", False, False, slurm_memory="15g", dampen_memory=0.25), 149 | make_partition("hpc", True, True, slurm_memory="14g", dampen_memory=0.5), 150 | make_partition( 151 | "dynamic", 152 | False, 153 | False, 154 | dynamic_config="-Z Feature=dyn", 155 | slurm_memory="13g", 156 | dampen_memory=0.75, 157 | ), 158 | ] 159 | 160 | writer = StringIO() 161 | cli._partitions(partitions, writer, autoscale=True) 162 | actual = "\n".join( 163 | [x for x in writer.getvalue().splitlines() if not x.startswith("#")] 164 | ) 165 | assert ( 166 | actual 167 | == """PartitionName=htc Nodes=pre-[1-100] Default=NO DefMemPerCPU=3072 MaxTime=INFINITE State=UP 168 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=12288 169 | PartitionName=hpc Nodes=pre-[1-100] Default=YES DefMemPerCPU=2048 MaxTime=INFINITE State=UP 170 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=8192 171 | Nodeset=dynamicns Feature=dyn 172 | PartitionName=dynamic Nodes=dynamicns""" 173 | ) 174 | 175 | # Define both slurm_memory resource and slurm.dampen_memory, autoscale=true 176 | # Expect dampen_memory (use 1G as 1% is too small) will be applied 177 | partitions = [ 178 | make_partition("htc", False, False, slurm_memory="15g", dampen_memory=0.001), 179 | make_partition("hpc", True, True, slurm_memory="14g", dampen_memory=0.001), 180 | make_partition( 181 | "dynamic", 182 | False, 183 | False, 184 | dynamic_config="-Z Feature=dyn", 185 | slurm_memory="13g", 186 | dampen_memory=0.75, 187 | ), 188 | ] 189 | 190 | writer = StringIO() 191 | cli._partitions(partitions, writer, autoscale=True) 192 | actual = "\n".join( 193 | [x for x in writer.getvalue().splitlines() if not x.startswith("#")] 194 | ) 195 | assert ( 196 | actual 197 | == """PartitionName=htc Nodes=pre-[1-100] Default=NO DefMemPerCPU=3840 MaxTime=INFINITE State=UP 198 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=15360 199 | PartitionName=hpc Nodes=pre-[1-100] Default=YES DefMemPerCPU=3840 MaxTime=INFINITE State=UP 200 | Nodename=pre-[1-100] Feature=cloud STATE=CLOUD CPUs=4 ThreadsPerCore=1 RealMemory=15360 201 | Nodeset=dynamicns Feature=dyn 202 | PartitionName=dynamic Nodes=dynamicns""" 203 | ) 204 | 205 | 206 | def test_return_to_idle() -> None: 207 | class MockNode: 208 | def __init__(self, name: str, keep_alive: bool) -> None: 209 | self.name = name 210 | self.keep_alive = keep_alive 211 | class MockNodeMgr: 212 | def __init__(self) -> None: 213 | self._expect = [] 214 | 215 | def expect(self, *names: str): 216 | self._expect.append(set(names)) 217 | 218 | def shutdown_nodes(self, nodes: List[MockNode]) -> ShutdownResult: 219 | actual = set() 220 | for node in nodes: 221 | assert not node.keep_alive 222 | actual.add(node.name) 223 | assert actual == self._expect[-1] 224 | self._expect.pop() 225 | 226 | class MockScontrol: 227 | def __init__(self) -> None: 228 | self._expect = [] 229 | 230 | def expect(self, *args: str, **kwargs) -> None: 231 | self._expect.append((list(args), kwargs.get("return_value", ""))) 232 | 233 | def __call__(self, args: List[str]) -> str: 234 | if args[0] == "show" and args[1] == "hostlist": 235 | return args[2] 236 | assert self._expect, f"Unexpected call to scontrol: {args}" 237 | expected_args, return_value = self._expect.pop() 238 | assert args == expected_args 239 | return return_value 240 | 241 | 242 | ccnodes_by_name = {"htc-1": MockNode("htc-1", True), "htc-2": MockNode("htc-2", False)} 243 | config = {} 244 | node_mgr = MockNodeMgr() 245 | scontrol_func = MockScontrol() 246 | snodes = [ 247 | {"NodeName": "htc-1", "State": "POWERING_DOWN+DOWN+CLOUD"}, 248 | {"NodeName": "htc-2", "State": "POWERED_DOWN+DOWN+CLOUD"}, 249 | {"NodeName": "unmanaged", "State": "POWERED_DOWN+DOWN"}, 250 | ] 251 | 252 | def run_test(): 253 | cli.SlurmCLI._return_to_idle(config=config, 254 | snodes=snodes, 255 | ccnodes_by_name=ccnodes_by_name, 256 | node_mgr=node_mgr, 257 | scontrol_func=scontrol_func, 258 | ) 259 | node_mgr.expect("htc-2") 260 | scontrol_func.expect("update", "nodename=htc-2", "state=idle") 261 | run_test() 262 | ccnodes_by_name.pop("htc-2") 263 | snodes[1]["State"] = "POWERED_DOWN+IDLE+CLOUD" 264 | run_test() 265 | ccnodes_by_name["htc-1"].keep_alive = False 266 | run_test() 267 | config["slurm"] = {"return_to_idle": True} 268 | node_mgr.expect("htc-2") 269 | run_test() -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/testutil.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from hpc.autoscale import util as hpcutil 4 | from hpc.autoscale.ccbindings.mock import MockClusterBinding 5 | from hpc.autoscale.clock import use_mock_clock 6 | from hpc.autoscale.node import nodemanager 7 | from hpc.autoscale.node.nodemanager import NodeManager 8 | from slurmcc import partition 9 | from slurmcc.cli import SlurmDriver 10 | from slurmcc.util import NativeSlurmCLI, set_slurm_cli, SrunOutput 11 | 12 | import logging 13 | 14 | use_mock_clock() 15 | 16 | 17 | CONFIG: Dict = {} 18 | 19 | 20 | def _show_hostnames(expr: str) -> List[str]: 21 | """ 22 | Purely used to mimic scontrol 23 | """ 24 | assert isinstance(expr, str) 25 | ret = [] 26 | if "," in expr: 27 | for sub_expr in expr.split(","): 28 | ret.extend(_show_hostnames(sub_expr)) 29 | return ret 30 | 31 | if "[" in expr: 32 | left, right = expr.rindex("["), expr.rindex("]") 33 | range_expr = expr[left + 1 : right].strip() 34 | if "-" in range_expr: 35 | start, stop = range_expr.split("-") 36 | for i in range(int(start), int(stop) + 1): 37 | new_expr = expr[:left] + str(i) + expr[right + 1 :] 38 | ret.extend(_show_hostnames(new_expr)) 39 | return ret 40 | else: 41 | return [expr] 42 | 43 | 44 | def _show_hostlist(node_list: List[str]) -> str: 45 | if len(node_list) == 1: 46 | return node_list[0] 47 | by_prefix = hpcutil.partition(node_list, lambda n: n.split("-")[0]) 48 | ret = [] 49 | for prefix, names in by_prefix.items(): 50 | if len(names) == 1: 51 | ret.append(names[0]) 52 | continue 53 | nums = [] 54 | for name in names: 55 | try: 56 | nums.append(int(name.split("-")[-1])) 57 | except ValueError: 58 | raise RuntimeError(f"Bad name - {name} from list {node_list}") 59 | nums = sorted(nums) 60 | min_num = nums[0] 61 | last_num = min_num 62 | for n, num in enumerate(nums[1:]): 63 | 64 | if num > last_num + 1 or n == len(nums) - 2: 65 | if n == len(nums) - 2: 66 | last_num = num 67 | ret.append(f"{prefix}-[{min_num}-{last_num}]") 68 | last_num = min_num = num 69 | else: 70 | last_num = num 71 | if node_list: 72 | assert ret, node_list 73 | return ",".join(ret) 74 | 75 | 76 | class MockNativeSlurmCLI(NativeSlurmCLI): 77 | def __init__(self) -> None: 78 | self.slurm_nodes: Dict[str, Dict] = {} 79 | self.suspend_exc_nodes: List[str] = [] 80 | 81 | def scontrol(self, args: List[str], retry: bool = True) -> str: 82 | logging.info("MOCK scontrol %s", args) 83 | clear_caches() 84 | if args[0:2] == ["show", "hostnames"]: 85 | assert len(args) == 3 86 | assert isinstance(args[-1], str) 87 | ret = _show_hostnames(args[-1]) 88 | assert ret 89 | assert isinstance(ret[0], str), ret[0] 90 | return "\n".join(ret) 91 | 92 | if args[0:2] == ["show", "hostlist"]: 93 | assert len(args) == 3 94 | assert args[-1] 95 | return _show_hostlist(args[-1].split(",")) 96 | 97 | if args[0:2] == ["show", "nodes"]: 98 | if len(args) == 3: 99 | return self.show_nodes(args[2].split(",")) 100 | return self.show_nodes([]) 101 | 102 | if args[0:2] == ["show", "config"]: 103 | return "\n".join(["", f"SuspendExcNodes={','.join(self.suspend_exc_nodes)}", "Ignore=me"]) 104 | 105 | if args[0] == "update": 106 | entity, value = args[1].split("=") 107 | if entity == "NodeName": 108 | slurm_node = self.slurm_nodes[value] 109 | for expr in args[2:]: 110 | key, value = expr.split("=") 111 | if key not in slurm_node: 112 | raise KeyError(f"Unknown key {key} in {args}") 113 | logging.info("MOCK update %s: %s=%s", slurm_node, key, value) 114 | slurm_node[key] = value 115 | elif entity.lower().startswith("suspendexcnodes"): 116 | if "+" in entity: 117 | self.suspend_exc_nodes.append(value) 118 | elif "-" in entity: 119 | self.suspend_exc_nodes.remove(value) 120 | else: 121 | self.suspend_exc_nodes.clear() 122 | self.suspend_exc_nodes.extend(value.split(",")) 123 | else: 124 | raise RuntimeError(f"Unknown args {args}") 125 | return "" 126 | raise RuntimeError(f"Unexpected command - {args}") 127 | 128 | def srun(self, hostlist: List[str], user_command: str, timeout: int, shell: bool, partition: str, gpus: int) -> SrunOutput: 129 | raise RuntimeError("Not implemented") 130 | 131 | def show_nodes(self, node_names: List[str]) -> str: 132 | ret = [] 133 | node_names = node_names or list(self.slurm_nodes.keys()) 134 | for node_name in node_names: 135 | assert ( 136 | node_name in self.slurm_nodes 137 | ), f"Unknown slurm node_name {node_name}. Try calling .create_nodes first. Existing nodes are {self.slurm_nodes.keys()}" 138 | snode = self.slurm_nodes[node_name] 139 | ret.append(" ".join(f"{key}={value}" for key, value in snode.items())) 140 | 141 | return "\n".join(ret) 142 | 143 | def create_nodes(self, node_names: List[str], features: List[str] = [], partitions: List[str] = []) -> None: 144 | for node_name in node_names: 145 | self.slurm_nodes[node_name] = { 146 | "NodeName": node_name, 147 | "NodeAddr": node_name, 148 | "NodeHostName": node_name, 149 | "State": "idle", 150 | "AvailableFeatures": ",".join(features), 151 | "Reason": "", 152 | "Partitions": "dynamic" if "dyn" in features else node_name.split("-")[0] 153 | } 154 | clear_caches() 155 | 156 | def srun(self, args: List[str], retry: bool = True) -> str: 157 | raise RuntimeError("srun not implemented") 158 | 159 | 160 | set_slurm_cli(MockNativeSlurmCLI()) 161 | 162 | 163 | def make_native_cli(create_default_nodes: bool = True) -> MockNativeSlurmCLI: 164 | ret = MockNativeSlurmCLI() 165 | set_slurm_cli(ret) 166 | if create_default_nodes: 167 | ret.create_nodes(_show_hostnames("hpc-[1-100]")) 168 | ret.create_nodes(_show_hostnames("htc-[1-100]")) 169 | return ret 170 | 171 | 172 | def refresh_test_node_manager(old_node_mgr: NodeManager) -> NodeManager: 173 | config = dict(CONFIG) 174 | 175 | config["_mock_bindings"] = old_node_mgr.cluster_bindings 176 | 177 | driver = SlurmDriver() 178 | config = driver.preprocess_config(config) 179 | 180 | node_mgr = nodemanager.new_node_manager(config) 181 | driver.preprocess_node_mgr(config, node_mgr) 182 | assert config["nodearrays"]["hpc"] 183 | return node_mgr 184 | 185 | 186 | def make_test_node_manager_with_bindings(bindings: MockClusterBinding) -> NodeManager: 187 | config = dict(CONFIG) 188 | config["_mock_bindings"] = bindings 189 | 190 | driver = SlurmDriver() 191 | config = driver.preprocess_config(config) 192 | 193 | node_mgr = nodemanager.new_node_manager(config) 194 | driver.preprocess_node_mgr(config, node_mgr) 195 | return node_mgr 196 | 197 | 198 | def make_test_node_manager(cluster_name: str = "c1") -> NodeManager: 199 | bindings = MockClusterBinding(cluster_name) 200 | config = dict(CONFIG) 201 | config["_mock_bindings"] = bindings 202 | 203 | bindings.add_nodearray( 204 | name="hpc", 205 | resources={}, 206 | software_configuration=dict( 207 | slurm=dict(is_default=True, hpc=True, use_nodename_as_hostname=True) 208 | ), 209 | ) 210 | # uses simple nodeaddr=ipaddress 211 | bindings.add_nodearray( 212 | name="htc", 213 | resources={}, 214 | software_configuration=dict( 215 | slurm=dict(is_default=False, hpc=False, use_nodename_as_hostname=False) 216 | ), 217 | ) 218 | bindings.add_nodearray( 219 | name="dynamic", 220 | resources={}, 221 | software_configuration=dict(slurm=dict(is_default=False, hpc=False, dynamic_config="-Z Feature=dyn")), 222 | ) 223 | bindings.add_bucket( 224 | nodearray_name="hpc", vm_size="Standard_F4", max_count=100, available_count=100, placement_groups=["Standard_F4_pg0"] 225 | ) 226 | 227 | bindings.add_bucket( 228 | nodearray_name="htc", vm_size="Standard_F2", max_count=100, available_count=100 229 | ) 230 | bindings.add_bucket( 231 | nodearray_name="dynamic", 232 | vm_size="Standard_F2", 233 | max_count=100, 234 | available_count=100, 235 | ) 236 | 237 | driver = SlurmDriver() 238 | config = driver.preprocess_config(config) 239 | 240 | node_mgr = nodemanager.new_node_manager(config) 241 | driver.preprocess_node_mgr(config, node_mgr) 242 | assert config["nodearrays"]["hpc"] 243 | return node_mgr 244 | 245 | 246 | def clear_caches() -> None: 247 | partition.Partition._SLURM_NODES_CACHE = None -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/testutil_test.py: -------------------------------------------------------------------------------- 1 | from . import testutil 2 | 3 | 4 | def test_mock_scontrol() -> None: 5 | """ 6 | Test the mock scontrol maintains state correctly between update calls, as well as avoids typos. 7 | """ 8 | cli = testutil.MockNativeSlurmCLI() 9 | cli.create_nodes(["hpc-1", "hpc-2"]) 10 | assert cli.slurm_nodes["hpc-1"]["NodeAddr"] == "hpc-1" == cli.slurm_nodes["hpc-1"]["NodeHostName"] 11 | assert cli.slurm_nodes["hpc-2"]["NodeAddr"] == "hpc-2" == cli.slurm_nodes["hpc-2"]["NodeHostName"] 12 | cli.scontrol(["update", "NodeName=hpc-1", "NodeAddr=1.2.3.4", "NodeHostName=1.2.3.4"]) 13 | assert cli.slurm_nodes["hpc-1"]["NodeAddr"] == "1.2.3.4" == cli.slurm_nodes["hpc-1"]["NodeHostName"] 14 | 15 | try: 16 | cli.scontrol(["update", "NodeName=hpc-1", "MadeUpThing=1"]) 17 | assert False, "Expected KeyError" 18 | except KeyError: 19 | pass 20 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/all_hostnames.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1 2 | hpcbench-hpc-2 3 | hpcbench-hpc-3 4 | hpcbench-hpc-4 5 | hpcbench-hpc-5 6 | hpcbench-hpc-6 7 | hpcbench-hpc-7 8 | hpcbench-hpc-8 9 | hpcbench-hpc-9 10 | hpcbench-hpc-10 11 | hpcbench-hpc-11 12 | hpcbench-hpc-12 13 | hpcbench-hpc-13 14 | hpcbench-hpc-14 15 | hpcbench-hpc-15 16 | hpcbench-hpc-16 17 | hpcbench-hpc-17 18 | hpcbench-hpc-18 19 | hpcbench-hpc-19 20 | hpcbench-hpc-20 21 | hpcbench-hpc-21 22 | hpcbench-hpc-22 23 | hpcbench-hpc-23 24 | hpcbench-hpc-24 25 | hpcbench-hpc-25 26 | hpcbench-hpc-26 27 | hpcbench-hpc-27 28 | hpcbench-hpc-28 29 | hpcbench-hpc-29 30 | hpcbench-hpc-30 31 | hpcbench-hpc-31 32 | hpcbench-hpc-32 33 | hpcbench-hpc-33 34 | hpcbench-hpc-34 35 | hpcbench-hpc-35 36 | hpcbench-hpc-36 37 | hpcbench-hpc-37 38 | hpcbench-hpc-38 39 | hpcbench-hpc-39 40 | hpcbench-hpc-40 41 | hpcbench-hpc-41 42 | hpcbench-hpc-42 43 | hpcbench-hpc-43 44 | hpcbench-hpc-44 45 | hpcbench-hpc-45 46 | hpcbench-hpc-46 47 | hpcbench-hpc-47 48 | hpcbench-hpc-48 49 | hpcbench-hpc-49 50 | hpcbench-hpc-50 51 | hpcbench-hpc-51 52 | hpcbench-hpc-52 53 | hpcbench-hpc-53 54 | hpcbench-hpc-54 55 | hpcbench-hpc-55 56 | hpcbench-hpc-56 57 | hpcbench-hpc-57 58 | hpcbench-hpc-58 59 | hpcbench-hpc-59 60 | hpcbench-hpc-60 61 | hpcbench-hpc-61 62 | hpcbench-hpc-62 63 | hpcbench-hpc-63 64 | hpcbench-hpc-64 65 | hpcbench-hpc-65 66 | hpcbench-hpc-66 67 | hpcbench-hpc-67 68 | hpcbench-hpc-68 69 | hpcbench-hpc-69 70 | hpcbench-hpc-70 71 | hpcbench-hpc-71 72 | hpcbench-hpc-72 73 | hpcbench-hpc-73 74 | hpcbench-hpc-74 75 | hpcbench-hpc-75 76 | hpcbench-hpc-76 77 | hpcbench-hpc-77 78 | hpcbench-hpc-78 79 | hpcbench-hpc-79 80 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/block_topology.txt: -------------------------------------------------------------------------------- 1 | # Number of Nodes in Block 1: 18 2 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f723c 3 | BlockName=block_5e797fc6-0f46-421a-8724-0e102c0f723c Nodes=hpcbench-hpc-1,hpcbench-hpc-5,hpcbench-hpc-9,hpcbench-hpc-13,hpcbench-hpc-17,hpcbench-hpc-21,hpcbench-hpc-25,hpcbench-hpc-29,hpcbench-hpc-33,hpcbench-hpc-37,hpcbench-hpc-41,hpcbench-hpc-45,hpcbench-hpc-49,hpcbench-hpc-53,hpcbench-hpc-57,hpcbench-hpc-61,hpcbench-hpc-65,hpcbench-hpc-69 4 | # Number of Nodes in Block 2: 18 5 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f721e 6 | BlockName=block_5e797fc6-0f46-421a-8724-0e102c0f721e Nodes=hpcbench-hpc-2,hpcbench-hpc-6,hpcbench-hpc-10,hpcbench-hpc-14,hpcbench-hpc-18,hpcbench-hpc-22,hpcbench-hpc-26,hpcbench-hpc-30,hpcbench-hpc-34,hpcbench-hpc-38,hpcbench-hpc-42,hpcbench-hpc-46,hpcbench-hpc-50,hpcbench-hpc-54,hpcbench-hpc-58,hpcbench-hpc-62,hpcbench-hpc-66,hpcbench-hpc-70 7 | # Number of Nodes in Block 3: 18 8 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f724b 9 | BlockName=block_5e797fc6-0f46-421a-8724-0e102c0f724b Nodes=hpcbench-hpc-3,hpcbench-hpc-7,hpcbench-hpc-11,hpcbench-hpc-15,hpcbench-hpc-19,hpcbench-hpc-23,hpcbench-hpc-27,hpcbench-hpc-31,hpcbench-hpc-35,hpcbench-hpc-39,hpcbench-hpc-43,hpcbench-hpc-47,hpcbench-hpc-51,hpcbench-hpc-55,hpcbench-hpc-59,hpcbench-hpc-63,hpcbench-hpc-67,hpcbench-hpc-71 10 | # Number of Nodes in Block 4: 18 11 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f722a 12 | BlockName=block_5e797fc6-0f46-421a-8724-0e102c0f722a Nodes=hpcbench-hpc-4,hpcbench-hpc-8,hpcbench-hpc-12,hpcbench-hpc-16,hpcbench-hpc-20,hpcbench-hpc-24,hpcbench-hpc-28,hpcbench-hpc-32,hpcbench-hpc-36,hpcbench-hpc-40,hpcbench-hpc-44,hpcbench-hpc-48,hpcbench-hpc-52,hpcbench-hpc-56,hpcbench-hpc-60,hpcbench-hpc-64,hpcbench-hpc-68,hpcbench-hpc-72 13 | BlockSizes=1 14 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/guid_hostnames.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1 2 | hpcbench-hpc-35 3 | hpcbench-hpc-36 4 | hpcbench-hpc-37 5 | hpcbench-hpc-38 6 | hpcbench-hpc-39 7 | hpcbench-hpc-40 8 | hpcbench-hpc-41 9 | hpcbench-hpc-42 10 | hpcbench-hpc-43 11 | hpcbench-hpc-44 12 | hpcbench-hpc-45 13 | hpcbench-hpc-46 14 | hpcbench-hpc-47 15 | hpcbench-hpc-48 16 | hpcbench-hpc-49 -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/guids.txt: -------------------------------------------------------------------------------- 1 | 0x155dfffd33ff33 2 | 0x155dfffd33ff34 3 | 0x155dfffd33ff35 4 | 0x155dfffd33ff36 5 | 0x155dfffd33ff37 6 | 0x155dfffd33ff38 7 | 0x155dfffd33ff39 8 | 0x155dfffd33ff3a 9 | 0x155dfffd33ff43 10 | 0x155dfffd33ff44 11 | 0x155dfffd33ff45 12 | 0x155dfffd33ff46 13 | 0x155dfffd33ff47 14 | 0x155dfffd33ff48 15 | 0x155dfffd33ff49 16 | 0x155dfffd33ff4a 17 | 0x155dfffd33ff7b 18 | 0x155dfffd33ff7c 19 | 0x155dfffd33ff7d 20 | 0x155dfffd33ff7e 21 | 0x155dfffd33ff7f 22 | 0x155dfffd33ff80 23 | 0x155dfffd33ff81 24 | 0x155dfffd33ff82 25 | 0x155dfffd33ff8b 26 | 0x155dfffd33ff8c 27 | 0x155dfffd33ff8d 28 | 0x155dfffd33ff8e 29 | 0x155dfffd33ff8f 30 | 0x155dfffd33ff90 31 | 0x155dfffd33ff91 32 | 0x155dfffd33ff92 33 | 0x155dfffd33ffa3 34 | 0x155dfffd33ffa4 35 | 0x155dfffd33ffa5 36 | 0x155dfffd33ffa6 37 | 0x155dfffd33ffa7 38 | 0x155dfffd33ffa8 39 | 0x155dfffd33ffa9 40 | 0x155dfffd33ffaa 41 | 0x155dfffd33ffbb 42 | 0x155dfffd33ffbc 43 | 0x155dfffd33ffbd 44 | 0x155dfffd33ffbe 45 | 0x155dfffd33ffbf 46 | 0x155dfffd33ffc0 47 | 0x155dfffd33ffc1 48 | 0x155dfffd33ffc2 49 | 0x155dfffd33ffc3 50 | 0x155dfffd33ffc4 51 | 0x155dfffd33ffc5 52 | 0x155dfffd33ffc6 53 | 0x155dfffd33ffc7 54 | 0x155dfffd33ffc8 55 | 0x155dfffd33ffc9 56 | 0x155dfffd33ffca 57 | 0x155dfffd33ffcb 58 | 0x155dfffd33ffcc 59 | 0x155dfffd33ffcd 60 | 0x155dfffd33ffce 61 | 0x155dfffd33ffcf 62 | 0x155dfffd33ffd0 63 | 0x155dfffd33ffd1 64 | 0x155dfffd33ffd2 65 | 0x155dfffd33ffd3 66 | 0x155dfffd33ffd4 67 | 0x155dfffd33ffd5 68 | 0x155dfffd33ffd6 69 | 0x155dfffd33ffd7 70 | 0x155dfffd33ffd8 71 | 0x155dfffd33ffd9 72 | 0x155dfffd33ffda 73 | 0x155dfffd33ffe3 74 | 0x155dfffd33ffe4 75 | 0x155dfffd33ffe5 76 | 0x155dfffd33ffe6 77 | 0x155dfffd33ffe7 78 | 0x155dfffd33ffe8 79 | 0x155dfffd33ffe9 80 | 0x155dfffd33ffea 81 | 0x155dfffd33ffeb 82 | 0x155dfffd33ffec 83 | 0x155dfffd33ffed 84 | 0x155dfffd33ffee 85 | 0x155dfffd33ffef 86 | 0x155dfffd33fff0 87 | 0x155dfffd33fff1 88 | 0x155dfffd33fff2 89 | 0x155dfffd34001b 90 | 0x155dfffd34001c 91 | 0x155dfffd34001d 92 | 0x155dfffd34001e 93 | 0x155dfffd34001f 94 | 0x155dfffd340020 95 | 0x155dfffd340021 96 | 0x155dfffd340022 97 | 0x155dfffd340033 98 | 0x155dfffd340034 99 | 0x155dfffd340035 100 | 0x155dfffd340036 101 | 0x155dfffd340037 102 | 0x155dfffd340038 103 | 0x155dfffd340039 104 | 0x155dfffd34003a 105 | 0x155dfffd34004b 106 | 0x155dfffd34004c 107 | 0x155dfffd34004d 108 | 0x155dfffd34004e 109 | 0x155dfffd34004f 110 | 0x155dfffd340050 111 | 0x155dfffd340051 112 | 0x155dfffd340052 113 | 0x155dfffd340053 114 | 0x155dfffd340054 115 | 0x155dfffd340055 116 | 0x155dfffd340056 117 | 0x155dfffd340057 118 | 0x155dfffd340058 119 | 0x155dfffd340059 120 | 0x155dfffd34005a 121 | 0x155dfffd340063 122 | 0x155dfffd340064 123 | 0x155dfffd340065 124 | 0x155dfffd340066 125 | 0x155dfffd340067 126 | 0x155dfffd340068 127 | 0x155dfffd340069 128 | 0x155dfffd34006a 129 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/hostnames.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1 2 | hpcbench-hpc-35 3 | hpcbench-hpc-36 4 | hpcbench-hpc-37 5 | hpcbench-hpc-38 6 | hpcbench-hpc-39 7 | hpcbench-hpc-40 8 | hpcbench-hpc-41 9 | hpcbench-hpc-42 10 | hpcbench-hpc-43 11 | hpcbench-hpc-44 12 | hpcbench-hpc-45 13 | hpcbench-hpc-46 14 | hpcbench-hpc-47 15 | hpcbench-hpc-48 16 | hpcbench-hpc-49 17 | hpcbench-hpc-63 18 | hpcbench-hpc-64 19 | hpcbench-hpc-65 20 | hpcbench-hpc-66 21 | hpcbench-hpc-67 22 | hpcbench-hpc-68 23 | hpcbench-hpc-69 24 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/nodes_clusterUUIDs.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1: 5e797fc6-0f46-421a-8724-0e102c0f723c 2 | hpcbench-hpc-2: 5e797fc6-0f46-421a-8724-0e102c0f721e 3 | hpcbench-hpc-3: 5e797fc6-0f46-421a-8724-0e102c0f724b 4 | hpcbench-hpc-4: 5e797fc6-0f46-421a-8724-0e102c0f722a 5 | hpcbench-hpc-5: 5e797fc6-0f46-421a-8724-0e102c0f723c 6 | hpcbench-hpc-6: 5e797fc6-0f46-421a-8724-0e102c0f721e 7 | hpcbench-hpc-7: 5e797fc6-0f46-421a-8724-0e102c0f724b 8 | hpcbench-hpc-8: 5e797fc6-0f46-421a-8724-0e102c0f722a 9 | hpcbench-hpc-9: 5e797fc6-0f46-421a-8724-0e102c0f723c 10 | hpcbench-hpc-10: 5e797fc6-0f46-421a-8724-0e102c0f721e 11 | hpcbench-hpc-11: 5e797fc6-0f46-421a-8724-0e102c0f724b 12 | hpcbench-hpc-12: 5e797fc6-0f46-421a-8724-0e102c0f722a 13 | hpcbench-hpc-13: 5e797fc6-0f46-421a-8724-0e102c0f723c 14 | hpcbench-hpc-14: 5e797fc6-0f46-421a-8724-0e102c0f721e 15 | hpcbench-hpc-15: 5e797fc6-0f46-421a-8724-0e102c0f724b 16 | hpcbench-hpc-16: 5e797fc6-0f46-421a-8724-0e102c0f722a 17 | hpcbench-hpc-17: 5e797fc6-0f46-421a-8724-0e102c0f723c 18 | hpcbench-hpc-18: 5e797fc6-0f46-421a-8724-0e102c0f721e 19 | hpcbench-hpc-19: 5e797fc6-0f46-421a-8724-0e102c0f724b 20 | hpcbench-hpc-20: 5e797fc6-0f46-421a-8724-0e102c0f722a 21 | hpcbench-hpc-21: 5e797fc6-0f46-421a-8724-0e102c0f723c 22 | hpcbench-hpc-22: 5e797fc6-0f46-421a-8724-0e102c0f721e 23 | hpcbench-hpc-23: 5e797fc6-0f46-421a-8724-0e102c0f724b 24 | hpcbench-hpc-24: 5e797fc6-0f46-421a-8724-0e102c0f722a 25 | hpcbench-hpc-25: 5e797fc6-0f46-421a-8724-0e102c0f723c 26 | hpcbench-hpc-26: 5e797fc6-0f46-421a-8724-0e102c0f721e 27 | hpcbench-hpc-27: 5e797fc6-0f46-421a-8724-0e102c0f724b 28 | hpcbench-hpc-28: 5e797fc6-0f46-421a-8724-0e102c0f722a 29 | hpcbench-hpc-29: 5e797fc6-0f46-421a-8724-0e102c0f723c 30 | hpcbench-hpc-30: 5e797fc6-0f46-421a-8724-0e102c0f721e 31 | hpcbench-hpc-31: 5e797fc6-0f46-421a-8724-0e102c0f724b 32 | hpcbench-hpc-32: 5e797fc6-0f46-421a-8724-0e102c0f722a 33 | hpcbench-hpc-33: 5e797fc6-0f46-421a-8724-0e102c0f723c 34 | hpcbench-hpc-34: 5e797fc6-0f46-421a-8724-0e102c0f721e 35 | hpcbench-hpc-35: 5e797fc6-0f46-421a-8724-0e102c0f724b 36 | hpcbench-hpc-36: 5e797fc6-0f46-421a-8724-0e102c0f722a 37 | hpcbench-hpc-37: 5e797fc6-0f46-421a-8724-0e102c0f723c 38 | hpcbench-hpc-38: 5e797fc6-0f46-421a-8724-0e102c0f721e 39 | hpcbench-hpc-39: 5e797fc6-0f46-421a-8724-0e102c0f724b 40 | hpcbench-hpc-40: 5e797fc6-0f46-421a-8724-0e102c0f722a 41 | hpcbench-hpc-41: 5e797fc6-0f46-421a-8724-0e102c0f723c 42 | hpcbench-hpc-42: 5e797fc6-0f46-421a-8724-0e102c0f721e 43 | hpcbench-hpc-43: 5e797fc6-0f46-421a-8724-0e102c0f724b 44 | hpcbench-hpc-44: 5e797fc6-0f46-421a-8724-0e102c0f722a 45 | hpcbench-hpc-45: 5e797fc6-0f46-421a-8724-0e102c0f723c 46 | hpcbench-hpc-46: 5e797fc6-0f46-421a-8724-0e102c0f721e 47 | hpcbench-hpc-47: 5e797fc6-0f46-421a-8724-0e102c0f724b 48 | hpcbench-hpc-48: 5e797fc6-0f46-421a-8724-0e102c0f722a 49 | hpcbench-hpc-49: 5e797fc6-0f46-421a-8724-0e102c0f723c 50 | hpcbench-hpc-50: 5e797fc6-0f46-421a-8724-0e102c0f721e 51 | hpcbench-hpc-51: 5e797fc6-0f46-421a-8724-0e102c0f724b 52 | hpcbench-hpc-52: 5e797fc6-0f46-421a-8724-0e102c0f722a 53 | hpcbench-hpc-53: 5e797fc6-0f46-421a-8724-0e102c0f723c 54 | hpcbench-hpc-54: 5e797fc6-0f46-421a-8724-0e102c0f721e 55 | hpcbench-hpc-55: 5e797fc6-0f46-421a-8724-0e102c0f724b 56 | hpcbench-hpc-56: 5e797fc6-0f46-421a-8724-0e102c0f722a 57 | hpcbench-hpc-57: 5e797fc6-0f46-421a-8724-0e102c0f723c 58 | hpcbench-hpc-58: 5e797fc6-0f46-421a-8724-0e102c0f721e 59 | hpcbench-hpc-59: 5e797fc6-0f46-421a-8724-0e102c0f724b 60 | hpcbench-hpc-60: 5e797fc6-0f46-421a-8724-0e102c0f722a 61 | hpcbench-hpc-61: 5e797fc6-0f46-421a-8724-0e102c0f723c 62 | hpcbench-hpc-62: 5e797fc6-0f46-421a-8724-0e102c0f721e 63 | hpcbench-hpc-63: 5e797fc6-0f46-421a-8724-0e102c0f724b 64 | hpcbench-hpc-64: 5e797fc6-0f46-421a-8724-0e102c0f722a 65 | hpcbench-hpc-65: 5e797fc6-0f46-421a-8724-0e102c0f723c 66 | hpcbench-hpc-66: 5e797fc6-0f46-421a-8724-0e102c0f721e 67 | hpcbench-hpc-67: 5e797fc6-0f46-421a-8724-0e102c0f724b 68 | hpcbench-hpc-68: 5e797fc6-0f46-421a-8724-0e102c0f722a 69 | hpcbench-hpc-69: 5e797fc6-0f46-421a-8724-0e102c0f723c 70 | hpcbench-hpc-70: 5e797fc6-0f46-421a-8724-0e102c0f721e 71 | hpcbench-hpc-71: 5e797fc6-0f46-421a-8724-0e102c0f724b 72 | hpcbench-hpc-72: 5e797fc6-0f46-421a-8724-0e102c0f722a 73 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/nodes_clusterUUIDs_2.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1: 5e797fc6-0f46-421a-8724-0e102c0f723c 2 | hpcbench-hpc-3: 5e797fc6-0f46-421a-8724-0e102c0f724b 3 | hpcbench-hpc-4: 5e797fc6-0f46-421a-8724-0e102c0f722a 4 | hpcbench-hpc-5: 5e797fc6-0f46-421a-8724-0e102c0f723c 5 | hpcbench-hpc-7: 5e797fc6-0f46-421a-8724-0e102c0f724b 6 | hpcbench-hpc-8: 5e797fc6-0f46-421a-8724-0e102c0f722a 7 | hpcbench-hpc-9: 5e797fc6-0f46-421a-8724-0e102c0f723c 8 | hpcbench-hpc-11: 5e797fc6-0f46-421a-8724-0e102c0f724b 9 | hpcbench-hpc-12: 5e797fc6-0f46-421a-8724-0e102c0f722a 10 | hpcbench-hpc-13: 5e797fc6-0f46-421a-8724-0e102c0f723c 11 | hpcbench-hpc-15: 5e797fc6-0f46-421a-8724-0e102c0f724b 12 | hpcbench-hpc-16: 5e797fc6-0f46-421a-8724-0e102c0f722a 13 | hpcbench-hpc-17: 5e797fc6-0f46-421a-8724-0e102c0f723c 14 | hpcbench-hpc-18: 5e797fc6-0f46-421a-8724-0e102c0f721e 15 | hpcbench-hpc-19: 5e797fc6-0f46-421a-8724-0e102c0f724b 16 | hpcbench-hpc-20: 5e797fc6-0f46-421a-8724-0e102c0f722a 17 | hpcbench-hpc-21: 5e797fc6-0f46-421a-8724-0e102c0f723c 18 | hpcbench-hpc-22: 5e797fc6-0f46-421a-8724-0e102c0f721e 19 | hpcbench-hpc-23: 5e797fc6-0f46-421a-8724-0e102c0f724b 20 | hpcbench-hpc-24: 5e797fc6-0f46-421a-8724-0e102c0f722a 21 | hpcbench-hpc-25: 5e797fc6-0f46-421a-8724-0e102c0f723c 22 | hpcbench-hpc-26: 5e797fc6-0f46-421a-8724-0e102c0f721e 23 | hpcbench-hpc-27: 5e797fc6-0f46-421a-8724-0e102c0f724b 24 | hpcbench-hpc-28: 5e797fc6-0f46-421a-8724-0e102c0f722a 25 | hpcbench-hpc-29: 5e797fc6-0f46-421a-8724-0e102c0f723c 26 | hpcbench-hpc-30: 5e797fc6-0f46-421a-8724-0e102c0f721e 27 | hpcbench-hpc-31: 5e797fc6-0f46-421a-8724-0e102c0f724b 28 | hpcbench-hpc-32: 5e797fc6-0f46-421a-8724-0e102c0f722a 29 | hpcbench-hpc-33: 5e797fc6-0f46-421a-8724-0e102c0f723c 30 | hpcbench-hpc-34: 5e797fc6-0f46-421a-8724-0e102c0f721e 31 | hpcbench-hpc-35: 5e797fc6-0f46-421a-8724-0e102c0f724b 32 | hpcbench-hpc-36: 5e797fc6-0f46-421a-8724-0e102c0f722a 33 | hpcbench-hpc-37: 5e797fc6-0f46-421a-8724-0e102c0f723c 34 | hpcbench-hpc-38: 5e797fc6-0f46-421a-8724-0e102c0f721e 35 | hpcbench-hpc-39: 5e797fc6-0f46-421a-8724-0e102c0f724b 36 | hpcbench-hpc-40: 5e797fc6-0f46-421a-8724-0e102c0f722a 37 | hpcbench-hpc-41: 5e797fc6-0f46-421a-8724-0e102c0f723c 38 | hpcbench-hpc-42: 5e797fc6-0f46-421a-8724-0e102c0f721e 39 | hpcbench-hpc-43: 5e797fc6-0f46-421a-8724-0e102c0f724b 40 | hpcbench-hpc-44: 5e797fc6-0f46-421a-8724-0e102c0f722a 41 | hpcbench-hpc-45: 5e797fc6-0f46-421a-8724-0e102c0f723c 42 | hpcbench-hpc-46: 5e797fc6-0f46-421a-8724-0e102c0f721e 43 | hpcbench-hpc-47: 5e797fc6-0f46-421a-8724-0e102c0f724b 44 | hpcbench-hpc-48: 5e797fc6-0f46-421a-8724-0e102c0f722a 45 | hpcbench-hpc-49: 5e797fc6-0f46-421a-8724-0e102c0f723c 46 | hpcbench-hpc-50: 5e797fc6-0f46-421a-8724-0e102c0f721e 47 | hpcbench-hpc-51: 5e797fc6-0f46-421a-8724-0e102c0f724b 48 | hpcbench-hpc-52: 5e797fc6-0f46-421a-8724-0e102c0f722a 49 | hpcbench-hpc-53: 5e797fc6-0f46-421a-8724-0e102c0f723c 50 | hpcbench-hpc-54: 5e797fc6-0f46-421a-8724-0e102c0f721e 51 | hpcbench-hpc-55: 5e797fc6-0f46-421a-8724-0e102c0f724b 52 | hpcbench-hpc-56: 5e797fc6-0f46-421a-8724-0e102c0f722a 53 | hpcbench-hpc-57: 5e797fc6-0f46-421a-8724-0e102c0f723c 54 | hpcbench-hpc-58: 5e797fc6-0f46-421a-8724-0e102c0f721e 55 | hpcbench-hpc-59: 5e797fc6-0f46-421a-8724-0e102c0f724b 56 | hpcbench-hpc-60: 5e797fc6-0f46-421a-8724-0e102c0f722a 57 | hpcbench-hpc-61: 5e797fc6-0f46-421a-8724-0e102c0f723c 58 | hpcbench-hpc-62: 5e797fc6-0f46-421a-8724-0e102c0f721e 59 | hpcbench-hpc-63: 5e797fc6-0f46-421a-8724-0e102c0f724b 60 | hpcbench-hpc-64: 5e797fc6-0f46-421a-8724-0e102c0f722a 61 | hpcbench-hpc-65: 5e797fc6-0f46-421a-8724-0e102c0f723c 62 | hpcbench-hpc-66: 5e797fc6-0f46-421a-8724-0e102c0f721e 63 | hpcbench-hpc-67: 5e797fc6-0f46-421a-8724-0e102c0f724b 64 | hpcbench-hpc-68: 5e797fc6-0f46-421a-8724-0e102c0f722a 65 | hpcbench-hpc-69: 5e797fc6-0f46-421a-8724-0e102c0f723c 66 | hpcbench-hpc-70: 5e797fc6-0f46-421a-8724-0e102c0f721e 67 | hpcbench-hpc-71: 5e797fc6-0f46-421a-8724-0e102c0f724b 68 | hpcbench-hpc-72: 5e797fc6-0f46-421a-8724-0e102c0f722a 69 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/nodes_guids.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1: 0x155dfffd33ff33 2 | hpcbench-hpc-1: 0x155dfffd33ff34 3 | hpcbench-hpc-1: 0x155dfffd33ff35 4 | hpcbench-hpc-1: 0x155dfffd33ff36 5 | hpcbench-hpc-1: 0x155dfffd33ff37 6 | hpcbench-hpc-1: 0x155dfffd33ff38 7 | hpcbench-hpc-1: 0x155dfffd33ff39 8 | hpcbench-hpc-1: 0x155dfffd33ff3a 9 | hpcbench-hpc-35: 0x155dfffd33ff43 10 | hpcbench-hpc-35: 0x155dfffd33ff44 11 | hpcbench-hpc-35: 0x155dfffd33ff45 12 | hpcbench-hpc-35: 0x155dfffd33ff46 13 | hpcbench-hpc-35: 0x155dfffd33ff47 14 | hpcbench-hpc-35: 0x155dfffd33ff48 15 | hpcbench-hpc-35: 0x155dfffd33ff49 16 | hpcbench-hpc-35: 0x155dfffd33ff4a 17 | hpcbench-hpc-36: 0x155dfffd33ff7b 18 | hpcbench-hpc-36: 0x155dfffd33ff7c 19 | hpcbench-hpc-36: 0x155dfffd33ff7d 20 | hpcbench-hpc-36: 0x155dfffd33ff7e 21 | hpcbench-hpc-36: 0x155dfffd33ff7f 22 | hpcbench-hpc-36: 0x155dfffd33ff80 23 | hpcbench-hpc-36: 0x155dfffd33ff81 24 | hpcbench-hpc-36: 0x155dfffd33ff82 25 | hpcbench-hpc-37: 0x155dfffd33ff8b 26 | hpcbench-hpc-37: 0x155dfffd33ff8c 27 | hpcbench-hpc-37: 0x155dfffd33ff8d 28 | hpcbench-hpc-37: 0x155dfffd33ff8e 29 | hpcbench-hpc-37: 0x155dfffd33ff8f 30 | hpcbench-hpc-37: 0x155dfffd33ff90 31 | hpcbench-hpc-37: 0x155dfffd33ff91 32 | hpcbench-hpc-37: 0x155dfffd33ff92 33 | hpcbench-hpc-38: 0x155dfffd33ffa3 34 | hpcbench-hpc-38: 0x155dfffd33ffa4 35 | hpcbench-hpc-38: 0x155dfffd33ffa5 36 | hpcbench-hpc-38: 0x155dfffd33ffa6 37 | hpcbench-hpc-38: 0x155dfffd33ffa7 38 | hpcbench-hpc-38: 0x155dfffd33ffa8 39 | hpcbench-hpc-38: 0x155dfffd33ffa9 40 | hpcbench-hpc-38: 0x155dfffd33ffaa 41 | hpcbench-hpc-39: 0x155dfffd33ffbb 42 | hpcbench-hpc-39: 0x155dfffd33ffbc 43 | hpcbench-hpc-39: 0x155dfffd33ffbd 44 | hpcbench-hpc-39: 0x155dfffd33ffbe 45 | hpcbench-hpc-39: 0x155dfffd33ffbf 46 | hpcbench-hpc-39: 0x155dfffd33ffc0 47 | hpcbench-hpc-39: 0x155dfffd33ffc1 48 | hpcbench-hpc-39: 0x155dfffd33ffc2 49 | hpcbench-hpc-40: 0x155dfffd33ffc3 50 | hpcbench-hpc-40: 0x155dfffd33ffc4 51 | hpcbench-hpc-40: 0x155dfffd33ffc5 52 | hpcbench-hpc-40: 0x155dfffd33ffc6 53 | hpcbench-hpc-40: 0x155dfffd33ffc7 54 | hpcbench-hpc-40: 0x155dfffd33ffc8 55 | hpcbench-hpc-40: 0x155dfffd33ffc9 56 | hpcbench-hpc-40: 0x155dfffd33ffca 57 | hpcbench-hpc-41: 0x155dfffd33ffcb 58 | hpcbench-hpc-41: 0x155dfffd33ffcc 59 | hpcbench-hpc-41: 0x155dfffd33ffcd 60 | hpcbench-hpc-41: 0x155dfffd33ffce 61 | hpcbench-hpc-41: 0x155dfffd33ffcf 62 | hpcbench-hpc-41: 0x155dfffd33ffd0 63 | hpcbench-hpc-41: 0x155dfffd33ffd1 64 | hpcbench-hpc-41: 0x155dfffd33ffd2 65 | hpcbench-hpc-42: 0x155dfffd33ffd3 66 | hpcbench-hpc-42: 0x155dfffd33ffd4 67 | hpcbench-hpc-42: 0x155dfffd33ffd5 68 | hpcbench-hpc-42: 0x155dfffd33ffd6 69 | hpcbench-hpc-42: 0x155dfffd33ffd7 70 | hpcbench-hpc-42: 0x155dfffd33ffd8 71 | hpcbench-hpc-42: 0x155dfffd33ffd9 72 | hpcbench-hpc-42: 0x155dfffd33ffda 73 | hpcbench-hpc-43: 0x155dfffd33ffe3 74 | hpcbench-hpc-43: 0x155dfffd33ffe4 75 | hpcbench-hpc-43: 0x155dfffd33ffe5 76 | hpcbench-hpc-43: 0x155dfffd33ffe6 77 | hpcbench-hpc-43: 0x155dfffd33ffe7 78 | hpcbench-hpc-43: 0x155dfffd33ffe8 79 | hpcbench-hpc-43: 0x155dfffd33ffe9 80 | hpcbench-hpc-43: 0x155dfffd33ffea 81 | hpcbench-hpc-44: 0x155dfffd33ffeb 82 | hpcbench-hpc-44: 0x155dfffd33ffec 83 | hpcbench-hpc-44: 0x155dfffd33ffed 84 | hpcbench-hpc-44: 0x155dfffd33ffee 85 | hpcbench-hpc-44: 0x155dfffd33ffef 86 | hpcbench-hpc-44: 0x155dfffd33fff0 87 | hpcbench-hpc-44: 0x155dfffd33fff1 88 | hpcbench-hpc-44: 0x155dfffd33fff2 89 | hpcbench-hpc-45: 0x155dfffd34001b 90 | hpcbench-hpc-45: 0x155dfffd34001c 91 | hpcbench-hpc-45: 0x155dfffd34001d 92 | hpcbench-hpc-45: 0x155dfffd34001e 93 | hpcbench-hpc-45: 0x155dfffd34001f 94 | hpcbench-hpc-45: 0x155dfffd340020 95 | hpcbench-hpc-45: 0x155dfffd340021 96 | hpcbench-hpc-45: 0x155dfffd340022 97 | hpcbench-hpc-46: 0x155dfffd340033 98 | hpcbench-hpc-46: 0x155dfffd340034 99 | hpcbench-hpc-46: 0x155dfffd340035 100 | hpcbench-hpc-46: 0x155dfffd340036 101 | hpcbench-hpc-46: 0x155dfffd340037 102 | hpcbench-hpc-46: 0x155dfffd340038 103 | hpcbench-hpc-46: 0x155dfffd340039 104 | hpcbench-hpc-46: 0x155dfffd34003a 105 | hpcbench-hpc-47: 0x155dfffd34004b 106 | hpcbench-hpc-47: 0x155dfffd34004c 107 | hpcbench-hpc-47: 0x155dfffd34004d 108 | hpcbench-hpc-47: 0x155dfffd34004e 109 | hpcbench-hpc-47: 0x155dfffd34004f 110 | hpcbench-hpc-47: 0x155dfffd340050 111 | hpcbench-hpc-47: 0x155dfffd340051 112 | hpcbench-hpc-47: 0x155dfffd340052 113 | hpcbench-hpc-48: 0x155dfffd340053 114 | hpcbench-hpc-48: 0x155dfffd340054 115 | hpcbench-hpc-48: 0x155dfffd340055 116 | hpcbench-hpc-48: 0x155dfffd340056 117 | hpcbench-hpc-48: 0x155dfffd340057 118 | hpcbench-hpc-48: 0x155dfffd340058 119 | hpcbench-hpc-48: 0x155dfffd340059 120 | hpcbench-hpc-48: 0x155dfffd34005a 121 | hpcbench-hpc-49: 0x155dfffd340063 122 | hpcbench-hpc-49: 0x155dfffd340064 123 | hpcbench-hpc-49: 0x155dfffd340065 124 | hpcbench-hpc-49: 0x155dfffd340066 125 | hpcbench-hpc-49: 0x155dfffd340067 126 | hpcbench-hpc-49: 0x155dfffd340068 127 | hpcbench-hpc-49: 0x155dfffd340069 128 | hpcbench-hpc-49: 0x155dfffd34006a 129 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/partitions.txt: -------------------------------------------------------------------------------- 1 | PARTITION 2 | dynamic 3 | gpu 4 | hpc 5 | htc 6 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/powered_down_hostnames.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-2 2 | hpcbench-hpc-3 3 | hpcbench-hpc-4 4 | hpcbench-hpc-5 5 | hpcbench-hpc-6 6 | hpcbench-hpc-7 7 | hpcbench-hpc-8 8 | hpcbench-hpc-9 9 | hpcbench-hpc-10 10 | hpcbench-hpc-11 11 | hpcbench-hpc-12 12 | hpcbench-hpc-13 13 | hpcbench-hpc-14 14 | hpcbench-hpc-15 15 | hpcbench-hpc-16 16 | hpcbench-hpc-17 17 | hpcbench-hpc-18 18 | hpcbench-hpc-19 19 | hpcbench-hpc-20 20 | hpcbench-hpc-21 21 | hpcbench-hpc-22 22 | hpcbench-hpc-23 23 | hpcbench-hpc-24 24 | hpcbench-hpc-25 25 | hpcbench-hpc-26 26 | hpcbench-hpc-27 27 | hpcbench-hpc-28 28 | hpcbench-hpc-29 29 | hpcbench-hpc-30 30 | hpcbench-hpc-31 31 | hpcbench-hpc-32 32 | hpcbench-hpc-33 33 | hpcbench-hpc-34 34 | hpcbench-hpc-50 35 | hpcbench-hpc-51 36 | hpcbench-hpc-52 37 | hpcbench-hpc-53 38 | hpcbench-hpc-54 39 | hpcbench-hpc-55 40 | hpcbench-hpc-56 41 | hpcbench-hpc-57 42 | hpcbench-hpc-58 43 | hpcbench-hpc-59 44 | hpcbench-hpc-60 45 | hpcbench-hpc-61 46 | hpcbench-hpc-62 47 | hpcbench-hpc-63 48 | hpcbench-hpc-64 49 | hpcbench-hpc-65 50 | hpcbench-hpc-66 51 | hpcbench-hpc-67 52 | hpcbench-hpc-68 53 | hpcbench-hpc-69 54 | hpcbench-hpc-70 55 | hpcbench-hpc-71 56 | hpcbench-hpc-72 57 | hpcbench-hpc-73 58 | hpcbench-hpc-74 59 | hpcbench-hpc-75 60 | hpcbench-hpc-76 61 | hpcbench-hpc-77 62 | hpcbench-hpc-78 63 | hpcbench-hpc-79 64 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/slurm_illegal_block_topology.txt: -------------------------------------------------------------------------------- 1 | # Number of Nodes in block1: 18 2 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f723c 3 | # Warning: Block 1 has less than 20 nodes, commenting out 4 | #BlockName=block1 Nodes=hpcbench-hpc-1,hpcbench-hpc-5,hpcbench-hpc-9,hpcbench-hpc-13,hpcbench-hpc-17,hpcbench-hpc-21,hpcbench-hpc-25,hpcbench-hpc-29,hpcbench-hpc-33,hpcbench-hpc-37,hpcbench-hpc-41,hpcbench-hpc-45,hpcbench-hpc-49,hpcbench-hpc-53,hpcbench-hpc-57,hpcbench-hpc-61,hpcbench-hpc-65,hpcbench-hpc-69 5 | # Number of Nodes in block2: 18 6 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f721e 7 | # Warning: Block 2 has less than 20 nodes, commenting out 8 | #BlockName=block2 Nodes=hpcbench-hpc-2,hpcbench-hpc-6,hpcbench-hpc-10,hpcbench-hpc-14,hpcbench-hpc-18,hpcbench-hpc-22,hpcbench-hpc-26,hpcbench-hpc-30,hpcbench-hpc-34,hpcbench-hpc-38,hpcbench-hpc-42,hpcbench-hpc-46,hpcbench-hpc-50,hpcbench-hpc-54,hpcbench-hpc-58,hpcbench-hpc-62,hpcbench-hpc-66,hpcbench-hpc-70 9 | # Number of Nodes in block3: 18 10 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f724b 11 | # Warning: Block 3 has less than 20 nodes, commenting out 12 | #BlockName=block3 Nodes=hpcbench-hpc-3,hpcbench-hpc-7,hpcbench-hpc-11,hpcbench-hpc-15,hpcbench-hpc-19,hpcbench-hpc-23,hpcbench-hpc-27,hpcbench-hpc-31,hpcbench-hpc-35,hpcbench-hpc-39,hpcbench-hpc-43,hpcbench-hpc-47,hpcbench-hpc-51,hpcbench-hpc-55,hpcbench-hpc-59,hpcbench-hpc-63,hpcbench-hpc-67,hpcbench-hpc-71 13 | # Number of Nodes in block4: 18 14 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f722a 15 | # Warning: Block 4 has less than 20 nodes, commenting out 16 | #BlockName=block4 Nodes=hpcbench-hpc-4,hpcbench-hpc-8,hpcbench-hpc-12,hpcbench-hpc-16,hpcbench-hpc-20,hpcbench-hpc-24,hpcbench-hpc-28,hpcbench-hpc-32,hpcbench-hpc-36,hpcbench-hpc-40,hpcbench-hpc-44,hpcbench-hpc-48,hpcbench-hpc-52,hpcbench-hpc-56,hpcbench-hpc-60,hpcbench-hpc-64,hpcbench-hpc-68,hpcbench-hpc-72 17 | BlockSizes=20 18 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/slurm_illegal_block_topology_2.txt: -------------------------------------------------------------------------------- 1 | # Number of Nodes in block1: 18 2 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f723c 3 | BlockName=block1 Nodes=hpcbench-hpc-1,hpcbench-hpc-5,hpcbench-hpc-9,hpcbench-hpc-13,hpcbench-hpc-17,hpcbench-hpc-21,hpcbench-hpc-25,hpcbench-hpc-29,hpcbench-hpc-33,hpcbench-hpc-37,hpcbench-hpc-41,hpcbench-hpc-45,hpcbench-hpc-49,hpcbench-hpc-53,hpcbench-hpc-57,hpcbench-hpc-61,hpcbench-hpc-65,hpcbench-hpc-69 4 | # Number of Nodes in block2: 18 5 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f724b 6 | BlockName=block2 Nodes=hpcbench-hpc-3,hpcbench-hpc-7,hpcbench-hpc-11,hpcbench-hpc-15,hpcbench-hpc-19,hpcbench-hpc-23,hpcbench-hpc-27,hpcbench-hpc-31,hpcbench-hpc-35,hpcbench-hpc-39,hpcbench-hpc-43,hpcbench-hpc-47,hpcbench-hpc-51,hpcbench-hpc-55,hpcbench-hpc-59,hpcbench-hpc-63,hpcbench-hpc-67,hpcbench-hpc-71 7 | # Number of Nodes in block3: 18 8 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f722a 9 | BlockName=block3 Nodes=hpcbench-hpc-4,hpcbench-hpc-8,hpcbench-hpc-12,hpcbench-hpc-16,hpcbench-hpc-20,hpcbench-hpc-24,hpcbench-hpc-28,hpcbench-hpc-32,hpcbench-hpc-36,hpcbench-hpc-40,hpcbench-hpc-44,hpcbench-hpc-48,hpcbench-hpc-52,hpcbench-hpc-56,hpcbench-hpc-60,hpcbench-hpc-64,hpcbench-hpc-68,hpcbench-hpc-72 10 | # Number of Nodes in block4: 14 11 | # ClusterUUID and CliqueID: 5e797fc6-0f46-421a-8724-0e102c0f721e 12 | # Warning: Block 4 has less than 18 nodes, commenting out 13 | #BlockName=block4 Nodes=hpcbench-hpc-18,hpcbench-hpc-22,hpcbench-hpc-26,hpcbench-hpc-30,hpcbench-hpc-34,hpcbench-hpc-38,hpcbench-hpc-42,hpcbench-hpc-46,hpcbench-hpc-50,hpcbench-hpc-54,hpcbench-hpc-58,hpcbench-hpc-62,hpcbench-hpc-66,hpcbench-hpc-70 14 | BlockSizes=18 15 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/slurm_topology.txt: -------------------------------------------------------------------------------- 1 | # Number of Nodes in sw00: 3 2 | SwitchName=sw00 Nodes=hpcbench-hpc-36,hpcbench-hpc-39,hpcbench-hpc-42 3 | # Number of Nodes in sw01: 6 4 | SwitchName=sw01 Nodes=hpcbench-hpc-1,hpcbench-hpc-35,hpcbench-hpc-38,hpcbench-hpc-41,hpcbench-hpc-44,hpcbench-hpc-49 5 | # Number of Nodes in sw02: 3 6 | SwitchName=sw02 Nodes=hpcbench-hpc-37,hpcbench-hpc-45,hpcbench-hpc-46 7 | # Number of Nodes in sw03: 2 8 | SwitchName=sw03 Nodes=hpcbench-hpc-40,hpcbench-hpc-43 9 | # Number of Nodes in sw04: 2 10 | SwitchName=sw04 Nodes=hpcbench-hpc-47,hpcbench-hpc-48 11 | SwitchName=sw05 Switches=sw00,sw01,sw02,sw03,sw04 12 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/topology.txt: -------------------------------------------------------------------------------- 1 | SwitchName=ibsw1 Nodes=0x155dfffd33ff7d,0x155dfffd33ffbd,0x155dfffd33ffd5 2 | SwitchName=ibsw2 Nodes=0x155dfffd33ff80,0x155dfffd33ffc0,0x155dfffd33ffd8 3 | SwitchName=ibsw3 Nodes=0x155dfffd33ff39,0x155dfffd33ff49,0x155dfffd33ffa9,0x155dfffd33ffd1,0x155dfffd33fff1,0x155dfffd340069 4 | SwitchName=ibsw4 Nodes=0x155dfffd33ff37,0x155dfffd33ff47,0x155dfffd33ffa7,0x155dfffd33ffcf,0x155dfffd33ffef,0x155dfffd340067 5 | SwitchName=ibsw5 Nodes=0x155dfffd33ff3a,0x155dfffd33ff4a,0x155dfffd33ffaa,0x155dfffd33ffd2,0x155dfffd33fff2,0x155dfffd34006a 6 | SwitchName=ibsw6 Nodes=0x155dfffd33ff35,0x155dfffd33ff45,0x155dfffd33ffa5,0x155dfffd33ffcd,0x155dfffd33ffed,0x155dfffd340065 7 | SwitchName=ibsw7 Nodes=0x155dfffd33ff33,0x155dfffd33ff43,0x155dfffd33ffa3,0x155dfffd33ffcb,0x155dfffd33ffeb,0x155dfffd340063 8 | SwitchName=ibsw8 Nodes=0x155dfffd33ff8d,0x155dfffd34001d,0x155dfffd340035 9 | SwitchName=ibsw9 Nodes=0x155dfffd33ff34,0x155dfffd33ff44,0x155dfffd33ffa4,0x155dfffd33ffcc,0x155dfffd33ffec,0x155dfffd340064 10 | SwitchName=ibsw10 Nodes=0x155dfffd33ff38,0x155dfffd33ff48,0x155dfffd33ffa8,0x155dfffd33ffd0,0x155dfffd33fff0,0x155dfffd340068 11 | SwitchName=ibsw11 Nodes=0x155dfffd33ff8b,0x155dfffd34001b,0x155dfffd340033 12 | SwitchName=ibsw12 Nodes=0x155dfffd33ff36,0x155dfffd33ff46,0x155dfffd33ffa6,0x155dfffd33ffce,0x155dfffd33ffee,0x155dfffd340066 13 | SwitchName=ibsw13 Nodes=0x155dfffd33ff92,0x155dfffd340022,0x155dfffd34003a 14 | SwitchName=ibsw14 Switches=ibsw3,ibsw16,ibsw17,ibsw28,ibsw34,ibsw47 15 | SwitchName=ibsw15 Nodes=0x155dfffd33ffc6,0x155dfffd33ffe6 16 | SwitchName=ibsw16 Nodes=0x155dfffd33ffc9,0x155dfffd33ffe9 17 | SwitchName=ibsw17 Nodes=0x155dfffd33ff91,0x155dfffd340021,0x155dfffd340039 18 | SwitchName=ibsw18 Nodes=0x155dfffd33ff90,0x155dfffd340020,0x155dfffd340038 19 | SwitchName=ibsw19 Nodes=0x155dfffd33ff8f,0x155dfffd34001f,0x155dfffd340037 20 | SwitchName=ibsw20 Nodes=0x155dfffd33ff8c,0x155dfffd34001c,0x155dfffd340034 21 | SwitchName=ibsw21 Nodes=0x155dfffd33ff8e,0x155dfffd34001e,0x155dfffd340036 22 | SwitchName=ibsw22 Nodes=0x155dfffd33ff7f,0x155dfffd33ffbf,0x155dfffd33ffd7 23 | SwitchName=ibsw23 Nodes=0x155dfffd33ffc8,0x155dfffd33ffe8 24 | SwitchName=ibsw24 Nodes=0x155dfffd33ff82,0x155dfffd33ffc2,0x155dfffd33ffda 25 | SwitchName=ibsw25 Nodes=0x155dfffd33ff7c,0x155dfffd33ffbc,0x155dfffd33ffd4 26 | SwitchName=ibsw26 Switches=ibsw2,ibsw10,ibsw18,ibsw23,ibsw40,ibsw47 27 | SwitchName=ibsw27 Switches=ibsw1,ibsw6,ibsw8,ibsw38,ibsw41,ibsw47 28 | SwitchName=ibsw28 Nodes=0x155dfffd33ff81,0x155dfffd33ffc1,0x155dfffd33ffd9 29 | SwitchName=ibsw29 Nodes=0x155dfffd33ffc4,0x155dfffd33ffe4 30 | SwitchName=ibsw30 Nodes=0x155dfffd33ff7b,0x155dfffd33ffbb,0x155dfffd33ffd3 31 | SwitchName=ibsw31 Nodes=0x155dfffd33ffca,0x155dfffd33ffea 32 | SwitchName=ibsw32 Nodes=0x155dfffd33ff7e,0x155dfffd33ffbe,0x155dfffd33ffd6 33 | SwitchName=ibsw33 Nodes=0x155dfffd34004e,0x155dfffd340056 34 | SwitchName=ibsw34 Nodes=0x155dfffd340051,0x155dfffd340059 35 | SwitchName=ibsw35 Nodes=0x155dfffd34004f,0x155dfffd340057 36 | SwitchName=ibsw36 Nodes=0x155dfffd34004c,0x155dfffd340054 37 | SwitchName=ibsw37 Nodes=0x155dfffd340052,0x155dfffd34005a 38 | SwitchName=ibsw38 Nodes=0x155dfffd34004d,0x155dfffd340055 39 | SwitchName=ibsw39 Nodes=0x155dfffd34004b,0x155dfffd340053 40 | SwitchName=ibsw40 Nodes=0x155dfffd340050,0x155dfffd340058 41 | SwitchName=ibsw41 Nodes=0x155dfffd33ffc5,0x155dfffd33ffe5 42 | SwitchName=ibsw42 Nodes=0x155dfffd33ffc7,0x155dfffd33ffe7 43 | SwitchName=ibsw43 Nodes=0x155dfffd33ffc3,0x155dfffd33ffe3 44 | SwitchName=ibsw44 Switches=ibsw9,ibsw20,ibsw25,ibsw29,ibsw36,ibsw47 45 | SwitchName=ibsw45 Switches=ibsw7,ibsw11,ibsw30,ibsw39,ibsw43,ibsw47 46 | SwitchName=ibsw46 Switches=ibsw12,ibsw15,ibsw21,ibsw32,ibsw33,ibsw47 47 | SwitchName=ibsw47 Switches=ibsw14,ibsw26,ibsw27,ibsw44,ibsw45,ibsw46,ibsw48,ibsw49 48 | SwitchName=ibsw48 Switches=ibsw5,ibsw13,ibsw24,ibsw31,ibsw37,ibsw47 49 | SwitchName=ibsw49 Switches=ibsw4,ibsw19,ibsw22,ibsw35,ibsw42,ibsw47 -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/topology_test_input/valid_hostnames.txt: -------------------------------------------------------------------------------- 1 | hpcbench-hpc-1 2 | hpcbench-hpc-35 3 | hpcbench-hpc-36 4 | hpcbench-hpc-37 5 | hpcbench-hpc-38 6 | hpcbench-hpc-39 7 | hpcbench-hpc-40 8 | hpcbench-hpc-41 9 | hpcbench-hpc-42 10 | hpcbench-hpc-43 11 | hpcbench-hpc-44 12 | hpcbench-hpc-45 13 | hpcbench-hpc-46 14 | hpcbench-hpc-47 15 | hpcbench-hpc-48 16 | hpcbench-hpc-49 17 | -------------------------------------------------------------------------------- /azure-slurm/test/slurmcc_test/util_test.py: -------------------------------------------------------------------------------- 1 | from slurmcc.util import to_hostlist, get_sort_key_func, run, _show_nodes, set_slurm_cli, _from_hostlist, _to_hostlist 2 | from slurmcc_test.testutil import MockNativeSlurmCLI 3 | from typing import List 4 | import subprocess 5 | 6 | def scontrol_func(args: List[str], retry: bool = True) -> str: 7 | if len(args) < 2: 8 | raise RuntimeError() 9 | if args[0:2] == ["show", "hostlist"]: 10 | return fake_to_hostlist(args[2]) 11 | 12 | 13 | def fake_to_hostlist(expr: str) -> str: 14 | """ 15 | this function will take a list of nodes like name-[1-5],other and convert it into a comma separated list of nodes 16 | like name-1,name-2,name-3,name-4,name-5,other 17 | """ 18 | if not expr: 19 | return "" 20 | nodes = expr.split(",") 21 | ret = [] 22 | for node in nodes: 23 | if "[" in node: 24 | prefix = node[0: node.index("[")] 25 | first, last = node.replace("]", "").split("[")[1].split("-")[0:2] 26 | ret.extend([f"{prefix}{i}" for i in range(int(first), int(last) + 1)]) 27 | else: 28 | ret.append(node) 29 | return ",".join(ret) 30 | 31 | 32 | def test_get_sort_key_func() -> None: 33 | assert ["name-1", "dyn"] == sorted(["name-1","dyn"], key=get_sort_key_func(is_hpc=False)) 34 | assert ["dyna", "dynb"] == sorted(["dyna","dynb"], key=get_sort_key_func(is_hpc=False)) 35 | 36 | 37 | def test_to_hostlist() -> None: 38 | assert "name-1,dyn" == to_hostlist(["name-1","dyn"], scontrol_func) 39 | assert "name-1,dyn" == to_hostlist(["dyn","name-1"], scontrol_func) 40 | 41 | def test_run_function() -> None: 42 | out = run(['ls', '-l'], shell=True) 43 | assert out.returncode == 0 44 | 45 | out = run(['cat', '/proc/loadavg'], shell=False) 46 | assert out.returncode == 0 47 | 48 | # test case for permissions errors 49 | try: 50 | out=run(['touch', '/root/.test']) 51 | except subprocess.CalledProcessError as e: 52 | assert out.returncode != 0 53 | 54 | 55 | def test_show_nodes() -> None: 56 | # no differences based on splitting 57 | cli = MockNativeSlurmCLI() 58 | node_list = ["htc-1", "htc-2", "htc-3", "htc-4"] 59 | set_slurm_cli(cli) 60 | cli.create_nodes(node_list, ["cloud"], ["htc"]) 61 | complete = _show_nodes(node_list, 4) 62 | split = _show_nodes(node_list, 2) 63 | assert split == complete 64 | 65 | 66 | def test_from_hostlist() -> None: 67 | # becomes so large we actually can't express htc-[min-max] 68 | # so the final result is actually different 69 | cli = MockNativeSlurmCLI() 70 | node_list = ["htc-1", "htc-2", "htc-3", "htc-4"] 71 | set_slurm_cli(cli) 72 | def simple_scontrol(args, ignore): 73 | assert args[0] == "show" 74 | assert args[1] == "hostnames" 75 | if args[2] == "htc-1,htc-2": 76 | return "htc-[1-2]" 77 | if args[2] == "htc-3,htc-4": 78 | return "htc-[3-4]" 79 | if args[2] == "htc-1,htc-2,htc-3,htc-4": 80 | return "htc-[1-4]" 81 | raise RuntimeError(args) 82 | 83 | 84 | cli.create_nodes(node_list, ["cloud"], ["htc"]) 85 | cli.scontrol = simple_scontrol 86 | complete = _from_hostlist(",".join(node_list), 4) 87 | split = _from_hostlist(",".join(node_list), 2) 88 | assert split != complete 89 | assert complete == ["htc-[1-4]"] 90 | assert split == ["htc-[1-2]", "htc-[3-4]"] 91 | 92 | 93 | def test_to_hostlist() -> None: 94 | # no changes based on splitting 95 | # confirmed this caused a failure with over 2k nodes 96 | # dropping to 500 fixed the issue 97 | cli = MockNativeSlurmCLI() 98 | node_list = ["htc-1", "htc-2", "htc-3", "htc-4"] 99 | set_slurm_cli(cli) 100 | cli.scontrol = scontrol_func # already implemented fake to hostlist 101 | complete = _to_hostlist(node_list, max_nodes_in_list=4) 102 | split = _to_hostlist(node_list, max_nodes_in_list=2) 103 | assert split == complete 104 | -------------------------------------------------------------------------------- /description.html: -------------------------------------------------------------------------------- 1 |
Slurm icon

Slurm is a highly configurable open source workload manager. See the Slurm project site for an overview.

Follow the instructions in the README for details on instructions on extending and configuring the Project for your environment.

-------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | mypy==0.710 2 | autoflake==1.3.1 3 | black==19.10b0 4 | pytest 5 | tabulate 6 | flake8 7 | hypothesis 8 | isort==4.3.21 9 | typing_extensions==3.6.6 10 | -------------------------------------------------------------------------------- /docker-package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo see .build.log for more information 3 | log_file=".build.log" 4 | check_dirty_changes() { 5 | if [ -n "$(git status --porcelain)" ]; then 6 | echo "Error: There are uncommitted changes in the current branch. Please commit or stash them before running this script." 7 | exit 1 8 | fi 9 | } 10 | 11 | # Call the function to check for dirty changes 12 | check_dirty_changes 13 | 14 | print_branch_and_last_commit() { 15 | branch=$(git rev-parse --abbrev-ref HEAD) 16 | last_commit=$(git log -1 --pretty=format:"%h - %s (%ci)") 17 | 18 | printf "%-20s: %s\n" "Current branch" "$branch" 19 | printf "%-20s: %s\n" "Last commit" "$last_commit" 20 | } 21 | 22 | delete_existing_blobs() { 23 | printf "%-20s: %s\n" "Deleting existing Blob Files" "" 24 | while IFS= read -r file; do 25 | if [ -f "blobs/$file" ]; then 26 | printf "%-20s: %s\n" "" "$file" 27 | rm -f "blobs/$file" || exit 1 28 | fi 29 | done < <(awk -F' *= *' '/^\[blobs\]/ {found=1} found && /^Files/ {gsub(/, */, "\n", $2); print $2; exit}' project.ini) 30 | } 31 | 32 | check_blobs_files_exist() { 33 | local version="$1" 34 | local missing_files=0 35 | 36 | printf "%-20s: %s\n" "Blob Files" "" 37 | while IFS= read -r file; do 38 | printf "%-20s: %s\n" "" "$file" 39 | if [ ! -f "blobs/$file" ]; then 40 | echo "Error: File blobs/$file does not exist." 41 | missing_files=1 42 | fi 43 | done < <(awk -F' *= *' '/^\[blobs\]/ {found=1} found && /^Files/ {gsub(/, */, "\n", $2); print $2; exit}' project.ini) 44 | 45 | if [ $missing_files -eq 1 ]; then 46 | echo "One or more required files are missing in the blobs directory." 47 | exit 1 48 | 49 | fi 50 | } 51 | 52 | get_version_from_project_ini() { 53 | version=$(awk -F' *= *' '/^\[project\]/ {found=1} found && /^version/ {print $2; exit}' project.ini) 54 | printf "%-20s: %s\n" "Project Version" "$version" 55 | check_blobs_files_exist "$version" 56 | } 57 | 58 | local_azslurm=/source/ 59 | if [ "$1" != "" ]; then 60 | scalelib=$(realpath $1) 61 | local_scalelib=/source/cyclecloud-scalelib 62 | extra_args="-v ${scalelib}:${local_scalelib}" 63 | fi 64 | 65 | if command -v docker; then 66 | runtime=docker 67 | runtime_args= 68 | elif command -v podman; then 69 | runtime=podman 70 | runtime_args="--privileged" 71 | else 72 | echo "`docker` or `podman` binary not found. Install docker or podman to build RPMs with this script" 73 | exit 1 74 | fi 75 | 76 | { 77 | delete_existing_blobs 78 | # allows caching 79 | $runtime build -t azslurm_build:latest -f util/Dockerfile . 80 | $runtime run -v $(pwd):${local_azslurm} $runtime_args $extra_args -ti azslurm_build:latest /bin/bash ${local_azslurm}/util/build.sh $local_scalelib 81 | } &> $log_file 82 | 83 | # Call the function to print the branch and the last commit 84 | print_branch_and_last_commit 85 | get_version_from_project_ini 86 | -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/cyclecloud-slurm/5e2514620b752ea6e194fd7b55f2d6d204e20e53/icon.png -------------------------------------------------------------------------------- /images/nodearrayedit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/cyclecloud-slurm/5e2514620b752ea6e194fd7b55f2d6d204e20e53/images/nodearrayedit.png -------------------------------------------------------------------------------- /images/nodearraytab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/cyclecloud-slurm/5e2514620b752ea6e194fd7b55f2d6d204e20e53/images/nodearraytab.png -------------------------------------------------------------------------------- /images/schedulernodeedit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/cyclecloud-slurm/5e2514620b752ea6e194fd7b55f2d6d204e20e53/images/schedulernodeedit.png -------------------------------------------------------------------------------- /integration/README.md: -------------------------------------------------------------------------------- 1 | # Running the integration tests 2 | 3 | ## Prerequisites 4 | Make sure `cyclecloud` is in your path and is working. 5 | 6 | Highly recommended that you have `$CS_HOME` defined as well. 7 | 8 | Lastly, I recommend you clear our `$CS_HOME/work/staging/projects/slurm` so that the latest artifacts are 9 | downloaded from GitHub. 10 | 11 | ## Create a parameters json file 12 | We need some common parameters for your clusters. Save these as `integration/params.json`, filling in the values 13 | with `{}` 14 | 15 | ```json 16 | { 17 | "UsePublicNetwork" : false, 18 | "Region" : "{myregion}", 19 | "NumberLoginNodes" : 1, 20 | "Credentials" : "{mycred, probably cloud}", 21 | "ExecuteNodesPublic" : false, 22 | "SubnetId" : "{my-persisten-rg}/{myvnet}/default", 23 | "ReturnProxy" : false 24 | } 25 | ``` 26 | 27 | ## Create an NFS cluster 28 | You need to create an NFS cluster, as most of our integration tests require this. You 29 | can set this up separately, but there is a handy command to do this automatically. Note that you 30 | need CS_HOME defined. 31 | 32 | ```bash 33 | python3 src/integration.py setup_nfs -p params.json 34 | ``` 35 | 36 | Otherwise, make sure that `/sched` and `/shared` are exported and restart `nfs-mountd`. 37 | 38 | ```bash 39 | echo '/mnt/exports/sched *(rw,sync,no_root_squash)' >> /etc/exports 40 | echo '/mnt/exports/shared *(rw,sync,no_root_squash)' >> /etc/exports 41 | systemctl restart nfs-mountd 42 | ``` 43 | 44 | 45 | ### Import the clusters 46 | 47 | ```bash 48 | $ python3 src/integration.py import -p param.json 49 | # Note pass in -n {nfs instance ip address} if you are using your own NFS instance 50 | ``` 51 | 52 |
53 | --help 54 | 55 | ```bash 56 | $ python3 src/integration.py import --help 57 | usage: integration.py import [-h] [--skip-stage-resources] --properties PROPERTIES --nfs-address NFS_ADDRESS 58 | 59 | optional arguments: 60 | -h, --help show this help message and exit 61 | --skip-stage-resources 62 | --properties PROPERTIES, -p PROPERTIES 63 | --nfs-address NFS_ADDRESS, -n NFS_ADDRESS 64 | ``` 65 | 66 | Only use `--skip-stage-resources` when you are running these before a GitHub release is available. 67 |
68 | 69 | 70 | 71 | ### Start the clusters 72 | There is a command for starting _all_ of the tests, or you can start them manually with `cyclecloud start_cluster` 73 | 74 | ```bash 75 | $ python3 src/integration.py start 76 | ``` 77 | 78 | To start just a single cluster 79 | ```bash 80 | cyclecloud start_cluster {cluster_name} --test 81 | ``` 82 | 83 |
84 | --help 85 | 86 | ```bash 87 | $ python3 src/integration.py start --help 88 | usage: integration.py start [-h] [--skip-tests] 89 | 90 | optional arguments: 91 | -h, --help show this help message and exit 92 | --skip-tests 93 | 94 | $ python3 src/integration.py start 95 | ``` 96 |
97 | 98 | ### Shutdown and delete the clusters 99 | Note that unless you pass in `--include-nfs`, the `integration-nfs` cluster will not be shutdown/deleted. 100 | ```bash 101 | $ python3 src/integration.py shutdown [--include-nfs] 102 | $ python3 src/integration.py delete [--include-nfs] 103 | ``` -------------------------------------------------------------------------------- /project.ini: -------------------------------------------------------------------------------- 1 | [project] 2 | name = slurm 3 | label = Slurm 4 | version = 4.0.3 5 | type = scheduler 6 | 7 | [blobs] 8 | Files = azure-slurm-pkg-4.0.3.tar.gz, azure-slurm-install-pkg-4.0.3.tar.gz 9 | 10 | [spec scheduler] 11 | run_list = role[slurm_scheduler_role] 12 | 13 | [spec execute] 14 | run_list = role[slurm_execute_role] 15 | 16 | [spec login] 17 | run_list = role[slurm_login_role] 18 | 19 | [config slurm.version] 20 | Required = True 21 | Label = Slurm Version 22 | Description = Version of Slurm to install on the cluster 23 | ParameterType = StringList 24 | Config.Plugin = pico.form.Dropdown 25 | Config.FreeForm = true 26 | Config.Entries := {[Value="25.05.2"]} 27 | DefaultValue = 25.05.2 28 | 29 | [config slurm.shutdown_policy] 30 | Label = ShutdownPolicy 31 | description = By default, autostop will Delete stopped VMS for lowest cost. Optionally, Stop/Deallocate the VMs for faster restart instead. 32 | DefaultValue = Terminate 33 | config.plugin = pico.control.AutoCompleteDropdown 34 | [[[[list Config.Entries]]]] 35 | Name = Terminate 36 | Label = Terminate 37 | [[[[list Config.Entries]]]] 38 | Name = Deallocate 39 | Label = Deallocate 40 | 41 | -------------------------------------------------------------------------------- /specs/default/chef/roles/slurm_execute_role.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | name "slurm_execute_role" 4 | description "Slurm Execute Role" 5 | run_list("recipe[cyclecloud]", 6 | "recipe[cshared::client]", 7 | "recipe[cuser]", 8 | "recipe[slurm::delayed_services]") 9 | default_attributes "slurm" => { "role" => "execute" } -------------------------------------------------------------------------------- /specs/default/chef/roles/slurm_login_role.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | name "slurm_login_role" 4 | description "Slurm Login Role" 5 | run_list("recipe[cyclecloud]", 6 | "recipe[cshared::client]", 7 | "recipe[cuser]", 8 | "recipe[slurm::delayed_services]") 9 | 10 | default_attributes "slurm" => { "role" => "login" } -------------------------------------------------------------------------------- /specs/default/chef/roles/slurm_scheduler_role.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | name "slurm_scheduler_role" 4 | description "Slurm Scheduler Role" 5 | run_list("role[scheduler]", 6 | "recipe[cyclecloud]", 7 | "recipe[cshared::directories]", 8 | "recipe[cuser]", 9 | "recipe[cshared::server]", 10 | "recipe[slurm::delayed_services]") 11 | default_attributes "cyclecloud" => { "discoverable" => true }, "slurm" => { "role" => "scheduler" } 12 | -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/.gitignore: -------------------------------------------------------------------------------- 1 | .vagrant 2 | *~ 3 | *# 4 | .#* 5 | \#*# 6 | .*.sw[a-z] 7 | *.un~ 8 | 9 | # Bundler 10 | Gemfile.lock 11 | gems.locked 12 | bin/* 13 | .bundle/* 14 | 15 | # test kitchen 16 | .kitchen/ 17 | .kitchen.local.yml 18 | 19 | # Chef 20 | Berksfile.lock 21 | .zero-knife.rb 22 | Policyfile.lock.json 23 | -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/README.md: -------------------------------------------------------------------------------- 1 | # slurm 2 | 3 | TODO: Enter the cookbook description here. 4 | 5 | -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/attributes/default.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | default[:slurm][:autoscale_version] = "4.0.3" 4 | default[:slurm][:version] = "23.11.9-1" 5 | default[:slurm][:user][:name] = 'slurm' 6 | default[:slurm][:cyclecloud_api] = "cyclecloud_api-8.4.1-py2.py3-none-any.whl" 7 | default[:slurm][:autoscale_dir] = "/opt/azurehpc/slurm" 8 | default[:slurm][:autoscale_pkg] = "azure-slurm-pkg-#{default[:slurm][:autoscale_version]}.tar.gz" 9 | default[:slurm][:install_pkg] = "azure-slurm-install-pkg-#{default[:slurm][:autoscale_version]}.tar.gz" 10 | default[:slurm][:install] = true 11 | default[:slurm][:use_nodename_as_hostname] = false 12 | default[:cyclecloud][:hosts][:simple_vpc_dns][:enabled] = false 13 | default[:cyclecloud][:hosts][:standalone_dns][:enabled] = false 14 | default[:slurm][:additional][:config] = "" 15 | default[:slurm][:ensure_waagent_monitor_hostname] = true 16 | 17 | # WORKAROUND: This should not need to be set here, but unexpectedly the default is sometimes being set 18 | # back to /home. 19 | default[:cuser][:base_home_dir] = "/shared/home" 20 | 21 | myplatform=node[:platform_family] 22 | case myplatform 23 | when 'ubuntu', 'debian' 24 | default[:slurm][:arch] = "amd64" 25 | default[:slurm][:user][:uid] = 64030 26 | default[:slurm][:user][:gid] = 64030 27 | when 'centos', 'rhel', 'redhat', 'almalinux', 'suse' 28 | if node[:platform_version] < "8"; 29 | default[:slurm][:arch] = "el7.x86_64" 30 | else 31 | default[:slurm][:arch] = "el8.x86_64" 32 | end 33 | default[:slurm][:user][:uid] = 11100 34 | default[:slurm][:user][:gid] = 11100 35 | end 36 | default[:munge][:user][:name] = 'munge' 37 | default[:munge][:user][:uid] = 11101 38 | default[:munge][:user][:gid] = 11101 39 | # Time between a suspend call and when that node can be used again - i.e. 10 minutes to shutdown 40 | default[:slurm][:suspend_timeout] = 600 41 | # Boot timeout 42 | default[:slurm][:resume_timeout] = 1800 43 | 44 | default[:slurm][:accounting][:enabled] = false 45 | default[:slurm][:accounting][:url] = 'localhost' 46 | 47 | default[:slurm][:ha_enabled] = false 48 | -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/chefignore: -------------------------------------------------------------------------------- 1 | # Put files/directories that should be ignored in this file when uploading 2 | # to a chef-server or supermarket. 3 | # Lines that start with '# ' are comments. 4 | 5 | # OS generated files # 6 | ###################### 7 | .DS_Store 8 | Icon? 9 | nohup.out 10 | ehthumbs.db 11 | Thumbs.db 12 | 13 | # SASS # 14 | ######## 15 | .sass-cache 16 | 17 | # EDITORS # 18 | ########### 19 | \#* 20 | .#* 21 | *~ 22 | *.sw[a-z] 23 | *.bak 24 | REVISION 25 | TAGS* 26 | tmtags 27 | *_flymake.* 28 | *_flymake 29 | *.tmproj 30 | .project 31 | .settings 32 | mkmf.log 33 | 34 | ## COMPILED ## 35 | ############## 36 | a.out 37 | *.o 38 | *.pyc 39 | *.so 40 | *.com 41 | *.class 42 | *.dll 43 | *.exe 44 | */rdoc/ 45 | 46 | # Testing # 47 | ########### 48 | .watchr 49 | .rspec 50 | spec/* 51 | spec/fixtures/* 52 | test/* 53 | features/* 54 | examples/* 55 | Guardfile 56 | Procfile 57 | .kitchen* 58 | .rubocop.yml 59 | spec/* 60 | Rakefile 61 | .travis.yml 62 | .foodcritic 63 | .codeclimate.yml 64 | 65 | # SCM # 66 | ####### 67 | .git 68 | */.git 69 | .gitignore 70 | .gitmodules 71 | .gitconfig 72 | .gitattributes 73 | .svn 74 | */.bzr/* 75 | */.hg/* 76 | */.svn/* 77 | 78 | # Berkshelf # 79 | ############# 80 | Berksfile 81 | Berksfile.lock 82 | cookbooks/* 83 | tmp 84 | 85 | # Policyfile # 86 | ############## 87 | Policyfile.rb 88 | Policyfile.lock.json 89 | 90 | # Cookbooks # 91 | ############# 92 | CONTRIBUTING* 93 | CHANGELOG* 94 | TESTING* 95 | MAINTAINERS.toml 96 | 97 | # Strainer # 98 | ############ 99 | Colanderfile 100 | Strainerfile 101 | .colander 102 | .strainer 103 | 104 | # Vagrant # 105 | ########### 106 | .vagrant 107 | Vagrantfile 108 | -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/libraries/helpers.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | module Slurm 4 | class Helpers 5 | 6 | def self.wait_for_master(sleep_time=10, max_retries=6, &block) 7 | results = block.call 8 | retries = 0 9 | while results.length < 2 and retries < max_retries 10 | sleep sleep_time 11 | retries += 1 12 | results = block.call 13 | Chef::Log.info "Found primary slurmctld node." 14 | end 15 | if retries >= max_retries 16 | raise Exception, "Timed out waiting for primary slurmctld" 17 | end 18 | 19 | results 20 | end 21 | 22 | end 23 | end -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/metadata.rb: -------------------------------------------------------------------------------- 1 | name 'slurm' 2 | maintainer 'Microsoft' 3 | maintainer_email 'support@cyclecomputing.com' 4 | license 'All Rights Reserved' 5 | description 'Installs/Configures slurm' 6 | long_description 'Installs/Configures slurm' 7 | version '4.0.3' 8 | chef_version '>= 12.1' if respond_to?(:chef_version) 9 | 10 | %w{ cuser cshared }.each {|c| depends c} 11 | 12 | # The `issues_url` points to the location where issues for this cookbook are 13 | # tracked. A `View Issues` link will be displayed on this cookbook's page when 14 | # uploaded to a Supermarket. 15 | # 16 | # issues_url 'https://github.com//slurm/issues' 17 | 18 | # The `source_url` points to the development repository for this cookbook. A 19 | # `View Source` link will be displayed on this cookbook's page when uploaded to 20 | # a Supermarket. 21 | # 22 | # source_url 'https://github.com//slurm' 23 | 24 | -------------------------------------------------------------------------------- /specs/default/chef/site-cookbooks/slurm/recipes/delayed_services.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Recipe:: delayed_services 5 | 6 | # This recipe is used to delay the start of slurmctld and slurmd services until 7 | # cluster init has finished 8 | 9 | defer_block 'Delayed start of services' do 10 | cmd = "#{node[:cyclecloud][:bootstrap]}/azure-slurm-install/start-services.sh #{node[:slurm][:role]} >> #{node[:cyclecloud][:bootstrap]}/azure-slurm-install/start-services.log 2>&1" 11 | Chef::Log.info "Executing #{cmd}" 12 | execute "delayed_start_of_services" do 13 | command cmd 14 | end 15 | 16 | end -------------------------------------------------------------------------------- /specs/default/cluster-init/files/JobSubmitPlugin/job_submit_cyclecloud.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) Microsoft Corporation. All rights reserved. 2 | -- Licensed under the MIT License. 3 | 4 | function slurm_job_submit(job_desc, part_list, submit_uid) 5 | if job_desc.argv ~= nil then 6 | for i = 0, job_desc.argc, 1 do 7 | if job_desc.argv[i] == "--switches" then 8 | slurm.log_info("--switches was set, ignoring."); 9 | return slurm.SUCCESS; 10 | end 11 | end 12 | end 13 | if job_desc.network ~= nil and job_desc.network ~= '' then 14 | if job_desc.network == "sn_single" then 15 | slurm.log_info("sn_single was set, ignoring."); 16 | return slurm.SUCCESS 17 | end 18 | end 19 | slurm.log_info("Setting reqswitch to 1."); 20 | job_desc.req_switch = 1; 21 | 22 | slurm.log_info("returning."); 23 | 24 | return slurm.SUCCESS 25 | end 26 | 27 | function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) 28 | return slurm.SUCCESS 29 | end 30 | 31 | slurm.log_info("initialized job_submit_cyclecloud") 32 | return slurm.SUCCESS 33 | -------------------------------------------------------------------------------- /specs/default/cluster-init/files/README.txt: -------------------------------------------------------------------------------- 1 | 2 | Files in this directory are automatically synced to any node using this spec. Content here 3 | can be anything from software packages to config files. Scripts can be used to install 4 | software packages or move files into the appropriate location on the node. 5 | -------------------------------------------------------------------------------- /specs/default/cluster-init/files/install-non-scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | mode=$1 5 | echo $mode | grep -Eqw "login|execute" || (echo "Usage: $0 [login|execute]" && exit 1) 6 | 7 | do_install=$(jetpack config slurm.do_install True) 8 | install_pkg=$(jetpack config slurm.install_pkg azure-slurm-install-pkg-4.0.3.tar.gz) 9 | slurm_project_name=$(jetpack config slurm.project_name slurm) 10 | 11 | 12 | cd $CYCLECLOUD_HOME/system/bootstrap 13 | if [ $do_install == "True" ]; then 14 | 15 | jetpack download --project $slurm_project_name $install_pkg 16 | tar xzf $install_pkg 17 | cd azure-slurm-install 18 | python3 install.py --mode $mode --bootstrap-config /opt/cycle/jetpack/config/node.json 19 | fi 20 | 21 | echo "installation complete. Run start-services scheduler|execute|login to start the slurm services." 22 | -------------------------------------------------------------------------------- /specs/default/cluster-init/scripts/README.txt: -------------------------------------------------------------------------------- 1 | 2 | Files in this directory are executed on the host in alphabetical order. 3 | It is recommended that files are named start with digits to ensure they 4 | are executed in the correct order, example: 5 | - 000_run_me_first.sh 6 | - 001_run_me_second.sh 7 | 8 | Allowable file extensions on Linux: .sh 9 | Allowable file extensions on Windows: .bat, .cmd, .exe 10 | -------------------------------------------------------------------------------- /specs/default/cluster-init/tests/README.txt: -------------------------------------------------------------------------------- 1 | 2 | Files in this directory contains tests that will be run at cluster start 3 | when in testing mode. Please see the official documentation for more information 4 | on cluster testing. 5 | -------------------------------------------------------------------------------- /specs/default/cluster-init/tests/test_uid.py: -------------------------------------------------------------------------------- 1 | #!/opt/cycle/jetpack/system/embedded/bin/python -m pytest 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | import subprocess 5 | import jetpack.config 6 | 7 | 8 | def test_slurm_uid(): 9 | suid = jetpack.config.get('slurm.user.uid') 10 | suser = jetpack.config.get('slurm.user.name', 'slurm') 11 | muid = jetpack.config.get('munge.user.uid') 12 | muser = jetpack.config.get('munge.user.name', 'munge') 13 | # Check that slurm uid and username match what is in data store 14 | assert subprocess.check_output(["id", "-u", suser]).decode().strip() == str(suid) 15 | 16 | assert subprocess.check_output(["id", "-u", muser]).decode().strip() == str(muid) 17 | -------------------------------------------------------------------------------- /specs/execute/cluster-init/scripts/00-install-execute.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | $SHELL /mnt/cluster-init/slurm/default/files/install-non-scheduler.sh execute -------------------------------------------------------------------------------- /specs/login/cluster-init/scripts/00-install-login.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | $SHELL /mnt/cluster-init/slurm/default/files/install-non-scheduler.sh login -------------------------------------------------------------------------------- /specs/scheduler/cluster-init/scripts/00-install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | do_install=$(jetpack config slurm.do_install True) 5 | install_pkg=$(jetpack config slurm.install_pkg azure-slurm-install-pkg-4.0.3.tar.gz) 6 | autoscale_pkg=$(jetpack config slurm.autoscale_pkg azure-slurm-pkg-4.0.3.tar.gz) 7 | slurm_project_name=$(jetpack config slurm.project_name slurm) 8 | 9 | find_python3() { 10 | export PATH=$(echo $PATH | sed -e 's/\/opt\/cycle\/jetpack\/system\/embedded\/bin://g' | sed -e 's/:\/opt\/cycle\/jetpack\/system\/embedded\/bin//g') 11 | if [ ! -z $AZSLURM_PYTHON_PATH]; then 12 | echo $AZSLURM_PYTHON_PATH 13 | return 0 14 | fi 15 | for version in $( seq 11 20 ); do 16 | which python3.$version 17 | if [ $? == 0 ]; then 18 | return 0 19 | fi 20 | done 21 | echo Could not find python3 version 3.11 >&2 22 | return 1 23 | } 24 | 25 | install_python3() { 26 | PYTHON_BIN=find_python3 27 | if [ -z "$PYTHON_BIN" ]; then 28 | return 0 29 | fi 30 | # NOTE: based off of healthagent 00-install.sh, but we have different needs - we don't need the devel/systemd paths. 31 | # most likely if healthagent is already installed, this won't be an issue. 32 | if [ -f /etc/os-release ]; then 33 | . /etc/os-release 34 | OS=$ID 35 | VERSION_ID=$VERSION_ID 36 | else 37 | echo "Cannot detect the operating system." 38 | exit 1 39 | fi 40 | 41 | if [ "$OS" == "almalinux" ]; then 42 | echo "Detected AlmaLinux. Installing Python 3.12..." >&2 43 | yum install -y python3.12 44 | PYTHON_BIN="/usr/bin/python3.12" 45 | 46 | elif [ "$OS" == "ubuntu" ] && [ "$VERSION_ID" == "22.04" ]; then 47 | echo "Detected Ubuntu 22.04. Installing Python 3.11..." >&2 48 | apt update 49 | # We need python dev headers and systemd dev headers for same reaosn mentioned above. 50 | apt install -y python3.11 python3.11-venv 51 | PYTHON_BIN="/usr/bin/python3.11" 52 | 53 | elif [ "$OS" == "ubuntu" ] && [[ $VERSION =~ ^24\.* ]]; then 54 | echo "Detected Ubuntu 24. Installing Python 3.12..." >&2 55 | apt update 56 | apt install -y python3.12 python3.12-venv 57 | PYTHON_BIN="/usr/bin/python3.12" 58 | else 59 | echo "Unsupported operating system: $OS $VERSION_ID" >&2 60 | exit 1 61 | fi 62 | export PYTHON_BIN 63 | } 64 | 65 | cd $CYCLECLOUD_HOME/system/bootstrap 66 | 67 | install_python3 68 | 69 | if [ $do_install == "True" ]; then 70 | rm -rf azure-slurm-install 71 | jetpack download --project $slurm_project_name $install_pkg 72 | tar xzf $install_pkg 73 | cd azure-slurm-install 74 | $PYTHON_BIN install.py --mode scheduler --bootstrap-config $CYCLECLOUD_HOME/config/node.json 75 | cd .. 76 | fi 77 | 78 | rm -rf azure-slurm 79 | jetpack download --project $slurm_project_name $autoscale_pkg 80 | tar xzf $autoscale_pkg 81 | cd azure-slurm 82 | AZSLURM_PYTHON_PATH=$PYTHON_BIN ./install.sh 83 | 84 | echo "installation complete. Run start-services scheduler|execute|login to start the slurm services." 85 | -------------------------------------------------------------------------------- /specs/scheduler/cluster-init/tests/test_slurm.py: -------------------------------------------------------------------------------- 1 | #!/opt/cycle/jetpack/system/embedded/bin/python -m pytest 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # 5 | import grp 6 | import json 7 | import os 8 | import pwd 9 | import random 10 | import subprocess 11 | from typing import List, Tuple 12 | import time 13 | import uuid 14 | 15 | WHOAMI = subprocess.check_output(["whoami"]).decode().strip() 16 | FAIL_FAST = int(os.getenv("FAIL_FAST", "1")) == 1 17 | 18 | 19 | def test_simple_autoscale(): 20 | if not is_autoscale(): 21 | return 22 | 23 | script_path = "/tmp/hello_world.sh" 24 | job_name = str(uuid.uuid4()) 25 | with open(script_path, "w") as fw: 26 | fw.write( 27 | """#!/bin/bash 28 | # 29 | #SBATCH --job-name={job_name} 30 | #SBATCH --output=test_hello_world.{job_name}.txt 31 | # 32 | #SBATCH --ntasks=1 33 | srun hostname""".format( 34 | job_name=job_name 35 | ) 36 | ) 37 | 38 | check_output("chown", "cyclecloud:cyclecloud", script_path) 39 | check_output("sudo", "-u", "cyclecloud", "sbatch", script_path) 40 | wait_for_job(job_name) 41 | wait_for_scale_down() 42 | 43 | 44 | def test_manual_scale(): 45 | if is_autoscale(): 46 | return 47 | nodes = _get_future_nodes() 48 | check_output("azslurm", "resume", "--nodes", nodes[-1]) 49 | wait_for_scale_up() 50 | check_output("azslurm", "suspend", "--nodes", nodes[-1]) 51 | 52 | 53 | def test_get_acct_info(): 54 | """ 55 | Low level cost reporting test, just to ensure our get_acct_info script is working. 56 | """ 57 | offline_node = _get_powered_down_nodes()[0] 58 | check_output("scontrol", "update", "NodeName=%s" % offline_node, "State=power_up") 59 | time.sleep(15) 60 | response = json.loads( 61 | check_output("/opt/azurehpc/slurm/get_acct_info.sh", offline_node) 62 | ) 63 | assert 1 == len(response) 64 | info = response[0] 65 | assert info.pop("name") == offline_node 66 | assert info.pop("location") 67 | assert info.pop("vm_size") 68 | assert info.pop("spot") is not None 69 | assert info.pop("nodearray") 70 | assert info.pop("cpus") 71 | assert info.pop("pcpu_count") 72 | assert info.pop("vcpu_count") 73 | assert info.pop("gpu_count") is not None 74 | assert info.pop("memgb") 75 | assert not info, "Unexpected keys: %s" % info.keys() 76 | 77 | 78 | def test_multi_node_start() -> None: 79 | """ 80 | Slurm has changed how it passes in the node_list to the suspend/resume scripts. 81 | This test ensures that we can handle both the old and new formats. 82 | """ 83 | if not is_autoscale(): 84 | return 85 | 86 | nodes = _get_powered_down_nodes() 87 | random.shuffle(nodes) 88 | nodes_to_start = [] 89 | for line in nodes[0:10]: 90 | node_name = line.split()[0] 91 | nodes_to_start.append(node_name) 92 | 93 | nodes_before = json.loads( 94 | check_output("azslurm", "nodes", "--output-format", "json") 95 | ) 96 | assert not nodes_before, "Expected 0 nodes already started!" 97 | hostlist = check_output("scontrol", "show", "hostlist", ",".join(nodes_to_start)) 98 | check_output("scontrol", "update", "NodeName=" + hostlist, "State=power_up") 99 | time.sleep(10) 100 | nodes_after = json.loads( 101 | check_output("azslurm", "nodes", "--output-format", "json") 102 | ) 103 | 104 | assert len(nodes_after) == 10 105 | assert set([n["name"] for n in nodes_after]) == set(nodes_to_start) 106 | 107 | check_output("scontrol", "update", "NodeName=" + hostlist, "State=power_down_force") 108 | time.sleep(10) 109 | nodes_final = json.loads( 110 | check_output("azslurm", "nodes", "--output-format", "json") 111 | ) 112 | 113 | assert 0 == len(nodes_final) 114 | 115 | 116 | def wait_for_ip(node: str) -> bool: 117 | 118 | for _ in range(60): 119 | records = json.loads( 120 | check_output("azslurm", "nodes", "--output-format", "json") 121 | ) 122 | for record in records: 123 | if record.get("name") == node and record.get("private_ip"): 124 | return True 125 | time.sleep(5) 126 | return False 127 | 128 | 129 | def test_resume_suspend_repeat() -> None: 130 | """ 131 | Ensures that we can resume and suspend a node multiple times even if 132 | it is still in the process of shutting down / starting. 133 | """ 134 | node = _get_powered_down_nodes()[0] 135 | check_output("azslurm", "resume", "--node-list", node, "--no-wait") 136 | assert wait_for_ip(node) 137 | check_output("azslurm", "suspend", "--node-list", node) 138 | 139 | check_output("azslurm", "resume", "--node-list", node, "--no-wait") 140 | assert wait_for_ip(node) 141 | 142 | check_output("azslurm", "suspend", "--node-list", node) 143 | 144 | 145 | def test_create_dyn_node() -> None: 146 | cluster_name = check_output("jetpack", "config", "cyclecloud.cluster.name") 147 | cluster_name = cluster_name.replace("_", "-").replace(".", "-") 148 | node = f"{cluster_name}-dyn-node-test" 149 | if is_autoscale(): 150 | check_output( 151 | "scontrol", "create", f"nodename={node}", "state=CLOUD", "Feature=dyn" 152 | ) 153 | check_output("scontrol", "update", f"nodename={node}", "state=power_up") 154 | else: 155 | check_output( 156 | "scontrol", "create", f"nodename={node}", "state=FUTURE", "Feature=dyn" 157 | ) 158 | check_output("azslurm", "resume", "--node-list", node, "--no-wait") 159 | wait_for_scale_up() 160 | wait_for_scale_down() 161 | 162 | 163 | def test_azslurm_cost() -> None: 164 | """ 165 | Ensures that the azslurm cost command works. 166 | """ 167 | check_output("azslurm", "cost", "-o", "/tmp") 168 | assert os.path.exists("/tmp/jobs.csv") 169 | assert os.path.exists("/tmp/partition.csv") 170 | assert os.path.exists("/tmp/partition_hourly.csv") 171 | 172 | 173 | def test_azslurm_scale() -> None: 174 | def stat(path: str) -> Tuple[int, int, int]: 175 | st = os.stat(path) 176 | return st.st_mode, st.st_uid, st.st_gid 177 | 178 | azure_conf = os.path.realpath("/etc/slurm/azure.conf") 179 | gres_conf = os.path.realpath("/etc/slurm/gres.conf") 180 | original = stat(azure_conf), stat(gres_conf) 181 | try: 182 | subprocess.call(["sudo", "chown", f"cyclecloud:cyclecloud", azure_conf]) 183 | subprocess.call(["sudo", "chmod", "400", azure_conf]) 184 | subprocess.call(["sudo", "chown", f"cyclecloud:cyclecloud", gres_conf]) 185 | subprocess.call(["sudo", "chmod", "400", gres_conf]) 186 | before_scale = stat(azure_conf), stat(gres_conf) 187 | assert before_scale != original 188 | subprocess.call(["sudo", "azslurm", "scale"]) 189 | after_scale = stat(azure_conf), stat(gres_conf) 190 | 191 | # assert we have actually maintained perms and ownership 192 | # See issue #193 for more information 193 | assert after_scale == before_scale 194 | 195 | finally: 196 | # restore back to original permissions 197 | azconf_owner = pwd.getpwuid(original[0][1]).pw_name 198 | azconf_grp = grp.getgrgid(original[0][2]).gr_name 199 | gres_owner = pwd.getpwuid(original[1][1]).pw_name 200 | gres_grp = grp.getgrgid(original[1][2]).gr_name 201 | subprocess.call(["sudo", "chown", f"{azconf_owner}:{azconf_grp}", azure_conf]) 202 | subprocess.call(["sudo", "chown", f"{gres_owner}:{gres_grp}", gres_conf]) 203 | subprocess.call(["sudo", "chmod", oct(original[0][0])[-4:], azure_conf]) 204 | subprocess.call(["sudo", "chmod", oct(original[1][0])[-4:], gres_conf]) 205 | 206 | 207 | def _get_powered_down_nodes() -> List[str]: 208 | ret = [] 209 | lines = check_output( 210 | "scontrol", "show", "nodes", "--future" 211 | ).splitlines() 212 | for line in lines: 213 | line = line.strip() 214 | if line.startswith("NodeName"): 215 | name = line.split()[0].split("=")[1].strip() 216 | ret.append(name) 217 | return ret 218 | 219 | def _get_future_nodes() -> List[str]: 220 | return check_output( 221 | "scontrol", "-N", "-h", "-t", "powered_down", "--format=%N" 222 | ).splitlines() 223 | 224 | 225 | def teardown() -> None: 226 | subprocess.call(["scancel", "-u", WHOAMI]) 227 | lines = check_output( 228 | "sinfo", "-N", "-h", "-Onodelist:100,StateComplete:100" 229 | ).splitlines() 230 | nodes = [] 231 | for line in lines: 232 | name, states = line.strip().split() 233 | if "powered_down" in states or "powering_down" in states: 234 | continue 235 | nodes.append(name) 236 | 237 | if nodes: 238 | hostlist = check_output("scontrol", "show", "hostlist", ",".join(nodes)) 239 | check_output( 240 | "scontrol", "update", "NodeName=" + hostlist, "State=power_down_force" 241 | ) 242 | if not FAIL_FAST: 243 | time.sleep(75) 244 | 245 | cc_nodes = json.loads(check_output("azslurm", "nodes", "--output-format", "json")) 246 | if cc_nodes: 247 | check_output( 248 | "azslurm", 249 | "suspend", 250 | "--node-list", 251 | ",".join([n["name"] for n in cc_nodes]), 252 | ) 253 | if not FAIL_FAST: 254 | time.sleep(10) 255 | 256 | 257 | def check_output(*args, **kwargs): 258 | print("Running:", " ".join(args)) 259 | return subprocess.check_output(list(args), **kwargs).decode().strip() 260 | 261 | 262 | def is_autoscale() -> bool: 263 | with open("/etc/slurm/azure.conf") as fr: 264 | if "FUTURE" not in fr.read().upper(): 265 | # an autoscale cluster, ignore 266 | return True 267 | return False 268 | 269 | 270 | def wait_for_job(job_name): 271 | deadline = time.time() + 20 * 60 272 | while time.time() < deadline: 273 | time.sleep(1) 274 | stdout = check_output("squeue", "--format", "%j", "-h") 275 | if job_name not in stdout: 276 | return 277 | raise AssertionError("Timed out waiting for job %s to finish" % job_name) 278 | 279 | 280 | def wait_for_scale_up(): 281 | deadline = time.time() + 20 * 60 282 | while time.time() < deadline: 283 | time.sleep(1) 284 | stdout = check_output("sinfo", "--format", "%T", "-h") 285 | if "idle" in stdout: 286 | return 287 | raise AssertionError("Timed out waiting for scale up") 288 | 289 | 290 | def wait_for_scale_down(): 291 | deadline = time.time() + 20 * 60 292 | while time.time() < deadline: 293 | time.sleep(1) 294 | stdout = check_output("sinfo", "--format", "%T", "-h") 295 | if "~idle" != stdout: 296 | return 297 | raise AssertionError("Timed out waiting for scale down") -------------------------------------------------------------------------------- /templates/slurm-beegfs.txt: -------------------------------------------------------------------------------- 1 | 2 | ################################ 3 | ## Cluster Configuration File ## 4 | ################################ 5 | 6 | [cluster Slurm] 7 | FormLayout = selectionpanel 8 | Category = Schedulers 9 | 10 | Autoscale = $Autoscale 11 | 12 | [[node defaults]] 13 | UsePublicNetwork = $UsePublicNetwork 14 | Credentials = $Credentials 15 | SubnetId = $SubnetId 16 | Region = $Region 17 | KeyPairLocation = ~/.ssh/cyclecloud.pem 18 | 19 | [[[cluster-init cyclecloud/slurm:default]]] 20 | Optional = true 21 | 22 | [[[cluster-init cyclecloud/beegfs:default]]] 23 | [[[cluster-init cyclecloud/beegfs:client]]] 24 | 25 | [[[configuration]]] 26 | slurm.version = $configuration_slurm_version 27 | beegfs.client.manager_ipaddress = $BeeGFSManagerIpaddress 28 | beegfs.client.cluster_name = $BeeGFSClusterName 29 | beegfs.client.mount_point = $BeeGFSMountPt 30 | 31 | [[node scheduler]] 32 | MachineType = $SchedulerMachineType 33 | ImageName = $SchedulerImageName 34 | IsReturnProxy = $ReturnProxy 35 | AdditionalClusterInitSpecs = $SchedulerClusterInitSpecs 36 | 37 | [[[configuration]]] 38 | slurm.role = scheduler 39 | 40 | [[[cluster-init cyclecloud/slurm:scheduler]]] 41 | 42 | [[[network-interface eth0]]] 43 | AssociatePublicIpAddress = $UsePublicNetwork 44 | 45 | [[nodearray hpc]] 46 | MachineType = $HPCMachineType 47 | ImageName = $HPCImageName 48 | MaxCoreCount = $MaxHPCExecuteCoreCount 49 | Azure.MaxScalesetSize = $HPCMaxScalesetSize 50 | AdditionalClusterInitSpecs = $HPCClusterInitSpecs 51 | 52 | [[[configuration]]] 53 | slurm.role = execute 54 | slurm.autoscale = true 55 | slurm.default_partition = true 56 | slurm.hpc = true 57 | 58 | [[[cluster-init cyclecloud/slurm:execute]]] 59 | 60 | [[[network-interface eth0]]] 61 | AssociatePublicIpAddress = $ExecuteNodesPublic 62 | 63 | 64 | [[nodearray htc]] 65 | MachineType = $HTCMachineType 66 | ImageName = $HTCImageName 67 | MaxCoreCount = $MaxHTCExecuteCoreCount 68 | 69 | Interruptible = $HTCUseLowPrio 70 | AdditionalClusterInitSpecs = $HTCClusterInitSpecs 71 | 72 | 73 | [[[configuration]]] 74 | slurm.role = execute 75 | slurm.autoscale = true 76 | slurm.hpc = false 77 | 78 | [[[cluster-init cyclecloud/slurm:execute]]] 79 | 80 | [[[network-interface eth0]]] 81 | AssociatePublicIpAddress = $ExecuteNodesPublic 82 | 83 | 84 | [parameters About] 85 | Order = 1 86 | 87 | [[parameters About Slurm]] 88 | 89 | [[[parameter slurm]]] 90 | HideLabel = true 91 | Config.Plugin = pico.widget.HtmlTemplateWidget 92 | Config.Template := "
\"Slurm

Slurm is a highly configurable open source workload manager. See the Slurm project site for an overview.

Follow the instructions in the README for details on instructions on extending and configuring the Project for your environment.

" 93 | 94 | [parameters Required Settings] 95 | Order = 10 96 | 97 | [[parameters Virtual Machines ]] 98 | Description = "The cluster, in this case, has two roles: the scheduler node with shared filer and the execute hosts. Configure which VM types to use based on the requirements of your application." 99 | Order = 20 100 | 101 | [[[parameter Region]]] 102 | Label = Region 103 | Description = Deployment Location 104 | ParameterType = Cloud.Region 105 | DefaultValue = westus2 106 | 107 | [[[parameter SchedulerMachineType]]] 108 | Label = Scheduler VM Type 109 | Description = The VM type for scheduler node 110 | ParameterType = Cloud.MachineType 111 | DefaultValue = Standard_E4_v4 112 | 113 | [[[parameter HPCMachineType]]] 114 | Label = HPC VM Type 115 | Description = The VM type for HPC execute nodes 116 | ParameterType = Cloud.MachineType 117 | DefaultValue = Standard_F2s_v2 118 | 119 | [[[parameter HTCMachineType]]] 120 | Label = HTC VM Type 121 | Description = The VM type for HTC execute nodes 122 | ParameterType = Cloud.MachineType 123 | DefaultValue = Standard_F2s_v2 124 | 125 | 126 | [[parameters Auto-Scaling]] 127 | Description = "The cluster can autoscale to the workload, adding execute hosts as jobs are queued. To enable this check the box below and choose the initial and maximum core counts for the cluster" 128 | Order = 30 129 | 130 | [[[parameter Autoscale]]] 131 | Label = Autoscale 132 | DefaultValue = true 133 | Widget.Plugin = pico.form.BooleanCheckBox 134 | Widget.Label = Start and stop execute instances automatically 135 | 136 | [[[parameter MaxHPCExecuteCoreCount]]] 137 | Label = Max HPC Cores 138 | Description = The total number of HPC execute cores to start 139 | DefaultValue = 100 140 | Config.Plugin = pico.form.NumberTextBox 141 | Config.MinValue = 1 142 | Config.IntegerOnly = true 143 | 144 | [[[parameter MaxHTCExecuteCoreCount]]] 145 | Label = Max HTC Cores 146 | Description = The total number of HTC execute cores to start 147 | DefaultValue = 100 148 | Config.Plugin = pico.form.NumberTextBox 149 | Config.MinValue = 1 150 | Config.IntegerOnly = true 151 | 152 | [[[parameter HPCMaxScalesetSize]]] 153 | Label = Max VMs per Scaleset 154 | Description = The maximum number of VMs created per VM Scaleset e.g. switch in Slurm. 155 | DefaultValue = 100 156 | Config.Plugin = pico.form.NumberTextBox 157 | Config.MinValue = 1 158 | Config.IntegerOnly = true 159 | 160 | [[[parameter HTCUseLowPrio]]] 161 | Label = Low Priority HTC 162 | DefaultValue = false 163 | Widget.Plugin = pico.form.BooleanCheckBox 164 | Widget.Label = Use low priority instances for HTC execute hosts 165 | 166 | [[parameters BeeGFS Settings]] 167 | Order = 25 168 | Description = "Use a BeeGFS cluster as a NAS. Settings for defining the BeeGFS cluster" 169 | 170 | [[[parameter BeeGFSManagerIpaddress]]] 171 | Label = BeeGFS Manager IP 172 | Description = IP address of the BeeGFS cluster to connect to. 173 | 174 | [[[parameter BeeGFSClusterName]]] 175 | Label = BeeGFS Cluster 176 | Description = Name of the BeeGFS cluster to connect to. The BeeGFS cluster should be orchestrated by the same CycleCloud Server 177 | Config.Plugin = pico.form.QueryDropdown 178 | Config.Query = select ClusterName as Name from Cloud.Node where Cluster().IsTemplate =!= True && ClusterInitSpecs["beegfs:manager"] isnt undefined 179 | Config.SetDefault = false 180 | 181 | [[[parameter BeeGFSMountPt]]] 182 | Label = BeeGFS MountPt 183 | Description = The mount point to mount the BeeGFS file server on. 184 | DefaultValue = /mnt/beegfs 185 | Required = True 186 | 187 | [[parameters Networking]] 188 | Order = 40 189 | 190 | [[[parameter SubnetId]]] 191 | Label = Subnet ID 192 | Description = Subnet Resource Path (ResourceGroup/VirtualNetwork/Subnet) 193 | ParameterType = Azure.Subnet 194 | Required = True 195 | 196 | 197 | [parameters Advanced Settings] 198 | Order = 20 199 | 200 | [[parameters Azure Settings]] 201 | Order = 10 202 | 203 | [[[parameter Credentials]]] 204 | Description = The credentials for the cloud provider 205 | ParameterType = Cloud.Credentials 206 | 207 | [[parameters Slurm Settings ]] 208 | Description = "Section for configuring Slurm" 209 | Order = 5 210 | 211 | [[[parameter configuration_slurm_version]]] 212 | Required = True 213 | Label = Slurm Version 214 | Description = Version of Slurm to install on the cluster 215 | ParameterType = StringList 216 | Config.Plugin = pico.form.Dropdown 217 | Config.FreeForm = true 218 | Config.Entries := {[Value="23.02.8-1"], [Value="23.11.9-1"]} 219 | DefaultValue = 23.11.9-1 220 | 221 | [[parameters Software]] 222 | Description = "Specify the scheduling software, and base OS installed on all nodes, and optionally the cluster-init and chef versions from your Locker." 223 | Order = 10 224 | 225 | [[[parameter SchedulerImageName]]] 226 | Label = Scheduler OS 227 | ParameterType = Cloud.Image 228 | Config.OS = linux 229 | DefaultValue = cycle.image.ubuntu20 230 | Config.Filter := Package in {"cycle.image.ubuntu20"} 231 | 232 | [[[parameter HPCImageName]]] 233 | Label = HPC OS 234 | ParameterType = Cloud.Image 235 | Config.OS = linux 236 | DefaultValue = cycle.image.ubuntu20 237 | Config.Filter := Package in {"cycle.image.ubuntu20"} 238 | 239 | [[[parameter HTCImageName]]] 240 | Label = HTC OS 241 | ParameterType = Cloud.Image 242 | Config.OS = linux 243 | DefaultValue = cycle.image.ubuntu20 244 | Config.Filter := Package in {"cycle.image.ubuntu20"} 245 | 246 | [[[parameter SchedulerClusterInitSpecs]]] 247 | Label = Scheduler Cluster-Init 248 | DefaultValue = =undefined 249 | Description = Cluster init specs to apply to the scheduler node 250 | ParameterType = Cloud.ClusterInitSpecs 251 | 252 | [[[parameter HTCClusterInitSpecs]]] 253 | Label = HTC Cluster-Init 254 | DefaultValue = =undefined 255 | Description = Cluster init specs to apply to HTC execute nodes 256 | ParameterType = Cloud.ClusterInitSpecs 257 | 258 | [[[parameter HPCClusterInitSpecs]]] 259 | Label = HPC Cluster-Init 260 | DefaultValue = =undefined 261 | Description = Cluster init specs to apply to HPC execute nodes 262 | ParameterType = Cloud.ClusterInitSpecs 263 | 264 | 265 | [[parameters Advanced Networking]] 266 | Description = Advanced networking settings 267 | 268 | [[[parameter ReturnProxy]]] 269 | Label = Return Proxy 270 | DefaultValue = true 271 | ParameterType = Boolean 272 | Config.Label = Use SSH tunnel to connect to CycleCloud (required if direct access is blocked) 273 | 274 | [[[parameter UsePublicNetwork]]] 275 | Label = Public Head Node 276 | DefaultValue = true 277 | ParameterType = Boolean 278 | Config.Label = Access scheduler node from the Internet 279 | 280 | [[[parameter ExecuteNodesPublic]]] 281 | Label = Public Execute 282 | DefaultValue = false 283 | ParameterType = Boolean 284 | Config.Label = Access execute nodes from the Internet 285 | Conditions.Excluded := UsePublicNetwork isnt true 286 | -------------------------------------------------------------------------------- /util/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | RUN pip3.11 install --upgrade setuptools 3 | RUN pip3.11 install --upgrade wheel 4 | RUN pip3.11 install requests -------------------------------------------------------------------------------- /util/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | set -x 4 | 5 | if [ "$1" == "-h" ] || [ "$1" == "--help" ] || [ "$1" == "-help" ]; then 6 | echo "Usage: $0 [path/to/scalelib repo]" 7 | echo "If no path to scalelib is passed in, one will be downloaded from GitHub based on" 8 | echo "the version specified in package.py:SCALELIB_VERSION" 9 | exit 1 10 | fi 11 | 12 | LOCAL_SCALELIB=$1 13 | 14 | if [ "$LOCAL_SCALELIB" != "" ]; then 15 | LOCAL_SCALELIB=$(realpath $LOCAL_SCALELIB) 16 | fi 17 | 18 | cwd=$(dirname "$(readlink -f "$0")") 19 | SOURCE=$(dirname $cwd) 20 | 21 | if [ ! -e $SOURCE/blobs ]; then 22 | mkdir $SOURCE/blobs 23 | fi 24 | 25 | wget -k -O $SOURCE/azure-slurm-install/AzureCA.pem https://github.com/Azure/cyclecloud-slurm/releases/download/2.7.3/AzureCA.pem 26 | # ls slurm/install/slurm-pkgs/*.rpm > /dev/null || (echo you need to run docker-rpmbuild.sh first; exit 1) 27 | # ls slurm/install/slurm-pkgs/*.deb > /dev/null || (echo you need to run docker-rpmbuild.sh first; exit 1) 28 | 29 | 30 | cd $SOURCE/azure-slurm-install 31 | rm -f dist/* 32 | ./package.sh 33 | mv dist/* ../blobs/ 34 | 35 | cd $SOURCE/azure-slurm 36 | rm -f dist/* 37 | ./package.sh $LOCAL_SCALELIB 38 | mv dist/* ../blobs/ 39 | --------------------------------------------------------------------------------