├── .ansible-lint ├── .gitignore ├── LICENSE ├── README.md ├── ansible.cfg ├── galaxy.yml ├── meta └── runtime.yml ├── plugins ├── README.md ├── action │ ├── raw_reboot.py │ └── raw_upgrade.py ├── filter │ └── rshim_filter.py └── modules │ ├── bf2_facts.py │ └── bf2_facts_test.py └── roles ├── bf2_boot ├── README.md ├── tasks │ └── main.yml └── templates │ └── bf.cfg.j2 ├── bf2_mode ├── README.md ├── defaults │ └── main.yml └── tasks │ ├── main.yml │ ├── ownership.yml │ └── security.yml ├── bf_bmc ├── README.md ├── defaults │ └── main.yml └── tasks │ ├── chassis_power_off.yaml │ ├── chassis_power_on.yaml │ ├── main.yml │ └── powercycle.yml ├── dpu_nvconfig ├── README.md └── tasks │ ├── main.yml │ ├── nvset.yml │ ├── run_mlxconfig.yml │ ├── set_embedded_cpu_model.yml │ ├── set_gpu_owner.yml │ ├── set_link_type.yml │ └── set_nic_mode.yml ├── force_reboot_armos ├── README.md └── tasks │ └── main.yml ├── install_cuda ├── README.md ├── files │ ├── 7fa2af80.gpg │ ├── A024F6F0E6D6A281.gpg │ ├── A4B469963BF863CC.gpg │ ├── F60F4B3D7FA2AF80.gpg │ ├── cuda-repository-pin-600 │ └── libnvidia-container.pub.pem ├── tasks │ ├── add_mirror_repo.yml │ ├── apt_common.yml │ ├── deb_network.yml │ ├── libnvidia_container.yml │ └── main.yml └── templates │ └── cuda-repo.list.j2 ├── install_doca ├── README.md ├── defaults │ └── main.yml └── tasks │ └── main.yml ├── load_bfb ├── README.md ├── defaults │ └── main.yml ├── tasks │ └── main.yml └── templates │ ├── bf.cfg.j2 │ └── bf_ubuntu.cfg.j2 ├── manage_bf2_fw ├── README.md ├── defaults │ └── main.yml └── tasks │ └── main.yml ├── manage_bf2_nic_speed ├── README.md ├── tasks │ └── main.yml └── templates │ └── 83-net-speed.rules.j2 ├── manage_bf_bmc_fw ├── README.md └── tasks │ └── main.yml ├── manage_rshim_owner ├── README.md └── tasks │ ├── change_owner.yaml │ └── main.yaml └── prepare_cuda_repo ├── README.md └── tasks ├── check_vars.yml ├── get_installer.yml └── main.yml /.ansible-lint: -------------------------------------------------------------------------------- 1 | exclude_paths: 2 | - ./collections/ 3 | - ./.venv/ 4 | - ./.cache 5 | - ./.git 6 | 7 | # https://github.com/ansible-community/ansible-lint/blob/master/src/ansiblelint/constants.py 8 | skip_list: 9 | - '204' # Lines should be no longer than 160 10 | - '301' # Commands should not change things if nothing needs 11 | - '302' # Using command rather than an argument to e.g. 12 | - '305' # Use shell only when shell functionality is required       13 | - '503' # Tasks that run when changed should likely be handlers 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nvidia-dpu_ops-*.tar.gz 2 | **/__pycache__ 3 | .cache 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 NVIDIA Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ansible Collection - nvidia.dpu_ops 2 | 3 | The following is a collection of roles that can be used to administer NVIDIA DPU cards. 4 | It contains the following functions: 5 | 6 | * `bf_bmc` - Run arbitrary ipmitool commands on the BMC of a DPU 7 | * `bf2_boot` - Modify the boot order of a DPU 8 | * `bf2_mode` - Modify the security and ownership modes of a DPU 9 | * `force_reboot_armos` - Force reboot the DPU over rshim 10 | * `install_doca` - Install DOCA utilities 11 | * `load_bfb` - Load BFB and bf.cfg over rshim 12 | * `manage_bf_bmc_fw` - Upgrade the firmware of the BMC of the DPU 13 | * `manage_bf2_fw` - Upgrade the firmware of the DPU 14 | * `manage_bf2_nic_speed` - Change settings on the nic speed for a DPU 15 | * `manage_rshim_owner` - Change rshim ownership between a DPU and its host 16 | * `prepare_cuda_repo` - Prepare local repository of CUDA installer 17 | * `install_cuda` - Install CUDA on x86 or DPU 18 | * `dpu_nvconfig` - Set nvconfig parameters of DPU 19 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | roles_path = roles 3 | library = plugins/modules/ 4 | action_plugins = plugins/action/ 5 | filter_plugins = plugins/modules/ 6 | -------------------------------------------------------------------------------- /galaxy.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | 26 | namespace: nvidia 27 | 28 | name: dpu_ops 29 | 30 | version: 1.0.1 31 | 32 | readme: README.md 33 | 34 | authors: 35 | - John Trenholm 36 | - Michael Basnight 37 | - Yurii Shestakov 38 | 39 | description: DPU Ops 40 | 41 | license: 42 | - MIT 43 | 44 | # license_file: LICENSE 45 | 46 | tags: [] 47 | 48 | dependencies: {} 49 | 50 | repository: https://github.com/NVIDIA/ansible-collection-dpu-ops 51 | 52 | documentation: https://github.com/NVIDIA/ansible-collection-dpu-ops 53 | 54 | homepage: https://github.com/NVIDIA/ansible-collection-dpu-ops 55 | 56 | issues: https://github.com/NVIDIA/ansible-collection-dpu-ops/issues 57 | 58 | build_ignore: 59 | - '*.tar.gz' 60 | - plugins/modules/bf2_facts_test.py 61 | - poetry.lock 62 | - pyproject.toml 63 | -------------------------------------------------------------------------------- /meta/runtime.yml: -------------------------------------------------------------------------------- 1 | requires_ansible: ">=2.9,<2.12.0" 2 | -------------------------------------------------------------------------------- /plugins/README.md: -------------------------------------------------------------------------------- 1 | # Collections Plugins Directory 2 | 3 | This directory can be used to ship various plugins inside an Ansible collection. Each plugin is placed in a folder that 4 | is named after the type of plugin it is in. It can also include the `module_utils` and `modules` directory that 5 | would contain module utils and modules respectively. 6 | 7 | Here is an example directory of the majority of plugins currently supported by Ansible: 8 | 9 | ``` 10 | └── plugins 11 | ├── action 12 | ├── become 13 | ├── cache 14 | ├── callback 15 | ├── cliconf 16 | ├── connection 17 | ├── filter 18 | ├── httpapi 19 | ├── inventory 20 | ├── lookup 21 | ├── module_utils 22 | ├── modules 23 | ├── netconf 24 | ├── shell 25 | ├── strategy 26 | ├── terminal 27 | ├── test 28 | └── vars 29 | ``` 30 | 31 | A full list of plugin types can be found at [Working With Plugins](https://docs.ansible.com/ansible/2.10/plugins/plugins.html). 32 | -------------------------------------------------------------------------------- /plugins/action/raw_reboot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 | # SPDX-License-Identifier: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a 9 | # copy of this software and associated documentation files (the "Software"), 10 | # to deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | # 26 | ############################################################################### 27 | 28 | import time 29 | 30 | from datetime import datetime, timedelta 31 | 32 | from ansible.errors import AnsibleConnectionFailure 33 | from ansible.plugins.action import ActionBase 34 | 35 | 36 | DOCUMENTATION = r''' 37 | --- 38 | module: raw_reboot 39 | 40 | short_description: Module issuing a raw style reboot and wait to come online 41 | 42 | version_added: "1.1.0" 43 | 44 | description: Module for raw reboots 45 | 46 | options: 47 | reboot_timeout: 48 | description: Maximum number of seconds to wait for a reboot 49 | 50 | 51 | ''' 52 | 53 | 54 | EXAMPLES = r''' 55 | - name: raw reboot 56 | raw_reboot: 57 | reboot_timeout: 1200 58 | ''' 59 | 60 | 61 | class TimeoutException(Exception): 62 | pass 63 | 64 | 65 | class ActionModule(ActionBase): 66 | 67 | def run(self, **kwargs): 68 | result = super(ActionModule, self).run(kwargs) 69 | result['failed'] = True 70 | result['rebooted'] = False 71 | 72 | reboot_timeout = int(self._task.args.get('reboot_timeout', 600)) 73 | end_time = datetime.utcnow() + timedelta(seconds=reboot_timeout) 74 | 75 | # Now reboot and then wait 76 | self._low_level_execute_command("/sbin/reboot", sudoable=True) 77 | # Sleep just in case the reboot takes a few seconds 78 | time.sleep(30) 79 | 80 | while datetime.utcnow() < end_time: 81 | try: 82 | self._low_level_execute_command("/usr/bin/whoami", sudoable=True) 83 | result['failed'] = False 84 | result['rebooted'] = True 85 | return result 86 | except Exception as e: 87 | # a connection failure is fine here, we are waiting for it to reboot anyway 88 | # reset it and move on 89 | if isinstance(e, AnsibleConnectionFailure): 90 | try: 91 | self._connection.reset() 92 | except AnsibleConnectionFailure: 93 | pass 94 | time.sleep(60) 95 | 96 | raise TimeoutException("Timed out waiting for the host to reboot timeout seconds {timeout}".format(timeout=reboot_timeout)) 97 | -------------------------------------------------------------------------------- /plugins/action/raw_upgrade.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 | # SPDX-License-Identifier: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a 9 | # copy of this software and associated documentation files (the "Software"), 10 | # to deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | # 26 | ############################################################################### 27 | 28 | import time 29 | 30 | from datetime import datetime, timedelta 31 | 32 | from ansible.errors import AnsibleConnectionFailure 33 | from ansible.plugins.action import ActionBase 34 | 35 | 36 | DOCUMENTATION = r''' 37 | --- 38 | module: raw_upgrade 39 | 40 | short_description: Module issuing a raw style upgrade of firmwares 41 | 42 | version_added: "1.1.0" 43 | 44 | description: Module for raw upgrades 45 | 46 | options: 47 | retries: 48 | description: Maximum number of retries 49 | delay: 50 | description: Number of seconds to wait between retries 51 | 52 | 53 | ''' 54 | 55 | 56 | EXAMPLES = r''' 57 | - name: raw upgrade 58 | raw_upgrade: 59 | retries: 100 60 | delay: 60 61 | ''' 62 | 63 | 64 | class FailedActivationException(Exception): 65 | pass 66 | 67 | 68 | class UnfinishedActivationException(Exception): 69 | pass 70 | 71 | 72 | ACTIVATE_LINE = "busctl set-property xyz.openbmc_project.Software.BMC.Updater /xyz/openbmc_project/software/{} xyz.openbmc_project.Software.Activation RequestedActivation s xyz.openbmc_project.Software.Activation.RequestedActivations.Active" 73 | 74 | VERIFY_LINE = "busctl get-property xyz.openbmc_project.Software.BMC.Updater /xyz/openbmc_project/software/{} xyz.openbmc_project.Software.Activation Activation" 75 | 76 | 77 | class ActionModule(ActionBase): 78 | 79 | def run(self, **kwargs): 80 | result = super(ActionModule, self).run(kwargs) 81 | failed = False 82 | active = False 83 | 84 | retries = int(self._task.args.get('retries', 100)) 85 | delay = int(self._task.args.get('delay', 60)) 86 | current_try = 0 87 | 88 | image_lines = self._low_level_execute_command("ls --color=none -t /tmp/images/")['stdout_lines'] 89 | if len(image_lines) > 1: 90 | raise FailedActivationException("More than one file is present in /tmp/images") 91 | image_name = image_lines[0] 92 | 93 | self._low_level_execute_command(ACTIVATE_LINE.format(image_name)) 94 | 95 | while current_try < retries: 96 | verify_out = self._low_level_execute_command(VERIFY_LINE.format(image_name))['stdout'] 97 | 98 | if "Activation.Activations.Active" in verify_out: 99 | active = True 100 | break 101 | if "Activation.Activations.Failed" in verify_out: 102 | failed = True 103 | break 104 | current_try += 1 105 | time.sleep(delay) 106 | 107 | if failed: 108 | raise FailedActivationException("Activation of firmware has failed") 109 | if not active: 110 | raise UnfinishedActivationException("Activation of firmware timed out and stayed in Activating state") 111 | 112 | result['active'] = active 113 | return result 114 | -------------------------------------------------------------------------------- /plugins/filter/rshim_filter.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: MIT 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | r''' 24 | `get_rshim` filter implementation 25 | ''' 26 | def get_rshim(bf2_devices, rshim): 27 | "returns list of rshim devices" 28 | return [f for f in bf2_devices if f['rshim'] == rshim] 29 | 30 | 31 | class FilterModule: 32 | """Ansible filter `get_rshim`""" 33 | def filters(self): 34 | 'return dict pointing at function' 35 | return {'get_rshim': get_rshim,} 36 | -------------------------------------------------------------------------------- /plugins/modules/bf2_facts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 | # SPDX-License-Identifier: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a 9 | # copy of this software and associated documentation files (the "Software"), 10 | # to deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | # 26 | ############################################################################### 27 | 28 | from __future__ import (absolute_import, division, print_function) 29 | __metaclass__ = type 30 | import re 31 | import sys 32 | 33 | 34 | DOCUMENTATION = r''' 35 | --- 36 | module: bf2_facts 37 | 38 | short_description: Module for generating bf2 facts 39 | 40 | version_added: "1.1.0" 41 | 42 | description: MModule for generating bf2 facts 43 | 44 | ''' 45 | 46 | 47 | EXAMPLES = r''' 48 | - name: gather bf2 facts 49 | bf2_facts: 50 | ''' 51 | 52 | 53 | RETURN = r''' 54 | ansible_facts: 55 | description: Facts to add to ansible_facts. 56 | returned: always 57 | type: dict 58 | contains: 59 | ''' 60 | 61 | 62 | UNDEFINED = 'UNDEFINED' 63 | 64 | 65 | import shlex 66 | import subprocess 67 | from ansible.module_utils.basic import AnsibleModule 68 | 69 | # singleton, cache of mlxconfig, key is pci/mst dev, val is dict() 70 | nvconfig_cache = dict() 71 | lspci_cache = dict() 72 | 73 | 74 | class CommandError(Exception): 75 | """ 76 | helper class for handling stderr failures 77 | """ 78 | def __init__(self, stderr): 79 | self.stderr = stderr 80 | def __str__(self): 81 | return self.stderr 82 | 83 | 84 | def execute(cmd): 85 | """ 86 | Executes a command, will raise an error if stderr is not clean 87 | """ 88 | if type(cmd) == str: 89 | cmd = shlex.split(cmd) 90 | proc = subprocess.Popen(cmd, 91 | stdout=subprocess.PIPE, 92 | stderr=subprocess.PIPE) 93 | try: 94 | stdout, stderr = proc.communicate(input=None, timeout=15) 95 | if proc.returncode != 0: 96 | # if stderr: 97 | raise CommandError(stderr) 98 | except subprocess.TimeoutExpired: 99 | proc.kill() 100 | stdout, stderr = proc.communicate() 101 | return stdout.decode('utf-8') 102 | 103 | 104 | def get_lines(cmd): 105 | return execute(cmd).rstrip().split('\n') 106 | 107 | 108 | def get_first_result(results, key): 109 | for r in results: 110 | if key in r: 111 | return r 112 | return None 113 | # return next(filter(lambda r: key in r, results)) 114 | 115 | 116 | def has_query_privhost(): 117 | lines = get_lines('mlxprivhost -h') 118 | return get_first_result(lines, 'query') is not None 119 | 120 | 121 | def get_rshim_output(rshim_path): 122 | # File IO in the Popen call is unhappy w/ the special rshim files, so this call command is used 123 | subprocess.call("echo 'DISPLAY_LEVEL 1' > {}/misc".format(rshim_path), shell=True) 124 | lines = get_lines("cat {}/misc".format(rshim_path)) 125 | # add in the rshim slot for later use 126 | lines.append("RSHIM_SLOT {}".format(rshim_path)) 127 | dev_name_line = get_first_result(lines, 'DEV_NAME') 128 | full_dev_name = shlex.split(dev_name_line)[1] 129 | return full_dev_name, lines 130 | 131 | 132 | def get_mst_and_pci(): 133 | # get all the lines with BlueField2 since those are the cards 134 | # Note that the -v flag will have 2 devices per card 135 | # the second device will be in the form of device.1 136 | # We discard the device.1's to not have duplicate devices 137 | lines = get_lines('mst status -v') 138 | # FIXME BlueField (1), BlueField3 ? 139 | bf_lines = [l for l in lines if 'BlueField' in l] 140 | # grab only the pcie device name 141 | mst_and_pci = [tuple(l.split()[1:3]) for l in bf_lines] 142 | # discard the devices with a period in the name 143 | return [l for l in mst_and_pci if '.' not in l[0]] 144 | 145 | 146 | def _parse_mlxconfig(lines): 147 | """ 148 | Input: lines of `mlxconfig -d .. q` output 149 | Output: dict 150 | """ 151 | # in_hdr = True 152 | ret = dict() 153 | for l in lines: 154 | # if in_hdr: 155 | # if l.startswith('Configurations'): 156 | # in_hdr = False 157 | # continue 158 | # if not l: 159 | # continue 160 | ary = re.split(r'\s+', l) 161 | # print(repr(ary), file=sys.stderr) 162 | # (x, hdr, val) = re.split(r'\s+', l) 163 | if len(ary) >= 3 and ary[0] == '': 164 | ret[ary[1]] = ary[2] 165 | return ret 166 | 167 | 168 | def get_mlxconfig(mst): 169 | global nvconfig_cache 170 | if mst in nvconfig_cache: 171 | return nvconfig_cache[mst] 172 | lines = get_lines("mlxconfig -d {} q".format(mst)) 173 | ret = _parse_mlxconfig(lines) 174 | # needed for PRIS and ROY adapters: 175 | if 'PCI_DOWNSTREAM_PORT_OWNER' in ret: 176 | k = 'PCI_DOWNSTREAM_PORT_OWNER[4]' 177 | lines = get_lines("mlxconfig -d {} q {}".format(mst, k)) 178 | r2 = _parse_mlxconfig(lines) 179 | ret[k] = r2[k] 180 | nvconfig_cache[mst] = ret 181 | return(ret) 182 | 183 | 184 | def get_mode(mst): 185 | nvcfg = get_mlxconfig(mst) 186 | # print(f"(get_mode: {nvcfg['INTERNAL_CPU_MODEL']})", file=sys.stderr) 187 | # TODO what about NIC_MODE vs SNIC_MODE vs SEPARATED_MODE ? 188 | v = nvcfg.get('INTERNAL_CPU_MODEL', None) 189 | if v is not None: 190 | return 'embedded' if v == 'EMBEDDED_CPU(1)' else 'separated' 191 | else: 192 | return UNDEFINED 193 | 194 | 195 | def get_vpd(pci): 196 | if pci in lspci_cache: 197 | return lspci_cache[pci] 198 | lines = get_lines("lspci -vvs {}".format(pci)) 199 | rx = re.compile('^\s+\[(\w\w)\]\s[^:]+:\s(.*?)\s*$') 200 | ret = dict() 201 | for l in lines: 202 | m = rx.search(l) 203 | if m is None: 204 | continue 205 | ret[m.group(1)] = m.group(2) 206 | lspci_cache[pci] = ret 207 | return ret 208 | 209 | 210 | def get_serial_number(pci): 211 | # lines = get_lines("lspci -vvs {}".format(pci)) 212 | # line = get_first_result(lines, 'Serial number') 213 | # if line is None: 214 | # return UNDEFINED 215 | # return line.split(":")[-1].strip() 216 | vpd = get_vpd(pci) 217 | return vpd.get('SN', UNDEFINED) 218 | 219 | 220 | 221 | def get_part_number(pci): 222 | vpd = get_vpd(pci) 223 | return vpd.get('PN', UNDEFINED) 224 | # lines = get_lines("lspci -vvs {}".format(pci)) 225 | # line = get_first_result(lines, 'Part number') 226 | # if line is None: 227 | # return UNDEFINED 228 | # return line.split(":")[-1].strip() 229 | 230 | 231 | def get_rshims_from_fs(): 232 | # the case of no rshims should return an empty list, not a list of 1 empty item 233 | rshims = get_lines('find /dev -maxdepth 1 -name "rshim*"') 234 | if len(rshims) == 1 and not rshims[0]: 235 | return [] 236 | return rshims 237 | 238 | 239 | def get_rshim_from_pci(rshim_outs, pci): 240 | if not rshim_outs: 241 | return None 242 | # Split on the dot of the pci as the key in the rshim_outs 243 | # has a different dot version (62:00.0 vs 62:00.2) 244 | rshim_key = pci.split('.')[0] 245 | # There may not be rshim's on the host for a given card, so not finding 246 | # a result just means it is not found 247 | key = get_first_result(rshim_outs.keys(), rshim_key) 248 | if key is None: 249 | return [] 250 | return rshim_outs.get(key) 251 | 252 | 253 | def get_mac_from_rshim_output(rshim_out): 254 | line = get_first_result(rshim_out, 'PEER_MAC') 255 | return shlex.split(line)[1] 256 | 257 | 258 | def get_rshim_slot_from_rshim_output(rshim_out): 259 | line = get_first_result(rshim_out, 'RSHIM_SLOT') 260 | return shlex.split(line)[1] 261 | 262 | 263 | def get_restriction_level(mst): 264 | lines = get_lines("mlxprivhost -d {} q".format(mst)) 265 | line = get_first_result(lines, 'level') 266 | return line.split(":")[1].strip().lower() 267 | 268 | 269 | def get_versions(mst): 270 | lines = get_lines("mlxfwmanager -d {}".format(mst)) 271 | versions = {} 272 | for line in lines: 273 | for phrase in ['FW', 'PXE', 'UEFI', 'UNKNOWN_ROM']: 274 | if phrase in line: 275 | # Some of the UEFI Virtio have 3 words before the version so this 276 | # takes that into consideration 277 | split = shlex.split(line) 278 | if (split[1] == 'Virtio'): 279 | key = "{} {} {}".format(split[0], split[1], split[2]) 280 | versions[key] = split[3] 281 | else: 282 | versions[split[0]] = split[1] 283 | return versions 284 | 285 | 286 | def run_module(): 287 | ansible_facts = {'bf2_devices': []} 288 | warnings = [] 289 | 290 | module = AnsibleModule( 291 | argument_spec={}, 292 | supports_check_mode=True 293 | ) 294 | try: 295 | try: 296 | execute('mst start') 297 | except FileNotFoundError: 298 | # if mst is not installed on the machine, popen will throw this exception, 299 | # so it can be handled gracefully 300 | module.exit_json(ansible_facts=ansible_facts, 301 | warnings="could not find the mst command, ensure that mlnx-ofed-all is installed") 302 | 303 | # validate if mlxprivhost can be used for query mode. some versions do not have the query flag 304 | can_query_privhost = has_query_privhost() 305 | 306 | rshims = get_rshims_from_fs() 307 | # rshim output will contain a key to the pcie device name with info inside it 308 | rshim_outs = {} 309 | 310 | # get all the rshim's on a single machine 311 | for rshim_path in rshims: 312 | full_dev_name, lines = get_rshim_output(rshim_path) 313 | rshim_outs[full_dev_name] = lines 314 | 315 | for mst, pci in get_mst_and_pci(): 316 | rshim_out = get_rshim_from_pci(rshim_outs, pci) 317 | permission = get_restriction_level(mst) if can_query_privhost else UNDEFINED 318 | if permission == 'privileged': 319 | # many items only work in privileged mode 320 | ownership = get_mode(mst) 321 | versions = get_versions(mst) 322 | else: 323 | ownership = UNDEFINED 324 | versions = UNDEFINED 325 | 326 | 327 | ansible_facts['bf2_devices'].append({ 328 | 'mst': mst, 329 | 'pci': pci, 330 | 'ownership': ownership, 331 | 'permission': permission, 332 | 'serial_number': get_serial_number(pci), 333 | 'part_number': get_part_number(pci), 334 | # Sort this out once the mac is not all 00's 335 | # 'mac': get_mac_from_rshim_output(rshim_out) if rshim_out else UNDEFINED, 336 | 'rshim': get_rshim_slot_from_rshim_output(rshim_out) if rshim_out else UNDEFINED, 337 | 'versions': versions, 338 | 'nvconfig': nvconfig_cache.get(mst, {}) 339 | }) 340 | 341 | module.exit_json(ansible_facts=ansible_facts, warnings="") 342 | except Exception as e: 343 | module.fail_json(msg='An unhandled error occured', exception=e) 344 | 345 | 346 | def main(): 347 | run_module() 348 | 349 | 350 | if __name__ == '__main__': 351 | main() 352 | -------------------------------------------------------------------------------- /plugins/modules/bf2_facts_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 | # SPDX-License-Identifier: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a 9 | # copy of this software and associated documentation files (the "Software"), 10 | # to deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | # 26 | ############################################################################### 27 | 28 | import bf2_facts 29 | import shlex 30 | import unittest 31 | from unittest.mock import patch 32 | 33 | 34 | def generate_rshim_output(mock_execute, pci, rshim, mac='00:00:00:00:00:00'): 35 | example = """DISPLAY_LEVEL 1 (0:basic, 1:advanced, 2:log) 36 | BOOT_MODE 1 (0:rshim, 1:emmc, 2:emmc-boot-swap) 37 | BOOT_TIMEOUT 100 (seconds) 38 | DROP_MODE 0 (0:normal, 1:drop) 39 | SW_RESET 0 (1: reset) 40 | DEV_NAME pcie-0000:{}.2 41 | DEV_INFO BlueField-2(Rev 1) 42 | BOOT_RESET_SKIP 0 (1: skip) 43 | PEER_MAC {} (rw) 44 | PXE_ID 0x00000000 (rw) 45 | VLAN_ID 0 0 (rw) 46 | """.format(pci, mac) 47 | mock_execute.return_value = example 48 | return bf2_facts.get_rshim_output(rshim) 49 | 50 | 51 | class Test(unittest.TestCase): 52 | @patch('bf2_facts.execute') 53 | def test_has_query_privhost_new_version(self, mock_execute): 54 | example = """usage: mlxprivhost [-h] [-v] --device DEVICE [--disable_rshim] [--disable_tracer] [--disable_counter_rd] [--disable_port_owner] {r,restrict,p,privilege,q,query} 55 | 56 | restrict or privilege host 57 | Note: New configurations takes effect immediately. 58 | Note: privileged host - host has all supported privileges. 59 | restricted host - host is not allowed to modify global 60 | per port/parameters or access other hosts parametersis. 61 | 62 | optional arguments: 63 | -h, --help show this help message and exit 64 | -v, --version show program's version number and exit 65 | 66 | Options: 67 | --device DEVICE, -d DEVICE 68 | Device to work with. 69 | --disable_rshim When TRUE, the host does not have an RSHIM function 70 | to access the embedded CPU registers 71 | --disable_tracer When TRUE, the host will not be allowed to own the Tracer 72 | --disable_counter_rd When TRUE, the host will not be allowed to read Physical port counters 73 | --disable_port_owner When TRUE, the host will not be allowed to be Port Owner 74 | 75 | Commands: 76 | {r,restrict,p,privilege,q,query} 77 | restrict: Set host 1 (ARM) privileged, host 0 (x86_64) restricted. 78 | privilege: Set host 1 (ARM) privileged, host 0 (x86_64) privileged 79 | (back to default). 80 | query: Query current host configuration. 81 | """ 82 | mock_execute.return_value = example 83 | self.assertTrue(bf2_facts.has_query_privhost()) 84 | 85 | @patch('bf2_facts.execute') 86 | def test_has_query_privhost_old_version(self, mock_execute): 87 | example = """usage: mlxprivhost [-h] [-v] --device DEVICE [--disable_rshim] [--disable_tracer] [--disable_counter_rd] [--disable_port_owner] {r,restrict,p,privilege} 88 | 89 | restrict or privilege host 90 | Note: New configurations takes effect immediately. 91 | Note: privileged host - host has all supported privileges. 92 | restricted host - host is not allowed to modify global 93 | per port/parameters or access other hosts parametersis. 94 | 95 | optional arguments: 96 | -h, --help show this help message and exit 97 | -v, --version show program's version number and exit 98 | 99 | Options: 100 | --device DEVICE, -d DEVICE 101 | Device to work with. 102 | --disable_rshim When TRUE, the host does not have an RSHIM function 103 | to access the embedded CPU registers 104 | --disable_tracer When TRUE, the host will not be allowed to own the Tracer 105 | --disable_counter_rd When TRUE, the host will not be allowed to read Physical port counters 106 | --disable_port_owner When TRUE, the host will not be allowed to be Port Owner 107 | 108 | Commands: 109 | {r,restrict,p,privilege} 110 | restrict: Set host 1 (ARM) privileged, host 0 (x86_64) restricted. 111 | privilege: Set host 1 (ARM) privileged, host 0 (x86_64) privileged 112 | (back to default). 113 | """ 114 | mock_execute.return_value = example 115 | self.assertFalse(bf2_facts.has_query_privhost()) 116 | 117 | @patch('bf2_facts.execute') 118 | @patch('subprocess.call') 119 | def test_get_rshim_output(self, call, mock_execute): 120 | example = """DISPLAY_LEVEL 1 (0:basic, 1:advanced, 2:log) 121 | BOOT_MODE 1 (0:rshim, 1:emmc, 2:emmc-boot-swap) 122 | BOOT_TIMEOUT 100 (seconds) 123 | DROP_MODE 0 (0:normal, 1:drop) 124 | SW_RESET 0 (1: reset) 125 | DEV_NAME pcie-0000:e2:00.2 126 | DEV_INFO BlueField-2(Rev 1) 127 | BOOT_RESET_SKIP 0 (1: skip) 128 | PEER_MAC 00:00:00:00:00:00 (rw) 129 | PXE_ID 0x00000000 (rw) 130 | VLAN_ID 0 0 (rw) 131 | """ 132 | mock_execute.return_value = example 133 | actual_rshim_slot = '/dev/rshim100' 134 | key, val = generate_rshim_output(mock_execute, 'e2:00', actual_rshim_slot) 135 | self.assertEqual(key, 'pcie-0000:e2:00.2') 136 | self.assertEqual(len(val), 12) 137 | rshim_slot = shlex.split([l for l in val if 'RSHIM_SLOT' in l][0])[1] 138 | self.assertEqual(rshim_slot, actual_rshim_slot) 139 | 140 | @patch('bf2_facts.execute') 141 | def test_get_mst_and_pci(self, mock_execute): 142 | example = """MST modules: 143 | ------------ 144 | MST PCI module is not loaded 145 | MST PCI configuration module loaded 146 | PCI devices: 147 | ------------ 148 | DEVICE_TYPE MST PCI RDMA NET NUMA 149 | BlueField2(rev:1) /dev/mst/mt41686_pciconf0.1 e2:00.1 mlx5_1 net-ens7f1 1 150 | 151 | BlueField2(rev:1) /dev/mst/mt41686_pciconf0 e2:00.0 mlx5_0 net-ens7f0 1 152 | 153 | """ 154 | mock_execute.return_value = example 155 | mst_and_pci = bf2_facts.get_mst_and_pci() 156 | self.assertEqual(len(mst_and_pci), 1) 157 | self.assertEqual(mst_and_pci[0][0], '/dev/mst/mt41686_pciconf0') 158 | self.assertEqual(mst_and_pci[0][1], 'e2:00.0') 159 | 160 | @patch('bf2_facts.execute') 161 | def test_get_mode(self, mock_execute): 162 | example = """ 163 | Device #1: 164 | ---------- 165 | 166 | Device type: BlueField2 167 | Name: MBF2M516A-EEEO_Ax 168 | Description: BlueField-2 E-Series SmartNIC 100GbE/EDR VPI Dual-Port QSFP56; PCIe Gen4 x16; Crypto Enabled; 16GB on-board DDR; 1GbE OOB management; FHHL 169 | Device: /dev/mst/mt41686_pciconf0 170 | 171 | Configurations: Next Boot 172 | MEMIC_BAR_SIZE 0 173 | MEMIC_SIZE_LIMIT _256KB(1) 174 | HOST_CHAINING_MODE DISABLED(0) 175 | HOST_CHAINING_CACHE_DISABLE False(0) 176 | HOST_CHAINING_DESCRIPTORS Array[0..7] 177 | HOST_CHAINING_TOTAL_BUFFER_SIZE Array[0..7] 178 | INTERNAL_CPU_MODEL EMBEDDED_CPU(1) 179 | _INTERNAL_CPU_MODEL SEPARATED_HOST(0) 180 | FLEX_PARSER_PROFILE_ENABLE 0 181 | PROG_PARSE_GRAPH False(0) 182 | FLEX_IPV4_OVER_VXLAN_PORT 0 183 | ROCE_NEXT_PROTOCOL 254 184 | ESWITCH_HAIRPIN_DESCRIPTORS Array[0..7] 185 | ESWITCH_HAIRPIN_TOT_BUFFER_SIZE Array[0..7] 186 | PF_BAR2_SIZE 0 187 | NON_PREFETCHABLE_PF_BAR False(0) 188 | VF_VPD_ENABLE False(0) 189 | PER_PF_NUM_SF False(0) 190 | LINK_TYPE_P1 ETH(2) 191 | LINK_TYPE_P2 ETH(2) 192 | """ 193 | mock_execute.return_value = example 194 | mode = bf2_facts.get_mode('/dev/mst/mt41686_pciconf0') 195 | self.assertEqual(mode, 'embedded') 196 | 197 | example = """ 198 | Device #1: 199 | ---------- 200 | 201 | Device type: BlueField2 202 | Name: MBF2M516A-EEEO_Ax 203 | Description: BlueField-2 E-Series SmartNIC 100GbE/EDR VPI Dual-Port QSFP56; PCIe Gen4 x16; Crypto Enabled; 16GB on-board DDR; 1GbE OOB management; FHHL 204 | Device: /dev/mst/mt41686_pciconf0.1 205 | 206 | Configurations: Next Boot 207 | MEMIC_BAR_SIZE 0 208 | INTERNAL_CPU_MODEL SEPARATED_HOST(0) 209 | """ 210 | mock_execute.return_value = example 211 | mode = bf2_facts.get_mode('/dev/mst/mt41686_pciconf0.1') 212 | self.assertEqual(mode, 'separated') 213 | 214 | @patch('bf2_facts.execute') 215 | def test_get_part_and_serial_number(self, mock_execute): 216 | example = """e2:00.0 Ethernet controller: Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 Dx network controller (rev 01) 217 | Subsystem: Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 Dx network controller 218 | Physical Slot: 7-1 219 | Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+ 220 | Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- SERR- [disabled] 227 | Capabilities: [48] Vital Product Data 228 | Product Name: BlueField-2 DPU 100GbE/EDR/HDR100 VPI Dual-Port QSFP56, Crypto Enabled, 16GB on-board DDR, 1GbE OOB management, Tall Bracket 229 | 230 | Read-only fields: 231 | [PN] Part number: MBF2M516A-EEEOT 232 | [EC] Engineering changes: A4 233 | [V2] Vendor specific: MBF2M516A-EEEOT 234 | [SN] Serial number: MT2050X00614 235 | [V3] Vendor specific: 9c20a1608d3feb118000043f72ff4c16 236 | [VA] Vendor specific: MLX:MN=MLNX:CSKU=V2:UUID=V3:PCI=V0:MODL=BF2M516A 237 | [V0] Vendor specific: PCIeGen4 x16 238 | [RV] Reserved: checksum good, 1 byte(s) reserved 239 | End 240 | """ 241 | mock_execute.return_value = example 242 | bf2_facts.lspci_cache = dict() # need to clean it up 243 | serial_number = bf2_facts.get_serial_number('e2:00.0') 244 | self.assertEqual('MT2050X00614', serial_number) 245 | part_number = bf2_facts.get_part_number('e2:00.0') 246 | self.assertEqual('MBF2M516A-EEEOT', part_number) 247 | 248 | 249 | @patch('bf2_facts.execute') 250 | def test_no_vpd(self, mock_execute): 251 | example = """e2:00.0 Ethernet controller: Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 Dx network controller (rev 01) 252 | Subsystem: Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 Dx network controller 253 | Physical Slot: 7-1 254 | Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+ 255 | Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- SERR- [disabled] 262 | Capabilities: [48] Vital Product Data 263 | End 264 | """ 265 | mock_execute.return_value = example 266 | bf2_facts.lspci_cache = dict() # need to clean it up 267 | serial_number = bf2_facts.get_serial_number('e2:00.0') 268 | self.assertEqual('UNDEFINED', serial_number) 269 | part_number = bf2_facts.get_part_number('e2:00.0') 270 | self.assertEqual('UNDEFINED', part_number) 271 | @patch('bf2_facts.execute') 272 | def test_get_rshims_from_fs(self, mock_execute): 273 | example = """/dev/rshim0 274 | /dev/rshim1 275 | /dev/rshim100 276 | """ 277 | mock_execute.return_value = example 278 | rshims = bf2_facts.get_rshims_from_fs() 279 | self.assertEqual(3, len(rshims)) 280 | 281 | @patch('bf2_facts.execute') 282 | @patch('subprocess.call') 283 | def test_get_rshim_from_pci(self, call, mock_execute): 284 | rshim_outs = {} 285 | pci_1 = 'aa:00' 286 | pci_2 = 'bb:00' 287 | for k,v in [(pci_1, '/dev/rshim1'), (pci_2, '/dev/rshim2')]: 288 | name, lines = generate_rshim_output(mock_execute, k, v) 289 | rshim_outs[name] = lines 290 | rshim_out = bf2_facts.get_rshim_from_pci(rshim_outs, pci_1) 291 | pci = [l for l in rshim_out if 'DEV_NAME' in l][0] 292 | self.assertTrue(pci_1 in pci) 293 | # empty case 294 | self.assertIsNone(bf2_facts.get_rshim_from_pci([], pci_1)) 295 | 296 | @patch('bf2_facts.execute') 297 | @patch('subprocess.call') 298 | def test_get_mac_from_rshim_output(self, call, mock_execute): 299 | # first get some rshim_out data populated 300 | rshim_outs = {} 301 | pci = 'aa:00' 302 | mac = '01:01:01:01:01:01' 303 | name, lines = generate_rshim_output(mock_execute, pci, '/dev/rshim0', mac=mac) 304 | rshim_outs[name] = lines 305 | rshim_out = bf2_facts.get_rshim_from_pci(rshim_outs, pci) 306 | 307 | out_mac = bf2_facts.get_mac_from_rshim_output(rshim_out) 308 | self.assertEqual(mac, out_mac) 309 | 310 | @patch('bf2_facts.execute') 311 | @patch('subprocess.call') 312 | def test_get_rshim_slot_from_rshim_output(self, call, mock_execute): 313 | # first get some rshim_out data populated 314 | rshim_outs = {} 315 | pci = 'aa:00' 316 | rshim_slot = '/dev/rshim100' 317 | name, lines = generate_rshim_output(mock_execute, pci, rshim_slot) 318 | rshim_outs[name] = lines 319 | rshim_out = bf2_facts.get_rshim_from_pci(rshim_outs, pci) 320 | 321 | out_rshim_slot = bf2_facts.get_rshim_slot_from_rshim_output(rshim_out) 322 | self.assertEqual(rshim_slot, out_rshim_slot) 323 | 324 | @patch('bf2_facts.execute') 325 | def test_get_restriction_level(self, mock_execute): 326 | example = """Current device configurations: 327 | ------------------------------ 328 | level : PRIVILEGED 329 | 330 | Port functions status: 331 | ----------------------- 332 | disable_rshim : FALSE 333 | disable_tracer : FALSE 334 | disable_port_owner : FALSE 335 | disable_counter_rd : FALSE 336 | 337 | """ 338 | mock_execute.return_value = example 339 | level = bf2_facts.get_restriction_level('/dev/mst/mt41686_pciconf0') 340 | self.assertEqual(level, 'privileged') 341 | 342 | @patch('bf2_facts.execute') 343 | def test_get_versions(self, mock_execute): 344 | example = """Querying Mellanox devices firmware ... 345 | 346 | Device #1: 347 | ---------- 348 | 349 | Device Type: BlueField2 350 | Part Number: MBF2M516A-EEEO_Ax 351 | Description: BlueField-2 E-Series SmartNIC 100GbE/EDR VPI Dual-Port QSFP56; PCIe Gen4 x16; Crypto Enabled; 16GB on-board DDR; 1GbE OOB management; FHHL 352 | PSID: MT_0000000559 353 | PCI Device Name: /dev/mst/mt41686_pciconf0 354 | Base MAC: 043f72a45a9c 355 | Versions: Current Available 356 | FW 24.29.2008 N/A 357 | PXE 3.6.0205 N/A 358 | UEFI 14.22.0019 N/A 359 | UNKNOWN_ROM 22.1.0011 N/A 360 | UEFI Virtio x 1.2.3.4 361 | 362 | Status: No matching image found 363 | 364 | """ 365 | mock_execute.return_value = example 366 | versions = bf2_facts.get_versions('/dev/mst/mt41686_pciconf0') 367 | self.assertEqual(versions['FW'], '24.29.2008') 368 | self.assertEqual(versions['PXE'], '3.6.0205') 369 | self.assertEqual(versions['UEFI'], '14.22.0019') 370 | self.assertEqual(versions['UNKNOWN_ROM'], '22.1.0011') 371 | self.assertEqual(versions['UEFI Virtio x'], '1.2.3.4') 372 | 373 | if __name__ == '__main__': 374 | unittest.main() 375 | -------------------------------------------------------------------------------- /roles/bf2_boot/README.md: -------------------------------------------------------------------------------- 1 | # DPU (BF2) Boot 2 | ## Parameters 3 | 4 | Ansible variable(s) to be defined: 5 | 6 | * `pxe_boot_dev` - name of device to boot DPU from in `/etc/bf.cfg` of the installer. 7 | Allowed values are: 8 | * `NET-OOB-IPV4` 9 | * `NET-NIC_P1-IPV4` 10 | 11 | ## Usage example 12 | 13 |
bf2-boot-order.yml 14 |

15 | - hosts: bf2oob
16 |   become: true
17 |   user: "{{ remote_install_user }}"
18 |   vars:
19 |     pxe_boot_dev: "{{ bf2.pxe_boot_dev }}"
20 |   roles:
21 |     - nvidia.dpu_ops.bf2_boot
22 | 
23 |
24 | -------------------------------------------------------------------------------- /roles/bf2_boot/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | --- 26 | 27 | - name: Create bf.cfg 28 | template: 29 | src: bf.cfg.j2 30 | dest: /etc/bf.cfg 31 | owner: root 32 | group: root 33 | mode: "0644" 34 | 35 | - name: run bfcfg 36 | command: /usr/bin/bfcfg 37 | -------------------------------------------------------------------------------- /roles/bf2_boot/templates/bf.cfg.j2: -------------------------------------------------------------------------------- 1 | BOOT0={{ pxe_boot_dev }} 2 | BOOT1=DISK 3 | -------------------------------------------------------------------------------- /roles/bf2_mode/README.md: -------------------------------------------------------------------------------- 1 | # DPU (BF2) Mode 2 | 3 | The `bf2_mode` role is used to: 4 | 1. set restricted mode and block the host from accessing the DPU or grant the access 5 | 2. change an "ownernership", actually switch DPU NIC mode between "separated host" and "smartnic" 6 | 7 | For more information about the DPU modes of operation, see the 8 | [NVIDIA Mellanox BlueField DPU SW Modes of Operation](https://docs.nvidia.com/networking/display/BlueFieldSWv35111601/Modes+of+Operation#ModesofOperation-SeparatedHost) page. 9 | 10 | ## Parameters 11 | 12 | Ansible variable(s) to be defined: 13 | 14 | * `new_bf_mode` - is used set restricted mode and block the host from accessing the DPU 15 | Allowed values are: 16 | * `privileged` 17 | * `restricted` 18 | 19 | * `new_bf_ownership` - the DPU may be placed in either separated or embedded ownership mode. 20 | Allowed values are: 21 | * `SEPARATED_HOST` 22 | * `EMBEDDED_CPU` 23 | 24 | ## Playbook examples 25 | 26 |
bf2_mode.yml 27 |

28 | ---
29 | - hosts: bf2oob
30 |   user: "{{ remote_install_user }}"
31 |   become: true
32 |   pre_tasks:
33 |     - name: Check for required variables
34 |       fail:
35 |         msg: "Invalid security mode, new_bf_mode should either be restricted or privileged"
36 |       when: new_bf_mode not in bf2.security_modes
37 |   vars:
38 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
39 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
40 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
41 |     run_on: "{{ groups['foreman'][0] }}"
42 |   roles:
43 |     - nvidia.dpu_ops.bf2_mode
44 |   post_tasks:
45 |     - name: wait for machine to be back online
46 |       wait_for:
47 |         host: "{{ non_bf2_host }}"
48 |         port: 22
49 |         timeout: 900
50 |         delay: 60
51 |       delegate_to: "{{ groups['foreman'][0] }}"
52 | 
53 |
54 | -------------------------------------------------------------------------------- /roles/bf2_mode/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | mode_options: "--disable_rshim --disable_tracer --disable_counter_rd --disable_port_owner" 26 | should_reboot: False 27 | 28 | embedded_port: 29 | - enp3s0f0s0 30 | - enp3s0f1s0 31 | separated_port: 32 | - p0 33 | - p1 34 | -------------------------------------------------------------------------------- /roles/bf2_mode/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: get facts 26 | bf2_facts: 27 | when: bf2_devices is not defined # bf2[0] since the dpu card only has 1 entry 28 | 29 | - name: Change security mode 30 | include_tasks: security.yml 31 | when: new_bf_mode is defined 32 | 33 | - name: Change ownership mode 34 | include_tasks: ownership.yml 35 | when: new_bf_ownership is defined 36 | 37 | - name: reboot host 38 | block: 39 | - name: turn x86 host off 40 | vars: 41 | bmc_action: "chassis power off" 42 | include_role: 43 | name: bf_bmc 44 | 45 | - name: Sleep for 1 minute to ensure power off 46 | pause: 47 | seconds: 60 48 | 49 | - name: turn x86 host on 50 | vars: 51 | bmc_action: "chassis power on" 52 | include_role: 53 | name: bf_bmc 54 | when: should_reboot 55 | -------------------------------------------------------------------------------- /roles/bf2_mode/tasks/ownership.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: set current_bf_ownership 26 | set_fact: 27 | current_bf_ownership: "{{ 'separated_host' if bf2_devices[0].ownership == 'separated' else 'embedded_cpu' }}" 28 | new_ownership_val: "{{ 0 if new_bf_ownership == 'separated_host' else 1 }}" 29 | 30 | - name: set interface names on new ownership 31 | set_fact: 32 | new_interface: "{{ separated_port if new_bf_ownership == 'separated_host' else embedded_port }}" 33 | old_interface: "{{ separated_port if current_bf_ownership == 'separated_host' else embedded_port }}" 34 | 35 | - name: var 36 | debug: 37 | msg: "Current: {{ current_bf_ownership }} New: {{ new_bf_ownership }}" 38 | 39 | - name: See if netplan exists 40 | stat: 41 | path: /etc/netplan/50-cloud-init.yaml 42 | register: netplan_file 43 | 44 | - name: Fail if netplan does not exist 45 | fail: 46 | msg: /etc/netplan/50-cloud-init.yaml did not exist 47 | when: not netplan_file.stat.exists 48 | 49 | - name: Change bf2 ownership 50 | command: "{{ item }}" 51 | with_items: 52 | - "mlxconfig -y -d {{ bf2_devices[0].mst }} s INTERNAL_CPU_MODEL={{ new_ownership_val }}" 53 | - "mlxconfig -y -d {{ bf2_devices[0].mst }}.1 s INTERNAL_CPU_MODEL={{ new_ownership_val }}" 54 | when: 55 | - current_bf_ownership != new_bf_ownership 56 | register: set_output_ownership 57 | 58 | - name: Set ovs config for separated 59 | lineinfile: 60 | path: /etc/mellanox/mlnx-ovs.conf 61 | regexp: CREATE_OVS_BRIDGES=.*$ 62 | line: CREATE_OVS_BRIDGES="no" 63 | when: 64 | - current_bf_ownership != new_bf_ownership and new_bf_ownership == 'separated_host' 65 | 66 | - name: cleanup ovs bridges 67 | command: "/usr/bin/ovs-vsctl --if-exists del-br {{ item }}" 68 | with_items: 69 | - "ovsbr1" 70 | - "ovsbr2" 71 | when: 72 | - current_bf_ownership != new_bf_ownership and new_bf_ownership == 'separated_host' 73 | 74 | - name: Update netplans - p0 75 | ansible.builtin.replace: 76 | path: "{{ item }}" 77 | regexp: "{{ old_interface[0] }}:" 78 | replace: "{{ new_interface[0] }}:" 79 | with_items: 80 | - /etc/netplan/50-cloud-init.yaml 81 | - /etc/netplan/60-mlnx.yaml 82 | 83 | - name: Update netplans - p1 84 | ansible.builtin.replace: 85 | path: "{{ item }}" 86 | regexp: "{{ old_interface[1] }}:" 87 | replace: "{{ new_interface[1] }}:" 88 | with_items: 89 | - /etc/netplan/50-cloud-init.yaml 90 | - /etc/netplan/60-mlnx.yaml 91 | 92 | - name: Set ovs config for embedded 93 | lineinfile: 94 | path: /etc/mellanox/mlnx-ovs.conf 95 | regexp: CREATE_OVS_BRIDGES=.*$ 96 | line: CREATE_OVS_BRIDGES="yes" 97 | when: 98 | - current_bf_ownership != new_bf_ownership and new_bf_ownership == 'embedded_cpu' 99 | 100 | - name: update netplan 101 | command: netplan generate 102 | 103 | - name: apply netplan 104 | command: netplan apply 105 | 106 | - name: sync files to disk 107 | command: sync 108 | 109 | - name: Set reboot flag 110 | set_fact: 111 | should_reboot: True 112 | when: set_output_ownership.changed 113 | -------------------------------------------------------------------------------- /roles/bf2_mode/tasks/security.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: set current_bf_mode 26 | set_fact: 27 | current_bf_mode: "{{ bf2_devices[0].permission }}" 28 | 29 | - name: var 30 | debug: 31 | msg: "Current: {{ current_bf_mode }} New: {{ new_bf_mode }}" 32 | 33 | - name: Set mode to restricted 34 | command: "mlxprivhost -d {{ bf2_devices[0].mst }} r {{ mode_options }}" 35 | when: 36 | - new_bf_mode == 'restricted' 37 | - current_bf_mode != new_bf_mode 38 | register: mode_change_restricted 39 | 40 | - name: Set mode to privileged 41 | command: "mlxprivhost -d {{ bf2_devices[0].mst }} p" 42 | when: 43 | - new_bf_mode == 'privileged' 44 | - current_bf_mode != new_bf_mode 45 | register: mode_change_priv 46 | 47 | - name: Set reboot flag 48 | set_fact: 49 | should_reboot: True 50 | when: mode_change_priv.changed or mode_change_restricted.changed 51 | -------------------------------------------------------------------------------- /roles/bf_bmc/README.md: -------------------------------------------------------------------------------- 1 | # BF BMC 2 | This role is used to manage power state of x86 or DPU using IPMI protocol 3 | 4 | ## Parameters 5 | 6 | The `main.yaml` tasks requires following parameter to be specified: 7 | 8 | * `bmc_action` -- IPMI command to execute on BMC 9 | 10 | ## Standalone tasks 11 | 12 | * `chassis_power_off.yaml` 13 | * `chassis_power_on.yaml` 14 | * `powercycle.yml` 15 | 16 | ## Playbook example 17 | 18 |
bf2_mode.yml 19 |

20 | ---
21 | - hosts: "bmc"
22 |   user: "{{ remote_install_user }}"
23 |   become: true
24 |   gather_facts: False
25 |   vars:
26 |     bmc_action: "chassis power cycle"
27 |     bmc_host: "{{ inventory_hostname }}"
28 |     bmc_user: "{{ ansible_user }}"
29 |     bmc_password: "{{ ansible_password }}"
30 |     run_on: "{{ groups['foreman'][0] }}"
31 |   roles:
32 |     - nvidia.dpu_ops.bf_bmc
33 | 
34 |
35 | 36 |
powercycle.yml 37 |

38 | ---
39 | - hosts: bf2oob
40 |   user: "{{ remote_install_user }}"
41 |   become: true
42 |   gather_facts: true
43 | 
44 |   vars:
45 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
46 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
47 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
48 |     run_on: "{{ groups['foreman'][0] }}"
49 |   tasks:
50 |     - name: power-cycle x86 host
51 |       include_role:
52 |         name: nvidia.dpu_ops.bf_bmc
53 |         tasks_from: powercycle.yml
54 | 
55 |
56 | -------------------------------------------------------------------------------- /roles/bf_bmc/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | 26 | run_on: localhost 27 | powercycle_delay: 60 28 | -------------------------------------------------------------------------------- /roles/bf_bmc/tasks/chassis_power_off.yaml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | --- 26 | - name: ipmitool command 27 | command: 28 | ipmitool -I lanplus -H {{ bmc_host }} -U {{ bmc_user }} -P {{ bmc_password }} chassis power off 29 | register: bmc_output 30 | delegate_to: "{{ run_on }}" 31 | 32 | - name: ipmitool output 33 | debug: 34 | var: bmc_output.stdout_lines 35 | 36 | -------------------------------------------------------------------------------- /roles/bf_bmc/tasks/chassis_power_on.yaml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | --- 26 | - name: ipmitool command 27 | command: 28 | ipmitool -I lanplus -H {{ bmc_host }} -U {{ bmc_user }} -P {{ bmc_password }} chassis power on 29 | register: bmc_output 30 | delegate_to: "{{ run_on }}" 31 | 32 | - name: ipmitool output 33 | debug: 34 | var: bmc_output.stdout_lines 35 | 36 | -------------------------------------------------------------------------------- /roles/bf_bmc/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | --- 26 | - name: ipmitool command 27 | command: 28 | ipmitool -I lanplus -H {{ bmc_host }} -U {{ bmc_user }} -P {{ bmc_password }} {{ bmc_action }} 29 | register: bmc_output 30 | delegate_to: "{{ run_on }}" 31 | 32 | - name: ipmitool output 33 | debug: 34 | var: bmc_output.stdout_lines 35 | -------------------------------------------------------------------------------- /roles/bf_bmc/tasks/powercycle.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | --- 26 | - name: turn x86 host off 27 | include_tasks: chassis_power_off.yaml 28 | - name: Sleep for 1 minute to ensure power off 29 | pause: 30 | seconds: "{{ powercycle_delay }}" 31 | - name: turn x86 host on 32 | include_tasks: chassis_power_on.yaml 33 | - name: wait for machine to be back online 34 | wait_for: 35 | host: "{{ non_bf2_host }}" 36 | port: 22 37 | timeout: 900 38 | delay: "{{ powercycle_delay }}" 39 | delegate_to: "{{ groups['foreman'][0] }}" 40 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/README.md: -------------------------------------------------------------------------------- 1 | # Set nvconfig parameters of DPU 2 | 3 | The `dpu_nvconfig` role use use to: 4 | * set link type (ETH or IB) 5 | * set NIC mode 6 | * set GPU owner 7 | 8 | ## Parameters 9 | 10 | 1. Set link type (ETH or IB) 11 | * `link_type_p1` 12 | * `link_type_p2` 13 | 14 | 2. Set NIC mode: 15 | * `dpu_nic_mode` allowed values: `ConnectX` or `SmartNIC` 16 | 17 | 3. Set GPU owner for ROY adapter (DPU+GPU) 18 | * `gpu_owner` allowed values: `ARM` or `X86` 19 | 20 | ## Playbook examples 21 | 22 |
set_vpi_mode.yaml 23 |

 24 | ---
 25 | - hosts: bf2oob
 26 |   user: "{{ remote_install_user }}"
 27 |   become: true
 28 |   gather_facts: true
 29 |   pre_tasks:
 30 |     - name: Check for required variables
 31 |       fail:
 32 |         msg: "Neither link_type_p1 nor link_type_p2 variables defined. Allowed values: ETH or IB"
 33 |       when:
 34 |         - not link_type_p1 is defined
 35 |         - not link_type_p2 is defined
 36 | 
 37 |   vars:
 38 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
 39 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
 40 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
 41 |     run_on: "{{ groups['foreman'][0] }}"
 42 |     link_type_p1: ETH
 43 |     link_type_p2: IB
 44 | 
 45 |   roles:
 46 |     - nvidia.dpu_ops.dpu_nvconfig
 47 | 
 48 |   post_tasks:
 49 |     - name: reboot x86 host block
 50 |       block:
 51 |         - name: notify about reboot
 52 |           debug:
 53 |             msg: "!!! Reboot of {{ non_bf2_host }} is scheduled (playbook handler)!!!"
 54 |         - name: power-cycle x86 host
 55 |           include_role:
 56 |             name: nvidia.dpu_ops.bf_bmc
 57 |             tasks_from: powercycle.yml
 58 |       when: should_reboot is defined
 59 | 
60 |
61 | 62 |
set_nic_mode-cx.yaml 63 |

 64 | ---
 65 | - hosts: bf2oob
 66 |   user: "{{ remote_install_user }}"
 67 |   become: true
 68 |   gather_facts: true
 69 | 
 70 |   vars:
 71 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
 72 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
 73 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
 74 |     run_on: "{{ groups['foreman'][0] }}"
 75 | 
 76 |   roles:
 77 |     - {name: nvidia.dpu_ops.dpu_nvconfig,
 78 |        dpu_nic_mode: ConnectX}
 79 | 
 80 |   post_tasks:
 81 |     - name: power-cycle x86 host
 82 |       include_role:
 83 |         name: nvidia.dpu_ops.bf_bmc
 84 |         tasks_from: powercycle.yml
 85 |     when: should_reboot is defined
 86 | 
87 |
88 | 89 |
set_nic_mode-snic.yaml 90 |

 91 | ---
 92 | - hosts: bf2oob
 93 |   user: "{{ remote_install_user }}"
 94 |   become: true
 95 |   gather_facts: true
 96 | 
 97 |   vars:
 98 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
 99 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
100 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
101 |     run_on: "{{ groups['foreman'][0] }}"
102 | 
103 |   roles:
104 |     - {name: nvidia.dpu_ops.dpu_nvconfig,
105 |        dpu_nic_mode: SmartNIC}
106 | 
107 |   post_tasks:
108 |     - name: power-cycle x86 host
109 |       include_role:
110 |         name: nvidia.dpu_ops.bf_bmc
111 |         tasks_from: powercycle.yml
112 |     when: should_reboot is defined
113 | 
114 |
115 | 116 |
set_gpu_mode.yaml 117 |

118 | ---
119 | - hosts: bf2oob
120 |   user: "{{ remote_install_user }}"
121 |   become: true
122 |   gather_facts: true
123 | 
124 |   vars:
125 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
126 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
127 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
128 |     run_on: "{{ groups['foreman'][0] }}"
129 | 
130 |   roles:
131 |     - {name: nvidia.dpu_ops.dpu_nvconfig,
132 |        gpu_owner: ARM}
133 | 
134 |   post_tasks:
135 |     - name: power-cycle x86 host
136 |       include_role:
137 |         name: nvidia.dpu_ops.bf_bmc
138 |         tasks_from: powercycle.yml
139 |     when: should_reboot is defined
140 | 
141 |
142 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | - name: get facts 24 | bf2_facts: 25 | when: bf2_devices is not defined 26 | # bf2[0] since the dpu card only has 1 entry 27 | 28 | - name: Change port link type 29 | include_tasks: set_link_type.yml 30 | 31 | - name: "Change NIC mode ({{ dpu_nic_mode }})" 32 | include_tasks: set_nic_mode.yml 33 | when: dpu_nic_mode is defined 34 | 35 | - name: "Change GPU owner ({{ gpu_owner }})" 36 | include_tasks: set_gpu_owner.yml 37 | when: gpu_owner is defined 38 | 39 | - name: Run 'mlxconfig set' 40 | include_tasks: run_mlxconfig.yml 41 | when: set_mlxconfig | length > 0 42 | 43 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/nvset.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | --- 24 | - name: "{{ item.key }} is defined" 25 | block: 26 | - name: set fact 27 | set_fact: 28 | num_val: "{{ nv[item.key] | regex_replace('^.*\\((\\d+)\\)', '\\1') }}" 29 | new_val: "{{ item.value | string }}" 30 | # - debug: 31 | # msg: "key {{ item.key }} is defined: {{ nv[item.key] }} | {{ num_val }}" 32 | - name: set fact 33 | set_fact: 34 | set_mlxconfig: "{{ set_mlxconfig }} {{ item.key }}={{ item.value }}" 35 | when: num_val != new_val 36 | when: nv[item.key] is defined 37 | 38 | - name: "{{ item.key }} is not defined" 39 | block: 40 | # - debug: 41 | # msg: "key {{ item.key }} is NOT defined, set {{ item.value }}" 42 | - name: set fact 43 | set_fact: 44 | set_mlxconfig: "{{ set_mlxconfig }} {{ item.key }}={{ item.value }}" 45 | when: not nv[item.key] is defined 46 | #- name: 47 | # debug: 48 | # msg: key/val {{ arg }} | {{ set_mlxconfig }} 49 | 50 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/run_mlxconfig.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | --- 24 | - name: mlxconfig set {{ set_mlxconfig }} 25 | ansible.builtin.shell: mlxconfig -d {{ bf2_devices[0].mst }} -y s {{ set_mlxconfig }} 26 | register: mlxconfig_set_link_type 27 | - name: set should_reboot 28 | set_fact: 29 | should_reboot: true 30 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/set_embedded_cpu_model.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | --- 24 | - include: nvset.yml arg={{ item }} 25 | with_dict: 26 | - {INTERNAL_CPU_MODEL: 1} 27 | - name: run 'mlxconfig set' 28 | include_tasks: run_mlxconfig.yml 29 | when: set_mlxconfig | length > 0 30 | - name: unset fact 31 | set_fact: 32 | set_mlxconfig: "" 33 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/set_gpu_owner.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | # Configurations: Next Boot New 24 | # PCI_DOWNSTREAM_PORT_OWNER[4] DEVICE_DEFAULT(0) EMBEDDED_CPU(15) 25 | --- 26 | - name: nvconfig 27 | set_fact: 28 | nv: '{{ bf2_devices[0].nvconfig }}' 29 | 30 | - name: Show GPU owner 31 | debug: 32 | msg: "Current GPU owner is {{ nv['PCI_DOWNSTREAM_PORT_OWNER[4]'] }}" 33 | 34 | - name: "Set GPU owned by ARM CPU" 35 | include: nvset.yml 36 | with_dict: 37 | - {"PCI_DOWNSTREAM_PORT_OWNER[4]": "15"} 38 | when: gpu_owner == "ARM" or gpu_owner == "arm" 39 | 40 | - name: "Set GPU owned by x86 CPU" 41 | include: nvset.yml 42 | with_dict: 43 | - {"PCI_DOWNSTREAM_PORT_OWNER[4]": "0"} 44 | when: gpu_owner == "X86" or gpu_owner == "x86" 45 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/set_link_type.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | --- 24 | - name: init 'set_mlxconfig' 25 | set_fact: 26 | set_mlxconfig: "" 27 | when: not (set_mlxconfig is defined) 28 | - block: 29 | - name: debug 30 | debug: 31 | msg: | 32 | Port 1: {{ bf2_devices[0].nvconfig.LINK_TYPE_P1 }} 33 | New value: {{ link_type_p1 }} 34 | - name: Set P1=IB? 35 | set_fact: 36 | set_mlxconfig: "{{ set_mlxconfig }} LINK_TYPE_P1=1" 37 | when: (bf2_devices[0].nvconfig.LINK_TYPE_P1 == 'ETH(2)' and link_type_p1 == 'IB') 38 | - name: Set P1=ETH? 39 | set_fact: 40 | set_mlxconfig: "{{ set_mlxconfig }} LINK_TYPE_P1=2" 41 | when: (bf2_devices[0].nvconfig.LINK_TYPE_P1 == 'IB(1)' and link_type_p1 == 'ETH') 42 | when: link_type_p1 is defined 43 | 44 | - block: 45 | - name: debug 46 | debug: 47 | msg: | 48 | Port 2: {{ bf2_devices[0].nvconfig.LINK_TYPE_P2 }} 49 | New value: {{ link_type_p2 }} 50 | - name: Set P2=IB? 51 | set_fact: 52 | set_mlxconfig: "{{ set_mlxconfig }} LINK_TYPE_P2=1" 53 | when: (bf2_devices[0].nvconfig.LINK_TYPE_P2 == 'ETH(2)' and link_type_p2 == 'IB') 54 | - name: Set P2=ETH? 55 | set_fact: 56 | set_mlxconfig: "{{ set_mlxconfig }} LINK_TYPE_P2=2" 57 | when: (bf2_devices[0].nvconfig.LINK_TYPE_P2 == 'IB(1)' and link_type_p2 == 'ETH') 58 | when: link_type_p2 is defined 59 | -------------------------------------------------------------------------------- /roles/dpu_nvconfig/tasks/set_nic_mode.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright 2022 NVIDIA Corporation 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | # 22 | ############################################################################### 23 | ## Ref: BLUEFIELD AS CONNECTX, Internal Architecture Spec 24 | # 5.2 Transition from SNIC mode to NIC mode 25 | # Transition from SNIC mode (default) to NIC mode should be available from x86, w/o accessing Arm cores. 26 | # 1. Install DPU ConnectX FW. 27 | # 2. NV configurations should be applied 28 | # a. INTERNAL_CPU_PAGE_SUPPLIER = EXT_HOST_PF 29 | # b. INTERNAL_CPU_ESWITCH_MANAGER = EXT_HOST_PF 30 | # c. INTERNAL_CPU_IB_VPORT0 = EXT_HOST_PF 31 | # d. INTERNAL_CPU_OFFLOAD_ENGINE = DISABLED 32 | # 3. Perform FW reset: mlxfwreset -d reset 33 | # a. Note, default reset flow (legacy/sub-1-sec) to be performed depends on system and device configurations. 34 | # b. It should be verified that if supported (mlxfwreset -d query), 35 | # both flows (legacy and sub-1-sec) should work and result in DPU device to be configured to DPU NIC mode. 36 | # 37 | # 5.2.1 Rshim host driver and Rshim PF aspect 38 | # 1. Rshim host driver isn’t must to complete transition and operate in DPU NIC mode 39 | # 2. The one who’d like to restrict Rshim PF should configure INTERNAL_CPU_RSHIM = DISABLED 40 | # a. Note: power cycle should be performed to apply such configuration 41 | # 42 | # 5.3 Transition from NIC mode back to SNIC mode (default) should include 43 | # ... 44 | # 3. NV configurations to be applied 45 | # a. INTERNAL_CPU_PAGE_SUPPLIER = ECPF 46 | # b. INTERNAL_CPU_ESWITCH_MANAGER = ECPF 47 | # c. INTERNAL_CPU_IB_VPORT0 = ECPF 48 | # d. INTERNAL_CPU_OFFLOAD_ENGINE = ENABLED 49 | # e. If INTERNAL_CPU_RSHIM = DISABLED, need to configure INTERNAL_CPU_RSHIM = ENABLED 50 | # i. Note, power cycle should be performed if INTERNAL_CPU_RSHIM is configured 51 | #--- 52 | # $ mlxconfig -d /dev/mst/mt41686_pciconf0 i 53 | # ... 54 | # INTERNAL CPU CONF: 55 | # INTERNAL_CPU_ESWITCH_MANAGER= 56 | # Defines the owner of Eth Embedded Switch responsibilities 57 | # 0x0: ECPF 58 | # 0x1: EXT_HOST_PF 59 | # Valid for INTERNAL_CPU_MODEL = EMBEDDED_CPU 60 | --- 61 | - name: nvconfig 62 | set_fact: 63 | nv: '{{ bf2_devices[0].nvconfig }}' 64 | 65 | - name: info 66 | debug: 67 | msg: | 68 | New NIC mode: {{ dpu_nic_mode }} 69 | 70 | # if INTERNAL_CPU_MODEL SEPARATED_HOST(0) 71 | # "-E- The Device doesn't support INTERNAL_CPU_PAGE_SUPPLIER parameter" 72 | # so, we need to set INTERNAL_CPU_MODEL=EMBEDDED_CPU(1) before switching to ConnectX mode 73 | # 74 | - name: set nic_mode=ConnectX 75 | block: 76 | - include: set_embedded_cpu_model.yml 77 | when: nv.INTERNAL_CPU_MODEL == "SEPARATED_HOST(0)" 78 | - include: nvset.yml 79 | with_dict: 80 | - {INTERNAL_CPU_PAGE_SUPPLIER: 1} 81 | - {INTERNAL_CPU_ESWITCH_MANAGER: 1} 82 | - {INTERNAL_CPU_IB_VPORT0: 1} 83 | - {INTERNAL_CPU_OFFLOAD_ENGINE: 1} 84 | when: dpu_nic_mode == "ConnectX" or dpu_nic_mode == "CX" 85 | 86 | - name: set nic_mode=SmartNIC 87 | block: 88 | - include: set_embedded_cpu_model.yml 89 | when: nv.INTERNAL_CPU_MODEL == "SEPARATED_HOST(0)" 90 | - include: nvset.yml 91 | with_dict: 92 | - {INTERNAL_CPU_PAGE_SUPPLIER: 0} 93 | - {INTERNAL_CPU_ESWITCH_MANAGER: 0} 94 | - {INTERNAL_CPU_IB_VPORT0: 0} 95 | - {INTERNAL_CPU_OFFLOAD_ENGINE: 0} 96 | when: dpu_nic_mode == "SmartNIC" or dpu_nic_mode == "SNIC" 97 | -------------------------------------------------------------------------------- /roles/force_reboot_armos/README.md: -------------------------------------------------------------------------------- 1 | # Force reboot of ARM OS of DPU 2 | The `force_reboot_armos` role is used to reboot "ARM OS" of DPU 3 | from x86 host side 4 | 5 | ## Parameters 6 | 7 | ## Playbook examples 8 | 9 |
force_reboot_armos.yaml 10 |

11 | ---
12 | - hosts: all
13 |   user: "{{ remote_install_user }}"
14 |   gather_facts: no
15 |   become: true
16 |   pre_tasks:
17 |     - name: set is_bmc
18 |       set_fact:
19 |         is_bmc: "{{ inventory_hostname.startswith('bmc') }}"
20 |   roles:
21 |     - nvidia.dpu_ops.force_reboot_armos
22 | 
23 |
24 | 25 | -------------------------------------------------------------------------------- /roles/force_reboot_armos/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: reboot armos from x86 host 26 | block: 27 | - name: get facts 28 | bf2_facts: 29 | when: bf2_devices is not defined 30 | register: bf2_output 31 | 32 | - name: Verify rshim is active on host 33 | assert: 34 | that: "{{ bf2_devices | nvidia.dpu_ops.get_rshim(rshim.dev) | length > 0 }}" 35 | fail_msg: > 36 | rshim is not active on the host, which could mean that either 37 | the BMC on the BF2 card owns the rshim or it is just disabled on the host. 38 | Please verify the state of your host and BF2 card before proceeding. 39 | 40 | - name: reboot bf2 41 | shell: echo "SW_RESET 1" > {{ rshim.dev }}/misc 42 | become: true 43 | when: is_bmc is false 44 | 45 | - name: reboot armos from the BMC 46 | block: 47 | - name: get rshim from BMC 48 | raw: 'find /dev -maxdepth 1 -name "rshim*"' 49 | register: rshim_output_bmc 50 | 51 | - name: Verify rshim is active on BMC 52 | assert: 53 | that: rshim_output_bmc.stdout_lines|length > 0 54 | fail_msg: > 55 | rshim is not active on the BMC, which could mean that either the x86 host 56 | owns the rshim or it is just disabled on the BMC. 57 | Please verify the state of your host and BF2 card before proceeding. 58 | 59 | - name: reboot bf2 on bmc 60 | raw: echo "SW_RESET 1" > /dev/rshim0/misc 61 | 62 | when: is_bmc is true 63 | -------------------------------------------------------------------------------- /roles/install_cuda/README.md: -------------------------------------------------------------------------------- 1 | # Install CUDA on x86 or DPU 2 | 3 | The `install_cuda` role is used to install Nvidia CUDA SDK 4 | on x86 host or on DPU. 5 | 6 | ## Parameters 7 | 8 | * `cuda_release` -- release of CUDA like `11.5.1`, `11.6.2` 9 | * `cuda_arch` -- CPU architecture to install. Allowed values are: 10 | * `amd64` 11 | * `arm64` 12 | * `mode` -- choose "DEB (network)" `deb_network` or "DEB (local)" `deb_local` 13 | 14 | 15 | ## Playbook examples 16 | 17 |
setup_cuda_network.yml 18 |

19 | # Usage:
20 | #   ansible-playbook setup_cuda_network.yml -v -e cuda_release=11.6.2 -e cuda_arch=arm64
21 | ---
22 | - hosts: "{{ groups['foreman'][0] }}"
23 |   user: "{{ remote_install_user }}"
24 |   become: true
25 |   roles:
26 |     - nvidia.dpu_ops.prepare_cuda_repo
27 | 
28 | - hosts: bf2oob
29 |   user: "{{ remote_install_user }}"
30 |   become: true
31 |   roles:
32 |     - name: nvidia.dpu_ops.install_cuda
33 |       mode: deb_network
34 |       when: cuda_arch == "arm64"
35 | 
36 | - hosts: x86host
37 |   user: "{{ remote_install_user }}"
38 |   become: true
39 |   roles:
40 |     - name: nvidia.dpu_ops.install_cuda
41 |       mode: deb_network
42 |       when: cuda_arch == "amd64"
43 | 
44 |
45 | 46 |
setup_cuda_local.yml 47 |

48 | # Usage:
49 | #   ansible-playbook setup_cuda_local.yml -v -e cuda_release=11.6.2 -e cuda_arch=arm64
50 | ---
51 | - hosts: "{{ groups['foreman'][0] }}"
52 |   user: "{{ remote_install_user }}"
53 |   become: true
54 |   roles:
55 |     - nvidia.dpu_ops.prepare_cuda_repo
56 | 
57 | - hosts: bf2oob
58 |   user: "{{ remote_install_user }}"
59 |   become: true
60 |   roles:
61 |     - name: nvidia.dpu_ops.install_cuda
62 |       mode: deb_local
63 |       when: cuda_arch == "arm64"
64 | 
65 | - hosts: x86host
66 |   user: "{{ remote_install_user }}"
67 |   become: true
68 |   roles:
69 |     - name: nvidia.dpu_ops.install_cuda
70 |       mode: deb_local
71 |       when: cuda_arch == "amd64"
72 | 
73 |
74 | -------------------------------------------------------------------------------- /roles/install_cuda/files/7fa2af80.gpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ansible-collection-dpu-ops/ed16f85bea103b0229c3c17b9584a9f7cd133707/roles/install_cuda/files/7fa2af80.gpg -------------------------------------------------------------------------------- /roles/install_cuda/files/A024F6F0E6D6A281.gpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ansible-collection-dpu-ops/ed16f85bea103b0229c3c17b9584a9f7cd133707/roles/install_cuda/files/A024F6F0E6D6A281.gpg -------------------------------------------------------------------------------- /roles/install_cuda/files/A4B469963BF863CC.gpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ansible-collection-dpu-ops/ed16f85bea103b0229c3c17b9584a9f7cd133707/roles/install_cuda/files/A4B469963BF863CC.gpg -------------------------------------------------------------------------------- /roles/install_cuda/files/F60F4B3D7FA2AF80.gpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ansible-collection-dpu-ops/ed16f85bea103b0229c3c17b9584a9f7cd133707/roles/install_cuda/files/F60F4B3D7FA2AF80.gpg -------------------------------------------------------------------------------- /roles/install_cuda/files/cuda-repository-pin-600: -------------------------------------------------------------------------------- 1 | # https://help.ubuntu.com/community/PinningHowto 2 | Package: nsight-compute 3 | Pin: origin *ubuntu.com* 4 | Pin-Priority: -1 5 | 6 | Package: nsight-systems 7 | Pin: origin *ubuntu.com* 8 | Pin-Priority: -1 9 | 10 | Package: * 11 | Pin: release l=NVIDIA CUDA 12 | Pin-Priority: 600 13 | 14 | -------------------------------------------------------------------------------- /roles/install_cuda/files/libnvidia-container.pub.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | 3 | mQINBFnNWDEBEACiX68rxIWvqH3h2GykO25oK9BAqV8fDtb6lXEbw3eKx4g87BRz 4 | M3DQBA0S0IfkQ72ovJ33H50+gVTXuu+Zme5muWk72m3pApccZVDLqdzYlpWPruNb 5 | MC+IlWr70yo8Jw8Zr1ihbWjFvMbDJTkgqPt2djNq3xxvdiKoZlgnpLRKIpSu9iBQ 6 | lNoZLHxTQKFH4219L77prRogv2QV1ckBL5lDVOERJuHo4jHE8mm9/NZ6v3m2HGuu 7 | AEZ7T9nWlPGiAIP8Pww4ZRTJcBANcI2EFKPLdfP61HTH6w0kVMkoAaGlemadTDl3 8 | ZcLpUpTFLc+ko/2uQ1qVPx9QYyoMrorS3kUmlXrhsA7FvcB09aIcb+JX6SVkcbO5 9 | A5+baCa3owwUtFBXMHM5hqpLv4P3/GsuW6283YwLZCf53dJY4lJZePqzPGsvs/wS 10 | vhnZrFvb61i/Aqm0hjhVh7h6VNxUiE8geMcjxy29LtzajoyS0EPVxes4xZu0VbS7 11 | 8LQyCNHSpS7TFmtVUQmbXqDN7cpiyr9+yutr0lZOMc7NYQt0nP/3RtYkWEob6wXa 12 | rVImHas1OYzlZymdO1uAnqkediS61E2vSD1OEq37/375FB/Q3AYXuNkQzDjYoJJz 13 | 9wsv7Xp0bdPzQ/daLdIFNQXo5MmVIirsWM07JvbZaJhDOiJxGn0MPf11/QARAQAB 14 | tEBOVklESUEgQ09SUE9SQVRJT04gKE9wZW4gU291cmNlIFByb2plY3RzKSA8Y3Vk 15 | YXRvb2xzQG52aWRpYS5jb20+iQI4BBMBCgAiBQJZzVgxAhsPBgsJCggHAwUVCgkI 16 | CwUWAgMBAAIeAQIXgAAKCRDdyuBE95bssAh6EACgUCww2sr8sOztEHKhvdCsonXu 17 | THYbel3YlWmVDPbh4dA31xoRXlvSJptJzPi/zlTc9fkVSFGbEZbFRR4JjnwYTMLD 18 | ElMh5YRMYAoPVYhWGKIO4earu32GhFuPjfr6h+0xNaQeDPIbr7bPe/AEhLSdJMzI 19 | OuAifr7UaC65A6YlxfeaSqyt0HthYujoQ12cWxP998C5jkc0IN2tyLs/OD7HLHht 20 | +lafqDSylykx63cw7jvsV/15rqZwVwjhkcxZyrKET32MTjXF3cxn7+TGpKS8B1k4 21 | a/EI7uXnncfSoma0dAT9bZM9JZbXQmSzCPDHHuVtnQ/3uh8VyenpigTFnrb20LCy 22 | 6WzJd3O9lAZXLhvwF/By3a07WLzRtTZNaUpt37Anb0js2syr3lohbmK9i3xvuqZN 23 | zhGPbqu9IV+vFgSGyTHRJUSBlHKDGiCdOOHc20MLPW1yRCXbx0F4eS9TWchYyJkJ 24 | NNczD5DnEl/gsvL4NCRxa+oUyUhhJ1HpJ6YNmTsy6nAAKIC+6248o164GiavaR3z 25 | 03RfaQayGHAUrBKi+PJBY7efgsZeYT8f+hyYrIC04MO8poBKS/GvSUL2QtVtj59N 26 | q+95gIptW2mZM8KRpt2huLH+QQ8SKr1vAECbpKJOwseqKmVyxX02iaSE8ifLE+tX 27 | FE8YgS3CZjWwy5PD0LkBDQRdgpCQAQgAx1oxX9tFlv3CIva0CJ0dsZyNF7mgHPgN 28 | szccUYLu0chyWYvwiVU/OlCzivytNX56wgeBgIVV1QzeBuTkrJSgzJ+dSgfrmyg5 29 | RwIDhvH+Dcut0++6+di1LyH9gXQcYPrN3pf4yR8nlRbm6K0Vsp0Z4+br18QelURe 30 | rfAkRordag26aB+MzVLvloHHu3Z6/v321uTGMdFd8CVCjovec5+EdcIAam3U/MmZ 31 | e2mr2M/x6F3st30cE7umq9Bb6UCqc6L8bQcoloxR3rwFzL1u9wUBUzQlaMNmxbe0 32 | BfezkmSQeC8JN4Fku+DtHEpS9uP5JEYNEEQ66K4mJDTMr0whBv1fKQARAQABiQNb 33 | BBgBCgAmAhsCFiEEyVsyG2HojBgJxPdZ3crgRPeW7LAFAl7oD1gFCQNGskgBKcBd 34 | IAQZAQoABgUCXYKQkAAKCRBu2RyjrBFgzZ/WB/9TuD2qzaBO7HlPDWRUTpFlvFgy 35 | Dc3XyfTAC/ISeYbIcPcq5kmVHgpsMdbN9Vvmot5GuT7VWzhHc9sJCmHgL330glBt 36 | NtSRflKzlBYnbiSWxLFYZtu2BtNOk8Ylbw8qw1E6W/iFBrqAwgeZvs2VOcPU3203 37 | Mqfi1JbS+YHC/bgs6cNq0zs/WJraYxiuleclKYExxLt9tRd0058n58GAph+Ki7mR 38 | InO6kxuKpsQannSn1Ku/DiaQcSF2L2TMSo0N9zwvYEZR+hgsKVqyRKT+DkZhusHJ 39 | HYGv96YHSTwo016ZhwYS9t0MLXY9/PgJysuO41Ya4Ii43D3UK1wOHTmyHZHTCRDd 40 | yuBE95bssDpwD/4jV9Pin3vAKa4hhn5GD4e478FNKRD58Q7qF3AhVTBNPIl1m4EF 41 | X7sqI6cXUDG4BjpS70ZRWF2x51ZTiq7DLTV/gGw2okfVjoWjzQY0ebrLd4IoNs80 42 | lIHmXxa+JdwB6WupCUzKCKLcPsX/yPAmswPNGAuIMAv+PWhUUSMVtzOZldnlogGM 43 | hbJ9UD2txFGGh9WoYc2vgX9KAaKryXcC6QMabv7JJU24HEJJDgbJEvtFM5PS8QMF 44 | bXIZsYgICWpQXVChBbduXo9sD2TUDWYAniNaaw4LKxPRG+Ix4HAqkh1oNOLojO30 45 | DO3r1/62FKE5/ykg3iSMTDR0iOES/leXCCIO9fRJT8+eucxyOQoY5ti7tjt1wm3H 46 | nTB+Rz3E/E2qeLs2PN82aseccm1G06pmsMCUiWtmSV6HjdO2XufYprrGLSu0RrT3 47 | sz5WHGUOY2iO40xHhSiXg3TcLZRpv30DQzxoUrx9Ff//rXLFznh+MksuvVD2roUR 48 | BGz/en31FxAcBoex9nNraeOekbFen5b7Xrq9wnzM5xZvJN2QYB3vS0khz/ZgFyy5 49 | 444ALa9gwb29FZCfA4m59S2QoB8uPQGM+8gnusE6J8y4fvI59ugafidIkt86dZ3m 50 | FsEME5XNmBGdNEo2flRVFfpG1IWds2Ba3IsdbYd9nzmbBW7/n0InVRDrIg== 51 | =9QWY 52 | -----END PGP PUBLIC KEY BLOCK----- 53 | -------------------------------------------------------------------------------- /roles/install_cuda/tasks/add_mirror_repo.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Setup CUDA '(deb) local' repo 4 | --- 5 | - name: Get md5sum.txt from local mirror 6 | uri: 7 | url: "{{ foreman.foreman_mirror }}/cuda/{{ cuda_release }}-{{ cuda_arch }}/md5sum.txt" 8 | return_content: true 9 | # failed_when: false 10 | register: md5 11 | - name: Check metadata status 12 | fail: 13 | msg: "No md5sum.txt found for CUDA {{ cuda_release }}-{{ cuda_arch }} in local mirror" 14 | when: 15 | - md5.status == 404 16 | 17 | - name: set dist_ver fact 18 | set_fact: 19 | dist_ver: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | regex_replace('\\.', '') }}" 20 | # ubuntu2004, ubuntu1804, ... 21 | # cuda-repo-ubuntu2004-11-5-local_11.5.2-495.29.05-1_amd64.deb 22 | - name: get repo package pattern 23 | set_fact: 24 | pkg_pattern: 'cuda-repo-{{ dist_ver }}-.*_{{ cuda_arch }}.deb' 25 | - name: get repo package 26 | set_fact: 27 | pkg_name: "{{ md5.content.splitlines() | map('regex_search', pkg_pattern) |select('string') |list }}" 28 | failed_when: pkg_name |count != 1 29 | - name: get repo local fn 30 | set_fact: 31 | pkg_nn: "{{ pkg_name[0] | split('_') | first }}" 32 | 33 | - name: copy cuda.list 34 | copy: 35 | content: | 36 | deb {{ foreman.foreman_mirror }}/cuda/{{ cuda_release }}-{{ cuda_arch }}/var/{{ pkg_nn }} ./ 37 | dest: /etc/apt/sources.list.d/cuda-repo.list 38 | owner: root 39 | group: root 40 | mode: '0644' 41 | -------------------------------------------------------------------------------- /roles/install_cuda/tasks/apt_common.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=arm64-sbsa&Compilation=Native&Distribution=Ubuntu&target_version=20.04&target_type=deb_network 4 | --- 5 | - name: c_arch for aarch64 6 | set_fact: 7 | c_arch: sbsa 8 | c_ver: "{{ ansible_distribution_version | regex_replace('\\.', '') }}" 9 | when: ansible_architecture == 'aarch64' 10 | 11 | - name: c_arch for x86_64 12 | set_fact: 13 | c_arch: x86_64 14 | c_ver: "{{ ansible_distribution_version | regex_replace('\\.', '') }}" 15 | when: ansible_architecture == 'x86_64' 16 | 17 | # A024F6F0E6D6A281: Mellanox Technologies (Mellanox Technologies - Signing Key v3) 18 | # F60F4B3D7FA2AF80: cudatools manage-doca.yml 16 |

17 | ---
18 | - hosts: x86host
19 |   user: "{{ remote_install_user }}"
20 |   become: true
21 |   vars:
22 |     doca:
23 |       version: 1.2.1
24 |       package: doca-host-repo-ubuntu2004_1.2.1-0.1.5.1.2.006.5.5.2.1.7.0_amd64.deb
25 |   roles:
26 |     - nvidia.dpu_ops.install_doca
27 | 
28 | 29 | -------------------------------------------------------------------------------- /roles/install_doca/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | 26 | distro_version_url: "{{ ansible_distribution |lower }}{{ ansible_distribution_version }}" 27 | doca_url: "https://linux.mellanox.com/public/repo/doca/{{ doca.version }}/{{ distro_version_url }}/amd64/{{ doca.package }}" 28 | -------------------------------------------------------------------------------- /roles/install_doca/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | --- 26 | - name: Install mellanox gpg key 27 | apt_key: 28 | url: https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox 29 | state: present 30 | 31 | - name: Install all in one doca package 32 | apt: 33 | deb: "{{ doca_url }}" 34 | 35 | - name: Remove old doca packages 36 | apt: 37 | update_cache: true 38 | state: absent 39 | name: 40 | - doca-sdk 41 | - doca-runtime 42 | - doca-tools 43 | when: downgrade is defined and downgrade 44 | 45 | - name: Install doca packages 46 | apt: 47 | update_cache: true 48 | name: 49 | - doca-sdk 50 | - doca-runtime 51 | - doca-tools 52 | -------------------------------------------------------------------------------- /roles/load_bfb/README.md: -------------------------------------------------------------------------------- 1 | # Load BFB image into DPU over RSHIM 2 | 3 | ## Parameters 4 | 5 | * `local_bfb` -- location of BFB image on local FS 6 | * `bfb_url` -- URL of BFB image to be downloaded on x86 host 7 | * `boot_mac` -- MAC-address of network interface to boot from 8 | 9 | ## Playbook examples 10 | 11 |
load-bfb.yaml 12 |

13 | ---
14 | - hosts: bf2:bf2oob:bmc
15 |   user: "{{ remote_install_user }}"
16 |   gather_facts: no # if using a bmc host, this will fail because ansible is not present
17 |   become: true
18 |   vars:
19 |     bfb_url: "{{ foreman.foreman_mirror }}/{{ product_version }}/{{ bfb.file }}"
20 |     cloudinit_hostname: "{{ inventory_hostname | regex_replace('bmc-','') }}"
21 |     cloudinit_ntp_host: "{{ subnet_dns_primary }}"
22 |     cloudinit_dns_host: "{{ subnet_dns_primary }}"
23 |     tmfifo_ip: "{{ hostvars[inventory_hostname].tmfifo_ip | default('192.168.100.2') }}/28"
24 |     tmfifo_mac: "{{ hostvars[inventory_hostname].tmfifo_mac | default('00:1a:ca:ff:ff:01') }}"
25 |     cloudinit_mtu: "{{ network_mtu }}"
26 |     ovs_mtu: "{{ network_mtu|int + 50 }}"
27 |     cloudinit_domain: "{{ domain }}"
28 |     bfcfg_template: "roles/load_bfb/templates/bf2_ndo.cfg.j2"
29 |     ansible_fqdn: "{{ inventory_hostname }}" # this hack is because facts are not gathered and the non_bf2_host uses it
30 |   pre_tasks:
31 |     - name: set is_bmc
32 |       set_fact:
33 |         is_bmc: "{{ inventory_hostname.startswith('bmc') }}"
34 |     - name: bmc operations
35 |       block:
36 |         - name: set hosts
37 |           set_fact:
38 |             x86_host: "{{ foreman_url }}"
39 |             dpu_host: "{{ inventory_hostname | regex_replace('bmc-','') }}"
40 |         - name: set bmc facts
41 |           set_fact:
42 |             boot_mac: "{{ hostvars[dpu_host]['oob_mac'] if bf2.oob_provision else hostvars[dpu_host]['primary_mac'] }}"
43 |             local_bfb: "/var/www/{{ product_version }}/{{ bfb.file }}" # directly manipulate the foreman filesystem
44 |       when: inventory_hostname.startswith('bmc')
45 |     - name: x86 host operations
46 |       block:
47 |         - name: set hosts
48 |           set_fact:
49 |             x86_host: "{{ non_bf2_host }}"
50 |             dpu_host: "{{ inventory_hostname | regex_replace('oob-','') }}"
51 |         - name: set non bmc facts
52 |           set_fact:
53 |             boot_mac: "{{ hostvars[dpu_host]['oob_mac'] if bf2.oob_provision else hostvars[dpu_host]['primary_mac'] }}"
54 |             local_bfb: "{{ bf2.download_local_path }}/{{ bfb.file }}"
55 |         - name: Create bfb temp dir
56 |           file:
57 |             state: directory
58 |             path: "{{ bf2.download_local_path }}"
59 |             owner: root
60 |             group: root
61 |             mode: "0644"
62 |           delegate_to: "{{ x86_host }}"
63 |         - name: Download bfb from web server
64 |           get_url:
65 |             url: "{{ bfb_url }}"
66 |             dest: "{{ bf2.download_local_path }}"
67 |             validate_certs: "{{ foreman.validate_certs }}"
68 |           delegate_to: "{{ x86_host }}"
69 |       when: not inventory_hostname.startswith('bmc')
70 |   roles:
71 |     - nvidia.dpu_ops.load_bfb
72 | 
73 |
74 | -------------------------------------------------------------------------------- /roles/load_bfb/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | 26 | bfcfg_template: bf_ubuntu.cfg.j2 27 | -------------------------------------------------------------------------------- /roles/load_bfb/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: Load BFB from x86 host 26 | block: 27 | - name: get facts 28 | bf2_facts: 29 | when: bf2_devices is not defined 30 | delegate_to: "{{ x86_host }}" 31 | 32 | - name: Verify rshim is active on host 33 | assert: 34 | that: "{{ bf2_devices | nvidia.dpu_ops.get_rshim(rshim.dev) | length > 0 }}" 35 | fail_msg: > 36 | rshim is not active on the host, which could mean that either 37 | the BMC on the BF2 card owns the rshim or it is just disabled on the host. 38 | Please verify the state of your host and BF2 card before proceeding. 39 | delegate_to: "{{ x86_host }}" 40 | 41 | - name: Generate bf.cfg 42 | template: 43 | src: "{{ bfcfg_template }}" 44 | dest: /tmp/bf.cfg 45 | owner: root 46 | group: root 47 | mode: "0755" 48 | delegate_to: "{{ x86_host }}" 49 | 50 | - name: Load bfb into bf2 51 | shell: "bfb-install --bfb {{ local_bfb }} --config /tmp/bf.cfg --rshim {{ rshim.dev }}" 52 | delegate_to: "{{ x86_host }}" 53 | 54 | - name: Delete temporary bf.cfg 55 | file: 56 | path: /tmp/bf.cfg 57 | state: absent 58 | delegate_to: "{{ x86_host }}" 59 | when: is_bmc is false 60 | 61 | - name: Load BFB from BMC rshim 62 | block: 63 | - name: get rshim from BMC 64 | raw: 'find /dev -maxdepth 1 -name "rshim*"' 65 | register: rshim_output_bmc 66 | 67 | - name: Verify rshim is active on BMC 68 | assert: 69 | that: rshim_output_bmc.stdout_lines|length > 0 70 | fail_msg: > 71 | rshim is not active on the BMC, which could mean that either the x86 host 72 | owns the rshim or it is just disabled on the BMC. 73 | Please verify the state of your host and BF2 card before proceeding. 74 | 75 | - name: delete directory to ensure it is empty 76 | file: 77 | state: absent 78 | path: "{{ local_bfb | dirname }}/{{ boot_mac }}" 79 | owner: root 80 | group: root 81 | mode: "0755" 82 | delegate_to: "{{ x86_host }}" 83 | 84 | - name: Create directory to append the files 85 | file: 86 | state: directory 87 | path: "{{ local_bfb | dirname }}/{{ boot_mac }}" 88 | owner: root 89 | group: root 90 | mode: "0755" 91 | delegate_to: "{{ x86_host }}" 92 | 93 | - name: Copy bf.cfg for appending 94 | copy: 95 | src: "{{ local_bfb }}" 96 | dest: "{{ local_bfb | dirname }}/{{ boot_mac }}" 97 | owner: root 98 | group: root 99 | mode: "0755" 100 | remote_src: true 101 | delegate_to: "{{ x86_host }}" 102 | 103 | - name: Generate bf.cfg 104 | template: 105 | src: "{{ bfcfg_template }}" 106 | dest: "{{ local_bfb | dirname }}/{{ boot_mac }}/bf.cfg" 107 | owner: root 108 | group: root 109 | mode: "0755" 110 | delegate_to: "{{ x86_host }}" 111 | 112 | - name: Assemble combined file 113 | assemble: 114 | src: "{{ local_bfb | dirname }}/{{ boot_mac }}" 115 | dest: "{{ local_bfb | dirname }}/{{ boot_mac }}/bfb-and-config.bfb" 116 | owner: root 117 | group: root 118 | mode: "0755" 119 | delegate_to: "{{ x86_host }}" 120 | 121 | - name: Load bfb into bf2 122 | raw: "wget --no-check-certificate {{ bfb_url | dirname }}/{{ boot_mac }}/bfb-and-config.bfb -O /dev/rshim0/boot" 123 | retries: 10 124 | delay: 1 125 | register: result 126 | until: result.rc == 0 127 | when: is_bmc is true 128 | -------------------------------------------------------------------------------- /roles/load_bfb/templates/bf.cfg.j2: -------------------------------------------------------------------------------- 1 | {% block preamble %}{% endblock %} 2 | bfb_modify_os() 3 | { 4 | {% block script_begin %}{%endblock %} 5 | 6 | # Glean the BOOTNIC from the mac passed in for configuring the bf.cfg 7 | # The devices are not renamed to p{0,1} until first boot 8 | HOST_MAC={{ boot_mac }} 9 | for p in /sys/class/net/* 10 | do 11 | n=${p##*/} 12 | # echo $d = $n 13 | mac=$(cat $p/address) 14 | if [ "$mac" != "$HOST_MAC" ] ; then 15 | continue 16 | fi 17 | dev=$(readlink $p/device) 18 | case "$dev" in 19 | *MLNXBF17*) 20 | DEVICE=OOB 21 | PROVISION_IFC=oob_net0 22 | ;; 23 | *03:00*) 24 | port=${dev##*03:00.} 25 | DEVICE="NIC_P${port}" 26 | PROVISION_IFC="eth${port}" 27 | ;; 28 | *) 29 | echo "$dev unknown" >&2 30 | ;; 31 | esac 32 | break 33 | done 34 | if [ -z "$DEVICE" ] ; then 35 | DEVICE=OOB 36 | PROVISION_IFC=oob_net0 37 | fi 38 | echo "DEVICE=$DEVICE ; PROVISION_IFC=$PROVISION_IFC" 39 | BOOTNIC=NET-${DEVICE}-IPV4 40 | 41 | # Note: This section section uses EOF (no slash) which will substitute $VARS 42 | cat << EOF > /mnt/etc/bf.cfg 43 | BOOT0=${BOOTNIC} 44 | BOOT1=DISK 45 | EOF 46 | 47 | # Note: this section will ensure that variables like passwords are not escaped, rendering them useless. 48 | # Because of that, there is no variable substituion from the finish template at BFB install. Please ensure 49 | # there are no variables in this user-data file blocks that are expected to be read at BFB install time. 50 | cat << \EOF > /mnt/var/lib/cloud/seed/nocloud-net/user-data 51 | #cloud-config 52 | {% block cloudinit %} 53 | {% endblock %} 54 | runcmd: 55 | - [ grub-install ] 56 | - [ /usr/bin/bfcfg ] 57 | {% block cloudinit_extra_commands %}{% endblock %} 58 | EOF 59 | 60 | # mst start 61 | # DEV=$(/bin/ls -1 /dev/mst/mt*pciconf0) 62 | DEV=03:00.0 63 | OWNERSHIP_STATUS=$(mlxconfig -d $DEV q INTERNAL_CPU_MODEL |awk '/INTERNAL_CPU_MODEL/ {print $2}') 64 | 65 | if [[ $OWNERSHIP_STATUS == 'EMBEDDED_CPU(1)' ]]; then 66 | cat > /mnt/var/lib/cloud/seed/nocloud-net/network-config.orig << 'EOF' 67 | {% block embedded_network %}{% endblock %} 68 | EOF 69 | else 70 | cat > /mnt/var/lib/cloud/seed/nocloud-net/network-config.orig << 'EOF' 71 | {% block separated_network %}{% endblock %} 72 | EOF 73 | fi 74 | 75 | [ -s /mnt/var/lib/cloud/seed/nocloud-net/network-config.orig ] && \ 76 | mv /mnt/var/lib/cloud/seed/nocloud-net/network-config.orig /mnt/var/lib/cloud/seed/nocloud-net/network-config 77 | 78 | # Set mtu for ovs ports 79 | cat << \EOF >> /mnt/etc/mellanox/mlnx-ovs.conf 80 | {% block ovs_config %}{% endblock %} 81 | EOF 82 | 83 | {% block script_end %}{% endblock %} 84 | 85 | } 86 | {% block postamble %}{% endblock %} 87 | 88 | # Do not remove these trailing spaces or cat bfb will not work 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /roles/load_bfb/templates/bf_ubuntu.cfg.j2: -------------------------------------------------------------------------------- 1 | {% extends "bf.cfg.j2" %} 2 | 3 | {% block preamble%} 4 | {% endblock %} 5 | 6 | {% block script_begin %} 7 | set +x 8 | {% endblock %} 9 | 10 | {% block cloudinit %} 11 | debug: 12 | verbose: true 13 | timezone: "Etc/UTC" 14 | hostname: {{ ansible_hostname }} 15 | manage_etc_hosts: true 16 | users: 17 | - name: ubuntu 18 | shell: /bin/bash 19 | sudo: ALL=(ALL) NOPASSWD:ALL 20 | lock_passwd: false 21 | passwd: {{ hashed_user_password }} 22 | groups: [adm, audio, cdrom, dialout, dip, floppy, lxd, netdev, plugdev, sudo, video] 23 | {% endblock %} 24 | 25 | {% block cloudinit_extra_commands %} 26 | # - [ systemctl, enable, rshim ] 27 | {% endblock %} 28 | 29 | {% block embedded_network %} 30 | {% endblock %} 31 | 32 | {% block separated_network %} 33 | {% endblock %} 34 | 35 | {% block ovs_config %} 36 | {% endblock %} 37 | 38 | {% block script_end %} 39 | {% endblock %} 40 | 41 | {% block postamble %} 42 | {% endblock %} 43 | -------------------------------------------------------------------------------- /roles/manage_bf2_fw/README.md: -------------------------------------------------------------------------------- 1 | # Update DPU NIC firmware 2 | 3 | The `manage_bf2_fw` roles updates NIC firmware of DPU and power-cycle the x86 host 4 | 5 | ## Playbook examples 6 | 7 |
manage-bf2-fw.yaml 8 |

 9 | ---
10 | - hosts: bf2oob
11 |   user: "{{ remote_install_user }}"
12 |   become: true
13 |   vars:
14 |     bmc_host: "{{ hostvars[non_bf2_host]['bmc_ip'] }}"
15 |     bmc_user: "{{ hostvars[non_bf2_host]['bmc_user'] }}"
16 |     bmc_password: "{{ hostvars[non_bf2_host]['bmc_password'] }}"
17 |     run_on: "{{ groups['foreman'][0] }}"
18 |   roles:
19 |     - nvidia.dpu_ops.manage_bf2_fw
20 |   post_tasks:
21 |     - name: wait for machine to be back online
22 |       wait_for:
23 |         host: "{{ non_bf2_host }}"
24 |         port: 22
25 |         timeout: 900
26 |         delay: 60
27 |       delegate_to: "{{ groups['foreman'][0] }}"
28 | 
29 |
30 | -------------------------------------------------------------------------------- /roles/manage_bf2_fw/defaults/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | 26 | force: False 27 | -------------------------------------------------------------------------------- /roles/manage_bf2_fw/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: FW update 26 | shell: "/opt/mellanox/mlnx-fw-updater/mlnx_fw_updater.pl" 27 | when: 28 | - not force 29 | changed_when: 30 | - fw_update.rc == 0 31 | failed_when: 32 | - fw_update.rc == 1 33 | register: fw_update 34 | 35 | - name: FW update - force 36 | shell: "/opt/mellanox/mlnx-fw-updater/mlnx_fw_updater.pl --force-fw-update" 37 | when: 38 | - force 39 | changed_when: 40 | - fw_update.rc == 0 41 | failed_when: 42 | - fw_update.rc == 1 43 | register: fw_update 44 | 45 | - name: reboot host 46 | block: 47 | - name: turn x86 host off 48 | vars: 49 | bmc_action: "chassis power off" 50 | include_role: 51 | name: bf_bmc 52 | 53 | - name: Sleep for 1 minute to ensure power off 54 | pause: 55 | seconds: 60 56 | 57 | - name: turn x86 host on 58 | vars: 59 | bmc_action: "chassis power on" 60 | include_role: 61 | name: bf_bmc 62 | when: fw_update.changed in [0, 1] 63 | -------------------------------------------------------------------------------- /roles/manage_bf2_nic_speed/README.md: -------------------------------------------------------------------------------- 1 | # Set ethernet link speed of DPU ports 2 | 3 | ## Parameters 4 | 5 | * `p0_nic_speed_options` -- speed for port #1 6 | * `p1_nic_speed_options` -- speed for port #2 7 | 8 | ## Playbook examples 9 | 10 |
manage-bf2-nic-speed.yaml 11 |

12 | ---
13 | - hosts: bf2oob
14 |   user: "{{ remote_install_user }}"
15 |   become: true
16 |   vars:
17 |     p0_nic_speed_options: "{{ bf2.p0_nic_speed_options }}"
18 |     p1_nic_speed_options: "{{ bf2.p1_nic_speed_options }}"
19 |   roles:
20 |     - nvidia.dpu_ops.manage_bf2_nic_speed
21 | 
22 |
23 | 24 | 25 | -------------------------------------------------------------------------------- /roles/manage_bf2_nic_speed/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: set nic speed via ethtool 26 | command: "{{ item }}" 27 | with_items: 28 | - "/sbin/ethtool -s p0 {{ p0_nic_speed_options }}" 29 | - "/sbin/ethtool -s p1 {{ p1_nic_speed_options }}" 30 | 31 | - name: Add nic speed configuration to udev rules 32 | template: 33 | dest: /etc/udev/rules.d/83-net-speed.rules 34 | src: 83-net-speed.rules.j2 35 | owner: root 36 | group: root 37 | mode: "0644" 38 | -------------------------------------------------------------------------------- /roles/manage_bf2_nic_speed/templates/83-net-speed.rules.j2: -------------------------------------------------------------------------------- 1 | SUBSYSTEM=="net", ACTION=="add", NAME=="p0", RUN+="/sbin/ethtool -s p0 {{ p0_nic_speed_options }}" 2 | SUBSYSTEM=="net", ACTION=="add", NAME=="p1", RUN+="/sbin/ethtool -s p1 {{ p1_nic_speed_options }}" 3 | -------------------------------------------------------------------------------- /roles/manage_bf_bmc_fw/README.md: -------------------------------------------------------------------------------- 1 | # Update BMC firmware of DPU 2 | 3 | ## Paremeters 4 | 5 | * `bmc_url` -- URL of BMC firmware image 6 | 7 | ## Playbook examples 8 | 9 |
manage-bf-bmc-fw.yaml 10 |

11 | ---
12 | - hosts: bmc
13 |   user: "{{ remote_install_user }}"
14 |   gather_facts: no # if using a bmc host, this will fail because ansible is not present
15 |   become: true
16 |   vars:
17 |     bmc_url: "{{ foreman.foreman_mirror }}/{{ bmc.file }}"
18 |   roles:
19 |     - nvidia.dpu_ops.manage_bf_bmc_fw
20 | 
21 |
22 | -------------------------------------------------------------------------------- /roles/manage_bf_bmc_fw/tasks/main.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: Copy image tarball 26 | raw: "wget --no-check-certificate {{ bmc_url }} -O /tmp/images/{{ bmc.file }}" 27 | 28 | - name: Give BMC time to split the tar up 29 | pause: 30 | seconds: 10 31 | 32 | - name: raw upgrade bmc 33 | raw_upgrade: 34 | retries: 100 35 | delay: 60 36 | 37 | - name: raw reboot bmc 38 | raw_reboot: 39 | reboot_timeout: 3600 40 | -------------------------------------------------------------------------------- /roles/manage_rshim_owner/README.md: -------------------------------------------------------------------------------- 1 | # Set RSHIM ownership 2 | 3 | The `manage_rshim_owner` role allows to set owner of DPU RSHIM interface. 4 | It could be either BMC or x86 host. 5 | 6 | ## Parameters 7 | 8 | * `bf_target` -- who is the owner of RSHIM: `bmc` or `x86` 9 | 10 | ## Playbook example 11 | 12 |
bf2_mode.yml 13 |

14 | - hosts: "bmc"
15 |   user: "{{ remote_install_user }}"
16 |   become: true
17 |   gather_facts: False
18 |   vars:
19 |     bf_target: "bmc"  # internal variable for the non_bf2_host regex
20 |     ansible_fqdn: "{{ inventory_hostname }}" # this hack is because facts are not gathered and the non_bf2_host uses it
21 |     x86_host: "{{ non_bf2_host }}"
22 |     bmc_host: "{{ inventory_hostname }}"
23 |   roles:
24 |     - nvidia.dpu_ops.manage_rshim_owner
25 | 
26 |
27 | -------------------------------------------------------------------------------- /roles/manage_rshim_owner/tasks/change_owner.yaml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: Rshim move from bmc to x86 26 | block: 27 | - name: Stop rshim on {{ bmc_host }} 28 | raw: systemctl stop rshim; systemctl mask rshim 29 | delegate_to: "{{ bmc_host }}" 30 | become: true 31 | 32 | - name: Wait 5 seconds 33 | pause: 34 | seconds: 5 35 | 36 | - name: Restart rshim on {{ x86_host }} 37 | raw: systemctl restart rshim 38 | delegate_to: "{{ x86_host }}" 39 | become: true 40 | when: current_rshim_owner == 'bmc' 41 | 42 | - name: Rshim move from x86 to bmc 43 | block: 44 | - name: Stop rshim on {{ x86_host }} 45 | raw: systemctl stop rshim 46 | delegate_to: "{{ x86_host }}" 47 | become: true 48 | 49 | - name: Wait 5 seconds 50 | pause: 51 | seconds: 5 52 | 53 | - name: Start rshim on {{ bmc_host }} 54 | raw: systemctl unmask rshim; systemctl enable rshim; systemctl start rshim 55 | delegate_to: "{{ bmc_host }}" 56 | become: true 57 | 58 | - name: Wait 5 seconds 59 | pause: 60 | seconds: 5 61 | 62 | - name: Start rshim on {{ x86_host }} 63 | raw: systemctl start rshim 64 | delegate_to: "{{ x86_host }}" 65 | become: true 66 | when: current_rshim_owner == 'x86' 67 | -------------------------------------------------------------------------------- /roles/manage_rshim_owner/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a 7 | # copy of this software and associated documentation files (the "Software"), 8 | # to deal in the Software without restriction, including without limitation 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | # and/or sell copies of the Software, and to permit persons to whom the 11 | # Software is furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | # DEALINGS IN THE SOFTWARE. 23 | # 24 | ############################################################################### 25 | - name: get rshim owner on BMC 26 | raw: 'find /dev -maxdepth 1 -name "rshim*"' 27 | register: rshim_output_bmc 28 | delegate_to: "{{ bmc_host }}" 29 | 30 | # bmc does not have python, ansible facts are not possible so we must "help" 31 | - name: Verify rshim is active on BMC 32 | set_fact: 33 | current_rshim_owner: "{{ 'bmc' if rshim_output_bmc.stdout_lines|length > 0 else 'x86' }}" 34 | 35 | - name: change ownership 36 | include_tasks: change_owner.yaml 37 | -------------------------------------------------------------------------------- /roles/prepare_cuda_repo/README.md: -------------------------------------------------------------------------------- 1 | # Prepare CUDA local repository 2 | 3 | The `prepare_cuda_repo` role is used to download CUDA "DEB local" installer, 4 | unpack it on the "control plane" host line Foreman and set up corresponding data. 5 | 6 | This roles is used in conjunction with `install_cuda` role 7 | 8 | ## Parameters 9 | 10 | * `cuda_release` -- release of CUDA like `11.5.1`, `11.6.2` 11 | * `cuda_arch` -- CPU architecture to install. Allowed values are: 12 | * `amd64` 13 | * `arm64` 14 | 15 | -------------------------------------------------------------------------------- /roles/prepare_cuda_repo/tasks/check_vars.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | --- 3 | - name: cuda_release is defined? 4 | fail: 5 | msg: "Value of cuda_release isn't defined" 6 | when: 7 | - not cuda_release is defined 8 | - name: cuda_arch is defined? 9 | fail: 10 | msg: "Value of cuda_arch isn't defined" 11 | when: 12 | - not cuda_arch is defined 13 | - name: cuda_arch is correct? 14 | fail: 15 | msg: "Value of cuda_arch ({{ cuda_arch }}) isn't correct" 16 | when: 17 | - not (cuda_arch == "arm64" or cuda_arch == "amd64") 18 | - name: cuda_dist is defined? 19 | fail: 20 | msg: "Value of cuda_dist isn't defined" 21 | when: 22 | - not cuda_dist is defined 23 | - name: set facts 24 | set_fact: 25 | # ubuntu2004, ubuntu1804, ... 26 | # We can't guess OS distro by `ansible_distribution` from the Foreman host 27 | # it won't match OS distro running on DPU or x86 host in general 28 | # dist_ver: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | regex_replace('\\.', '') }}" 29 | dist_ver: "{{ cuda_dist | lower | regex_replace('[-\\.]', '') }}" 30 | 31 | - name: Get CUDA release metadata 32 | uri: 33 | url: "https://developer.download.nvidia.com/compute/cuda/{{ cuda_release }}/docs/sidebar/md5sum.txt" 34 | return_content: true 35 | failed_when: false 36 | register: md5 37 | 38 | - name: Check metadata status 39 | fail: 40 | msg: "No metadata found for CUDA release {{ cuda_release }}" 41 | when: 42 | - md5.status == 404 43 | 44 | # cuda-repo-ubuntu2004-11-5-local_11.5.2-495.29.05-1_amd64.deb 45 | - name: get repo package pattern 46 | set_fact: 47 | pkg_pattern: 'cuda-repo-{{ dist_ver }}-.*_{{ cuda_arch }}.deb' 48 | - name: get repo package 49 | set_fact: 50 | pkg_name: "{{ md5.content.splitlines() | map('regex_search', pkg_pattern) |select('string') |list }}" 51 | failed_when: pkg_name |count != 1 52 | -------------------------------------------------------------------------------- /roles/prepare_cuda_repo/tasks/get_installer.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | --- 3 | - name: get repo local fn 4 | set_fact: 5 | pkg_fn: "/var/www/cuda/{{ pkg_name[0] }}" 6 | pkg_nn: "{{ pkg_name[0] | split('_') | first }}" 7 | dst_dir: "/var/www/cuda/{{ cuda_release }}-{{ cuda_arch }}" 8 | - name: set md5sum_fn fact 9 | set_fact: 10 | md5sum_fn: "{{ dst_dir }}/md5sum.txt" 11 | 12 | # - name: check local copy of md5sum.txt 13 | # stat: 14 | # path: "{{ md5sum_fn }}" 15 | # register: md5_st 16 | 17 | - name: prepare local 18 | block: 19 | # prepare destination folders 20 | - name: mkdir 21 | file: 22 | path: "{{ item }}" 23 | state: directory 24 | mode: '0755' 25 | owner: root 26 | group: root 27 | with_items: 28 | - /var/www/cuda 29 | - "{{ dst_dir }}" 30 | - name: copy store md5sum.txt 31 | copy: 32 | content: "{{ md5.content }}" 33 | dest: "{{ md5sum_fn }}" 34 | mode: '0644' 35 | owner: root 36 | group: root 37 | # when: not md5_st.stat.exists 38 | 39 | - name: download CUDA local installer 40 | get_url: 41 | url: "https://developer.download.nvidia.com/compute/cuda/{{ cuda_release }}/local_installers/{{ pkg_name[0] }}" 42 | dest: "{{ pkg_fn }}" 43 | register: pkg 44 | # when: not (pkg_st.stat.islnk is defined) 45 | 46 | - name: check unpacked dir 47 | stat: 48 | path: "{{ dst_dir }}/var/{{ pkg_nn }}" 49 | register: var_st 50 | - name: Unpack local installer 51 | shell: | 52 | dpkg --unpack --force-architecture --instdir={{ dst_dir }} {{ pkg_fn }} 53 | args: 54 | executable: /bin/bash 55 | chdir: "{{ dst_dir }}" 56 | creates: "{{ dst_dir }}/var/{{ pkg_nn }}" 57 | register: unpack 58 | when: pkg.changed or not (var_st.stat.exists) 59 | 60 | - name: debug 61 | debug: 62 | msg: "{{ unpack }}" 63 | -------------------------------------------------------------------------------- /roles/prepare_cuda_repo/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | --- 3 | - name: Check vars 4 | include_tasks: check_vars.yml 5 | 6 | - name: Get local installer 7 | include_tasks: get_installer.yml 8 | --------------------------------------------------------------------------------