├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── build-vm-images.yml
    │   ├── openstack.yml
    │   └── usernames.yml
├── README.md
├── TOS.md
├── access
    └── conda-forge-users.json
├── architecture.png
├── docs
    ├── README.md
    ├── cirun.md
    ├── images.md
    ├── network.md
    └── openstack.md
└── vm-images
    ├── .cirun.yml
    ├── README.md
    ├── build-image.sh
    ├── cpu-image.yaml
    ├── elements
        ├── cuda
        │   ├── element-deps
        │   ├── package-installs.yaml
        │   └── post-install.d
        │   │   └── 05-cuda-install
        └── misc
        │   ├── element-deps
        │   ├── package-installs.yaml
        │   └── post-install.d
        │       └── 01-install-misc
    ├── gpu-image.yaml
    └── requirements.txt


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | To obtain access to the CI server, you must complete the form below:
2 | 
3 | - [ ] I have read the [Terms of Service](https://github.com/Quansight/open-gpu-server/blob/main/TOS.md) and [Privacy Policy](https://quansight.com/privacy-policy/) and accept them.
4 | - [ ] I have included my GitHub username and unique identifier to the relevant `access/*.json` file.
5 | 
6 | <!-- You can obtain your Github identifier via https://api.github.com/users/__username__ -->
7 | 


--------------------------------------------------------------------------------
/.github/workflows/build-vm-images.yml:
--------------------------------------------------------------------------------
 1 | name: Build VM Images
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | env:
 6 |   OUTPUT_IMAGE_PATH: /tmp/ubuntu-nvidia
 7 |   ELEMENTS_PATH: /home/runnerx/actions-runner/_work/nvidia-openstack-image/nvidia-openstack-image/elements/
 8 |   #  Workaround for upload-cloud-storage to work
 9 |   OPENSSL_CONF: /dev/null
10 | 
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: "cirun-runner--${{ github.run_id }}"
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v3
18 |       - id: 'auth'
19 |         name: 'Authenticate to Google Cloud'
20 |         uses: 'google-github-actions/auth@v0'
21 |         with:
22 |           credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
23 | 
24 |       - name: Setup Node.js environment
25 |         uses: actions/setup-node@v3.5.1
26 |         with:
27 |           node-version: 16
28 | 
29 |       - name: Check NVIDIA SMI
30 |         run: nvidia-smi || true
31 | 
32 |       - name: Setup Miniconda
33 |         uses: conda-incubator/setup-miniconda@v2.2.0
34 |         with:
35 |           python-version: "3.10"
36 |           miniconda-version: "latest"
37 |         env:
38 |           CONDA: "/home/runnerx/miniconda3"
39 |       - name: Set output image path
40 |         run: |
41 |           cat .env >> $GITHUB_ENV
42 | 
43 |       - name: Create output image name
44 |         id: output-image
45 |         run: |
46 |           timestamp=$(date +%s)
47 |           output_image_name=nvidia-openstack-image-$timestamp
48 |           echo "OUTPUT_IMAGE=/tmp/output-image/$output_image_name" >> $GITHUB_ENV
49 |           echo "OUTPUT_IMAGE_NAME=$output_image_name" >> $GITHUB_OUTPUT
50 | 
51 |       - name: Add hostname to /etc/hosts
52 |         run: echo $(hostname -I | cut -d\  -f1) $(hostname) | sudo tee -a /etc/hosts
53 | 
54 |       - name: Install dependencies
55 |         shell: bash -el {0}
56 |         run: |
57 |           sudo apt update
58 |           sudo apt upgrade -y
59 |           sudo apt install qemu-utils python3-pip -y
60 |           pip install -r requirements.txt
61 | 
62 |       - name: Run disk image builder
63 |         shell: bash -el {0}
64 |         run: |
65 |           echo "OUTPUT_IMAGE: $OUTPUT_IMAGE_NAME"
66 |           bash vm-images/build-image.sh
67 | 
68 |       - id: 'upload-file'
69 |         name: Upload built disk image to GCS
70 |         uses: 'google-github-actions/upload-cloud-storage@v1'
71 |         with:
72 |           path: '/tmp/output-image/${{ steps.output-image.outputs.OUTPUT_IMAGE_NAME}}.qcow2'
73 |           destination: 'cirun/images/'
74 | 


--------------------------------------------------------------------------------
/.github/workflows/openstack.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Image to OpenStack
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       image_name:
 7 |         description: 'Image name'
 8 |         type: string
 9 |         required: true
10 | 
11 |       input_image_name:
12 |         description: 'Input Image Name in GCS'
13 |         type: string
14 |         required: true
15 | 
16 | env:
17 |   OS_AUTH_URL: https://ci.quansight.dev/in-api/identity
18 |   OS_IDENTITY_API_VERSION: 3
19 |   OS_PASSWORD: ${{ secrets.OS_PASSWORD }}
20 |   OS_PROJECT_DOMAIN_NAME: Default
21 |   OS_PROJECT_NAME: ${{ secrets.OS_USERNAME }}
22 |   OS_REGION_NAME: RegionOne
23 |   OS_TENANT_NAME: ${{ secrets.OS_USERNAME }}
24 |   OS_USERNAME: ${{ secrets.OS_USERNAME }}
25 |   OS_USER_DOMAIN_NAME: Default
26 |   INPUT_IMAGE: /tmp/input-image
27 | 
28 | jobs:
29 |   build:
30 |     runs-on: ubuntu-latest
31 |     defaults:
32 |       run:
33 |         shell: bash -el {0}
34 |     steps:
35 |       - uses: actions/checkout@v3
36 | 
37 |       - name: Setup Miniconda
38 |         uses: conda-incubator/setup-miniconda@v2.2.0
39 |         with:
40 |           python-version: "3.10"
41 |           miniconda-version: "latest"
42 | 
43 |       - name: Install openstack client
44 |         run: |
45 |           pip install python-openstackclient==6.0.0
46 | 
47 |       - name: Check Openstack client
48 |         run: |
49 |           openstack help
50 | 
51 |       - name: Save GCP Creds
52 |         run: |
53 |           echo '${{ secrets.GOOGLE_CREDENTIALS }}' > /tmp/gcp.json
54 | 
55 |       - uses: actions/checkout@v3
56 |       - id: 'auth'
57 |         name: 'Authenticate to Google Cloud'
58 |         uses: 'google-github-actions/auth@v0'
59 |         with:
60 |           credentials_json: ${{ secrets.GOOGLE_CREDENTIALS }}
61 | 
62 |       - name: Download image from GCS
63 |         run: |
64 |           gcloud auth activate-service-account --key-file /tmp/gcp.json --project aktech-labs
65 |           mkdir $INPUT_IMAGE
66 |           gsutil cp gs://cirun/images/${{ inputs.input_image_name }} $INPUT_IMAGE/
67 |           ls $INPUT_IMAGE
68 | 
69 |       - name: Upload image
70 |         run: |
71 |           openstack image create ${{ inputs.image_name }} \
72 |             --public --disk-format qcow2 \
73 |             --container-format bare \
74 |             --file $INPUT_IMAGE/${{ inputs.input_image_name }}
75 | 


--------------------------------------------------------------------------------
/.github/workflows/usernames.yml:
--------------------------------------------------------------------------------
 1 | name: Access control
 2 | 
 3 | on:
 4 |   push: 
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   build:
11 |     name: Validate usernames
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - uses: actions/setup-python@v4
16 |         with:
17 |           python-version: 3.x
18 |       - name: Install requests
19 |         run: pip install requests
20 |       - name: Validate
21 |         shell: python
22 |         run: |
23 |           import json
24 |           import requests
25 |           import sys
26 |           from pathlib import Path
27 | 
28 |           def check_login_id(login, ident):
29 |             r = requests.get(f"https://api.github.com/users/{login}", headers={
30 |               "Accept": "application/vnd.github.v3+json",
31 |               "Authorization": f"token ${{ secrets.GITHUB_TOKEN }}",
32 |             })
33 |             r.raise_for_status()
34 |             data = r.json()
35 |             if data["id"] != ident:
36 |               raise ValueError(
37 |                 f"Supplied identified {ident} for user {login} "
38 |                 f"doesn't match Github API: {data['id']}"
39 |               )
40 |           exceptions = []
41 |           for path in Path("access").glob("*.json"):
42 |             print("Processing", path)
43 |             access_data = json.loads(path.read_text())
44 |             for user in access_data["users"]:
45 |               login = user.get("github")
46 |               if not login:
47 |                 raise ValueError(f"Entry {user} is missing `github` key.")
48 |               ident = user.get("id")
49 |               if not ident:
50 |                 raise ValueError(f"Entry {user} is missing `id` key.")
51 |               try:
52 |                 check_login_id(login, ident)
53 |               except ValueError as exc:
54 |                 print("!!!", exc.__class__.__name__, "->", exc)
55 |                 exceptions.append(exc)
56 |           if exceptions:
57 |             sys.exit(1)
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # open-gpu-server
 2 | 
 3 | This repository provides information about the OpenStack instance Quansight and MetroStar are providing to conda-forge and other communities.
 4 | 
 5 | ## Access
 6 | 
 7 | The main intent of this service is to provide GPU CI to those conda-forge feedstocks that require it. To do so:
 8 | 
 9 | - **Feedstocks** must have access to the service. Refer to the [`conda-forge/admin-requests`](https://github.com/conda-forge/admin-requests) README.
10 | - **Maintainers** you must read and agree to the [Terms of Service](./TOS.md). Your username must be listed in [`access/conda-forge-users.json`](./access/conda-forge-users.json). Please open a PR to add yourself to the list.
11 | 
12 | ## Incidents
13 | 
14 | If you suspect the server is not operating as expected, please check:
15 | 
16 | - [Status page](https://ci-status.quansight.dev/)
17 | - [Ongoing incidents](https://github.com/Quansight/open-gpu-server/issues?q=is%3Aopen+is%3Aissue+label%3Aincident%3Adegraded-performance%2Cincident%3Ainvestigating%2Cincident%3Amajor-outage+sort%3Aupdated-desc)
18 | 
19 | If you think there should be an open incident report but there's none, please open a new issue and tag [@Quansight/open-gpu-server](https://github.com/orgs/Quansight/teams/open-gpu-server) so the team can take a look. Thanks!
20 | 
21 | ## Base configuration
22 | 
23 | - Model: AMD EPYC 7352 24-Core Processor
24 | - Architecture: `x86_64`, 32-bit, 64-bit
25 | - 48 Cores
26 | - ~500 GB Memory
27 | - 6x NVIDIA Tesla V100
28 | 
29 | ## Available runners
30 | 
31 | The server can spin up VMs with the following configurations:
32 | 
33 | ### GPU runners
34 | 
35 | | Name          | vCPUs | RAM  | Disk  | GPUs                 |
36 | | ------------  | ----- | ---- | ----- | -------------------- |
37 | | `gpu_tiny`    | 4     | 2GB  | 20GB  | 1x NVIDIA Tesla V100 |
38 | | `gpu_medium`  | 4     | 8GB  | 50GB  | 1x NVIDIA Tesla V100 |
39 | | `gpu_large`   | 4     | 12GB | 60GB  | 1x NVIDIA Tesla V100 |
40 | | `gpu_xlarge`  | 8     | 16GB | 60GB  | 1x NVIDIA Tesla V100 |
41 | | `gpu_2xlarge` | 8     | 32GB | 80GB  | 1x NVIDIA Tesla V100 |
42 | | `gpu_4xlarge` | 8     | 64GB | 100GB | 1x NVIDIA Tesla V100 |
43 | 
44 | These runners use the `ubuntu-2204-nvidia-20230914` image.
45 | 
46 | ### CPU runners
47 | 
48 | | Name         | vCPUs | RAM  | Disk   |
49 | | ------------ | ----- | ---- | ------ |
50 | | `ci_medium`  | 4     | 8GB  | 60GB   |
51 | | `ci_large`   | 4     | 12GB | 60GB   |
52 | | `ci_xlarge`  | 4     | 32GB | 60GB   |
53 | | `ci_2xlarge` | 8     | 32GB | 80GB   |
54 | | `ci_4xlarge` | 8     | 64GB | 100GB  |
55 | 
56 | These runners use the `ubuntu-2204-20231018` image.
57 | 
58 | ### Software
59 | 
60 | These runners run ISOs derived from Ubuntu 22.04. Images are built with the instructions provided in the [`images`](./images) folder.
61 | 
62 | ## Limitations
63 | 
64 | * Concurrency depends on available resources. Only 4 GPUs can be exposed to the VMs at a time; expect queues. This is not per repository, but a server-wide limitation. See [docs/openstack.md](/docs/openstack.md) for details.
65 | * We have not yet implemented a time limit per job. Please be mindful of this and try to keep your jobs as short as possible. This might change in the future.
66 | 
67 | ## Support
68 | 
69 | This service is provided as is, with no guarantees of uptime or support. If you have any questions, please open an issue in this repository and we'll try our best.
70 | 


--------------------------------------------------------------------------------
/TOS.md:
--------------------------------------------------------------------------------
 1 | # Service Agreement
 2 | 
 3 | Last Modified: 2023-11-07
 4 | 
 5 | This Service Agreement (this "Agreement") is a binding contract between you ("User," "you," or "your") and Quansight, LLC ("Provider," "we," or "us"). This Agreement governs your access to and use of the Services.
 6 | 
 7 | THIS AGREEMENT TAKES EFFECT WHEN YOU BEGIN ACCESSING OR USING THE SERVICES (the "Effective Date"). BY ACCESSING OR USING THE SERVICES YOU (A) ACKNOWLEDGE THAT YOU HAVE READ AND UNDERSTAND THIS AGREEMENT; (B) REPRESENT AND WARRANT THAT YOU HAVE THE RIGHT, POWER, AND AUTHORITY TO ENTER INTO THIS AGREEMENT AND, IF ENTERING INTO THIS AGREEMENT FOR AN ORGANIZATION, THAT YOU HAVE THE LEGAL AUTHORITY TO BIND THAT ORGANIZATION; AND (C) ACCEPT THIS AGREEMENT AND AGREE THAT YOU ARE LEGALLY BOUND BY ITS TERMS.
 8 | 
 9 | IF YOU DO NOT AGREE TO THESE TERMS, PLEASE DO NOT PROCEED TO ACCESS THE SERVICES. IF YOU DO NOT ACCEPT THESE TERMS, YOU MAY NOT ACCESS OR USE THE SERVICES. 
10 | 
11 | 1. Definitions. 
12 |     1. "Authorized User" means User and, if applicable, User's employees, consultants, contractors, members, and agents (i) who are authorized by User to access and use the Services under the rights granted to User pursuant to this Agreement.
13 |     2. "Services" means the services provided by Provider under this Agreement that are detailed on Provider's website available at https://github.com/Quansight/open-gpu-server.
14 |     3. "User Data" means, other than Aggregated Statistics, information, data, and other content, in any form or medium, that is submitted, posted, or otherwise transmitted by or on behalf of User or any other Authorized User through the Services.
15 |     4. "Provider IP" means the Services, any documentation, and all intellectual property provided to User or any other Authorized User in connection with the foregoing. For the avoidance of doubt, Provider IP includes Aggregated Statistics, but does not include User Data.
16 |     5. "Third-Party Products" means any products, content, services, information, websites, or other materials that are owned by third parties and are incorporated into or accessible through the Services.
17 | 2. Access and Use.
18 |     1. Provision of Access. Subject to the terms and conditions of this Agreement, Provider hereby grants you a revocable, non-exclusive, non-transferable, non-sublicensable, limited right to access and use the Services during the Term solely for use by Authorized Users in accordance with the terms and conditions herein. 
19 |     2. Use Restrictions. You shall not, and shall not permit any Authorized Users to, use the Services, or any software component of the Services, for any purposes beyond the scope of the access granted in this Agreement. You shall not at any time, directly or indirectly, and shall not permit any Authorized Users to: (i)modify the Services, any software component of the Services, or Documentation, in whole or in part; (i) rent, lease, lend, sell, license, sublicense, assign, distribute, publish, transfer, or otherwise make available the Services or Documentation except as expressly permitted under this Agreement; (ii) reverse engineer, disassemble, decompile, decode, adapt, or otherwise attempt to derive or gain access to any software component of the Services, in whole or in part; () remove any proprietary notices from the Services or its documentation; or (v) use the Services or Documentation in any manner or for any purpose that infringes, misappropriates, or otherwise violates any intellectual property right or other right of any person, or that violates any applicable law, regulation, or rule.
20 |     3. Aggregated Statistics. Notwithstanding anything to the contrary in this Agreement, Provider may monitor User's use of the Services and collect and compile data and information related to User's use of the Services to be used by Provider in an aggregated and anonymized manner, including to compile statistical and performance information related to the provision and operation of the Services ("Aggregated Statistics"). As between Provider and User, all right, title, and interest in Aggregated Statistics, and all intellectual property rights therein, belong to and are retained solely by Provider. You acknowledge that Provider may compile Aggregated Statistics based on User Data input into the Services. You agree that Provider may (i) make Aggregated Statistics publicly available in compliance with applicable law, and (ii) use Aggregated Statistics to the extent and in the manner permitted under applicable law. 
21 |     4. Reservation of Rights. Provider reserves all rights not expressly granted to User in this Agreement. Except for the limited rights and licenses expressly granted under this Agreement, nothing in this Agreement grants, by implication, waiver, estoppel, or otherwise, to User or any third party, any intellectual property rights or other right, title, or interest in or to the Provider IP.
22 |     5. Suspension. Notwithstanding anything to the contrary in this Agreement, Provider may temporarily or indefinitely suspend User's and any other Authorized User's access to any portion or all of the Services for any reason or no reason, including but not limited to: (i) Provider reasonably determines that (A) there is a threat or attack on any of the Provider IP; (B) User's or any other Authorized User's use of the Provider IP disrupts or poses a security risk to the Provider IP or to any other User or vendor of Provider; (C) User or any other Authorized User is using the Provider IP for fraudulent or illegal activities; (D) subject to applicable law, User has ceased to continue its business in the ordinary course, made an assignment for the benefit of creditors or similar disposition of its assets, or become the subject of any bankruptcy, reorganization, liquidation, dissolution, or similar proceeding; or (E) Provider's provision of the Services to User or any other Authorized User is prohibited by applicable law; or (ii) any vendor of Provider has suspended or terminated Provider's access to or use of any third-party services or products required to enable User to access the Services, (a "Service Suspension"). Provider will have no liability for any damage, liabilities, losses (including any loss of or profits), or any other consequences that User or any other Authorized User may incur as a result of a Service Suspension.
23 | 3. User Responsibilities.
24 |     1. Acceptable Use Policy. The Services may not be used for unlawful, fraudulent, offensive, or obscene activity, as further described and set forth in Provider's acceptable use policy ("AUP") located at https://github.com/Quansight/open-gpu-server, as may be amended from time to time, which is incorporated herein by reference. You will comply with all terms and conditions of this Agreement, all applicable laws, rules, and regulations, and all guidelines, standards, and requirements that may be posted on https://github.com/Quansight/open-gpu-server from time to time, including the AUP.
25 |     2. Account Use. You are responsible and liable for all uses of the Services and Documentation resulting from access provided by you, directly or indirectly, whether such access or use is permitted by or in violation of this Agreement. Without limiting the generality of the foregoing, you are responsible for all acts and omissions of Authorized Users, and any act or omission by an Authorized User that would constitute a breach of this Agreement if taken by you will be deemed a breach of this Agreement by you. You shall use reasonable efforts to make all Authorized Users aware of this Agreement's provisions as applicable to such Authorized User's use of the Services and shall cause Authorized Users to comply with such provisions. 
26 |     3. User Data. You hereby grant to Provider a non-exclusive, royalty-free, worldwide license to reproduce, distribute, and otherwise use and display the User Data and perform all acts with respect to the User Data as may be necessary for Provider to provide the Services to you, and a non-exclusive, perpetual, irrevocable, royalty-free, worldwide license to reproduce, distribute, modify, and otherwise use and display User Data incorporated within the Aggregated Statistics. You will ensure that User Data and any Authorized User's use of User Data will not violate any policy or terms referenced in or incorporated into this Agreement or any applicable law. You are solely responsible for the development, content, operation, maintenance, and use of User Data.
27 |     4. Third-Party Products. The Services may permit access to Third-Party Products. For purposes of this Agreement, such Third-Party Products are subject to their own terms and conditions presented to you for acceptance within the Services by website link or otherwise. If you do not agree to abide by the applicable terms for any such Third-Party Products, then you should not install, access, or use such Third-Party Products.
28 | 4. No Support.  This Agreement does not entitle User to any support, maintenance, upgrades, or modifications for or to the Services.
29 | 5. No Fees. The Parties agree that no fees will be payable under this Agreement in exchange for access to the Services granted under this Agreement. User acknowledges and agrees that this fee arrangement is made in consideration of the mutual covenants set forth in this Agreement, including, without limitation, the disclaimers, exclusions, and limitations of liability set forth herein.
30 | 6. Privacy Policy. Provider complies with its privacy policy, available at https://quansight.com/privacy-policy/ ("Privacy Policy"), in providing the Services. The Privacy Policy is subject to change as described therein. By accessing, using, and providing information to or through the Services, you acknowledge that you have reviewed and accepted our Privacy Policy, and you consent to all actions taken by us with respect to your information in compliance with the then-current version of our Privacy Policy.
31 | 7. Intellectual Property Ownership; Feedback. As between you and us, (a) we own all right, title, and interest, including all intellectual property rights, in and to the Services and (b) you own all right, title, and interest, including all intellectual property rights, in and to User Data. If you or any of your employees, contractors, or agents sends or transmits any communications or materials to us by mail, email, telephone, or otherwise, suggesting or recommending changes to the Services, including without limitation, new features or functionality relating thereto, or any comments, questions, suggestions, or the like ("Feedback"), we are free to use such Feedback irrespective of any other obligation or limitation between you and us governing such Feedback. All Feedback is and will be treated as non-confidential. You hereby assign to us on your behalf, and shall cause your employees, contractors, and agents to assign, all right, title, and interest in, and we are free to use, without any attribution or compensation to you or any third party, any ideas, know-how, concepts, techniques, or other intellectual property rights contained in the Feedback, for any purpose whatsoever, although we are not required to use any Feedback.
32 | 8. Limited Warranty and Warranty Disclaimer. 
33 |    1. User Warranty. You warrant that you own all right, title, and interest, including all intellectual property rights, in and to User Data and that both the User Data and your use of the Services are in compliance with the AUP.
34 |    2. No Provider Warranty. THE SERVICES ARE PROVIDED "AS IS" AND PROVIDER SPECIFICALLY DISCLAIMS ALL WARRANTIES, WHETHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE. PROVIDER SPECIFICALLY DISCLAIMS ALL IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, AND NON-INFRINGEMENT, AND ALL WARRANTIES ARISING FROM COURSE OF DEALING, USAGE, OR TRADE PRACTICE. PROVIDER MAKES NO WARRANTY OF ANY KIND THAT THE SERVICES, OR ANY PRODUCTS OR RESULTS OF THE USE THEREOF, WILL MEET YOUR OR ANY OTHER PERSON'S OR ENTITY'S REQUIREMENTS, OPERATE WITHOUT INTERRUPTION, ACHIEVE ANY INTENDED RESULT, BE COMPATIBLE OR WORK WITH ANY OF YOUR OR ANY THIRD PARTY'S SOFTWARE, SYSTEM, OR OTHER SERVICES, OR BE SECURE, ACCURATE, COMPLETE, FREE OF HARMFUL CODE, OR ERROR-FREE, OR THAT ANY ERRORS OR DEFECTS CAN OR WILL BE CORRECTED. 
35 | 9. Indemnification. User shall indemnify, hold harmless, and, at Provider's option, defend Provider and its officers, directors, employees, agents, affiliates, successors, and assigns from and against any and all Losses arising from or relating to any Third-Party Claim (i) that the User Data, or any use of the User Data in accordance with this Agreement, infringes or misappropriates such third party's intellectual property rights; or (ii) based on User's or any Authorized User's negligence or willful misconduct or use of the Services in a manner not authorized by this Agreement; provided that User may not settle any Third-Party Claim against Provider unless Provider consents to such settlement, and further provided that Provider will have the right, at its option, to defend itself against any such Third-Party Claim or to participate in the defense thereof by counsel of its own choice.
36 | 10. Limitations of Liability. IN NO EVENT WILL PROVIDER BE LIABLE UNDER OR IN CONNECTION WITH THIS AGREEMENT UNDER ANY LEGAL OR EQUITABLE THEORY, INCLUDING BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, FOR ANY: (a) CONSEQUENTIAL, INCIDENTAL, INDIRECT, EXEMPLARY, SPECIAL, ENHANCED, OR PUNITIVE DAMAGES; (b) INCREASED COSTS, DIMINUTION IN VALUE OR LOST BUSINESS, PRODUCTION, REVENUES, OR PROFITS; (c) LOSS OF GOODWILL OR REPUTATION; (d) USE, INABILITY TO USE, LOSS, INTERRUPTION, DELAY OR RECOVERY OF ANY DATA, OR BREACH OF DATA OR SYSTEM SECURITY; OR (e) COST OF REPLACEMENT GOODS OR SERVICES, IN EACH CASE REGARDLESS OF WHETHER PROVIDER WAS ADVISED OF THE POSSIBILITY OF SUCH LOSSES OR DAMAGES OR SUCH LOSSES OR DAMAGES WERE OTHERWISE FORESEEABLE. IN NO EVENT WILL PROVIDER'S AGGREGATE LIABILITY ARISING OUT OF OR RELATED TO THIS AGREEMENT UNDER ANY LEGAL OR EQUITABLE THEORY, INCLUDING BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE EXCEED THE TOTAL AMOUNTS PAID TO PROVIDER UNDER THIS AGREEMENT IN THE 1 YEAR PERIOD PRECEDING THE EVENT GIVING RISE TO THE CLAIM OR $50, WHICHEVER IS LESS.
37 | 11. Term and Termination.
38 |     1. Term. The term of this Agreement begins on the Effective Date and continues until terminated. 
39 |     2. Termination. In addition to any other express termination right set forth in this Agreement:
40 |         1. Provider may terminate this Agreement for any reason or no reason with or without notice. 
41 |         2. Either party may terminate this Agreement, effective on written notice to the other party, if the other party materially breaches this Agreement, and such breach: (A) is incapable of cure; or (B) being capable of cure, remains uncured five (5) days after the non-breaching party provides the breaching party with written notice of such breach.
42 |     3. Effect of Termination. Upon termination of this Agreement, User shall immediately discontinue use of the Provider IP.
43 |     4. Survival. This Section 12(d), Sections 5, 6, 10, 11, 14, 15, 16, and 17, and any right, obligation, or required performance of the parties in this Agreement which, by its express terms or nature and context is intended to survive termination or expiration of this Agreement, will survive any such termination or expiration.
44 | 12. Modifications. You acknowledge and agree that we have the right, in our sole discretion, to modify this Agreement from time to time, and that modified terms become effective on posting. You will be notified of modifications through posts on https://github.com/Quansight/open-gpu-server. You are responsible for reviewing and becoming familiar with any such modifications. Your continued use of the Services after the effective date of the modifications will be deemed acceptance of the modified terms. 
45 | 13. Export Regulation. The Services utilize software and technology that may be subject to US export control laws, including the US Export Administration Act and its associated regulations. You shall not, directly or indirectly, export, re-export, or release the Services or the software or technology included in the Services to, or make the Services or the software or technology included in the Services accessible from, any jurisdiction or country to which export, re-export, or release is prohibited by law, regulation, or rule. You shall comply with all applicable federal laws, regulations, and rules, and complete all required undertakings (including obtaining any necessary export license or other governmental approval), prior to exporting, re-exporting, releasing, or otherwise making the Services or the software or technology included in the Services available outside the US.
46 | 14. Governing Law and Jurisdiction. This agreement is governed by and construed in accordance with the internal laws of the State of Texas without giving effect to any choice or conflict of law provision or rule that would require or permit the application of the laws of any jurisdiction other than those of the State of Texas. Any legal suit, action, or proceeding arising out of or related to this agreement or the rights granted hereunder will be instituted in the federal courts of the United States or the courts of the State of Texas in each case located in the city of Austin and County of Travis, and each party irrevocably submits to the jurisdiction of such courts in any such suit, action, or proceeding. 
47 | 15. Miscellaneous. This Agreement constitutes the entire agreement and understanding between the parties hereto with respect to the subject matter hereof and supersedes all prior and contemporaneous understandings, agreements, representations, and warranties, both written and oral, with respect to such subject matter. Any notices to us must be sent to our corporate headquarters address available at https://quansight.com and must be delivered either in person, by certified or registered mail, return receipt requested and postage prepaid, or by recognized overnight courier service, and are deemed given upon receipt by us. Notwithstanding the foregoing, you hereby consent to receiving electronic communications from us. These electronic communications may include notices about transactional information and other information concerning or related to the Services. You agree that any notices, agreements, disclosures, or other communications that we send to you electronically will satisfy any legal communication requirements, including that such communications be in writing. The invalidity, illegality, or unenforceability of any provision herein does not affect any other provision herein or the validity, legality, or enforceability of such provision in any other jurisdiction. Any failure to act by us with respect to a breach of this Agreement by you or others does not constitute a waiver and will not limit our rights with respect to such breach or any subsequent breaches. This Agreement is personal to you and may not be assigned or transferred by you for any reason whatsoever without our prior written consent and any action or conduct in violation of the foregoing will be void and without effect. We expressly reserve the right to assign this Agreement and to delegate any of its obligations hereunder.
48 | 
49 | ## Acceptable Use Policy
50 | 
51 | These content standards apply to any and all User Contributed Data and use of the Services. User Contributed Data must in its entirety comply with all applicable federal, state, local, and international laws and regulations. Without limiting the foregoing, User Contributed Data must not:
52 | 
53 | - Contain any material that is defamatory, obscene, indecent, abusive, offensive, harassing, violent, hateful, inflammatory, or otherwise objectionable.
54 | - Promote sexually explicit or pornographic material, violence, or discrimination based on race, sex, religion, nationality, disability, sexual orientation, or age.
55 | - Infringe any patent, trademark, trade secret, copyright, or other intellectual property or other rights of any other person.
56 | - Violate the legal rights (including the rights of publicity and privacy) of others or contain any material that could give rise to any civil or criminal liability under applicable laws or regulations or that otherwise may be in conflict with these Terms of Use and our Privacy Policy (https://quansight.com/privacy-policy/).
57 | - Be likely to deceive any person.
58 | - Promote any illegal activity, or advocate, promote, or assist any unlawful act.
59 | - Cause annoyance, inconvenience, or needless anxiety or be likely to upset, embarrass, alarm, or annoy any other person.
60 | - Impersonate any person, or misrepresent Your identity or affiliation with any person or organization.
61 | - Involve commercial activities or sales, such as contests, sweepstakes, and other sales promotions, barter, or advertising.
62 | - Give the impression that they emanate from or are endorsed by Us or any other person or entity, if this is not the case.
63 | 


--------------------------------------------------------------------------------
/access/conda-forge-users.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "users": [
  3 |     {
  4 |       "github": "jaimergp",
  5 |       "id": 2559438
  6 |     },
  7 |     {
  8 |       "github": "aktech",
  9 |       "id": 5647941
 10 |     },
 11 |     {
 12 |       "github": "isuruf",
 13 |       "id": 5234427
 14 |     },
 15 |     {
 16 |       "github": "carterbox",
 17 |       "id": 9604511
 18 |     },
 19 |     {
 20 |       "github": "h-vetinari",
 21 |       "id": 33685575
 22 |     },
 23 |     {
 24 |       "github": "hmaarrfk",
 25 |       "id": 90008
 26 |     },
 27 |     {
 28 |       "github": "jeongseok-meta",
 29 |       "id": 142548112
 30 |     },
 31 |     {
 32 |       "github": "minrk",
 33 |       "id": 151929
 34 |     },
 35 |     {
 36 |       "github": "RaulPPelaez",
 37 |       "id": 13015792
 38 |     },
 39 |     {
 40 |       "github": "njzjz",
 41 |       "id": 9496702
 42 |     },
 43 |     {
 44 |       "github": "weiji14",
 45 |       "id": 23487320
 46 |     },
 47 |     {
 48 |       "github": "xhochy",
 49 |       "id": 70274
 50 |     },
 51 |     {
 52 |       "github": "ngam",
 53 |       "id": 67342040
 54 |     },
 55 |     {
 56 |       "github": "iamthebot",
 57 |       "id": 8432571
 58 |     },
 59 |     {
 60 |       "github": "seanlaw",
 61 |       "id": 7473521
 62 |     },
 63 |     {
 64 |       "github": "joehiggi1758",
 65 |       "id": 46200959
 66 |     },
 67 |     {
 68 |       "github": "rongou",
 69 |       "id": 497101
 70 |     },
 71 |     {
 72 |       "github": "Tobias-Fischer",
 73 |       "id": 5497832
 74 |     },
 75 |     {
 76 |       "github": "mgorny",
 77 |       "id": 110765
 78 |     },
 79 |     {
 80 |       "github": "beckermr",
 81 |       "id": 5296416
 82 |     },
 83 |     {
 84 |       "github": "mattip",
 85 |       "id": 823911
 86 |     },
 87 |     {
 88 |       "github": "vicentebolea",
 89 |       "id": 939798
 90 |     },
 91 |     {
 92 |       "github": "apmorton",
 93 |       "id": 63636
 94 |     },
 95 |     {
 96 |       "github": "cbourjau",
 97 |       "id": 3288058
 98 |     }
 99 |   ]
100 | }
101 | 


--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quansight/open-gpu-server/68ead7e2fcdac69ec8462992f2b09646fcca606e/architecture.png


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Setup overview
2 | 
3 | ![arch](../architecture.png)
4 | 
5 | * The server runs OpenStack, which is able to spin up VMs on demand. See [`openstack.md`](./openstack.md) for more information.
6 | * The VMs run images built with [diskimage-builder](https://docs.openstack.org/diskimage-builder/latest/). See [`images.md`](./images.md) for more information.
7 | * Cirun knows how to connect to OpenStack to spin up VMs and expose them to Github Actions as self-hosted runners. See [`cirun.md`](./cirun.md) for more information.
8 | * The server is behind a VPN, which prevents public access to the server. See [`network.md`](./network.md) for more information.
9 | 


--------------------------------------------------------------------------------
/docs/cirun.md:
--------------------------------------------------------------------------------
 1 | # Cirun
 2 | 
 3 | The CI is powered by [cirun.io](https://cirun.io). Cirun is a service to spinup GitHub Actions Runners on a cloud provider (OpenStack in this case).
 4 | 
 5 | ## Setup
 6 | 
 7 | Roughly:
 8 | 
 9 | 1. Organizations need to install the Cirun app for Github, and enable it in the target repositories.
10 | 2. Add the cloud provider details in Cirun's configuration dashboard.
11 | 3. Add the runner configuration in the `.cirun.yml` file in the repository.
12 | 4. Modify the GitHub Actions workflow to use the self-hosted runners created by Cirun.
13 | 
14 | More details can be found in Cirun's documentation.
15 | 
16 | ## How does it work
17 | 
18 | Cirun application is installed on a GitHub repository, which authorises it to listen to webhook events. Whenever there is a
19 | GitHub Actions workflow job, cirun receives a webhook event, which contains the label for the requested runners. Cirun then
20 | reads the cirun cofniguration file to find our the full runner configuration such as cloud, image, instance type, etc.
21 | 
22 | Cirun then makes a request to the given cloud provider (OpenStack in this case) to create a runner. The request full provisioning
23 | configuration to spinup the runner and connect it to the github repository and run the requested job. OpenStack when recieved the API
24 | request creates a VM for the job and deletes the VM when it recieves another request from Cirun when the job is completed.
25 | 


--------------------------------------------------------------------------------
/docs/images.md:
--------------------------------------------------------------------------------
1 | # Images
2 | 
3 | OpenStack needs VM image(s) to spinup the VMs.
4 | 
5 | We use a tool called [Diskimage-builder](https://docs.openstack.org/diskimage-builder/latest/) for building images for our OpenStack installation.
6 | 
7 | The scripts for the same are available at [aktech/nvidia-openstack-image](https://github.com/aktech/nvidia-openstack-image).
8 | 


--------------------------------------------------------------------------------
/docs/network.md:
--------------------------------------------------------------------------------
 1 | # Network
 2 | 
 3 | The GPU Server sits behind a VPN, which prevents public access to the server.
 4 | 
 5 | However, Cirun needs access to the OpenStack API to be able to create and destroy runners (VMs). To
 6 | tackle this problem we have created a proxy server on GCP whose IP address is allow-listed in the
 7 | VPN, so that it can access the GPU server from outside the VPN.
 8 | 
 9 | The proxy server points to [ci.quansight.dev](https://ci.quansight.dev).
10 | Apart from providing a gateway to the GPU server it also proxies request to the server,
11 | which is basically being able to access the OpenStack API.
12 | 
13 | The OpenStack web interface is also accesible at [ci.quansight.dev](ci.quansight.dev/).
14 | 


--------------------------------------------------------------------------------
/docs/openstack.md:
--------------------------------------------------------------------------------
  1 | # OpenStack setup
  2 | 
  3 | The GPU Server runs [OpenStack](https://docs.openstack.org/kolla/latest/). Its [services](#openstack-services) run via Docker containers, which provides a way to spin up isolated VMs which can be used for CI, testing,development, or sandbox environments.
  4 | 
  5 | ## Deploying OpenStack
  6 | 
  7 | The deployment of OpenStack uses [Kolla](https://docs.openstack.org/kolla/latest/), which provides provide production-ready containers and deployment tools for deploying OpenStack Cloud.
  8 | 
  9 | The OpenStack setup is deployed via [Kolla Ansible](https://docs.openstack.org/kolla-ansible/latest/), which allows deploying
 10 | OpenStack with a single command via ansible. All the configuration is done via a single global configuration file.
 11 | More information on deployment can be found on the deployment repository: https://github.com/aktech/gpu-server-openstack-config (TODO: Move repo to quansight.)
 12 | 
 13 | ## Configuring access to GPUs
 14 | 
 15 | ### Passthrough
 16 | 
 17 | The GPUs need to be passed to OpenStack VMs via GPU passthrough, we enabled it in two places:
 18 | 
 19 | - By checking the **SVM Mode** in the BIOS setting.
 20 | - GRUB, by editing `/etc/default/grub`:
 21 | 
 22 | ```
 23 | GRUB_CMDLINE_LINUX_DEFAULT="amd_iommu=on iommu=pt vfio-pci.ids=10de:1db4 vfio_iommu_type1.allow_unsafe_interrupts=1 modprobe.blacklist=nvidiafb,nouveau"
 24 | ```
 25 | 
 26 | This also prevents NVIDIA drivers to directly access the GPUs from the GPU server, as they need to be passed to the VMs
 27 | created by OpenStack instead and hence detected by the NVIDIA drivers in those VMs.
 28 | 
 29 | > Note these `vfio-pci.ids` match the device IDs in the output below.
 30 | 
 31 | ### IOMMU groups
 32 | 
 33 | There are 6 x GPUs attached to the GPU server. They are in distributed in 4 IOMMU groups, as shown below:
 34 | 
 35 | ```bash
 36 | function lsiommu() {
 37 |     for d in $(find /sys/kernel/iommu_groups/ -type l | sort -n -k5 -t/); do
 38 |         n=${d#*/iommu_groups/*}
 39 |         n=${n%%/*}
 40 |         printf 'IOMMU Group %s ' "$n"
 41 |         lspci -nns "${d##*/}"
 42 |     done;
 43 | }
 44 | lsiommu | grep NVIDIA
 45 | # prints:
 46 | # IOMMU Group 19 27:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] [10de:1db4] (rev a1)
 47 | # IOMMU Group 19 28:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] [10de:1db4] (rev a1)
 48 | # IOMMU Group 32 44:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] [10de:1db4] (rev a1)
 49 | # IOMMU Group 75 a3:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] [10de:1db4] (rev a1)
 50 | # IOMMU Group 87 c3:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] [10de:1db4] (rev a1)
 51 | # IOMMU Group 87 c4:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] [10de:1db4] (rev a1)
 52 | ```
 53 | 
 54 | <details>
 55 | 
 56 | <summary>What is an IOMMU group?</summary>
 57 | 
 58 | The **Input-Output Memory Management Unit (IOMMU)** is a component in a memory controller that translates device virtual addresses
 59 | (can be also called I/O addresses or device addresses) to physical addresses. The concept of IOMMU is similar to Memory Management Unit (MMU).
 60 | 
 61 | The difference between IOMMU and MMU is that IOMMU translates device virtual addresses to physical addresses while MMU translates
 62 | CPU virtual addresses to physical addresses.
 63 | 
 64 | An **IOMMU group** is the smallest set of physical devices that can be passed to a virtual machine.
 65 | 
 66 | </details>
 67 | 
 68 | Devices in the same IOMMU group can not to attached to different VMs. For example, GPUs with address `27:00.0` and `28:00.0` are
 69 | in same IOMMU group **19**, which means they cannot be attached to two separate VMs. For this reason, we can spin up upto 4 VMs with at least one GPU each.
 70 | 
 71 | The way we access these specific GPUs is by configuring openstack to only load the following GPU addresses for VM creations (when GPU is requested), below is the sample configuration from nova service from openstack for selecting specific GPUs:
 72 | 
 73 | ```conf
 74 | # nova.conf
 75 | [pci]
 76 | device_spec = [{"address": "a3:00.0"}, {"address": "c3:00.0"}, {"address": "27:00.0"}, {"address": "44:00.0"}]
 77 | ```
 78 | 
 79 | ## Flavors
 80 | 
 81 | Flavors define the compute, memory, and storage capacity and attached PCI devices (GPU in this case) for creation of a VM.
 82 | We have defined a bunch of flavors for different use cases like GPU/CPU, Low/High Memory/cores runners. They can be
 83 | created via `openstack flavor` command. Example:
 84 | 
 85 | ```bash
 86 | openstack flavor create --public gpu_4xlarge --id gpu_4xlarge --ram 65536 --disk 60 --vcpus 8
 87 | openstack flavor set gpu_4xlarge --property  "pci_passthrough:alias"="tesla-v100:1"
 88 | ```
 89 | 
 90 | See [README.md](/README.md) for currently available flavors.
 91 | 
 92 | ## Resource quotas
 93 | 
 94 | ### Query current quotas
 95 | 
 96 | The resource quota(s) for OpenStack can be seen via the following command:
 97 | 
 98 | ```bash
 99 | openstack quota show --default
100 | ```
101 | 
102 | ### Update quotas
103 | 
104 | The resource quota(s) can be updated via `openstack quota set`. The example below is to update max instance count that can be spun up:
105 | 
106 | ```
107 | openstack quota set --class --instances 15 default
108 | ```
109 | 
110 | ## OpenStack Services
111 | 
112 | Here is a list of core OpenStack services with brief summaries.
113 | 
114 | - Cinder: Block Storage service for providing volumes to Nova virtual machines
115 | - Nova: responsible for provisioning of compute instances
116 | - Horizon: Web based user interface to OpenStack services
117 | - Keystone: Identity and authentication for all OpenStack services.
118 | - Glance: Compute image repository
119 | - Neutron: responsible for provisioning the virtual or physical networks that compute instances connect to on boot.
120 | - Placement: responsible for tracking inventory of resources available in a cloud and assisting in choosing which
121 | 
122 | More about these can be read on OpenStack documentation.
123 | 


--------------------------------------------------------------------------------
/vm-images/.cirun.yml:
--------------------------------------------------------------------------------
 1 | # Self-Hosted Github Action Runners on Openstack via Cirun.io
 2 | # Reference: https://docs.cirun.io/reference/yaml
 3 | runners:
 4 |   - name: openstack-runner
 5 |     # Cloud Provider: Openstack
 6 |     cloud: openstack
 7 |     # Instance type refers to flavors in openstack
 8 |     instance_type: gpu_large
 9 |     # The machine image to use for creating VM
10 |     machine_image: ubuntu-2204-nvidia-docker-20221229
11 |     region: RegionOne
12 |     labels:
13 |       - cirun-runner
14 | 


--------------------------------------------------------------------------------
/vm-images/README.md:
--------------------------------------------------------------------------------
 1 | # VM Images for OpenStack
 2 | 
 3 | Creates Ubuntu VM Images with following things installed:
 4 | 
 5 | - Nvidia drivers (GPU images only)
 6 | - Docker
 7 | 
 8 | The created image is uploaded to a GCS bucket, which is then retrieved by a self-hosted GHA runner running on our OpenStack instance.
 9 | 
10 | ## GHA workflows
11 | 
12 | These should automate the creation of the images:
13 | 
14 | - `.github/workflows/build-vm-images.yml` - Github Action to build the image
15 | - `.github/workflows/openstack.yml` - Github Action to upload the VM image to OpenStack
16 | 
17 | ## Building manually
18 | 
19 | 1. Install `diskimage-builder` manually via `pip install -r requirements.txt`. A virtual environment is advised.
20 | 2. Run `scripts/build-image.sh` to build the image. This will create a `.qcow2` file. 
21 |     - Export `$IMAGE_YAML` to choose a different image to build (e.g. `cpu-image.yaml`, `gpu-image.yaml`).
22 |     - Export `$OUTPUT_IMAGE` to change the qcow2 output filename.
23 | 
24 | ## Add Image to OpenStack
25 | 
26 | ```bash
27 | openstack image create ubuntu-2204-nvidia-docker \
28 |   --public --disk-format qcow2 \
29 |   --container-format bare \
30 |   --file <created-image>.qcow2
31 | ```
32 | 


--------------------------------------------------------------------------------
/vm-images/build-image.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | DIB_RELEASE=jammy
 4 | DIB_CLOUD_IMAGES=https://cloud-images.ubuntu.com/jammy/20230914/
 5 | DIB_MODPROBE_BLACKLIST=”nouveau”
 6 | DIB_CLOUD_INIT_DATASOURCES="OpenStack"
 7 | DIB_DHCP_TIMEOUT=30
 8 | DIB_NO_TMPFS=1
 9 | 
10 | if [[ -z "${LOG_TO_FILE}" ]]; then 
11 |   LOG_TO_FILE="--logfile dib.log"
12 | fi
13 | if [[ -z "${IMAGE_YAML}" ]]; then 
14 |   IMAGE_YAML="cpu-image.yaml"
15 | fi
16 | if [[ -z "${OUTPUT_IMAGE}" ]]; then 
17 |   output_fn=$(basename -- "${IMAGE_YAML}")
18 |   OUTPUT_IMAGE="${output_fn%.*}-$(date +%Y%m%d%H%M).qcow2"
19 | fi
20 | 
21 | echo "Starting Disk Image builder"
22 | disk-image-create \
23 |   ${IMAGE_YAML:-cpu-image.yaml}
24 |   --no-tmpfs \
25 |   -o "${OUTPUT_IMAGE}"
26 | echo "Starting Disk Image Finished"
27 | 


--------------------------------------------------------------------------------
/vm-images/cpu-image.yaml:
--------------------------------------------------------------------------------
1 | - elements:
2 |   - vm
3 |   - dhcp-all-interfaces
4 |   - block-device-gpt
5 |   - ubuntu
6 |   - misc
7 | 


--------------------------------------------------------------------------------
/vm-images/elements/cuda/element-deps:
--------------------------------------------------------------------------------
1 | package-installs
2 | 


--------------------------------------------------------------------------------
/vm-images/elements/cuda/package-installs.yaml:
--------------------------------------------------------------------------------
1 | build-essential:
2 | 


--------------------------------------------------------------------------------
/vm-images/elements/cuda/post-install.d/05-cuda-install:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ${DIB_DEBUG_TRACE:-0} -gt 0 ]; then
 4 |     set -x
 5 | fi
 6 | set -o errexit
 7 | set -o nounset
 8 | set -o pipefail
 9 | 
10 | 
11 | echo "#############################################################"
12 | echo "# CUDA Installation                                         #"
13 | echo "#############################################################"
14 | 
15 | ## comparing chroot kernel with host kernel
16 | #CHROOT_KERNEL=$(rpm -q --queryformat "%{installtime} %{version}-%{release}.%{arch}\n" kernel | sort -nr | sed -n 1p | cut -d' ' -f2)
17 | #HOST_KERNEL=$(uname -r)
18 | #if [ "$CHROOT_KERNEL" != "$HOST_KERNEL" ]; then
19 | #	"ERROR: kernel mismatch!"
20 | #	exit 1
21 | #fi
22 | 
23 | echo "Add hostname to /etc/hosts"
24 | echo $(hostname -I | cut -d\  -f1) $(hostname) | sudo tee -a /etc/hosts
25 | 
26 | echo "Installing linux headers"
27 | sudo apt-get install linux-headers-"$(uname -r)" -y
28 | 
29 | echo "Installing Nvidia Drivers"
30 | echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
31 | DEBIAN_FRONTEND=noninteractive sudo apt-get install -y -q
32 | DEBIAN_FRONTEND=noninteractive sudo apt update
33 | DEBIAN_FRONTEND=noninteractive sudo apt upgrade -y
34 | DEBIAN_FRONTEND=noninteractive sudo apt install nvidia-driver-525 -y
35 | echo "Put nvidia drivers on hold: DO NOT UPGRADE basically"
36 | dpkg-query -W --showformat='${Package} ${Status}\n' | grep -v deinstall | awk '{ print $1 }' | grep -E 'nvidia.*-[0-9]+$' | xargs -r -L 1 sudo apt-mark hold
37 | 
38 | echo "#############################################################"
39 | echo "# Install Nvidia Container Toolkit                          #"
40 | echo "#############################################################"
41 | 
42 | curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
43 |   && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
44 |     sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
45 |     sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
46 | 
47 | DEBIAN_FRONTEND=noninteractive sudo apt-get update
48 | DEBIAN_FRONTEND=noninteractive sudo apt-get install -y nvidia-container-toolkit
49 | echo "Done"
50 | 


--------------------------------------------------------------------------------
/vm-images/elements/misc/element-deps:
--------------------------------------------------------------------------------
1 | package-installs
2 | 


--------------------------------------------------------------------------------
/vm-images/elements/misc/package-installs.yaml:
--------------------------------------------------------------------------------
1 | build-essential:
2 | 


--------------------------------------------------------------------------------
/vm-images/elements/misc/post-install.d/01-install-misc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ${DIB_DEBUG_TRACE:-0} -gt 0 ]; then
 4 |     set -x
 5 | fi
 6 | set -o errexit
 7 | set -o nounset
 8 | set -o pipefail
 9 | 
10 | echo "#############################################################"
11 | echo "# Docker Installation                                       #"
12 | echo "#############################################################"
13 | 
14 | echo "Installing Docker"
15 | echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
16 | DEBIAN_FRONTEND=noninteractive sudo apt-get install -y -q
17 | DEBIAN_FRONTEND=noninteractive sudo apt update
18 | DEBIAN_FRONTEND=noninteractive sudo apt upgrade -y
19 | DEBIAN_FRONTEND=noninteractive sudo apt-get install ca-certificates curl gnupg lsb-release -y
20 | sudo mkdir -p /etc/apt/keyrings
21 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
22 | echo \
23 |   "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
24 |   $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
25 | DEBIAN_FRONTEND=noninteractive sudo apt update
26 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-compose-plugin -y
27 | echo "Done"
28 | 
29 | echo "#############################################################"
30 | echo "# Docker Installation Complete                              #"
31 | echo "#############################################################"
32 | 
33 | echo "#############################################################"
34 | echo "# Docker MTU PATCH                                          #"
35 | echo "#############################################################"
36 | 
37 | python3 <<EOF
38 | import json
39 | from pathlib import Path
40 | mtu_data = {"mtu": 1200}
41 | daemon_json = Path("/etc/docker/daemon.json")
42 | if daemon_json.is_file():
43 |     data = json.loads(daemon_json.read_text())
44 |     data.update(mtu_data)
45 |     daemon_json.write_text(json.dumps(data))
46 | else:
47 |     daemon_json.write_text(json.dumps(mtu_data))
48 | EOF
49 | 
50 | echo "#############################################################"
51 | echo "# MISC DONE                                                 #"
52 | echo "#############################################################"
53 | 


--------------------------------------------------------------------------------
/vm-images/gpu-image.yaml:
--------------------------------------------------------------------------------
1 | - elements:
2 |   - vm
3 |   - dhcp-all-interfaces
4 |   - block-device-gpt
5 |   - ubuntu
6 |   - cuda
7 |   - misc
8 | 


--------------------------------------------------------------------------------
/vm-images/requirements.txt:
--------------------------------------------------------------------------------
1 | diskimage-builder==3.25.0
2 | 


--------------------------------------------------------------------------------