├── .github
    ├── configurations
    │   ├── torch-base.yml
    │   └── torch-nccl.yml
    └── workflows
    │   ├── bloom.yml
    │   ├── build.yml
    │   ├── cuda-ssh.yml
    │   ├── cw-mega-sam.yml
    │   ├── gpt-neox-determined.yml
    │   ├── gpt-neox-mpi.yml
    │   ├── hf-llm-inference.yml
    │   ├── megatron.yml
    │   ├── read-configuration.yml
    │   ├── sd-finetuner.yml
    │   ├── sd-inference.yml
    │   ├── sglang.yml
    │   ├── tensorizer.yml
    │   ├── torch-base.yml
    │   ├── torch-extras.yml
    │   ├── torch-nccl.yml
    │   ├── torch-nightly.yml
    │   ├── torch.yml
    │   └── vllm-tensorizer.yml
├── .gitignore
├── LICENSE
├── bloom
    ├── Dockerfile
    └── environment.yaml
├── catalog.yaml
├── cuda-ssh
    └── Dockerfile
├── cw-mega-sam
    ├── Dockerfile
    ├── cuda124.patch
    └── requirements.txt
├── docs
    └── README.md
├── gpt-neox-determined
    └── Dockerfile
├── gpt-neox-mpi
    └── Dockerfile
├── hf-llm-inference
    └── Dockerfile
├── megatron
    ├── Dockerfile
    └── requirements.txt
├── mkdocs.yml
├── sd-finetuner
    └── Dockerfile
├── sd-inference
    └── Dockerfile
├── sglang
    ├── Dockerfile
    ├── build.bash
    └── install.bash
├── tensorizer
    └── Dockerfile
├── torch-extras
    ├── Dockerfile
    ├── compiler_wrapper.f95
    ├── effective_cpu_count.sh
    ├── install_cudnn.sh
    └── scale.sh
├── torch
    ├── Dockerfile
    ├── compiler_wrapper.f95
    ├── effective_cpu_count.sh
    ├── install_cudnn.sh
    ├── scale.sh
    └── torchaudio-cu125-pr3811.patch
└── vllm-tensorizer
    ├── Dockerfile
    └── freeze.sh


/.github/configurations/torch-base.yml:
--------------------------------------------------------------------------------
1 | cuda: [ 12.9.0, 12.8.1, 12.6.3 ]
2 | os: [ ubuntu22.04 ]
3 | abi: [ 1 ]
4 | include:
5 |   - torch: 2.7.0
6 |     vision: 0.22.0
7 |     audio: 2.7.0
8 | 


--------------------------------------------------------------------------------
/.github/configurations/torch-nccl.yml:
--------------------------------------------------------------------------------
 1 | cuda: [ 12.9.0, 12.8.1, 12.6.3 ]
 2 | os: [ ubuntu22.04 ]
 3 | abi: [ 1 ]
 4 | include:
 5 |   - torch: 2.7.0
 6 |     vision: 0.22.0
 7 |     audio: 2.7.0
 8 |     nccl: 2.27.3-1
 9 |     nccl-tests-hash: d82e3c0
10 | 


--------------------------------------------------------------------------------
/.github/workflows/bloom.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     paths:
 5 |       - "bloom/**"
 6 |       - ".github/workflows/bloom.yml"
 7 |       - ".github/workflows/build.yml"
 8 | 
 9 | 
10 | jobs:
11 |   build:
12 |     uses: ./.github/workflows/build.yml
13 |     secrets: inherit
14 |     with:
15 |       image-name: bloom
16 |       folder: bloom
17 |       build-args: ""
18 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: Build
  2 | 
  3 | on:
  4 |   workflow_call:
  5 |     inputs:
  6 |       folder:
  7 |         required: true
  8 |         type: string
  9 |       image-name:
 10 |         required: true
 11 |         type: string
 12 |       build-args:
 13 |         required: false
 14 |         type: string
 15 |       tag-suffix:
 16 |         required: false
 17 |         type: string
 18 |       cache-key:
 19 |         required: false
 20 |         description: "Optional sub-key to append to the image name for build layer caching"
 21 |         type: string
 22 |       platforms:
 23 |         required: false
 24 |         description: "Platforms for which to build (default: linux/amd64,linux/arm64)"
 25 |         type: string
 26 |         default: linux/amd64,linux/arm64
 27 |     outputs:
 28 |       outcome:
 29 |         description: "The outcome of the build"
 30 |         value: ${{ jobs.build.outputs.outcome }}
 31 |       tags:
 32 |         description: "The resulting image tags"
 33 |         value: ${{ jobs.build.outputs.tags }}
 34 |       version:
 35 |         description: "The resulting image version"
 36 |         value: ${{ jobs.build.outputs.tags }}
 37 | 
 38 | jobs:
 39 |   build:
 40 |     name: Build Images
 41 |     runs-on: [ cw ]
 42 |     container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
 43 |     timeout-minutes: 960
 44 |     defaults:
 45 |       run:
 46 |         shell: bash
 47 |     outputs:
 48 |       outcome: ${{ steps.docker-build.outcome }}
 49 |       tags: ${{ steps.meta.outputs.tags }}
 50 |       version: ${{ steps.meta.outputs.version }}
 51 |     steps:
 52 |       - uses: actions/checkout@v4
 53 |       - name: Fetch BuildKit Client Certs
 54 |         uses: dopplerhq/secrets-fetch-action@v1.2.0
 55 |         id: client-certs
 56 |         with:
 57 |           doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }}
 58 |           doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }}
 59 |           doppler-config: prod
 60 |           inject-env-vars: false
 61 |       - name: Set up Docker Buildx
 62 |         uses: docker/setup-buildx-action@v3.7.1
 63 |         with:
 64 |           driver: remote
 65 |           endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
 66 |           platforms: linux/amd64
 67 |           append: |
 68 |             - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
 69 |               platforms: linux/arm64
 70 |         env:
 71 |           BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
 72 |           BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
 73 |           BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
 74 |           BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
 75 |           BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
 76 |           BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
 77 |       - name: Get base registry
 78 |         run: |
 79 |           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
 80 |       - name: Set tag prefix
 81 |         if: github.ref_name != 'main'
 82 |         run: |
 83 |           echo "TAG_PREFIX=${{ github.ref_name }}-" >> $GITHUB_ENV
 84 |       - name: Set tag suffix
 85 |         if: inputs.tag-suffix != ''
 86 |         run: |
 87 |           echo "TAG_SUFFIX=-${{ inputs.tag-suffix }}" >> $GITHUB_ENV
 88 |       - name: Set cache key
 89 |         if: inputs.cache-key != ''
 90 |         run: |
 91 |           echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV
 92 |       - name: Extract metadata (tags, labels) for Docker
 93 |         id: meta
 94 |         uses: docker/metadata-action@v5.5.1
 95 |         with:
 96 |           images: ${{ env.REGISTRY }}/${{ inputs.image-name }}
 97 |           tags: |
 98 |             type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short
 99 |       - name: Initialize registry credentials file
100 |         env:
101 |           USER: ${{ github.actor }}
102 |           PASS: ${{ secrets.GITHUB_TOKEN }}
103 |         run: |
104 |           jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \
105 |           | install -m400 /dev/stdin ~/.docker/config.json
106 |       - name: Build and push Docker image
107 |         id: docker-build
108 |         uses: docker/build-push-action@v6.9.0
109 |         with:
110 |           context: ${{ inputs.folder }}
111 |           build-args: |-
112 |             ${{ inputs.build-args }}
113 |           push: true
114 |           tags: ${{ steps.meta.outputs.tags }}
115 |           labels: ${{ steps.meta.outputs.labels }}
116 |           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
117 |           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
118 |           platforms: ${{ inputs.platforms }}
119 |       - name: Clear registry credentials
120 |         if: always()
121 |         run: |
122 |           rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ]
123 |       - uses: 8BitJonny/gh-get-current-pr@2.1.3
124 |         id: PR
125 |         with:
126 |           filterOutClosed: true
127 |       - name: Comment
128 |         if: steps.PR.outputs.number
129 |         uses: peter-evans/create-or-update-comment@v2.1.0
130 |         with:
131 |           issue-number: ${{ steps.PR.outputs.number }}
132 |           body: >
133 |             @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}:
134 |             ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
135 | 
136 |             Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}`
137 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda-ssh.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     paths:
 5 |       - "cuda-ssh/**"
 6 |       - ".github/workflows/cuda-ssh.yml"
 7 |       - ".github/workflows/build.yml"
 8 | 
 9 | 
10 | jobs:
11 |   build:
12 |     strategy:
13 |       matrix:
14 |         tag:
15 |           - ceeb8c2-base-cuda11.8.0-torch2.0.1-vision0.15.2-audio2.0.2
16 |           - ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2
17 | 
18 |     uses: ./.github/workflows/build.yml
19 |     secrets: inherit
20 |     with:
21 |       image-name: cuda-ssh
22 |       folder: cuda-ssh
23 |       tag-suffix: torch-${{ matrix.tag }}
24 |       build-args: |
25 |         BASE_IMAGE=ghcr.io/coreweave/ml-containers/torch:${{ matrix.tag }}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/cw-mega-sam.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       base-image:
 5 |         description: "Base image to use"
 6 |         required: true
 7 |       commit:
 8 |         description: "Commit of Mega-sam to include"
 9 |         required: true
10 |   push:
11 |     paths:
12 |       - "cw-mega-sam/**"
13 |       - ".github/workflows/cw-mega-sam.yml"
14 |       - ".github/workflows/build.yml"
15 | 
16 | jobs:
17 |   build:
18 |     uses: ./.github/workflows/build.yml
19 |     secrets: inherit
20 |     with:
21 |       image-name: cw-mega-sam
22 |       folder: cw-mega-sam
23 |       build-args: |
24 |         BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-68fbfd1-nccl-cuda12.4.1-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi0'}}
25 |         COMMIT=${{ inputs.commit || 'main'}}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/gpt-neox-determined.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     paths:
 5 |       - "gpt-neox-determined/**"
 6 |       - ".github/workflows/gpt-neox-determined.yml"
 7 |       - ".github/workflows/build.yml"
 8 | 
 9 | 
10 | jobs:
11 |   build:
12 |     uses: ./.github/workflows/build.yml
13 |     secrets: inherit
14 |     with:
15 |       image-name: gpt-neox-determined
16 |       folder: gpt-neox-determined
17 |       build-args: ""
18 | 


--------------------------------------------------------------------------------
/.github/workflows/gpt-neox-mpi.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     paths:
 5 |       - "gpt-neox-mpi/**"
 6 |       - ".github/workflows/gpt-neox-mpi.yml"
 7 |       - ".github/workflows/build.yml"
 8 | 
 9 | 
10 | jobs:
11 |   build:
12 |     uses: ./.github/workflows/build.yml
13 |     secrets: inherit
14 |     with:
15 |       image-name: gpt-neox-mpi
16 |       folder: gpt-neox-mpi
17 |       build-args: ""
18 | 


--------------------------------------------------------------------------------
/.github/workflows/hf-llm-inference.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       commit:
 5 |         description: 'Commit to build'
 6 |         required: true
 7 |   push:
 8 |     paths:
 9 |       - "hf-llm-inference/**"
10 |       - ".github/workflows/hf-llm-inference.yml"
11 |       - ".github/workflows/build.yml"
12 | 
13 | 
14 | jobs:
15 |   build:
16 |     uses: ./.github/workflows/build.yml
17 |     secrets: inherit
18 |     with:
19 |       image-name: hf-llm-inference
20 |       folder: hf-llm-inference
21 |       build-args: |
22 |         ${{ inputs.commit && 'COMMIT=' }}${{ inputs.commit }}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/megatron.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       base-image:
 5 |         description: 'Base image to use'
 6 |         required: true
 7 |       commit:
 8 |         description: 'Commit of Megatron to include'
 9 |         required: true
10 |   push:
11 |     paths:
12 |       - "megatron/**"
13 |       - ".github/workflows/megatron.yml"
14 |       - ".github/workflows/build.yml"
15 | 
16 | 
17 | jobs:
18 |   build:
19 |     uses: ./.github/workflows/build.yml
20 |     secrets: inherit
21 |     with:
22 |       image-name: megatron
23 |       folder: megatron
24 |       build-args: |
25 |         BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:bfe03aa-nccl-cuda12.4.1-ubuntu22.04-nccl2.21.5-1-torch2.4.0-vision0.19.0-audio2.4.0'}}
26 |         COMMIT=${{ inputs.commit || 'main'}}


--------------------------------------------------------------------------------
/.github/workflows/read-configuration.yml:
--------------------------------------------------------------------------------
 1 | name: read-configuration
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       path:
 7 |         required: true
 8 |         type: string
 9 |       filter:
10 |         required: false
11 |         type: string
12 |     outputs:
13 |       config:
14 |         description: "The retrieved configuration, as JSON"
15 |         value: ${{ jobs.read-file.outputs.config }}
16 | 
17 | jobs:
18 |   read-file:
19 |     name: Read Configuration File
20 |     runs-on: [ cw ]
21 |     container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
22 |     defaults:
23 |       run:
24 |         shell: bash
25 |     permissions: {}
26 |     outputs:
27 |       config: ${{ steps.read.outputs.contents }}
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |       - name: Read configuration
31 |         id: read
32 |         env:
33 |           FILE_PATH: ${{ inputs.path }}
34 |           FILTER: ${{ inputs.filter }}
35 |         run: |
36 |           set -x;
37 |           if [ -n "$FILTER" ]; then
38 |             CONTENTS="$(yq e "$FILE_PATH" --expression "$FILTER" -oj -I0)";
39 |           else
40 |             CONTENTS="$(yq e "$FILE_PATH" -oj -I0)";
41 |           fi;
42 |           echo "contents=$CONTENTS" >> "$GITHUB_OUTPUT";
43 |           
44 |           {
45 |             echo '## Configuration';
46 |             echo '```json';
47 |             echo "$CONTENTS" | jq .;
48 |             echo '```';
49 |           } >> "$GITHUB_STEP_SUMMARY";
50 | 


--------------------------------------------------------------------------------
/.github/workflows/sd-finetuner.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       commit:
 5 |         description: 'Commit to build'
 6 |         required: true
 7 |         default: 'master'
 8 |   push:
 9 |     paths:
10 |       - "sd-finetuner/**"
11 |       - ".github/workflows/sd-finetuner.yml"
12 |       - ".github/workflows/build.yml"
13 | 
14 | 
15 | jobs:
16 |   build:
17 |     uses: ./.github/workflows/build.yml
18 |     secrets: inherit
19 |     with:
20 |       image-name: sd-finetuner
21 |       folder: sd-finetuner
22 |       build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
23 | 


--------------------------------------------------------------------------------
/.github/workflows/sd-inference.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       commit:
 5 |         description: 'Commit to build'
 6 |         required: true
 7 |         default: 'master'
 8 |   push:
 9 |     paths:
10 |       - "sd-inference/**"
11 |       - ".github/workflows/sd-inference.yml"
12 |       - ".github/workflows/build.yml"
13 | 
14 | 
15 | jobs:
16 |   build:
17 |     uses: ./.github/workflows/build.yml
18 |     secrets: inherit
19 |     with:
20 |       image-name: sd-inference
21 |       folder: sd-inference
22 |       build-args: |
23 |         COMMIT=${{ github.event.inputs.commit }}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/sglang.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       tag:
 5 |         description: 'Tag for the build'
 6 |         required: true
 7 |       base-image:
 8 |         description: 'Base image from which to build'
 9 |         required: true
10 |       builder-image:
11 |         description: 'Image to use to compile wheels, if different from the base image'
12 |         required: false
13 |   push:
14 |     paths:
15 |       - "sglang/**"
16 |       - ".github/workflows/sglang.yml"
17 |       - ".github/workflows/build.yml"
18 | 
19 | 
20 | jobs:
21 |   build:
22 |     uses: ./.github/workflows/build.yml
23 |     secrets: inherit
24 |     with:
25 |       image-name: sglang
26 |       folder: sglang
27 |       tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }}
28 |       build-args: |
29 |         BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}}
30 |         ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}}
31 | 


--------------------------------------------------------------------------------
/.github/workflows/tensorizer.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       commit:
 5 |         description: 'Commit to build'
 6 |         required: true
 7 |         default: 'master'
 8 |   push:
 9 |     paths:
10 |       - "tensorizer/**"
11 |       - ".github/workflows/tensorizer.yml"
12 |       - ".github/workflows/build.yml"
13 | 
14 | 
15 | jobs:
16 |   build:
17 |     uses: ./.github/workflows/build.yml
18 |     secrets: inherit
19 |     with:
20 |       image-name: tensorizer
21 |       folder: tensorizer
22 |       build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
23 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-base.yml:
--------------------------------------------------------------------------------
 1 | name: torch-base
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       image-name:
 7 |         required: false
 8 |         description: "Custom name under which to publish the resulting container"
 9 |         type: string
10 |       image-tag-suffix:
11 |         required: false
12 |         description: "Custom tag suffix listing library versions under which to publish the resulting container"
13 |         type: string
14 |   push:
15 |     paths:
16 |       - "torch/**"
17 |       - ".github/configurations/torch-base.yml"
18 |       - ".github/workflows/torch-base.yml"
19 |       - ".github/workflows/torch.yml"
20 |       - ".github/workflows/build.yml"
21 | 
22 | 
23 | jobs:
24 |   get-config:
25 |     name: Get torch:base Config
26 |     uses: ./.github/workflows/read-configuration.yml
27 |     with:
28 |       path: ./.github/configurations/torch-base.yml
29 |   build:
30 |     name: Build torch:base
31 |     needs: get-config
32 |     strategy:
33 |       matrix: ${{ fromJSON(needs.get-config.outputs.config) }}
34 |     uses: ./.github/workflows/torch.yml
35 |     secrets: inherit
36 |     with:
37 |       image-name: ${{ inputs.image-name }}
38 |       tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
39 |       builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }}
40 |       base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }}
41 |       torch-version: ${{ matrix.torch }}
42 |       torchvision-version: ${{ matrix.vision }}
43 |       torchaudio-version: ${{ matrix.audio }}
44 |       additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
45 |       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
46 |       build-extras: true
47 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-extras.yml:
--------------------------------------------------------------------------------
  1 | name: torch-extras
  2 | 
  3 | on:
  4 |   workflow_call:
  5 |     inputs:
  6 |       tag:
  7 |         required: true
  8 |         type: string
  9 |       base-image:
 10 |         required: true
 11 |         type: string
 12 |       image-name:
 13 |         required: false
 14 |         type: string
 15 |       skip-bases-check:
 16 |         required: false
 17 |         type: boolean
 18 |         default: true
 19 |       cache-key:
 20 |         required: false
 21 |         type: string
 22 | 
 23 |   workflow_dispatch:
 24 |     inputs:
 25 |       tag:
 26 |         required: false
 27 |         description: "Tag suffix to identify the build"
 28 |         type: string
 29 |       base-image:
 30 |         required: false
 31 |         description: "Base image for the build"
 32 |         type: string
 33 |       image-name:
 34 |         required: false
 35 |         description: "Custom name under which to publish the resulting container"
 36 |         type: string
 37 |       skip-bases-check:
 38 |         required: false
 39 |         description: "Build from one specific image rather than the most recent releases from the main branch"
 40 |         type: boolean
 41 |         default: true
 42 | 
 43 |   push:
 44 |     paths:
 45 |       - "torch-extras/**"
 46 |       - ".github/workflows/torch-extras.yml"
 47 |       - ".github/workflows/build.yml"
 48 | 
 49 | 
 50 | jobs:
 51 |   get-required-bases:
 52 |     name: Get Latest Required Base Images
 53 |     if: inputs.skip-bases-check != true
 54 |     runs-on: [ cw ]
 55 |     container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
 56 |     defaults:
 57 |       run:
 58 |         shell: bash
 59 |     permissions:
 60 |       packages: read
 61 |     outputs:
 62 |       bases-list: ${{ steps.choose-bases.outputs.list }}
 63 |     steps:
 64 |       - uses: actions/checkout@v4
 65 |         with:
 66 |           fetch-depth: 0
 67 |       - name: Check if torch-extras needs to be rebuilt from previous bases
 68 |         id: check-changed
 69 |         run: |
 70 |           if [ "$EVENT_NAME" = 'push' ]; then \
 71 |             if [ "$FORCE_PUSH" = '1' ] || \
 72 |               [ "$BEFORE_HASH" = '0000000000000000000000000000000000000000' ] && [ -n "$FIRST_COMMIT" ]; then \
 73 |               export BEFORE_HASH="$FIRST_COMMIT~";
 74 |             fi && \
 75 |             CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \
 76 |             { \
 77 |               echo "$CHANGED_FILES" \
 78 |               | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \
 79 |                 && echo "BASE_PROVIDED=true" >> "$GITHUB_OUTPUT" \
 80 |                 || echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT"; \
 81 |             } && { \
 82 |               echo "$CHANGED_FILES" \
 83 |               | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \
 84 |                 && echo "NCCL_PROVIDED=true" >> "$GITHUB_OUTPUT" \
 85 |                 || echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; \
 86 |             }; \
 87 |           else \
 88 |             echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT" && \
 89 |             echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT";
 90 |           fi
 91 |         env:
 92 |           EVENT_NAME: ${{ github.event_name }}
 93 |           BEFORE_HASH: ${{ github.event.before }}
 94 |           AFTER_HASH: ${{ github.event.after }}
 95 |           FIRST_COMMIT: ${{ github.event.commits[0].id }}
 96 |           FORCE_PUSH: ${{ github.event.forced && '1' || '' }}
 97 |       - name: Get latest torch container releases
 98 |         if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true'
 99 |         id: get-latest
100 |         run: |
101 |           RELEASES="$( \
102 |             /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \
103 |               'https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list?n=100000' \
104 |             | jq -r '.["tags"][]' \
105 |             | grep -E '^[0-9a-f]{7}-(base|nccl)-' \
106 |           )" && \
107 |           BASE_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-base-')" && \
108 |           NCCL_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-nccl-')" && \
109 |           LATEST_BASE_COMMIT="$(echo "$BASE_RELEASES" | tail -1 | cut -c1-7)" && \
110 |           LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \
111 |           LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \
112 |           LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \
113 |           echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> "$GITHUB_OUTPUT" && \
114 |           echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> "$GITHUB_OUTPUT"
115 |         env:
116 |           BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
117 |       - name: Choose which torch containers to use as a build base
118 |         if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true'
119 |         id: choose-bases
120 |         run: |
121 |           TAG_TO_JSON() {
122 |             TAG_PATTERN='^[0-9a-f]{7}-(.*)';
123 |             JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}';
124 |             sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g";
125 |           } && \
126 |           SPLIT_TO_LINES() { xargs -n 1; } && \
127 |           JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \
128 |           echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \
129 |           echo "list=[$( \
130 |             ( \
131 |               if [ "$BASE_PROVIDED" = 'false' ]; then \
132 |                 echo "$LATEST_BASE_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \
133 |                 echo "$LATEST_BASE_IMAGES"; \
134 |               fi && \
135 |               if [ "$NCCL_PROVIDED" = 'false' ]; then \
136 |                 echo "$LATEST_NCCL_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \
137 |                 echo "$LATEST_NCCL_IMAGES"; \
138 |               fi; \
139 |             ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \
140 |           )]" >> "$GITHUB_OUTPUT";
141 |         env:
142 |           BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }}
143 |           NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }}
144 |           LATEST_BASE_IMAGES: ${{ steps.get-latest.outputs.LATEST_BASE_IMAGES }}
145 |           LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }}
146 | 
147 |   build-call:
148 |     name: Build torch-extras via Workflow Call
149 |     if: inputs.skip-bases-check
150 |     uses: ./.github/workflows/build.yml
151 |     secrets: inherit
152 |     with:
153 |       image-name: ${{ inputs.image-name || 'torch-extras' }}
154 |       folder: torch-extras
155 |       tag-suffix: ${{ inputs.tag }}
156 |       cache-key: ${{ inputs.cache-key }}
157 |       build-args: |
158 |         BASE_IMAGE=${{ inputs.base-image }}
159 | 
160 |   build-self:
161 |     name: Build torch-extras via Event Trigger
162 |     needs: get-required-bases
163 |     if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]'
164 |     strategy:
165 |       matrix:
166 |         bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }}
167 |     uses: ./.github/workflows/build.yml
168 |     secrets: inherit
169 |     with:
170 |       image-name: ${{ inputs.image-name || 'torch-extras' }}
171 |       folder: torch-extras
172 |       tag-suffix: ${{ matrix.bases.tag }}
173 |       build-args: |
174 |         BASE_IMAGE=${{ matrix.bases.image }}
175 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-nccl.yml:
--------------------------------------------------------------------------------
 1 | name: torch-nccl
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       image-name:
 7 |         required: false
 8 |         type: string
 9 |       image-tag-suffix:
10 |         required: false
11 |         type: string
12 |   workflow_dispatch:
13 |     inputs:
14 |       image-name:
15 |         required: false
16 |         description: "Custom name under which to publish the resulting container"
17 |         type: string
18 |       image-tag-suffix:
19 |         required: false
20 |         description: "Custom tag suffix listing library versions under which to publish the resulting container"
21 |         type: string
22 |   push:
23 |     paths:
24 |       - "torch/**"
25 |       - ".github/configurations/torch-nccl.yml"
26 |       - ".github/workflows/torch-nccl.yml"
27 |       - ".github/workflows/torch.yml"
28 |       - ".github/workflows/build.yml"
29 | 
30 | 
31 | jobs:
32 |   get-config:
33 |     name: Get torch:nccl Config
34 |     uses: ./.github/workflows/read-configuration.yml
35 |     with:
36 |       path: ./.github/configurations/torch-nccl.yml
37 |   build:
38 |     name: Build torch:nccl
39 |     needs: get-config
40 |     strategy:
41 |       matrix: ${{ fromJSON(needs.get-config.outputs.config) }}
42 |     uses: ./.github/workflows/torch.yml
43 |     secrets: inherit
44 |     with:
45 |       image-name: ${{ inputs.image-name }}
46 |       tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
47 |       builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
48 |       base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
49 |       torch-version: ${{ matrix.torch }}
50 |       torchvision-version: ${{ matrix.vision }}
51 |       torchaudio-version: ${{ matrix.audio }}
52 |       additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
53 |       cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
54 |       build-extras: true
55 | 


--------------------------------------------------------------------------------
/.github/workflows/torch-nightly.yml:
--------------------------------------------------------------------------------
  1 | name: torch-nightly
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |   schedule:
  6 |     # At 05:00 UTC (midnight EST)
  7 |     - cron: "0 5 * * *"
  8 |   push:
  9 |     paths:
 10 |       - "torch/**"
 11 |       - ".github/configurations/torch-base.yml"
 12 |       - ".github/configurations/torch-nccl.yml"
 13 |       - ".github/workflows/torch-nightly.yml"
 14 |       - ".github/workflows/torch.yml"
 15 |       - ".github/workflows/build.yml"
 16 | 
 17 | 
 18 | jobs:
 19 |   get-nightly-info:
 20 |     name:
 21 |       Get Nightly Info
 22 |     runs-on: [ cw ]
 23 |     container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
 24 |     defaults:
 25 |       run:
 26 |         shell: bash
 27 |     outputs:
 28 |       pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }}
 29 |       triton-commit: ${{ steps.get-hash.outputs.triton-commit }}
 30 |       torchvision-commit: ${{ steps.get-hash.outputs.torchvision-commit }}
 31 |       torchaudio-commit: ${{ steps.get-hash.outputs.torchaudio-commit }}
 32 |       version-string: ${{ steps.get-hash.outputs.version-string }}
 33 |       date: ${{ steps.get-date.outputs.date }}
 34 |     steps:
 35 |       - name: Get latest commit hashes
 36 |         id: get-hash
 37 |         run: |
 38 |           set -e;
 39 |           
 40 |           FORMAT_COMMIT_LINK() {
 41 |             printf '[`%.7s`](https://github.com/%s/tree/%s)\n' "$2" "$1" "$2";
 42 |           };
 43 |           
 44 |           LOG() {
 45 |             printf -- "$@" >> "$GITHUB_STEP_SUMMARY";
 46 |           };
 47 |           
 48 |           CLONE() {
 49 |             git clone --filter=blob:none --no-checkout --depth=1 \
 50 |               "https://github.com/$1" \
 51 |               "$2" > /dev/null 2> /dev/null && \
 52 |             local COMMIT="$(git -C "$2" rev-parse HEAD)" && \
 53 |             LOG 'Latest `%s` commit: %s\n' \
 54 |               "$1" "$(FORMAT_COMMIT_LINK "$1" "$COMMIT")" && \
 55 |             echo $COMMIT;
 56 |           };
 57 |           
 58 |           GET_VERSION() {
 59 |             git -C "$1" show HEAD:version.txt 2> /dev/null; 
 60 |           };
 61 |           
 62 |           PYTORCH_COMMIT="$(CLONE pytorch/pytorch pytorch-git)";
 63 |           PYTORCH_VERSION="$(GET_VERSION pytorch-git)";
 64 |           TRITON_COMMIT_FILE=".ci/docker/ci_commit_pins/triton.txt";
 65 |           TRITON_COMMIT="$(git -C pytorch-git show "HEAD:$TRITON_COMMIT_FILE" 2> /dev/null)";
 66 |           rm -rf pytorch-git;
 67 |           
 68 |           LOG 'Corresponding `openai/triton` commit: %s\n' \
 69 |             "$(FORMAT_COMMIT_LINK openai/triton "$TRITON_COMMIT")";
 70 |           
 71 |           TORCHVISION_COMMIT="$(CLONE pytorch/vision torchvision-git)";
 72 |           TORCHVISION_VERSION="$(GET_VERSION torchvision-git)";
 73 |           rm -rf torchvision-git;
 74 |           
 75 |           TORCHAUDIO_COMMIT="$(CLONE pytorch/audio torchaudio-git)";
 76 |           TORCHAUDIO_VERSION="$(GET_VERSION torchaudio-git)";
 77 |           rm -rf torchaudio-git;
 78 |           
 79 |           echo "pytorch-commit=$PYTORCH_COMMIT" >> "$GITHUB_OUTPUT";
 80 |           echo "triton-commit=$TRITON_COMMIT" >> "$GITHUB_OUTPUT";
 81 |           echo "torchvision-commit=$TORCHVISION_COMMIT" >> "$GITHUB_OUTPUT";
 82 |           echo "torchaudio-commit=$TORCHAUDIO_COMMIT" >> "$GITHUB_OUTPUT";
 83 |           
 84 |           printf -- 'version-string=torch%s-vision%s-audio%s\n' \
 85 |             "$PYTORCH_VERSION" "$TORCHVISION_VERSION" "$TORCHAUDIO_VERSION" \
 86 |             >> "$GITHUB_OUTPUT";
 87 |       - name: Get date
 88 |         id: get-date
 89 |         run: echo "date=$(date -u '+%y%m%d%H')" >> "$GITHUB_OUTPUT";
 90 | 
 91 |   get-base-config:
 92 |     name: Get torch:base Config
 93 |     uses: ./.github/workflows/read-configuration.yml
 94 |     with:
 95 |       path: ./.github/configurations/torch-base.yml
 96 |       filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]'
 97 |   get-nccl-config:
 98 |     name: Get torch:nccl Config
 99 |     uses: ./.github/workflows/read-configuration.yml
100 |     with:
101 |       path: ./.github/configurations/torch-nccl.yml
102 |       filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]'
103 | 
104 |   build-base:
105 |     name: Build Nightly torch:base
106 |     needs:
107 |       - get-nightly-info
108 |       - get-base-config
109 |     strategy:
110 |       fail-fast: false
111 |       matrix: ${{ fromJSON(needs.get-base-config.outputs.config) }}
112 |     uses: ./.github/workflows/torch.yml
113 |     secrets: inherit
114 |     with:
115 |       image-name: nightly-torch
116 |       tag: ${{ format('base-{0}-cuda{1}-{2}-{3}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, needs.get-nightly-info.outputs.version-string) }}
117 |       builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }}
118 |       base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }}
119 |       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
120 |       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
121 |       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
122 |       additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
123 |       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
124 |       build-extras: true
125 |   build-nccl:
126 |     name: Build Nightly torch:nccl
127 |     needs:
128 |       - get-nightly-info
129 |       - get-nccl-config
130 |     strategy:
131 |       fail-fast: false
132 |       matrix: ${{ fromJSON(needs.get-nccl-config.outputs.config) }}
133 |     uses: ./.github/workflows/torch.yml
134 |     secrets: inherit
135 |     with:
136 |       image-name: nightly-torch
137 |       tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }}
138 |       builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
139 |       base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
140 |       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
141 |       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
142 |       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
143 |       additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
144 |       cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
145 |       build-extras: true
146 | 


--------------------------------------------------------------------------------
/.github/workflows/torch.yml:
--------------------------------------------------------------------------------
  1 | on:
  2 |   workflow_call:
  3 |     inputs:
  4 |       tag:
  5 |         required: true
  6 |         type: string
  7 |       builder-base-image:
  8 |         required: true
  9 |         type: string
 10 |       base-image:
 11 |         required: true
 12 |         type: string
 13 |       torch-version:
 14 |         required: true
 15 |         type: string
 16 |       torchvision-version:
 17 |         required: true
 18 |         type: string
 19 |       torchaudio-version:
 20 |         required: true
 21 |         type: string
 22 |       additional-build-args:
 23 |         required: false
 24 |         type: string
 25 |       image-name:
 26 |         required: false
 27 |         type: string
 28 |       build-extras:
 29 |         required: false
 30 |         type: boolean
 31 |         default: false
 32 |       cache-key:
 33 |         required: false
 34 |         type: string
 35 | 
 36 |   workflow_dispatch:
 37 |     inputs:
 38 |       tag:
 39 |         required: true
 40 |         description: "Tag suffix to identify the build"
 41 |         type: string
 42 |       builder-base-image:
 43 |         required: true
 44 |         description: "Base image used during the compilation step"
 45 |         type: string
 46 |       base-image:
 47 |         required: true
 48 |         description: "Base image for the final image"
 49 |         type: string
 50 |       torch-version:
 51 |         required: true
 52 |         description: "Tagged version number from pytorch/pytorch to build"
 53 |         type: string
 54 |       torchvision-version:
 55 |         required: true
 56 |         description: "Tagged version number from pytorch/vision to build"
 57 |         type: string
 58 |       torchaudio-version:
 59 |         required: true
 60 |         description: "Tagged version number from pytorch/audio to build"
 61 |         type: string
 62 |       additional-build-args:
 63 |         required: false
 64 |         description: "Further --build-arg parameters for the build"
 65 |         type: string
 66 |       image-name:
 67 |         required: false
 68 |         description: "Custom name under which to publish the resulting container"
 69 |         type: string
 70 |       build-extras:
 71 |         required: false
 72 |         description: "Whether to build and push a torch-extras container as well"
 73 |         type: boolean
 74 |         default: false
 75 | 
 76 | jobs:
 77 |   build:
 78 |     name: Build torch
 79 |     uses: ./.github/workflows/build.yml
 80 |     secrets: inherit
 81 |     with:
 82 |       image-name: ${{ inputs.image-name || 'torch' }}
 83 |       folder: torch
 84 |       tag-suffix: ${{ inputs.tag }}
 85 |       cache-key: ${{ inputs.cache-key }}
 86 |       build-args: |
 87 |         BUILD_CCACHE_SIZE=5Gi
 88 |         BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }}
 89 |         FINAL_BASE_IMAGE=${{ inputs.base-image }}
 90 |         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
 91 |         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
 92 |         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
 93 |         ${{ inputs.additional-build-args }}
 94 |   build-extras:
 95 |     name: Build torch-extras
 96 |     if: inputs.build-extras
 97 |     needs: build
 98 |     uses: ./.github/workflows/torch-extras.yml
 99 |     secrets: inherit
100 |     with:
101 |       tag: ${{ inputs.tag }}
102 |       base-image: ${{ needs.build.outputs.tags }}
103 |       image-name: ${{ inputs.image-name && format('{0}-extras', inputs.image-name) || '' }}
104 |       cache-key: ${{ inputs.cache-key }}
105 | 


--------------------------------------------------------------------------------
/.github/workflows/vllm-tensorizer.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |     inputs:
 4 |       commit:
 5 |         description: 'Commit to build'
 6 |         required: true
 7 |   push:
 8 |     paths:
 9 |       - "vllm-tensorizer/**"
10 |       - ".github/workflows/vllm-tensorizer.yml"
11 |       - ".github/workflows/build.yml"
12 | 
13 | 
14 | jobs:
15 |   build:
16 |     uses: ./.github/workflows/build.yml
17 |     secrets: inherit
18 |     with:
19 |       image-name: vllm-tensorizer
20 |       folder: vllm-tensorizer
21 |       tag-suffix: ${{ inputs.commit || '19307ba71ddeb7e1cc6aec3c1baa8b50d59c1beb'}}
22 |       build-args: |
23 |         COMMIT_HASH=${{ inputs.commit || '19307ba71ddeb7e1cc6aec3c1baa8b50d59c1beb'}}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # General
  2 | .DS_Store
  3 | .AppleDouble
  4 | .LSOverride
  5 | 
  6 | # Icon must end with two \r
  7 | Icon
  8 | 
  9 | # Thumbnails
 10 | ._*
 11 | 
 12 | # Files that might appear in the root of a volume
 13 | .DocumentRevisions-V100
 14 | .fseventsd
 15 | .Spotlight-V100
 16 | .TemporaryItems
 17 | .Trashes
 18 | .VolumeIcon.icns
 19 | .com.apple.timemachine.donotpresent
 20 | 
 21 | # Directories potentially created on remote AFP share
 22 | .AppleDB
 23 | .AppleDesktop
 24 | Network Trash Folder
 25 | Temporary Items
 26 | .apdisk
 27 | 
 28 | *~
 29 | 
 30 | # temporary files which can be created if a process still has a handle open of a deleted file
 31 | .fuse_hidden*
 32 | 
 33 | # KDE directory preferences
 34 | .directory
 35 | 
 36 | # Linux trash folder which might appear on any partition or disk
 37 | .Trash-*
 38 | 
 39 | # .nfs files are created when an open file is removed but is still being accessed
 40 | .nfs*
 41 | 
 42 | # Swap
 43 | [._]*.s[a-v][a-z]
 44 | !*.svg  # comment out if you don't need vector files
 45 | [._]*.sw[a-p]
 46 | [._]s[a-rt-v][a-z]
 47 | [._]ss[a-gi-z]
 48 | [._]sw[a-p]
 49 | 
 50 | # Session
 51 | Session.vim
 52 | Sessionx.vim
 53 | 
 54 | # Temporary
 55 | .netrwhist
 56 | *~
 57 | # Auto-generated tag files
 58 | tags
 59 | # Persistent undo
 60 | [._]*.un~
 61 | 
 62 | # -*- mode: gitignore; -*-
 63 | *~
 64 | \#*\#
 65 | /.emacs.desktop
 66 | /.emacs.desktop.lock
 67 | *.elc
 68 | auto-save-list
 69 | tramp
 70 | .\#*
 71 | 
 72 | # Org-mode
 73 | .org-id-locations
 74 | *_archive
 75 | 
 76 | # flymake-mode
 77 | *_flymake.*
 78 | 
 79 | # eshell files
 80 | /eshell/history
 81 | /eshell/lastdir
 82 | 
 83 | # elpa packages
 84 | /elpa/
 85 | 
 86 | # reftex files
 87 | *.rel
 88 | 
 89 | # AUCTeX auto folder
 90 | /auto/
 91 | 
 92 | # cask packages
 93 | .cask/
 94 | dist/
 95 | 
 96 | # Flycheck
 97 | flycheck_*.el
 98 | 
 99 | # server auth directory
100 | /server/
101 | 
102 | # projectiles files
103 | .projectile
104 | 
105 | # directory configuration
106 | .dir-locals.el
107 | 
108 | # network security
109 | /network-security.data
110 | 
111 | # -*- mode: gitignore; -*-
112 | *~
113 | \#*\#
114 | /.emacs.desktop
115 | /.emacs.desktop.lock
116 | *.elc
117 | auto-save-list
118 | tramp
119 | .\#*
120 | 
121 | # Org-mode
122 | .org-id-locations
123 | *_archive
124 | 
125 | # flymake-mode
126 | *_flymake.*
127 | 
128 | # eshell files
129 | /eshell/history
130 | /eshell/lastdir
131 | 
132 | # elpa packages
133 | /elpa/
134 | 
135 | # reftex files
136 | *.rel
137 | 
138 | # AUCTeX auto folder
139 | /auto/
140 | 
141 | # cask packages
142 | .cask/
143 | dist/
144 | 
145 | # Flycheck
146 | flycheck_*.el
147 | 
148 | # server auth directory
149 | /server/
150 | 
151 | # projectiles files
152 | .projectile
153 | 
154 | # directory configuration
155 | .dir-locals.el
156 | 
157 | # network security
158 | /network-security.data
159 | 
160 | # local environment files
161 | .env
162 | .env*
163 | .environment
164 | .environment*
165 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2022 CoreWeave
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/bloom/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM coreweave/nccl-tests:2022-11-06_19-21-22.11_EST
 2 | 
 3 | # setup python and conda
 4 | 
 5 | RUN DEBIAN_FRONTEND=noninteractive apt-get -qq update && \
 6 |         DEBIAN_FRONTEND=noninteractive apt-get -qq install -y --no-install-recommends \
 7 |         python3 python3-dev python3-pip git libssl-dev pkg-config
 8 | 
 9 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
10 |     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
11 |     rm Miniconda3-latest-Linux-x86_64.sh
12 | 
13 | ENV PATH=/opt/conda/bin:$PATH
14 | 
15 | # create conda environment from file. this step installs torch.
16 | COPY ./environment.yaml /opt/nccl-tests/environment.yaml
17 | RUN conda env create -f environment.yaml
18 | 
19 | SHELL ["conda", "run", "-n", "tr11-176B-ml", "/bin/bash", "-c"]
20 | 
21 | # setup rust and then tokenizers
22 | RUN conda install -y -c conda-forge rust
23 | 
24 | RUN git clone https://github.com/huggingface/tokenizers && \
25 |     cd tokenizers && \
26 |     git checkout bigscience_fork && \
27 |     pip install setuptools_rust && \
28 |     pip install -e bindings/python
29 | 
30 | # install apex
31 | RUN git clone https://github.com/NVIDIA/apex && \
32 |     cd apex && \
33 |     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ --upgrade
34 | 
35 | # install deepspeed from here:
36 | # https://github.com/microsoft/DeepSpeed/tree/olruwase/elastic-ckpt-refresh
37 | RUN pip install git+https://github.com/microsoft/DeepSpeed.git@olruwase/elastic-ckpt-refresh --upgrade
38 | 
39 | # clone bigscience repo
40 | RUN git clone https://github.com/bigscience-workshop/bigscience.git
41 | 
42 | # https://github.com/bigscience-workshop/Megatron-DeepSpeed/tree/olruwase/ds_ckpt_reshape
43 | RUN git clone --single-branch --branch olruwase/ds_ckpt_reshape https://github.com/bigscience-workshop/Megatron-DeepSpeed.git
44 | 


--------------------------------------------------------------------------------
/bloom/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: tr11-176B-ml
  2 | channels:
  3 |   - conda-forge
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=conda_forge
  7 |   - _openmp_mutex
  8 |   - ca-certificates=2022.6.15=ha878542_0
  9 |   - certifi=2022.6.15=py38h578d9bd_0
 10 |   - ld_impl_linux-64
 11 |   - libffi
 12 |   - libgcc-ng
 13 |   - libgomp
 14 |   - libstdcxx-ng>=10.3.0
 15 |   - nccl=2.10.3.1=hdc17891_0
 16 |   - ncurses=6.3=h7f8727e_2
 17 |   - openssl=1.1.1q=h7f8727e_0
 18 |   - pip=21.2.4=py38h06a4308_0
 19 |   - python=3.8.12=h12debd9_0
 20 |   - python_abi=3.8=2_cp38
 21 |   - readline=8.1.2=h7f8727e_1
 22 |   - setuptools=58.0.4=py38h06a4308_0
 23 |   - sqlite=3.37.2=hc218d9a_0
 24 |   - tk=8.6.11=h1ccaba5_0
 25 |   - wheel=0.37.1=pyhd3eb1b0_0
 26 |   - xz=5.2.5=h7b6447c_0
 27 |   - zlib=1.2.11=h7f8727e_4
 28 |   - pip:
 29 |     - absl-py==1.0.0
 30 |     - aiohttp==3.8.1
 31 |     - aiosignal==1.2.0
 32 |     - appdirs==1.4.4
 33 |     - astunparse==1.6.3
 34 |     - async-timeout==4.0.2
 35 |     - attrs==21.4.0
 36 |     - best-download==0.0.9
 37 |     - black==21.4b0
 38 |     - cachetools==5.0.0
 39 |     - chardet==4.0.0
 40 |     - charset-normalizer==2.0.12
 41 |     - click==8.0.4
 42 |     - colorama==0.4.4
 43 |     - cython==0.29.28
 44 |     - dataproperty==0.55.0
 45 |     - datasets==1.15.1
 46 |     - dill==0.3.4
 47 |     - dynet38==2.1
 48 |     - filelock==3.6.0
 49 |     - flatbuffers==2.0
 50 |     - frozenlist==1.3.0
 51 |     - fsspec==2022.2.0
 52 |     - gast==0.5.3
 53 |     - google-auth==2.6.0
 54 |     - google-auth-oauthlib==0.4.6
 55 |     - google-pasta==0.2.0
 56 |     - grpcio==1.44.0
 57 |     - h5py==3.6.0
 58 |     - hjson==3.0.2
 59 |     - huggingface-hub==0.9.0
 60 |     - idna==3.3
 61 |     - importlib-metadata==4.11.2
 62 |     - iniconfig==1.1.1
 63 |     - isort==5.10.1
 64 |     - jieba==0.42.1
 65 |     - joblib==1.1.0
 66 |     - jsonlines==2.0.0
 67 |     - keras==2.8.0
 68 |     - keras-preprocessing==1.1.2
 69 |     - libclang==14.0.1
 70 |     - lm-dataformat==0.0.20
 71 |     - lm-eval==0.2.0
 72 |     - markdown==3.3.6
 73 |     - mbstrdecoder==1.1.0
 74 |     - mock==4.0.3
 75 |     - msgfy==0.2.0
 76 |     - multidict==6.0.2
 77 |     - multiprocess==0.70.12.2
 78 |     - mypy-extensions==0.4.3
 79 |     - nagisa==0.2.7
 80 |     - ninja==1.10.2.3
 81 |     - nltk==3.7
 82 |     - numexpr==2.7.2
 83 |     - numpy==1.22.3
 84 |     - oauthlib==3.2.0
 85 |     - openai==0.6.4
 86 |     - opt-einsum==3.3.0
 87 |     - packaging==21.3
 88 |     - pandas==1.4.1
 89 |     - parameterized==0.8.1
 90 |     - pathspec==0.9.0
 91 |     - pathvalidate==2.5.0
 92 |     - pillow==9.0.1
 93 |     - pluggy==0.13.1
 94 |     - portalocker==2.4.0
 95 |     - protobuf==3.19.4
 96 |     - psutil==5.9.0
 97 |     - py==1.11.0
 98 |     - py-cpuinfo==8.0.0
 99 |     - py-spy==0.3.11
100 |     - pyarrow==7.0.0
101 |     - pyasn1==0.4.8
102 |     - pyasn1-modules==0.2.8
103 |     - pybind11==2.6.2
104 |     - pycountry==20.7.3
105 |     - pydantic==1.9.1
106 |     - pyparsing==3.0.7
107 |     - pytablewriter==0.58.0
108 |     - pytest==6.2.3
109 |     - pytest-instafail==0.4.2
110 |     - python-dateutil==2.8.2
111 |     - pytz==2021.3
112 |     - pyyaml==6.0
113 |     - regex==2022.3.2
114 |     - rehash==1.0.0
115 |     - requests==2.27.1
116 |     - requests-oauthlib==1.3.1
117 |     - responses==0.18.0
118 |     - rouge-score==0.0.4
119 |     - rsa==4.8
120 |     - sacrebleu==1.5.0
121 |     - sacremoses==0.0.47
122 |     - scikit-learn==1.0.2
123 |     - scipy==1.8.0
124 |     - semantic-version==2.9.0
125 |     - sentencepiece==0.1.96
126 |     - setuptools-rust==1.1.2
127 |     - six==1.16.0
128 |     - sqlitedict==1.6.0
129 |     - tabledata==1.3.0
130 |     - tcolorpy==0.1.2
131 |     - tensorboard==2.8.0
132 |     - tensorboard-data-server==0.6.1
133 |     - tensorboard-plugin-wit==1.8.1
134 |     - tensorflow==2.8.0
135 |     - tensorflow-io-gcs-filesystem==0.25.0
136 |     - termcolor==1.1.0
137 |     - tf-estimator-nightly==2.8.0.dev2021122109
138 |     - tf-slim==1.1.0
139 |     - threadpoolctl==3.1.0
140 |     - toml==0.10.2
141 |     - tomli==2.0.1
142 |     - tqdm==4.63.0
143 |     - tqdm-multiprocess==0.0.11
144 |     - typepy==1.3.0
145 |     - typing-extensions==4.1.1
146 |     - ujson==5.2.0
147 |     - urllib3==1.26.8
148 |     - werkzeug==2.0.3
149 |     - wrapt==1.14.0
150 |     - xxhash==3.0.0
151 |     - yarl==1.7.2
152 |     - zipp==3.7.0
153 |     - zstandard==0.15.2


--------------------------------------------------------------------------------
/catalog.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: backstage.io/v1alpha1
 3 | kind: Component
 4 | metadata:
 5 |   name: ml-containers
 6 |   annotations:
 7 |     backstage.io/techdocs-ref: dir:.
 8 |   description: Optimized images for training/inference on CoreWeave infrastructure
 9 |   tags:
10 |     - ml
11 |   # links:
12 |   #   - title: Deployment Manifests
13 |   #     url: https://github.com/coreweave/awesome-turtles/tree/main/deploy
14 |   #     icon: github
15 |   customer_impact: true
16 |   stateless: false
17 | spec:
18 |   type: service
19 |   lifecycle: production
20 |   owner: group:cw/team_ml


--------------------------------------------------------------------------------
/cuda-ssh/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE="ghcr.io/coreweave/ml-containers/torch:ceeb8c2-base-cuda11.8.0-torch2.0.1-vision0.15.2-audio2.0.2"
 2 | 
 3 | FROM ${BASE_IMAGE}
 4 | 
 5 | RUN apt-get -qq update && \
 6 |     DEBIAN_FRONTEND=noninteractive \
 7 |       apt-get -qq install --no-install-recommends -y \
 8 |       # Critical packages:
 9 |       ssh ca-certificates tini bash \
10 |       # Helpful packages:
11 |       libncurses5 curl wget sudo htop git rsync locales \
12 |       tmux unzip nano vim apt-utils iputils-ping && \
13 |     apt-get clean && \
14 |     # SSH passes the client's LANG and LC_* environment variables by default.
15 |     # However, the only pre-installed locales on most container images are
16 |     # C, C.UTF-8, and POSIX. This adds the en_US.UTF-8 locale as well,
17 |     # and leaves locale-gen available to install others.
18 |     locale-gen en_US.UTF-8 && \
19 |     # Wipe the server-side SSH keys on the container image level
20 |     # to prevent leaking the private host keys, which could
21 |     # potentially allow impersonation of the SSH server by an attacker.
22 |     rm /etc/ssh/ssh_host_*
23 | 
24 | # Since there are no host keys, the SSH server
25 | # MUST be configured at runtime by running:
26 | #   dpkg-reconfigure openssh-server
27 | # (Or by adding custom host key files to /etc/ssh/) before launching it with:
28 | #   service ssh start
29 | # Or (blocking):
30 | #   service ssh start -D
31 | 
32 | RUN \
33 |     # Configure the privilege separation directory for sshd
34 |     # See here for details: https://github.com/openssh/openssh-portable/blob/master/README.privsep
35 |     install -d --mode=0755 --owner=0 --group=0 /var/run/sshd && \
36 |     # Configure an empty authorized keys file with correct permissions
37 |     install -d --mode=0700 --owner=0 --group=0 /root/.ssh && \
38 |     install --mode=600 --owner=0 --group=0 /dev/null /root/.ssh/authorized_keys && \
39 |     # Allow only public key authentication
40 |     install --mode=600 --owner=0 --group=0 /dev/null /etc/ssh/sshd_config.d/10-key-auth.conf && \
41 |     echo "PasswordAuthentication no" >> /etc/ssh/sshd_config.d/10-key-auth.conf && \
42 |     echo "PermitRootLogin without-password" >> /etc/ssh/sshd_config.d/10-key-auth.conf && \
43 |     # Prevent the user from being kicked off after login
44 |     # See here for details: https://stackoverflow.com/questions/21391142
45 |     sed -i -E -e \
46 |       's:session(\s*)required(\s*)pam_loginuid\.so:session\1optional\2pam_loginuid.so:g' \
47 |       /etc/pam.d/sshd && \
48 |     # Fix sudo bug: https://github.com/sudo-project/sudo/issues/42
49 |     echo 'Set disable_coredump false' >> /etc/sudo.conf
50 | 
51 | RUN chsh -s /bin/bash root
52 | 
53 | EXPOSE 22
54 | 


--------------------------------------------------------------------------------
/cw-mega-sam/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM $BASE_IMAGE
 3 | 
 4 | RUN mkdir -p /work
 5 | COPY ./cuda124.patch /work/cuda124.patch
 6 | COPY ./requirements.txt /work/requirements.txt
 7 | 
 8 | RUN pip install -r /work/requirements.txt
 9 | RUN rm /work/requirements.txt
10 | 
11 | ARG COMMIT
12 | RUN cd /work && git clone --recursive https://github.com/mega-sam/mega-sam && \
13 |     cd mega-sam && \
14 |     git checkout ${COMMIT}
15 | RUN cd /work/mega-sam && ls -la
16 | RUN cd /work/mega-sam && patch -p2 < /work/cuda124.patch
17 | 
18 | 
19 | 
20 | ENTRYPOINT /work
21 | CMD echo "Hello! You should only need to run python setup.py install from the /work/mega-sam/base directory."
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/cw-mega-sam/cuda124.patch:
--------------------------------------------------------------------------------
  1 | diff -crB '--exclude=.git' ref/mega-sam/base/src/altcorr_kernel.cu mega-sam/base/src/altcorr_kernel.cu
  2 | *** ref/mega-sam/base/src/altcorr_kernel.cu	Mon Mar 10 18:15:59 2025
  3 | --- mega-sam/base/src/altcorr_kernel.cu	Mon Mar 10 17:10:59 2025
  4 | ***************
  5 | *** 304,310 ****
  6 |     const dim3 threads(BLOCK_H, BLOCK_W);
  7 |   
  8 |   
  9 | !   AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.type(), "altcorr_forward_kernel", ([&] {
 10 |       altcorr_forward_kernel<scalar_t><<<blocks, threads>>>(
 11 |           fmap1.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
 12 |           fmap2.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
 13 | --- 304,310 ----
 14 |     const dim3 threads(BLOCK_H, BLOCK_W);
 15 |   
 16 |   
 17 | !   AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.type().scalarType(), "altcorr_forward_kernel", ([&] {
 18 |       altcorr_forward_kernel<scalar_t><<<blocks, threads>>>(
 19 |           fmap1.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
 20 |           fmap2.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
 21 | ***************
 22 | *** 351,354 ****
 23 |       radius);
 24 |   
 25 |     return {fmap1_grad, fmap2_grad, coords_grad};
 26 | ! }
 27 | \ No newline at end of file
 28 | --- 351,354 ----
 29 |       radius);
 30 |   
 31 |     return {fmap1_grad, fmap2_grad, coords_grad};
 32 | ! }
 33 | diff -crB '--exclude=.git' ref/mega-sam/base/src/correlation_kernels.cu mega-sam/base/src/correlation_kernels.cu
 34 | *** ref/mega-sam/base/src/correlation_kernels.cu	Mon Mar 10 18:15:59 2025
 35 | --- mega-sam/base/src/correlation_kernels.cu	Mon Mar 10 17:16:42 2025
 36 | ***************
 37 | *** 141,147 ****
 38 |     torch::Tensor corr = torch::zeros(
 39 |       {batch_size, 2*radius+1, 2*radius+1, ht, wd}, opts);
 40 |   
 41 | !   AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type(), "sampler_forward_kernel", ([&] {
 42 |       corr_index_forward_kernel<scalar_t><<<blocks, threads>>>(
 43 |         volume.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
 44 |         coords.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
 45 | --- 141,147 ----
 46 |     torch::Tensor corr = torch::zeros(
 47 |       {batch_size, 2*radius+1, 2*radius+1, ht, wd}, opts);
 48 |   
 49 | !   AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type().scalarType(), "sampler_forward_kernel", ([&] {
 50 |       corr_index_forward_kernel<scalar_t><<<blocks, threads>>>(
 51 |         volume.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
 52 |         coords.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
 53 | ***************
 54 | *** 172,178 ****
 55 |     const dim3 threads(BLOCK, BLOCK);
 56 |   
 57 |   
 58 | !   AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type(), "sampler_backward_kernel", ([&] {
 59 |       corr_index_backward_kernel<scalar_t><<<blocks, threads>>>(
 60 |         coords.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
 61 |         corr_grad.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
 62 | --- 172,178 ----
 63 |     const dim3 threads(BLOCK, BLOCK);
 64 |   
 65 |   
 66 | !   AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type().scalarType(), "sampler_backward_kernel", ([&] {
 67 |       corr_index_backward_kernel<scalar_t><<<blocks, threads>>>(
 68 |         coords.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
 69 |         corr_grad.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
 70 | ***************
 71 | *** 181,184 ****
 72 |      }));
 73 |   
 74 |     return {volume_grad};
 75 | ! }
 76 | \ No newline at end of file
 77 | --- 181,184 ----
 78 |      }));
 79 |   
 80 |     return {volume_grad};
 81 | ! }
 82 | diff -crB '--exclude=.git' ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp
 83 | *** ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp	Mon Mar 10 18:16:06 2025
 84 | --- mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp	Mon Mar 10 17:37:48 2025
 85 | ***************
 86 | *** 357,363 ****
 87 |       int batch_size = a.size(0);
 88 |       torch::Tensor X;
 89 |   
 90 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_forward_kernel", ([&] {
 91 |           X = torch::zeros({batch_size, group_t::N}, a.options());
 92 |           exp_forward_kernel<group_t, scalar_t>(
 93 |               a.data_ptr<scalar_t>(), 
 94 | --- 357,363 ----
 95 |       int batch_size = a.size(0);
 96 |       torch::Tensor X;
 97 |   
 98 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_forward_kernel", ([&] {
 99 |           X = torch::zeros({batch_size, group_t::N}, a.options());
100 |           exp_forward_kernel<group_t, scalar_t>(
101 |               a.data_ptr<scalar_t>(), 
102 | ***************
103 | *** 372,378 ****
104 |       int batch_size = a.size(0);
105 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
106 |   
107 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_backward_kernel", ([&] {
108 |           exp_backward_kernel<group_t, scalar_t>(
109 |               grad.data_ptr<scalar_t>(), 
110 |               a.data_ptr<scalar_t>(), 
111 | --- 372,378 ----
112 |       int batch_size = a.size(0);
113 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
114 |   
115 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_backward_kernel", ([&] {
116 |           exp_backward_kernel<group_t, scalar_t>(
117 |               grad.data_ptr<scalar_t>(), 
118 |               a.data_ptr<scalar_t>(), 
119 | ***************
120 | *** 387,393 ****
121 |       int batch_size = X.size(0);
122 |       torch::Tensor a;
123 |   
124 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_forward_kernel", ([&] {
125 |           a = torch::zeros({batch_size, group_t::K}, X.options());
126 |           log_forward_kernel<group_t, scalar_t>(
127 |               X.data_ptr<scalar_t>(), 
128 | --- 387,393 ----
129 |       int batch_size = X.size(0);
130 |       torch::Tensor a;
131 |   
132 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_forward_kernel", ([&] {
133 |           a = torch::zeros({batch_size, group_t::K}, X.options());
134 |           log_forward_kernel<group_t, scalar_t>(
135 |               X.data_ptr<scalar_t>(), 
136 | ***************
137 | *** 402,408 ****
138 |       int batch_size = X.size(0);
139 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
140 |   
141 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_backward_kernel", ([&] {
142 |           log_backward_kernel<group_t, scalar_t>(
143 |               grad.data_ptr<scalar_t>(), 
144 |               X.data_ptr<scalar_t>(), 
145 | --- 402,408 ----
146 |       int batch_size = X.size(0);
147 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
148 |   
149 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_backward_kernel", ([&] {
150 |           log_backward_kernel<group_t, scalar_t>(
151 |               grad.data_ptr<scalar_t>(), 
152 |               X.data_ptr<scalar_t>(), 
153 | ***************
154 | *** 417,423 ****
155 |       int batch_size = X.size(0);
156 |       torch::Tensor Y = torch::zeros_like(X);
157 |   
158 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_forward_kernel", ([&] {
159 |           inv_forward_kernel<group_t, scalar_t>(
160 |               X.data_ptr<scalar_t>(), 
161 |               Y.data_ptr<scalar_t>(), 
162 | --- 417,423 ----
163 |       int batch_size = X.size(0);
164 |       torch::Tensor Y = torch::zeros_like(X);
165 |   
166 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_forward_kernel", ([&] {
167 |           inv_forward_kernel<group_t, scalar_t>(
168 |               X.data_ptr<scalar_t>(), 
169 |               Y.data_ptr<scalar_t>(), 
170 | ***************
171 | *** 431,437 ****
172 |       int batch_size = X.size(0);
173 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
174 |   
175 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_backward_kernel", ([&] {
176 |           inv_backward_kernel<group_t, scalar_t>(
177 |               grad.data_ptr<scalar_t>(), 
178 |               X.data_ptr<scalar_t>(), 
179 | --- 431,437 ----
180 |       int batch_size = X.size(0);
181 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
182 |   
183 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_backward_kernel", ([&] {
184 |           inv_backward_kernel<group_t, scalar_t>(
185 |               grad.data_ptr<scalar_t>(), 
186 |               X.data_ptr<scalar_t>(), 
187 | ***************
188 | *** 447,453 ****
189 |       int batch_size = X.size(0);
190 |       torch::Tensor Z = torch::zeros_like(X);
191 |   
192 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_forward_kernel", ([&] {
193 |           mul_forward_kernel<group_t, scalar_t>(
194 |               X.data_ptr<scalar_t>(), 
195 |               Y.data_ptr<scalar_t>(), 
196 | --- 447,453 ----
197 |       int batch_size = X.size(0);
198 |       torch::Tensor Z = torch::zeros_like(X);
199 |   
200 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_forward_kernel", ([&] {
201 |           mul_forward_kernel<group_t, scalar_t>(
202 |               X.data_ptr<scalar_t>(), 
203 |               Y.data_ptr<scalar_t>(), 
204 | ***************
205 | *** 463,469 ****
206 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
207 |       torch::Tensor dY = torch::zeros(Y.sizes(), grad.options());
208 |   
209 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_backward_kernel", ([&] {
210 |           mul_backward_kernel<group_t, scalar_t>(
211 |               grad.data_ptr<scalar_t>(), 
212 |               X.data_ptr<scalar_t>(), 
213 | --- 463,469 ----
214 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
215 |       torch::Tensor dY = torch::zeros(Y.sizes(), grad.options());
216 |   
217 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_backward_kernel", ([&] {
218 |           mul_backward_kernel<group_t, scalar_t>(
219 |               grad.data_ptr<scalar_t>(), 
220 |               X.data_ptr<scalar_t>(), 
221 | ***************
222 | *** 480,486 ****
223 |       int batch_size = X.size(0);
224 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
225 |   
226 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_forward_kernel", ([&] {
227 |           adj_forward_kernel<group_t, scalar_t>(
228 |               X.data_ptr<scalar_t>(), 
229 |               a.data_ptr<scalar_t>(), 
230 | --- 480,486 ----
231 |       int batch_size = X.size(0);
232 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
233 |   
234 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_forward_kernel", ([&] {
235 |           adj_forward_kernel<group_t, scalar_t>(
236 |               X.data_ptr<scalar_t>(), 
237 |               a.data_ptr<scalar_t>(), 
238 | ***************
239 | *** 496,502 ****
240 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
241 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
242 |   
243 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_backward_kernel", ([&] {
244 |           adj_backward_kernel<group_t, scalar_t>(
245 |               grad.data_ptr<scalar_t>(), 
246 |               X.data_ptr<scalar_t>(), 
247 | --- 496,502 ----
248 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
249 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
250 |   
251 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_backward_kernel", ([&] {
252 |           adj_backward_kernel<group_t, scalar_t>(
253 |               grad.data_ptr<scalar_t>(), 
254 |               X.data_ptr<scalar_t>(), 
255 | ***************
256 | *** 514,520 ****
257 |       int batch_size = X.size(0);
258 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
259 |   
260 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_forward_kernel", ([&] {
261 |           adjT_forward_kernel<group_t, scalar_t>(
262 |               X.data_ptr<scalar_t>(), 
263 |               a.data_ptr<scalar_t>(), 
264 | --- 514,520 ----
265 |       int batch_size = X.size(0);
266 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
267 |   
268 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_forward_kernel", ([&] {
269 |           adjT_forward_kernel<group_t, scalar_t>(
270 |               X.data_ptr<scalar_t>(), 
271 |               a.data_ptr<scalar_t>(), 
272 | ***************
273 | *** 530,536 ****
274 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
275 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
276 |   
277 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_backward_kernel", ([&] {
278 |           adjT_backward_kernel<group_t, scalar_t>(
279 |               grad.data_ptr<scalar_t>(), 
280 |               X.data_ptr<scalar_t>(), 
281 | --- 530,536 ----
282 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
283 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
284 |   
285 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_backward_kernel", ([&] {
286 |           adjT_backward_kernel<group_t, scalar_t>(
287 |               grad.data_ptr<scalar_t>(), 
288 |               X.data_ptr<scalar_t>(), 
289 | ***************
290 | *** 548,554 ****
291 |       int batch_size = X.size(0);
292 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
293 |   
294 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_forward_kernel", ([&] {
295 |           act_forward_kernel<group_t, scalar_t>(
296 |               X.data_ptr<scalar_t>(), 
297 |               p.data_ptr<scalar_t>(), 
298 | --- 548,554 ----
299 |       int batch_size = X.size(0);
300 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
301 |   
302 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_forward_kernel", ([&] {
303 |           act_forward_kernel<group_t, scalar_t>(
304 |               X.data_ptr<scalar_t>(), 
305 |               p.data_ptr<scalar_t>(), 
306 | ***************
307 | *** 564,570 ****
308 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
309 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
310 |   
311 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_backward_kernel", ([&] {
312 |           act_backward_kernel<group_t, scalar_t>(
313 |               grad.data_ptr<scalar_t>(), 
314 |               X.data_ptr<scalar_t>(), 
315 | --- 564,570 ----
316 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
317 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
318 |   
319 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_backward_kernel", ([&] {
320 |           act_backward_kernel<group_t, scalar_t>(
321 |               grad.data_ptr<scalar_t>(), 
322 |               X.data_ptr<scalar_t>(), 
323 | ***************
324 | *** 582,588 ****
325 |       int batch_size = X.size(0);
326 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
327 |   
328 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_forward_kernel", ([&] {
329 |           act4_forward_kernel<group_t, scalar_t>(
330 |               X.data_ptr<scalar_t>(), 
331 |               p.data_ptr<scalar_t>(), 
332 | --- 582,588 ----
333 |       int batch_size = X.size(0);
334 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
335 |   
336 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_forward_kernel", ([&] {
337 |           act4_forward_kernel<group_t, scalar_t>(
338 |               X.data_ptr<scalar_t>(), 
339 |               p.data_ptr<scalar_t>(), 
340 | ***************
341 | *** 598,604 ****
342 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
343 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
344 |   
345 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_backward_kernel", ([&] {
346 |           act4_backward_kernel<group_t, scalar_t>(
347 |               grad.data_ptr<scalar_t>(), 
348 |               X.data_ptr<scalar_t>(), 
349 | --- 598,604 ----
350 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
351 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
352 |   
353 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_backward_kernel", ([&] {
354 |           act4_backward_kernel<group_t, scalar_t>(
355 |               grad.data_ptr<scalar_t>(), 
356 |               X.data_ptr<scalar_t>(), 
357 | ***************
358 | *** 616,622 ****
359 |       int batch_size = X.size(0);
360 |       torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options());
361 |   
362 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "as_matrix_forward_kernel", ([&] {
363 |           as_matrix_forward_kernel<group_t, scalar_t>(
364 |               X.data_ptr<scalar_t>(), 
365 |               T4x4.data_ptr<scalar_t>(), 
366 | --- 616,622 ----
367 |       int batch_size = X.size(0);
368 |       torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options());
369 |   
370 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "as_matrix_forward_kernel", ([&] {
371 |           as_matrix_forward_kernel<group_t, scalar_t>(
372 |               X.data_ptr<scalar_t>(), 
373 |               T4x4.data_ptr<scalar_t>(), 
374 | ***************
375 | *** 631,637 ****
376 |       int batch_size = X.size(0);
377 |       torch::Tensor P;
378 |       
379 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "orthogonal_projector_kernel", ([&] {
380 |           P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options());
381 |           orthogonal_projector_kernel<group_t, scalar_t>(X.data_ptr<scalar_t>(), P.data_ptr<scalar_t>(), batch_size);
382 |       }));
383 | --- 631,637 ----
384 |       int batch_size = X.size(0);
385 |       torch::Tensor P;
386 |       
387 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "orthogonal_projector_kernel", ([&] {
388 |           P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options());
389 |           orthogonal_projector_kernel<group_t, scalar_t>(X.data_ptr<scalar_t>(), P.data_ptr<scalar_t>(), batch_size);
390 |       }));
391 | ***************
392 | *** 645,651 ****
393 |       int batch_size = X.size(0);
394 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
395 |   
396 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "jleft_forward_kernel", ([&] {
397 |           jleft_forward_kernel<group_t, scalar_t>(
398 |               X.data_ptr<scalar_t>(), 
399 |               a.data_ptr<scalar_t>(), 
400 | --- 645,651 ----
401 |       int batch_size = X.size(0);
402 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
403 |   
404 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "jleft_forward_kernel", ([&] {
405 |           jleft_forward_kernel<group_t, scalar_t>(
406 |               X.data_ptr<scalar_t>(), 
407 |               a.data_ptr<scalar_t>(), 
408 | ***************
409 | *** 654,657 ****
410 |       }));
411 |   
412 |       return b;
413 | ! }
414 | \ No newline at end of file
415 | --- 654,657 ----
416 |       }));
417 |   
418 |       return b;
419 | ! }
420 | diff -crB '--exclude=.git' ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu
421 | *** ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu	Mon Mar 10 18:16:06 2025
422 | --- mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu	Mon Mar 10 17:29:53 2025
423 | ***************
424 | *** 299,305 ****
425 |       int batch_size = a.size(0);
426 |       torch::Tensor X;
427 |   
428 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_forward_kernel", ([&] {
429 |           X = torch::zeros({batch_size, group_t::N}, a.options());
430 |           exp_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
431 |               a.data_ptr<scalar_t>(), 
432 | --- 299,305 ----
433 |       int batch_size = a.size(0);
434 |       torch::Tensor X;
435 |   
436 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_forward_kernel", ([&] {
437 |           X = torch::zeros({batch_size, group_t::N}, a.options());
438 |           exp_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
439 |               a.data_ptr<scalar_t>(), 
440 | ***************
441 | *** 314,320 ****
442 |       int batch_size = a.size(0);
443 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
444 |   
445 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_backward_kernel", ([&] {
446 |           exp_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
447 |               grad.data_ptr<scalar_t>(), 
448 |               a.data_ptr<scalar_t>(), 
449 | --- 314,320 ----
450 |       int batch_size = a.size(0);
451 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
452 |   
453 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_backward_kernel", ([&] {
454 |           exp_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
455 |               grad.data_ptr<scalar_t>(), 
456 |               a.data_ptr<scalar_t>(), 
457 | ***************
458 | *** 329,335 ****
459 |       int batch_size = X.size(0);
460 |       torch::Tensor a;
461 |   
462 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_forward_kernel", ([&] {
463 |           a = torch::zeros({batch_size, group_t::K}, X.options());
464 |           log_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
465 |               X.data_ptr<scalar_t>(), 
466 | --- 329,335 ----
467 |       int batch_size = X.size(0);
468 |       torch::Tensor a;
469 |   
470 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_forward_kernel", ([&] {
471 |           a = torch::zeros({batch_size, group_t::K}, X.options());
472 |           log_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
473 |               X.data_ptr<scalar_t>(), 
474 | ***************
475 | *** 344,350 ****
476 |       int batch_size = X.size(0);
477 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
478 |   
479 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_backward_kernel", ([&] {
480 |           log_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
481 |               grad.data_ptr<scalar_t>(), 
482 |               X.data_ptr<scalar_t>(), 
483 | --- 344,350 ----
484 |       int batch_size = X.size(0);
485 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
486 |   
487 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_backward_kernel", ([&] {
488 |           log_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
489 |               grad.data_ptr<scalar_t>(), 
490 |               X.data_ptr<scalar_t>(), 
491 | ***************
492 | *** 359,365 ****
493 |       int batch_size = X.size(0);
494 |       torch::Tensor Y = torch::zeros_like(X);
495 |   
496 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_forward_kernel", ([&] {
497 |           inv_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
498 |               X.data_ptr<scalar_t>(), 
499 |               Y.data_ptr<scalar_t>(), 
500 | --- 359,365 ----
501 |       int batch_size = X.size(0);
502 |       torch::Tensor Y = torch::zeros_like(X);
503 |   
504 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_forward_kernel", ([&] {
505 |           inv_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
506 |               X.data_ptr<scalar_t>(), 
507 |               Y.data_ptr<scalar_t>(), 
508 | ***************
509 | *** 373,379 ****
510 |       int batch_size = X.size(0);
511 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
512 |   
513 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_backward_kernel", ([&] {
514 |           inv_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
515 |               grad.data_ptr<scalar_t>(), 
516 |               X.data_ptr<scalar_t>(), 
517 | --- 373,379 ----
518 |       int batch_size = X.size(0);
519 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
520 |   
521 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_backward_kernel", ([&] {
522 |           inv_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
523 |               grad.data_ptr<scalar_t>(), 
524 |               X.data_ptr<scalar_t>(), 
525 | ***************
526 | *** 389,395 ****
527 |       int batch_size = X.size(0);
528 |       torch::Tensor Z = torch::zeros_like(X);
529 |   
530 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_forward_kernel", ([&] {
531 |           mul_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
532 |               X.data_ptr<scalar_t>(), 
533 |               Y.data_ptr<scalar_t>(), 
534 | --- 389,395 ----
535 |       int batch_size = X.size(0);
536 |       torch::Tensor Z = torch::zeros_like(X);
537 |   
538 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_forward_kernel", ([&] {
539 |           mul_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
540 |               X.data_ptr<scalar_t>(), 
541 |               Y.data_ptr<scalar_t>(), 
542 | ***************
543 | *** 405,411 ****
544 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
545 |       torch::Tensor dY = torch::zeros(Y.sizes(), grad.options());
546 |   
547 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_backward_kernel", ([&] {
548 |           mul_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
549 |               grad.data_ptr<scalar_t>(), 
550 |               X.data_ptr<scalar_t>(), 
551 | --- 405,411 ----
552 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
553 |       torch::Tensor dY = torch::zeros(Y.sizes(), grad.options());
554 |   
555 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_backward_kernel", ([&] {
556 |           mul_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
557 |               grad.data_ptr<scalar_t>(), 
558 |               X.data_ptr<scalar_t>(), 
559 | ***************
560 | *** 422,428 ****
561 |       int batch_size = X.size(0);
562 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
563 |   
564 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_forward_kernel", ([&] {
565 |           adj_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
566 |               X.data_ptr<scalar_t>(), 
567 |               a.data_ptr<scalar_t>(), 
568 | --- 422,428 ----
569 |       int batch_size = X.size(0);
570 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
571 |   
572 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_forward_kernel", ([&] {
573 |           adj_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
574 |               X.data_ptr<scalar_t>(), 
575 |               a.data_ptr<scalar_t>(), 
576 | ***************
577 | *** 438,444 ****
578 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
579 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
580 |   
581 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_backward_kernel", ([&] {
582 |           adj_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
583 |               grad.data_ptr<scalar_t>(), 
584 |               X.data_ptr<scalar_t>(), 
585 | --- 438,444 ----
586 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
587 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
588 |   
589 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_backward_kernel", ([&] {
590 |           adj_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
591 |               grad.data_ptr<scalar_t>(), 
592 |               X.data_ptr<scalar_t>(), 
593 | ***************
594 | *** 456,462 ****
595 |       int batch_size = X.size(0);
596 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
597 |   
598 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_forward_kernel", ([&] {
599 |           adjT_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
600 |               X.data_ptr<scalar_t>(), 
601 |               a.data_ptr<scalar_t>(), 
602 | --- 456,462 ----
603 |       int batch_size = X.size(0);
604 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
605 |   
606 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_forward_kernel", ([&] {
607 |           adjT_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
608 |               X.data_ptr<scalar_t>(), 
609 |               a.data_ptr<scalar_t>(), 
610 | ***************
611 | *** 472,478 ****
612 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
613 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
614 |   
615 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_backward_kernel", ([&] {
616 |           adjT_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
617 |               grad.data_ptr<scalar_t>(), 
618 |               X.data_ptr<scalar_t>(), 
619 | --- 472,478 ----
620 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
621 |       torch::Tensor da = torch::zeros(a.sizes(), grad.options());
622 |   
623 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_backward_kernel", ([&] {
624 |           adjT_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
625 |               grad.data_ptr<scalar_t>(), 
626 |               X.data_ptr<scalar_t>(), 
627 | ***************
628 | *** 491,497 ****
629 |       int batch_size = X.size(0);
630 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
631 |   
632 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_forward_kernel", ([&] {
633 |           act_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
634 |               X.data_ptr<scalar_t>(), 
635 |               p.data_ptr<scalar_t>(), 
636 | --- 491,497 ----
637 |       int batch_size = X.size(0);
638 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
639 |   
640 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_forward_kernel", ([&] {
641 |           act_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
642 |               X.data_ptr<scalar_t>(), 
643 |               p.data_ptr<scalar_t>(), 
644 | ***************
645 | *** 507,513 ****
646 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
647 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
648 |   
649 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_backward_kernel", ([&] {
650 |           act_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
651 |               grad.data_ptr<scalar_t>(), 
652 |               X.data_ptr<scalar_t>(), 
653 | --- 507,513 ----
654 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
655 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
656 |   
657 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_backward_kernel", ([&] {
658 |           act_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
659 |               grad.data_ptr<scalar_t>(), 
660 |               X.data_ptr<scalar_t>(), 
661 | ***************
662 | *** 524,530 ****
663 |       int batch_size = X.size(0);
664 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
665 |   
666 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_forward_kernel", ([&] {
667 |           act4_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
668 |               X.data_ptr<scalar_t>(), 
669 |               p.data_ptr<scalar_t>(), 
670 | --- 524,530 ----
671 |       int batch_size = X.size(0);
672 |       torch::Tensor q = torch::zeros(p.sizes(), p.options());
673 |   
674 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_forward_kernel", ([&] {
675 |           act4_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
676 |               X.data_ptr<scalar_t>(), 
677 |               p.data_ptr<scalar_t>(), 
678 | ***************
679 | *** 540,546 ****
680 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
681 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
682 |   
683 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_backward_kernel", ([&] {
684 |           act4_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
685 |               grad.data_ptr<scalar_t>(), 
686 |               X.data_ptr<scalar_t>(), 
687 | --- 540,546 ----
688 |       torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
689 |       torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
690 |   
691 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_backward_kernel", ([&] {
692 |           act4_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
693 |               grad.data_ptr<scalar_t>(), 
694 |               X.data_ptr<scalar_t>(), 
695 | ***************
696 | *** 558,564 ****
697 |       int batch_size = X.size(0);
698 |       torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options());
699 |   
700 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "as_matrix_forward_kernel", ([&] {
701 |           as_matrix_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
702 |               X.data_ptr<scalar_t>(), 
703 |               T4x4.data_ptr<scalar_t>(), 
704 | --- 558,564 ----
705 |       int batch_size = X.size(0);
706 |       torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options());
707 |   
708 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "as_matrix_forward_kernel", ([&] {
709 |           as_matrix_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
710 |               X.data_ptr<scalar_t>(), 
711 |               T4x4.data_ptr<scalar_t>(), 
712 | ***************
713 | *** 573,579 ****
714 |       int batch_size = X.size(0);
715 |       torch::Tensor P;
716 |   
717 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "orthogonal_projector_kernel", ([&] {
718 |           P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options());
719 |           orthogonal_projector_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
720 |               X.data_ptr<scalar_t>(), 
721 | --- 573,579 ----
722 |       int batch_size = X.size(0);
723 |       torch::Tensor P;
724 |   
725 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "orthogonal_projector_kernel", ([&] {
726 |           P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options());
727 |           orthogonal_projector_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
728 |               X.data_ptr<scalar_t>(), 
729 | ***************
730 | *** 589,595 ****
731 |       int batch_size = X.size(0);
732 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
733 |   
734 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "jleft_forward_kernel", ([&] {
735 |           jleft_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
736 |               X.data_ptr<scalar_t>(), 
737 |               a.data_ptr<scalar_t>(), 
738 | --- 589,595 ----
739 |       int batch_size = X.size(0);
740 |       torch::Tensor b = torch::zeros(a.sizes(), a.options());
741 |   
742 | !     DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "jleft_forward_kernel", ([&] {
743 |           jleft_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
744 |               X.data_ptr<scalar_t>(), 
745 |               a.data_ptr<scalar_t>(), 
746 | 


--------------------------------------------------------------------------------
/cw-mega-sam/requirements.txt:
--------------------------------------------------------------------------------
 1 | annotated-types
 2 | archspec
 3 | boltons
 4 | Brotli
 5 | certifi
 6 | cffi
 7 | charset-normalizer
 8 | cryptography
 9 | distro
10 | filelock==3.17.0
11 | frozendict
12 | fsspec==2025.3.0
13 | idna
14 | Jinja2==3.1.6
15 | jsonpatch
16 | jsonpointer==2.1
17 | markdown-it-py
18 | MarkupSafe==3.0.2
19 | mdurl
20 | mpmath==1.3.0
21 | networkx==3.4.2
22 | numpy==2.2.3
23 | nvidia-cublas-cu12==12.4.5.8
24 | nvidia-cuda-cupti-cu12==12.4.127
25 | nvidia-cuda-nvrtc-cu12==12.4.127
26 | nvidia-cuda-nvrtc-cu12==12.4.127
27 | nvidia-cuda-runtime-cu12==12.4.127
28 | nvidia-cudnn-cu12
29 | nvidia-cufft-cu12==11.2.1.3
30 | nvidia-curand-cu12==10.3.5.147
31 | nvidia-cusolver-cu12==11.6.1.9
32 | nvidia-cusparse-cu12==12.3.1.170
33 | nvidia-cusparselt-cu12==0.6.2
34 | nvidia-nccl-cu12==2.20.5
35 | nvidia-nvjitlink-cu12==12.4.127
36 | nvidia-nvtx-cu12==12.4.127
37 | packaging
38 | pillow==11.1.0
39 | platformdirs
40 | pluggy
41 | pycosat
42 | pycparser
43 | pydantic
44 | pydantic_core
45 | Pygments
46 | PySocks
47 | requests
48 | rich
49 | ruamel.yaml
50 | ruamel.yaml.clib
51 | setuptools==75.8.0
52 | sympy==1.13.1
53 | torch==2.6.0
54 | torchaudio==2.6.0
55 | torchvision==0.21.0
56 | tqdm
57 | triton==3.2.0
58 | truststore
59 | typing_extensions
60 | urllib3
61 | wheel==0.45.1
62 | zstandard
63 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | # ml-containers
  2 | 
  3 | Repository for building ML images at CoreWeave
  4 | 
  5 | 
  6 | ## Index
  7 | 
  8 | See the [list of all published images](https://github.com/orgs/coreweave/packages?repo_name=ml-containers).
  9 | 
 10 | Special PyTorch Images:
 11 | 
 12 | - [PyTorch Base Images](#pytorch-base-images)
 13 | - [PyTorch Extras](#pytorch-extras)
 14 | - [PyTorch Nightly](#pytorch-nightly)
 15 | 
 16 | ### PyTorch Base Images
 17 | 
 18 | - [`ghcr.io/coreweave/ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch)
 19 | 
 20 | CoreWeave provides custom builds of
 21 | [PyTorch](https://github.com/pytorch/pytorch),
 22 | [`torchvision`](https://github.com/pytorch/vision)
 23 | and [`torchaudio`](https://github.com/pytorch/audio)
 24 | tuned for our platform in a single container image, [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch).
 25 | 
 26 | Versions compiled against CUDA 11.8.0, 12.0.1, 12.1.1, and 12.2.2 are available in this repository, with two variants:
 27 | 
 28 | 1. `base`: Tagged as `ml-containers/torch:a1b2c3d-base-...`.
 29 |    1. Built from [`nvidia/cuda:...-base-ubuntu22.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=base-ubuntu22.04) as a base.
 30 |    2. Only includes essentials (CUDA, `torch`, `torchvision`, `torchaudio`),
 31 |       so it has a small image size, making it fast to launch.
 32 | 2. `nccl`: Tagged as `ml-containers/torch:a1b2c3d-nccl-...`.
 33 |    1. Built from [`ghcr.io/coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/pkgs/container/nccl-tests) as a base.
 34 |    2. Ultimately inherits from [`nvidia/cuda:...-cudnn8-devel-ubuntu22.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=cudnn8-devel-ubuntu22.04).
 35 |    3. Larger, but includes development libraries and build tools such as `nvcc` necessary for compiling other PyTorch extensions.
 36 |    4. These PyTorch builds are built on component libraries optimized for the CoreWeave cloud&mdash;see
 37 |       [`coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/blob/master/README.md).
 38 | 
 39 | > [!NOTE]
 40 | > Most `torch` images have both a variant built on Ubuntu 22.04 and a variant built on Ubuntu 20.04.
 41 | > - CUDA 11.8.0 is an exception, and is only available on Ubuntu 20.04.
 42 | > - Ubuntu 22.04 images use Python 3.10.
 43 | > - Ubuntu 20.04 images use Python 3.8.
 44 | > - The base distribution is indicated in the container image tag.
 45 | 
 46 | ### PyTorch Extras
 47 | 
 48 | - [`ghcr.io/coreweave/ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras)
 49 | 
 50 | [`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras)
 51 | extends the [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch)
 52 | images with a set of common PyTorch extensions:
 53 | 
 54 | 1. [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 55 | 2. [FlashAttention](https://github.com/Dao-AILab/flash-attention)
 56 | 3. [NVIDIA Apex](https://github.com/NVIDIA/apex)
 57 | 
 58 | Each one is compiled specially against the custom PyTorch builds in [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch).
 59 | 
 60 | Both `base` and `nccl` editions are available for
 61 | [`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras)
 62 | matching those for
 63 | [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch).
 64 | The `base` edition retains a small size, as a multi-stage build is used to avoid including
 65 | CUDA development libraries in it, despite those libraries being required to build
 66 | the extensions themselves.
 67 | 
 68 | ### PyTorch Nightly
 69 | 
 70 | - [`ghcr.io/coreweave/ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch)
 71 | - [`ghcr.io/coreweave/ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras)
 72 | 
 73 | [`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch)
 74 | is an experimental, nightly release channel of the
 75 | [PyTorch Base Images](#pytorch-base-images) in the style of PyTorch's
 76 | own nightly preview builds, featuring the latest development versions of
 77 | `torch`, `torchvision`, and `torchaudio` pulled daily from GitHub
 78 | and compiled from source.
 79 | 
 80 | [`ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras)
 81 | is a version of [PyTorch Extras](#pytorch-extras) built on top of the
 82 | [`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch)
 83 | container images.
 84 | These are not nightly versions of the extensions themselves, but rather match
 85 | the extension versions in the regular [PyTorch Extras](#pytorch-extras) containers.
 86 | 
 87 | > ⚠ The *PyTorch Nightly* containers are based on unstable, experimental preview
 88 | builds of PyTorch, and should be expected to contain bugs and other issues.
 89 | > For more stable containers use the [PyTorch Base Images](#pytorch-base-images)
 90 | > and [PyTorch Extras](#pytorch-extras) containers. 
 91 | 
 92 | 
 93 | ## Organization
 94 | This repository contains multiple container image Dockerfiles, each is expected
 95 | to be within its own folder along with any other needed files for the build.
 96 | 
 97 | 
 98 | ## CI Builds (Actions)
 99 | The current CI builds are set up to run when changes to files in the respective
100 | folders are detected so that only the changed container images are built. The
101 | actions are set up with an action per image utilizing a reusable base action
102 | [build.yml](.github/workflows/build.yml). The reusable action accepts several inputs:
103 | 
104 | - `folder` - the folder containing the dockerfile for the image
105 | - `image-name` - the name to use for the image
106 | - `build-args` - arguments to pass to the docker build
107 | 
108 | Images built using the same source can utilize one action as the main reason for
109 | the multiple actions is to handle only building the changed images. A build
110 | matrix can be helpful for these cases
111 | https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs.
112 | 


--------------------------------------------------------------------------------
/gpt-neox-determined/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM coreweave/nccl-tests:2022-09-28_16-34-19.392_EDT
 2 | 
 3 | ENV DET_PYTHON_EXECUTABLE="/usr/bin/python3.8"
 4 | ENV DET_SKIP_PIP_INSTALL="SKIP"
 5 | 
 6 | # Run updates and install packages for build
 7 | RUN echo "Dpkg::Options { "--force-confdef"; "--force-confnew"; };" > /etc/apt/apt.conf.d/local
 8 | RUN apt-get -qq update && \
 9 |     apt-get -qq install -y --no-install-recommends software-properties-common && \
10 |     add-apt-repository ppa:deadsnakes/ppa -y && \
11 |     add-apt-repository universe && \
12 |     apt-get -qq update && \
13 |     DEBIAN_FRONTEND=noninteractive apt-get install -y curl tzdata build-essential daemontools && \
14 |     apt-get install -y --no-install-recommends \
15 |        python3.8 \
16 |        python3.8-distutils \
17 |        python3.8-dev \
18 |        python3.8-venv \
19 |        git && \
20 |     apt-get clean
21 | 
22 | # python3.8 -m ensurepip --default-pip && \
23 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
24 | RUN python3.8 get-pip.py
25 | RUN python3.8 -m pip install --no-cache-dir --upgrade pip
26 | 
27 | ARG PYTORCH_VERSION=1.12.1
28 | ARG TORCHVISION_VERSION=0.13.1
29 | ARG TORCHAUDIO_VERSION=0.12.1
30 | ARG TORCH_CUDA=116
31 | ARG TORCH_INDEX=whl
32 | 
33 | RUN python3.8 -m pip install --no-cache-dir install torch==${PYTORCH_VERSION}+cu${TORCH_CUDA} \ 
34 |         torchvision==${TORCHVISION_VERSION}+cu${TORCH_CUDA} \
35 |         torchaudio==${TORCHAUDIO_VERSION}+cu${TORCH_CUDA} \
36 |         --extra-index-url https://download.pytorch.org/${TORCH_INDEX}/cu${TORCH_CUDA}
37 | 
38 | RUN python3.8 -m pip install --no-cache-dir install packaging
39 | 
40 | RUN mkdir -p /tmp/build && \
41 |         cd /tmp/build && \
42 |         git clone https://github.com/NVIDIA/apex && \
43 |         cd apex && \
44 |         python3.8 -m pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ && \
45 |         cd /tmp && \
46 |         rm -r /tmp/build
47 | 
48 | #### Python packages
49 | RUN python3.8 -m pip install --no-cache-dir determined==0.19.2
50 | 
51 | #### Clone GPT-Neox for requirements
52 | RUN git clone https://github.com/EleutherAI/gpt-neox.git && cd gpt-neox && \
53 |         python3.8 -m pip install --no-cache-dir -r requirements/requirements.txt && \
54 |         python3.8 -m pip install --no-cache-dir -r requirements/requirements-onebitadam.txt && \
55 |         python3.8 -m pip install -r requirements/requirements-sparseattention.txt
56 | 
57 | RUN python3.8 -m pip install --no-cache-dir pybind11
58 | RUN python3.8 -m pip install --no-cache-dir protobuf==3.19.4
59 | RUN update-alternatives --install /usr/bin/python3 python /usr/bin/python3.8 2
60 | RUN echo 2 | update-alternatives --config python
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/gpt-neox-mpi/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/nccl-tests:11.7.1-devel-ubuntu20.04-nccl2.14.3-1-a0cb1a6
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | #### System package (uses default Python 3 version in Ubuntu 20.04)
 6 | RUN apt-get update -y && \
 7 |     apt-get install -y \
 8 |         git python3 python3-dev libpython3-dev python3-pip pdsh && \
 9 |     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
10 |     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
11 |     pip install --upgrade pip && \
12 |     pip install gpustat
13 | 
14 | #### Python packages
15 | RUN pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html && \
16 |     pip install packaging>=14.0 && pip cache purge
17 | 
18 | ## Install APEX
19 | ARG APEX_COMMIT=537424d24d55e3a166c930828e4780549edc6151
20 | RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" \
21 |         --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@${APEX_COMMIT}
22 | 
23 | # Get the gpt-neox source code
24 | WORKDIR /
25 | RUN git clone https://github.com/EleutherAI/gpt-neox.git
26 | 
27 | # Use the-eye.eu instead of the dead mystic.the-eye.eu mirror for dataset links
28 | RUN sed -i 's/mystic.the-eye/the-eye/g' /gpt-neox/tools/corpora.py
29 | 
30 | RUN pip install -r /gpt-neox/requirements/requirements.txt && \
31 |     pip install -r /gpt-neox/requirements/requirements-onebitadam.txt && \
32 |     pip install -r /gpt-neox/requirements/requirements-sparseattention.txt && \
33 |     pip install protobuf==3.20.* && \
34 |     pip install git+https://github.com/EleutherAI/best-download.git && \
35 |     pip cache purge
36 | 
37 | RUN python /gpt-neox/megatron/fused_kernels/setup.py install
38 | 
39 | # Clear staging
40 | RUN mkdir -p /tmp && chmod 0777 /tmp
41 | 
42 | WORKDIR /gpt-neox
43 | 


--------------------------------------------------------------------------------
/hf-llm-inference/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt-get -qq update && \
 5 |     apt-get -qq install --no-install-recommends -y git curl && \
 6 |     apt-get clean
 7 | 
 8 | RUN mkdir /app
 9 | WORKDIR /app
10 | 
11 | ARG COMMIT=cfd8b249a6bac47e0b3dab6fa2be781965a69025
12 | RUN git clone --filter=blob:none https://github.com/coreweave/kubernetes-cloud && \
13 |     cd kubernetes-cloud && \
14 |     git checkout ${COMMIT} && \
15 |     cd .. && \
16 |     cp kubernetes-cloud/online-inference/hf-llm/service/* .  && \
17 |     cp kubernetes-cloud/online-inference/hf-llm/serializer/serialize.py . && \
18 |     rm -rf kubernetes-cloud
19 | 
20 | RUN pip3 install --no-cache-dir --upgrade pip && \
21 |     pip3 install --no-cache-dir -r requirements.txt
22 | 


--------------------------------------------------------------------------------
/megatron/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM $BASE_IMAGE
 3 | 
 4 | COPY requirements.txt /tmp/requirements.txt
 5 | 
 6 | RUN python3 -m pip install -U --no-cache-dir \
 7 |     -r /tmp/requirements.txt \
 8 |     && rm /tmp/requirements.txt
 9 | 
10 | ARG COMMIT
11 | RUN git clone https://github.com/NVIDIA/megatron-lm && \
12 |     cd megatron-lm && \
13 |     git checkout ${COMMIT} && \
14 |     rm -rf .git
15 | 


--------------------------------------------------------------------------------
/megatron/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.23.4
 2 | pybind11==2.13.6
 3 | pyyaml==6.0.2
 4 | regex==2024.9.11
 5 | tensorboard==2.18.0
 6 | tensorboard-data-server==0.7.2
 7 | transformers==4.45.2
 8 | triton==3.0.0
 9 | wandb==0.18.3
10 | sentencepiece==0.2.0


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: ml-containers
 2 | plugins:
 3 |   - techdocs-core
 4 | markdown_extensions:
 5 |   pymdownx.extra:
 6 |     pymdownx.superfences:
 7 |       custom_fences:
 8 |         - name: mermaid
 9 |           class: mermaid
10 |           format: !!python/name:pymdownx.superfences.fence_code_format


--------------------------------------------------------------------------------
/sd-finetuner/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gooseai/torch-base:6cfdc11
 2 | 
 3 | RUN apt-get install -y cuda-nvcc-11-3 cuda-nvml-dev-11-3 libcurand-dev-11-3 \
 4 |                        libcublas-dev-11-3 libcusparse-dev-11-3 \
 5 |                        libcusolver-dev-11-3 cuda-nvprof-11-3 \
 6 |                        ninja-build git && \
 7 |     apt-get clean
 8 | 
 9 | RUN mkdir /app
10 | WORKDIR /app
11 | 
12 | ARG COMMIT=master
13 | RUN git clone https://github.com/coreweave/kubernetes-cloud.git && \
14 |     cd kubernetes-cloud && \
15 |     git checkout ${COMMIT} && \
16 |     cd ..
17 | RUN cp kubernetes-cloud/sd-finetuner-workflow/sd-finetuner/* .
18 | RUN pip3 install --no-cache-dir -r requirements.txt
19 | 
20 | CMD [ "/usr/bin/python3", "finetuner.py" ]
21 | 


--------------------------------------------------------------------------------
/sd-inference/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt update && apt upgrade -y && \
 5 |     apt update && apt install -y python3 python3-pip git curl && \
 6 |     apt clean
 7 | 
 8 | RUN mkdir /app
 9 | WORKDIR /app
10 | 
11 | ARG COMMIT=master
12 | RUN git clone https://github.com/coreweave/kubernetes-cloud && \
13 |     cd kubernetes-cloud && \
14 |     git checkout ${COMMIT} && \
15 |     cd .. && \
16 |     cp kubernetes-cloud/online-inference/stable-diffusion/service/* .  && \
17 |     cp kubernetes-cloud/online-inference/stable-diffusion/serializer/serialize.py . && \
18 |     rm -rf kubernetes-cloud
19 | 
20 | RUN pip3 install --no-cache-dir --upgrade pip && \
21 |     pip3 install --no-cache-dir -r requirements.txt
22 | 


--------------------------------------------------------------------------------
/sglang/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1.2
 2 | ARG BASE_IMAGE
 3 | ARG BUILDER_IMAGE="${BASE_IMAGE}"
 4 | 
 5 | FROM ${BUILDER_IMAGE} AS builder
 6 | 
 7 | ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX'
 8 | 
 9 | ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8'
10 | ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef'
11 | ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0'
12 | ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb'
13 | ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b'
14 | # Building Triton is not currently enabled,
15 | # but this is the commit that would be used if it were
16 | ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd'
17 | 
18 | WORKDIR /build
19 | COPY build.bash /build/
20 | RUN mkdir /wheels && \
21 |     bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \
22 |     rm -rf /build/*
23 | COPY install.bash /wheels/
24 | 
25 | FROM ${BASE_IMAGE}
26 | RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \
27 |     cd /wheels && \
28 |     bash install.bash
29 | 


--------------------------------------------------------------------------------
/sglang/build.bash:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -xeo pipefail
  3 | export DEBIAN_FRONTEND=noninteractive
  4 | 
  5 | TORCH_CUDA_ARCH_LIST=''
  6 | FILTER_ARCHES=''
  7 | BUILD_TRITON=''
  8 | 
  9 | while getopts 'a:ft' OPT; do
 10 |   case "${OPT}" in
 11 |     a) TORCH_CUDA_ARCH_LIST="${OPTARG}" ;;
 12 |     f) FILTER_ARCHES='1' ;;
 13 |     t) BUILD_TRITON='1' ;;
 14 |     *) exit 92 ;;
 15 |   esac
 16 | done
 17 | 
 18 | export NVCC_APPEND_FLAGS='-gencode=arch=compute_100,code=[sm_100,compute_100] -gencode=arch=compute_100a,code=sm_100a --diag-suppress 174'
 19 | export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0 10.0+PTX}"
 20 | 
 21 | mkdir -p /wheels/logs
 22 | 
 23 | _BUILD() { python3 -m build -w -n -v -o /wheels "${1:-.}"; }
 24 | _LOG() { tee -a "/wheels/logs/${1:?}"; }
 25 | _CONSTRAINTS="$(python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p')"
 26 | _PIP_INSTALL() {
 27 |   python3 -m pip install --no-cache-dir \
 28 |   --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \
 29 |   "$@"
 30 | }
 31 | 
 32 | _PIP_INSTALL -U pip setuptools wheel build pybind11 ninja cmake
 33 | 
 34 | # triton (not compatible with torch 2.6)
 35 | if [ "${BUILD_TRITON}" = 1 ]; then (
 36 |   : "${TRITON_COMMIT:?}"
 37 |   echo 'Building triton-lang/triton'
 38 |   git clone --recursive --filter=blob:none https://github.com/triton-lang/triton
 39 |   cd triton
 40 |   git checkout "${TRITON_COMMIT}"
 41 |   _BUILD python |& _LOG triton.log
 42 | ); fi
 43 | 
 44 | # flashinfer
 45 | : "${FLASHINFER_COMMIT:?}"
 46 | : "${CUTLASS_COMMIT:?}"
 47 | (
 48 | echo 'Building flashinfer-ai/flashinfer'
 49 | git clone --recursive --filter=blob:none https://github.com/flashinfer-ai/flashinfer
 50 | cd flashinfer
 51 | git checkout "${FLASHINFER_COMMIT}"
 52 | sed -i 's/name = "flashinfer-python"/name = "flashinfer"/' pyproject.toml
 53 | git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
 54 | _PIP_INSTALL -U optree
 55 | NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS } --diag-suppress 20281,174" \
 56 |   FLASHINFER_ENABLE_AOT=1 _BUILD . |& _LOG flashinfer.log
 57 | )
 58 | 
 59 | # Setup cutlass repo for vLLM to use
 60 | git clone --recursive --filter=blob:none https://github.com/NVIDIA/cutlass
 61 | git -C cutlass checkout "${CUTLASS_COMMIT}"
 62 | 
 63 | # vLLM
 64 | : "${VLLM_COMMIT:?}"
 65 | (
 66 | echo 'Building vllm-project/vllm'
 67 | export VLLM_CUTLASS_SRC_DIR="${PWD}/cutlass"
 68 | test -d "${VLLM_CUTLASS_SRC_DIR}"
 69 | git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm
 70 | cd vllm
 71 | git checkout "${VLLM_COMMIT}"
 72 | # For lsmod
 73 | apt-get -qq update && apt-get -qq install --no-install-recommends -y kmod
 74 | python3 use_existing_torch.py
 75 | _PIP_INSTALL -r requirements-build.txt
 76 | USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log
 77 | )
 78 | 
 79 | # sglang
 80 | : "${SGLANG_COMMIT:?}"
 81 | (
 82 | echo 'Building sglang'
 83 | git clone --recursive --filter=blob:none https://github.com/sgl-project/sglang
 84 | cd sglang
 85 | git checkout "${SGLANG_COMMIT}"
 86 | (
 87 | cd sgl-kernel
 88 | git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
 89 | git -C 3rdparty/flashinfer/3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
 90 | 
 91 | ARCH_TRIPLE="$(gcc -print-multiarch)"
 92 | LIB_DIR="/usr/lib/${ARCH_TRIPLE:?}"
 93 | test -d "${LIB_DIR:?}"
 94 | PYTHON_API_VER="$(
 95 |   python3 --version | sed -En 's@Python ([0-9])\.([0-9]+)\..*@cp\1\2@p'
 96 | )"
 97 | ARCH_FILTER=()
 98 | if [ "${FILTER_ARCHES}" = 1 ]; then
 99 |   ARCH_FILTER=(-e 's@"-gencode=arch=compute_[78][0-9],code=sm_[78][0-9]",@#\0@')
100 | fi
101 | 
102 | sed -Ei \
103 |   "${ARCH_FILTER[@]}" \
104 |   -e 's@/usr/lib/x86_64-linux-gnu@'"${LIB_DIR}"'@' \
105 |   -e 's@(\s+)(\w.+manylinux2014_x86_64.+)@\1pass  # \2@' \
106 |   -e 's@\{"py_limited_api": "cp39"}@{"py_limited_api": "'"${PYTHON_API_VER:-cp310}"'"}@' \
107 |   setup.py
108 | SGL_KERNEL_ENABLE_BF16=1 SGL_KERNEL_ENABLE_FP8=1 SGL_KERNEL_ENABLE_SM90A=1 \
109 |   _BUILD . |& _LOG sglang.log
110 | )
111 | _BUILD python |& _LOG sglang.log
112 | )
113 | 
114 | # decord and xgrammar aren't available on PyPI for ARM64
115 | 
116 | if [ ! "$(uname -m)" = 'x86_64' ]; then
117 |   # xgrammar (for sglang)
118 |   (
119 |   git clone --recursive --filter=blob:none -b v0.1.11 https://github.com/mlc-ai/xgrammar && \
120 |   cd xgrammar
121 |   (
122 |   mkdir build && cd build
123 |   cmake -S.. -B. -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG xgrammar.log
124 |   cmake --build . |& _LOG xgrammar.log
125 |   )
126 |   _BUILD python |& _LOG xgrammar.log
127 |   )
128 | 
129 |   # decord (for sglang)
130 |   : "${DECORD_COMMIT:?}"
131 |   (
132 |   apt-get -qq update && apt-get -q install --no-install-recommends -y \
133 |     build-essential python3-dev python3-setuptools \
134 |     make cmake ffmpeg \
135 |     libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
136 |   git clone --recursive --filter=blob:none https://github.com/dmlc/decord
137 |   cd decord
138 |   git checkout "${DECORD_COMMIT}"
139 |   (
140 |   mkdir build && cd build
141 |   cmake -S.. -B. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG decord.log
142 |   cmake --build . |& _LOG decord.log
143 |   cp libdecord.so /wheels/libdecord.so
144 |   )
145 |   cd python
146 |   _BUILD . |& _LOG decord.log
147 |   )
148 | fi
149 | 
150 | apt-get clean
151 | 


--------------------------------------------------------------------------------
/sglang/install.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xeo pipefail
 3 | export DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | _CONSTRAINTS="$(
 6 |   python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p'
 7 | )"
 8 | _PIP_INSTALL() {
 9 |   python3 -m pip install --no-cache-dir \
10 |   --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \
11 |   "$@"
12 | }
13 | 
14 | _PIP_INSTALL /wheels/*.whl
15 | if [ -x /wheels/libdecord.so ]; then
16 |   apt-get -qq update && apt-get -q install --no-install-recommends -y \
17 |     libavfilter7 libavformat58 && \
18 |   apt-get clean
19 |   cp /wheels/libdecord.so /usr/local/lib/ && ldconfig
20 | fi
21 | 
22 | SGLANG_EXTRA_PIP_DEPENDENCIES=()
23 | if [ "$(uname -m)" = 'x86_64' ]; then
24 |   SGLANG_EXTRA_PIP_DEPENDENCIES=('decord' 'xgrammar>=0.1.10')
25 | fi
26 | _PIP_INSTALL \
27 |   'aiohttp' 'fastapi' \
28 |   'hf_transfer' 'huggingface_hub' 'interegular' 'modelscope' \
29 |   'orjson' 'packaging' 'pillow' 'prometheus-client>=0.20.0' \
30 |   'psutil' 'pydantic' 'python-multipart' 'pyzmq>=25.1.2' \
31 |   'torchao>=0.7.0' 'uvicorn' 'uvloop' \
32 |   'cuda-python' 'outlines>=0.0.44,<0.1.0' \
33 |   "${SGLANG_EXTRA_PIP_DEPENDENCIES[@]}"
34 | 


--------------------------------------------------------------------------------
/tensorizer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:es-22.04-3ce72cc-base-cuda12.2.2-torch2.1.2-vision0.16.2-audio2.1.2
 2 | ARG COMMIT=main
 3 | 
 4 | RUN mkdir /app
 5 | WORKDIR /app
 6 | 
 7 | RUN git clone https://github.com/coreweave/tensorizer && \
 8 |     cd tensorizer && \
 9 |     git checkout ${COMMIT} && \
10 |     pip3 install .


--------------------------------------------------------------------------------
/torch-extras/Dockerfile:
--------------------------------------------------------------------------------
  1 | # syntax=docker/dockerfile:1.2
  2 | 
  3 | ARG BASE_IMAGE
  4 | ARG DEEPSPEED_VERSION="0.14.4"
  5 | ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8"
  6 | ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1"
  7 | ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90"
  8 | ARG XFORMERS_VERSION="0.0.28.post1"
  9 | ARG BUILD_MAX_JOBS=""
 10 | 
 11 | FROM alpine/git:2.36.3 as apex-downloader
 12 | WORKDIR /git
 13 | ARG APEX_COMMIT
 14 | RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
 15 |       https://github.com/NVIDIA/apex && \
 16 |     cd apex && \
 17 |     git checkout "${APEX_COMMIT}" && \
 18 |     git submodule update --init --recursive --jobs 8 \
 19 |       --depth 1 --filter=blob:none && \
 20 |     find -type d -name docs -prune -exec rm -r '{}' ';'
 21 | 
 22 | 
 23 | FROM alpine/git:2.36.3 as ds-kernels-downloader
 24 | WORKDIR /git
 25 | ARG DEEPSPEED_KERNELS_COMMIT
 26 | RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
 27 |       https://github.com/microsoft/DeepSpeed-Kernels ds-kernels && \
 28 |     cd ds-kernels && \
 29 |     git checkout "${DEEPSPEED_KERNELS_COMMIT}" && \
 30 |     git submodule update --init --recursive --jobs 8 \
 31 |       --depth 1 --filter=blob:none
 32 | 
 33 | 
 34 | # Dependencies requiring NVCC are built ahead of time in a separate stage
 35 | # so that the ~2 GiB dev library installations don't have to be included
 36 | # in the final image.
 37 | FROM ${BASE_IMAGE} as builder-base
 38 | RUN export \
 39 |       CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
 40 |       CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
 41 |     export \
 42 |       CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
 43 |     apt-get install -y --no-install-recommends --no-upgrade \
 44 |       cuda-nvcc-${CUDA_PACKAGE_VERSION} \
 45 |       cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \
 46 |       libcurand-dev-${CUDA_PACKAGE_VERSION} \
 47 |       libcublas-dev-${CUDA_PACKAGE_VERSION} \
 48 |       libcusparse-dev-${CUDA_PACKAGE_VERSION} \
 49 |       libcusolver-dev-${CUDA_PACKAGE_VERSION} \
 50 |       cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
 51 |       cuda-nvtx-${CUDA_PACKAGE_VERSION} \
 52 |       cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \
 53 |     apt-get -qq update && \
 54 |     apt-get install -y --no-install-recommends \
 55 |       libaio-dev \
 56 |       ninja-build && \
 57 |     apt-get clean
 58 | 
 59 | # Install the cuDNN dev package for building Apex
 60 | # The cuDNN runtime is installed in the base torch image
 61 | COPY --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
 62 | RUN /tmp/install_cudnn.sh "${CUDA_VERSION}" dev && \
 63 |     rm /tmp/install_cudnn.sh
 64 | 
 65 | # Add Kitware's apt repository to get a newer version of CMake
 66 | RUN apt-get -qq update && apt-get -qq install -y \
 67 |       software-properties-common lsb-release && \
 68 |     { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
 69 |     | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
 70 |     apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
 71 |     apt-get -qq update && \
 72 |     apt-get -qq install -y 'cmake=3.31.6-*' 'cmake-data=3.31.6-*' && \
 73 |     apt-get clean && \
 74 |     python3 -m pip install --no-cache-dir 'cmake==3.31.6'
 75 | 
 76 | # Update compiler (GCC) and linker (LLD) versions
 77 | # gfortran-11 is just for compiler_wrapper.f95
 78 | RUN LLVM_VERSION='18' && \
 79 |     apt-get -qq update && apt-get -qq install --no-install-recommends -y \
 80 |       gcc-11 g++-11 gfortran-11 "lld-$LLVM_VERSION" && \
 81 |     apt-get clean && \
 82 |     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
 83 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
 84 |     update-alternatives --install \
 85 |       /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
 86 |     update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1
 87 | 
 88 | RUN mkdir /wheels /build
 89 | WORKDIR /build
 90 | 
 91 | # DeepSpeed forces -march=native into the compiler options,
 92 | # making the result dependent on the processor architecture
 93 | # used on the builder machine.
 94 | # The compiler wrapper normalizes -march=native to -march=skylake
 95 | # along with a couple other transformations before invoking GCC.
 96 | COPY compiler_wrapper.f95 .
 97 | ARG AMD64_NATIVE_ARCH="skylake"
 98 | ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
 99 | RUN if [ "$(uname -m)" = "aarch64" ]; then \
100 |       NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \
101 |       AVX='WRAPPER_NO_AVX'; \
102 |     else \
103 |       NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
104 |       AVX='WRAPPER_AVX="AVX256"'; \
105 |     fi && \
106 |     gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
107 | 
108 | COPY --chmod=755 effective_cpu_count.sh .
109 | COPY --chmod=755 scale.sh .
110 | 
111 | ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a"
112 | RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
113 |     case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
114 |       FLAGS="${FLAGS} -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_100a,code=sm_100a" ;; \
115 |     esac && \
116 |     echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
117 | ARG BUILD_MAX_JOBS
118 | 
119 | 
120 | FROM builder-base as deepspeed-builder
121 | 
122 | ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST
123 | RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \
124 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
125 |     cd ds-kernels && \
126 |     export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \
127 |     echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \
128 |     python3 -m pip wheel -w /wheels \
129 |       --no-cache-dir --no-build-isolation --no-deps . && \
130 |     python3 -m pip install /wheels/*.whl
131 | 
132 | # DeepSpeed build flags
133 | # See: https://www.deepspeed.ai/tutorials/advanced-install
134 | ARG DS_BUILD_OPS="0"
135 | ARG DS_BUILD_CCL_COMM="0"
136 | ARG DS_BUILD_CPU_ADAM="1"
137 | ARG DS_BUILD_CPU_LION="1"
138 | # Requires CUTLASS
139 | ARG DS_BUILD_EVOFORMER_ATTN="0"
140 | ARG DS_BUILD_FUSED_ADAM="1"
141 | ARG DS_BUILD_FUSED_LION="1"
142 | ARG DS_BUILD_CPU_ADAGRAD="1"
143 | ARG DS_BUILD_FUSED_LAMB="1"
144 | ARG DS_BUILD_QUANTIZER="1"
145 | ARG DS_BUILD_RANDOM_LTD="1"
146 | # sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4
147 | ARG DS_BUILD_SPARSE_ATTN="0"
148 | ARG DS_BUILD_TRANSFORMER="1"
149 | ARG DS_BUILD_TRANSFORMER_INFERENCE="1"
150 | ARG DS_BUILD_STOCHASTIC_TRANSFORMER="1"
151 | ARG DS_BUILD_UTILS="1"
152 | ARG DS_BUILD_AIO="1"
153 | 
154 | ARG DEEPSPEED_VERSION
155 | 
156 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
157 | RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
158 |     python3 -m pip install -U --no-cache-dir \
159 |       setuptools wheel pip py-cpuinfo && \
160 |     if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \
161 |       # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's
162 |       # requirement for C++17 (as of DeepSpeed 0.10.1).
163 |       # See: https://github.com/microsoft/DeepSpeed/pull/3976
164 |       export DS_BUILD_AIO='0'; \
165 |     fi && \
166 |     { \
167 |       # DeepSpeed doesn't handle blank environment variables
168 |       # in the same way as unset ones, so clear any blank ones.
169 |       for VAR in \
170 |         DS_BUILD_OPS \
171 |         DS_BUILD_CCL_COMM \
172 |         DS_BUILD_CPU_ADAM \
173 |         DS_BUILD_CPU_LION \
174 |         DS_BUILD_EVOFORMER_ATTN \
175 |         DS_BUILD_FUSED_ADAM \
176 |         DS_BUILD_FUSED_LION \
177 |         DS_BUILD_CPU_ADAGRAD \
178 |         DS_BUILD_FUSED_LAMB \
179 |         DS_BUILD_QUANTIZER \
180 |         DS_BUILD_RANDOM_LTD \
181 |         DS_BUILD_SPARSE_ATTN \
182 |         DS_BUILD_TRANSFORMER \
183 |         DS_BUILD_TRANSFORMER_INFERENCE \
184 |         DS_BUILD_STOCHASTIC_TRANSFORMER \
185 |         DS_BUILD_UTILS \
186 |         DS_BUILD_AIO; \
187 |       do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \
188 |     } && \
189 |     CC=$(realpath -e ./compiler) \
190 |       MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \
191 |       python3 -m pip wheel -w /wheels \
192 |       --no-cache-dir --no-build-isolation --no-deps -v \
193 |       deepspeed==${DEEPSPEED_VERSION} && \
194 |     rm ./*
195 | SHELL ["/bin/sh", "-c"]
196 | 
197 | WORKDIR /wheels
198 | 
199 | 
200 | FROM builder-base as apex-builder
201 | 
202 | RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) && \
203 |     apt-get -qq update && apt-get install -y --no-install-recommends \
204 |       libnccl-dev=$LIBNCCL2_VERSION && \
205 |     apt-get clean
206 | 
207 | # --distributed_adam, --distributed_lamb, and --group_norm aren't documented
208 | # in the Apex README, but are defined in its setup.py config.
209 | RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
210 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
211 |     python3 -m pip install -U --no-cache-dir \
212 |       packaging setuptools wheel pip && \
213 |     CUDA_MAJOR_VERSION=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \
214 |     CHECK_VERSION() { \
215 |       dpkg-query --status "$1" 2>/dev/null \
216 |       | sed -ne 's/Version: //p' \
217 |       | grep .; \
218 |     } && \
219 |     LIBCUDNN_VER="$( \
220 |       CHECK_VERSION libcudnn8-dev || \
221 |       CHECK_VERSION "libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION}" || \
222 |       :; \
223 |     )" && \
224 |     export CC=$(realpath -e ./compiler) && \
225 |     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)}" && \
226 |     printf -- '--config-settings="--build-option=%s" ' $( \
227 |       echo \
228 |         --cpp_ext \
229 |         --cuda_ext \
230 |         --distributed_adam \
231 |         --distributed_lamb \
232 |         --permutation_search \
233 |         --xentropy \
234 |         --focal_loss \
235 |         --group_norm \
236 |         --index_mul_2d \
237 |         --deprecated_fused_adam \
238 |         --deprecated_fused_lamb \
239 |         --fast_layer_norm \
240 |         --fmha \
241 |         --fast_multihead_attn \
242 |         --transducer \
243 |         --peer_memory \
244 |         --nccl_p2p \
245 |         --fast_bottleneck && \
246 |       if [ -n "$LIBCUDNN_VER" ]; then \
247 |         echo \
248 |           --bnp \
249 |           --cudnn_gbn \
250 |           --fused_conv_bias_relu; \
251 |       fi; \
252 |     ) > ./apex-extensions.conf && \
253 |     echo "Extensions: $(cat ./apex-extensions.conf)" && \
254 |     cd apex && \
255 |     xargs -a ../apex-extensions.conf python3 -m pip wheel -w /wheels -v --no-cache-dir --no-build-isolation --no-deps ./
256 | 
257 | WORKDIR /wheels
258 | 
259 | FROM builder-base as xformers-builder
260 | 
261 | ARG XFORMERS_VERSION
262 | 
263 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
264 | RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
265 |     python3 -m pip install -U --no-cache-dir \
266 |       setuptools wheel pip && \
267 |     CC=$(realpath -e ./compiler) \
268 |       MAX_JOBS=1 \
269 |       PYTHONUNBUFFERED=1 \
270 |       XFORMERS_DISABLE_FLASH_ATTN=1 \
271 |       python3 -m pip wheel -w /wheels -v \
272 |       --no-cache-dir --no-build-isolation --no-deps \
273 |       --no-binary=xformers \
274 |       xformers==${XFORMERS_VERSION} 2> \
275 |     >(grep -Ev --line-buffered 'ptxas info\s*:|bytes spill stores' >&2)
276 | 
277 | SHELL ["/bin/sh", "-c"]
278 | 
279 | WORKDIR /build
280 | 
281 | FROM ${BASE_IMAGE}
282 | 
283 | RUN apt-get -qq update && \
284 |     apt-get install -y --no-install-recommends libaio-dev && \
285 |     apt-get clean
286 | 
287 | 
288 | RUN --mount=type=bind,from=deepspeed-builder,source=/wheels,target=/tmp/wheels \
289 |     python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
290 | RUN --mount=type=bind,from=apex-builder,source=/wheels,target=/tmp/wheels \
291 |     python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
292 | RUN --mount=type=bind,from=xformers-builder,source=/wheels,target=/tmp/wheels \
293 |     python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
294 | 


--------------------------------------------------------------------------------
/torch-extras/compiler_wrapper.f95:
--------------------------------------------------------------------------------
 1 | #ifndef WRAPPER_NATIVE
 2 | #define WRAPPER_NATIVE "skylake"
 3 | #endif
 4 | 
 5 | #ifndef WRAPPER_CC
 6 | #define WRAPPER_CC "gcc"
 7 | #endif
 8 | 
 9 | #ifndef WRAPPER_AVX
10 | #define WRAPPER_AVX "AVX256"
11 | #endif
12 | 
13 | PROGRAM compiler_wrapper
14 |     ! Wraps C compiler invocations,
15 |     ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions
16 |     ! with -D__<WRAPPER_AVX>__, and -march=native with -march=<WRAPPER_NATIVE>,
17 |     ! for better reproducibility and compatibility.
18 |     IMPLICIT NONE
19 |     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
20 |     CHARACTER(len=:), ALLOCATABLE :: arg, command
21 |     ALLOCATE(CHARACTER(len=128) :: arg)
22 |     command = WRAPPER_CC
23 | 
24 |     DO i = 1, COMMAND_ARGUMENT_COUNT()
25 |         DO
26 |             CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated)
27 |             IF (truncated == 0) THEN
28 |                 EXIT
29 |             ELSE IF (truncated == -1) THEN
30 |                 DEALLOCATE(arg)
31 |                 ALLOCATE(CHARACTER(len=full_length) :: arg)
32 |             ELSE
33 |                 CALL EXIT(95)
34 |             END IF
35 |         END DO
36 |         IF (arg == "-march=native") THEN
37 |             command = command // (" '-march=" // WRAPPER_NATIVE // "'")
38 |         ELSE IF ( &
39 |             arg == "-D__AVX512__" &
40 |             .OR. arg == "-D__AVX256__" &
41 |             .OR. arg == "-D__SCALAR__" &
42 |         ) THEN
43 | #ifndef WRAPPER_NO_AVX
44 |             command = command // (" '-D__" // WRAPPER_AVX // "__'")
45 | #endif
46 |         ELSE
47 |             command = command // shell_escaped(arg)
48 |         END IF
49 |     END DO
50 |     CALL SYSTEM(command, exitcode)
51 |     IF (exitcode > 255) THEN
52 |         exitcode = MAX(IAND(exitcode, 255), 1)
53 |     END IF
54 |     CALL EXIT(exitcode)
55 | 
56 | 
57 |     CONTAINS
58 |         FUNCTION shell_escaped(str) RESULT(out)
59 |             ! Turns [str] into [ 'str'] and replaces all
60 |             ! internal ['] characters with ['"'"']
61 |             IMPLICIT NONE
62 |             CHARACTER(len=*), INTENT(IN) :: str
63 |             CHARACTER(len=:), ALLOCATABLE :: out
64 |             INTEGER :: old_i, out_i, old_len, out_len
65 | 
66 |             old_len = LEN_TRIM(str)
67 |             ! Figure out the new length to allocate by scanning `str`.
68 |             ! This always needs to add at least [ '] at the beginning
69 |             ! and ['] at the end, so the length increases by at least 3.
70 |             out_len = old_len + 3
71 |             DO old_i = 1, old_len
72 |                 IF (str(old_i:old_i) == "'") THEN
73 |                     out_len = out_len + 4
74 |                 END IF
75 |             END DO
76 |             ALLOCATE(CHARACTER(len=out_len) :: out)
77 | 
78 |             ! Copy over the string, performing necessary escapes.
79 |             out(1:2) = " '"
80 |             out_i = 3
81 |             DO old_i = 1, old_len
82 |                 IF (str(old_i:old_i) == "'") THEN
83 |                     ! Escape internal single-quotes
84 |                     out(out_i:out_i + 4) = '''"''"'''
85 |                     out_i = out_i + 5
86 |                 ELSE
87 |                     ! No escaping needed
88 |                     out(out_i:out_i) = str(old_i:old_i)
89 |                     out_i = out_i + 1
90 |                 END IF
91 |             END DO
92 |             out(out_i:out_i) = "'"
93 |         END FUNCTION
94 | END PROGRAM
95 | 


--------------------------------------------------------------------------------
/torch-extras/effective_cpu_count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | CPU_QUOTA() (
 4 |     CGROUP='/sys/fs/cgroup';
 5 |     CGROUP_V1="$CGROUP/cpu,cpuacct";
 6 |     CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us";
 7 |     CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us";
 8 |     CGROUP_V2="$CGROUP/user.slice/cpu.max";
 9 |     if [ ! -d "$CGROUP" ]; then
10 |         return 1;
11 |     elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then
12 |         IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1;
13 |         IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1;
14 |     elif [ -f "$CGROUP_V2" ]; then
15 |         IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1;
16 |     else
17 |         return 1;
18 |     fi;
19 | 
20 |     if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then
21 |         echo $((QUOTA / PERIOD));
22 |         return 0;
23 |     else
24 |         return 1;
25 |     fi;
26 | )
27 | 
28 | EFFECTIVE_CPU_COUNT() {
29 |     CPU_QUOTA || getconf _NPROCESSORS_ONLN;
30 | }
31 | 
32 | EFFECTIVE_CPU_COUNT;
33 | 


--------------------------------------------------------------------------------
/torch-extras/install_cudnn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | CUDA_VERSION="$1";
 4 | if [ -z "$CUDA_VERSION" ]; then
 5 |     exit 14;
 6 | fi;
 7 | 
 8 | INSTALL_DEV="$2";
 9 | if [ "$INSTALL_DEV" = "dev" ]; then
10 |     echo "Ensuring installation of cuDNN (dev)";
11 |     DEV_SUFFIX="-dev";
12 |     DEV_PREFIX="";
13 | elif [ "$INSTALL_DEV" = "runtime" ]; then
14 |     echo "Ensuring installation of cuDNN (runtime)";
15 |     DEV_SUFFIX="";
16 |     DEV_PREFIX="lib";
17 | else
18 |     exit 15;
19 | fi;
20 | 
21 | CHECK_VERSION() {
22 |     dpkg-query --status "$1" 2>/dev/null \
23 |     | sed -ne 's/Version: //p' \
24 |     | grep .;
25 | }
26 | 
27 | CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1);
28 | LIBCUDNN_VER="$(
29 |     CHECK_VERSION "libcudnn8${DEV_SUFFIX}" || \
30 |     CHECK_VERSION "libcudnn9${DEV_SUFFIX}-cuda-${CUDA_MAJOR_VERSION}" || \
31 |     :;
32 | )" || exit 16;
33 | 
34 | if [ -z "$LIBCUDNN_VER" ]; then
35 |     apt-get -qq update && \
36 |     apt-get -qq install --no-upgrade -y "${DEV_PREFIX}cudnn9-cuda-${CUDA_MAJOR_VERSION}" && \
37 |     apt-get clean && \
38 |     ldconfig;
39 | else
40 |     echo "Found cuDNN version ${LIBCUDNN_VER}"
41 | fi;
42 | 


--------------------------------------------------------------------------------
/torch-extras/scale.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e;
 4 | 
 5 | VAL="$1";
 6 | DIVISOR="$2";
 7 | MAXIMUM="$3";
 8 | 
 9 | [ -n "$VAL" ];
10 | 
11 | if [ -n "$DIVISOR" ];
12 | then VAL="$((( $VAL + $DIVISOR - 1 ) / $DIVISOR))";
13 | fi;
14 | 
15 | if [ -n "$MAXIMUM" ];
16 | then VAL="$((VAL > MAXIMUM ? MAXIMUM : VAL))";
17 | fi;
18 | 
19 | echo "$VAL";
20 | 


--------------------------------------------------------------------------------
/torch/Dockerfile:
--------------------------------------------------------------------------------
  1 | # syntax=docker/dockerfile:1.7
  2 | ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.9.0-devel-ubuntu22.04"
  3 | ARG FINAL_BASE_IMAGE="nvidia/cuda:12.9.0-base-ubuntu22.04"
  4 | 
  5 | ARG BUILD_TORCH_VERSION="2.7.0"
  6 | ARG BUILD_TORCH_VISION_VERSION="0.22.0"
  7 | ARG BUILD_TORCH_AUDIO_VERSION="2.7.0"
  8 | ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
  9 | ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
 10 | ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
 11 | ARG BUILD_TRITON_VERSION=""
 12 | ARG BUILD_TRITON="1"
 13 | ARG BUILD_TORCH_CUDA_ARCH_LIST="7.0 8.0 8.9 9.0 10.0+PTX"
 14 | ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100"
 15 | 
 16 | ARG AOCL_BASE="/opt/aocl"
 17 | ARG AOCL_VER="4.2.0"
 18 | ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-aocc-4.2.0.tar.gz"
 19 | 
 20 | # region Downloads
 21 | 
 22 | # Clone PyTorch repositories independently from all other build steps
 23 | # for cache-friendliness and parallelization
 24 | FROM alpine/git:2.40.1 AS downloader-base
 25 | WORKDIR /git
 26 | RUN git config --global advice.detachedHead false
 27 | 
 28 | COPY <<-"EOT" /git/clone.sh
 29 | 	#!/bin/sh
 30 | 	REPO="https://github.com/$1";
 31 | 	DEST="$2";
 32 | 	REF="$3";
 33 | 
 34 | 	CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
 35 | 
 36 | 	# Try cloning REF as a tag prefixed with "v", otherwise fall back
 37 | 	# to git checkout for commit hashes
 38 | 	CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \
 39 | 	  "$REPO" -b "v$REF" "$DEST" || { \
 40 | 	    CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \
 41 | 	    git -C "$DEST" checkout "$REF" && \
 42 | 	    git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \
 43 | 	};
 44 | EOT
 45 | 
 46 | RUN chmod 755 /git/clone.sh
 47 | 
 48 | 
 49 | FROM downloader-base AS pytorch-downloader
 50 | ARG BUILD_TORCH_VERSION
 51 | # Includes a patch for a foreach bug in PyTorch v2.5.1
 52 | RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
 53 |     if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \
 54 |       wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \
 55 |       | git -C pytorch apply; \
 56 |     fi && \
 57 |     rm -rf pytorch/.git
 58 | 
 59 | FROM downloader-base AS torchvision-downloader
 60 | ARG BUILD_TORCH_VISION_VERSION
 61 | RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \
 62 |     rm -rf vision/.git
 63 | 
 64 | FROM downloader-base AS torchaudio-downloader
 65 | ARG BUILD_TORCH_AUDIO_VERSION
 66 | RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}"
 67 | # The torchaudio build requires that this directory remain a full git repository,
 68 | # so no rm -rf audio/.git is done for this one.
 69 | 
 70 | # torchaudio is broken for CUDA 12.5+ without this patch (up to and including v2.5.0)
 71 | # See https://github.com/pytorch/audio/pull/3811
 72 | # Fixed as a side effect of https://github.com/pytorch/audio/pull/3843 in versions after v2.5.0
 73 | COPY torchaudio-cu125-pr3811.patch /git/patch
 74 | RUN if grep -qF '#include <float.h>' \
 75 |       'audio/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu'; \
 76 |       then :; else git -C audio apply -v --stat --apply /git/patch; \
 77 |     fi && \
 78 |     rm /git/patch
 79 | 
 80 | FROM downloader-base AS transformerengine-downloader
 81 | ARG BUILD_TRANSFORMERENGINE_VERSION
 82 | RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}"
 83 | 
 84 | FROM downloader-base AS flash-attn-downloader
 85 | ARG BUILD_FLASH_ATTN_VERSION
 86 | RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}"
 87 | 
 88 | FROM downloader-base AS flash-attn-3-downloader
 89 | ARG BUILD_FLASH_ATTN_3_VERSION
 90 | RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \
 91 |       ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \
 92 |     else \
 93 |       mkdir flash-attention; \
 94 |     fi
 95 | 
 96 | FROM downloader-base AS triton-version
 97 | ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt'
 98 | COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt
 99 | ARG BUILD_TRITON_VERSION
100 | RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \
101 |       echo "${BUILD_TRITON_VERSION}" > /git/version.txt; \
102 |     fi
103 | 
104 | FROM downloader-base AS triton-downloader
105 | COPY --link --from=triton-version /git/version.txt /git/version.txt
106 | ARG BUILD_TRITON
107 | RUN if [ "${BUILD_TRITON}" = '1' ]; then \
108 |       ./clone.sh openai/triton triton "$(cat /git/version.txt)"; \
109 |     else \
110 |       mkdir triton; \
111 |     fi
112 | 
113 | FROM alpine/curl:8.7.1 AS aocl-downloader
114 | WORKDIR /tmp/install
115 | 
116 | RUN apk add --no-cache bash
117 | 
118 | ARG AOCL_BASE
119 | ARG AOCL_VER
120 | ARG AOCL_URL
121 | 
122 | RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \
123 |     INSTALL_LIB() { ./install.sh -l "$1" -t "${AOCL_BASE}" -i lp64; } && \
124 |     INSTALL_LIB blis && \
125 |     INSTALL_LIB libflame && \
126 |     INSTALL_LIB utils && \
127 |     . ./amd-libs.cfg && \
128 |     rm -r "${AOCL_ROOT}/include_ILP64" && \
129 |     rm -r "${AOCL_ROOT}/lib_ILP64" && \
130 |     ln -s "${AOCL_ROOT}/amd-libs.cfg" "${AOCL_BASE}/amd-libs.cfg" && \
131 |     ln -s "${AOCL_ROOT}/include" "${AOCL_BASE}/include" && \
132 |     ln -s "${AOCL_ROOT}/lib" "${AOCL_BASE}/lib" && \
133 |     echo "${AOCL_BASE}/lib" \
134 |     | install -m 0644 /dev/stdin "${AOCL_BASE}/aocl.conf" && \
135 |     rm -r ./*
136 | 
137 | # endregion Downloads
138 | 
139 | ## Build PyTorch on a builder image.
140 | FROM ${BUILDER_BASE_IMAGE} AS builder-base-shared
141 | ENV DEBIAN_FRONTEND=noninteractive
142 | 
143 | ARG BUILD_CCACHE_SIZE="1Gi"
144 | 
145 | # ninja-build, ccache, and lld are optional but improve the build
146 | RUN apt-get -qq update && apt-get -qq install -y \
147 |       libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \
148 |       libomp5 libpng-dev libjpeg-dev pkg-config python3-distutils \
149 |       build-essential ninja-build && \
150 |     apt-get clean && \
151 |     /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
152 |     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
153 |     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
154 |     ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \
155 |     ldconfig
156 | 
157 | COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
158 | 
159 | RUN export \
160 |       CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1) \
161 |       CUDA_MINOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f2) && \
162 |     export \
163 |       CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
164 |     apt-get -qq update && \
165 |     apt-get -qq install --no-upgrade -y \
166 |       cuda-nvtx-${CUDA_PACKAGE_VERSION} \
167 |       cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \
168 |     /tmp/install_cudnn.sh "${CUDA_VERSION}" dev && \
169 |     rm /tmp/install_cudnn.sh && \
170 |     apt-get clean
171 | 
172 | # Add Kitware's apt repository to get a newer version of CMake
173 | RUN apt-get -qq update && apt-get -qq install -y \
174 |       software-properties-common lsb-release && \
175 |     { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
176 |     | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
177 |     apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
178 |     apt-get -qq update && \
179 |     apt-get -qq install -y 'cmake=3.31.6-*' 'cmake-data=3.31.6-*' && \
180 |     apt-get clean && \
181 |     python3 -m pip install --no-cache-dir 'cmake==3.31.6'
182 | 
183 | RUN mkdir /tmp/ccache-install && \
184 |     cd /tmp/ccache-install && \
185 |     CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \
186 |     wget -qO - "$CCACHE_URL" | tar --strip-components 1 -xJf - && \
187 |     mkdir build && \
188 |     cd build && \
189 |     cmake -B. -S.. -DCMAKE_BUILD_TYPE=Release && \
190 |     cmake --build . --config Release && \
191 |     make install && \
192 |     cd ../.. && \
193 |     rm -rf /tmp/ccache-install && \
194 |     ccache -M "${BUILD_CCACHE_SIZE}" && \
195 |     ccache -F 0
196 | 
197 | # Build-time environment variables
198 | ENV CCACHE_DIR=/ccache \
199 |     CMAKE_C_COMPILER_LAUNCHER=ccache \
200 |     CMAKE_CXX_COMPILER_LAUNCHER=ccache \
201 |     CMAKE_CUDA_COMPILER_LAUNCHER=ccache
202 | 
203 | # Update compiler (GCC) and linker (LLD) versions
204 | RUN LLVM_VERSION='18' && \
205 |     CODENAME="$(lsb_release -cs)" && \
206 |     wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
207 |     apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \
208 |     SETUP_TOOLCHAIN() { \
209 |         apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
210 |         | sed -e '/connection timed out/{p; Q1}' && \
211 |         apt-get -qq install --no-install-recommends -y \
212 |           gcc-11 g++-11 gfortran-11 \
213 |           "lld-$LLVM_VERSION" "libomp-$LLVM_VERSION-dev" && \
214 |         apt-get clean; \
215 |     } && \
216 |     { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \
217 |     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
218 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
219 |     update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
220 |     if [ "$(uname -m)" != 'aarch64' ]; then \
221 |       update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \
222 |     fi && \
223 |     ldconfig
224 | 
225 | 
226 | FROM builder-base-shared AS builder-base-arm64
227 | ARG BUILD_TORCH_CUDA_ARCH_LIST
228 | ENV TORCH_CUDA_ARCH_LIST="9.0${BUILD_TORCH_CUDA_ARCH_LIST#* 9.0}"
229 | # There is currently no CPU BLAS used for ARM builds
230 | 
231 | 
232 | FROM builder-base-shared AS builder-base-amd64
233 | ARG BUILD_TORCH_CUDA_ARCH_LIST
234 | ENV TORCH_CUDA_ARCH_LIST="${BUILD_TORCH_CUDA_ARCH_LIST}"
235 | # Install AOCL-BLAS and AOCL-LAPACK
236 | # See: https://www.amd.com/en/developer/aocl/dense.html
237 | ARG AOCL_BASE
238 | COPY --from=aocl-downloader "${AOCL_BASE}" "${AOCL_BASE}"
239 | 
240 | # `ldconfig` lets the dynamic linker access AOCL libraries
241 | RUN install -m 0644 -t /etc/ld.so.conf.d "${AOCL_BASE}/aocl.conf" && \
242 |     ldconfig
243 | 
244 | # These environment variables are only for the build stage,
245 | # and register paths to build-time AOCL resources.
246 | # This could alternatively be done by invoking `. "${AOCL_BASE}/amd-libs.cfg"`
247 | # in every RUN compilation step, but this will make sure it is never missed.
248 | #
249 | # PyTorch's logic to find LAPACK during CMake configuration
250 | # additionally requires its installed path to either be in:
251 | # - One of:
252 | #   - /usr/local/lib, or
253 | #   - /usr/lib, or
254 | #   - /usr/local/lib64, or
255 | #   - /usr/lib64, or
256 | #   - /usr/lib/aarch64-linux-gnu, or
257 | # - $LD_LIBRARY_PATH
258 | # While skipping $LIBRARY_PATH, and ld's normal configured paths,
259 | # so it is necessary to add $LD_LIBRARY_PATH here as well.
260 | # See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindLAPACK.cmake#L56-L59
261 | ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \
262 |     CPLUS_INCLUDE_PATH="${AOCL_BASE}/include${CPLUS_INCLUDE_PATH:+:$CPLUS_INCLUDE_PATH}" \
263 |     LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \
264 |     LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}"
265 | 
266 | 
267 | FROM builder-base-${TARGETARCH} AS builder-base
268 | RUN mkdir /build /build/dist
269 | WORKDIR /build
270 | COPY --chmod=755 effective_cpu_count.sh .
271 | COPY --chmod=755 scale.sh .
272 | COPY compiler_wrapper.f95 .
273 | ARG AMD64_NATIVE_ARCH="skylake"
274 | ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
275 | RUN if [ "$(uname -m)" = "aarch64" ]; then \
276 |       NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \
277 |       AVX='WRAPPER_NO_AVX'; \
278 |     else \
279 |       NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
280 |       AVX='WRAPPER_AVX="AVX256"'; \
281 |     fi && \
282 |     gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
283 | 
284 | COPY <<-"EOT" /build/version-string.sh
285 | 	#!/bin/sh
286 | 	set -e;
287 | 	VERSION="$1";
288 | 
289 | 	IS_HASH() {
290 | 	  echo "$1" | grep -qxiEe '[0-9a-f]{40}';
291 | 	};
292 | 
293 | 	if IS_HASH "$VERSION"; then
294 | 	  REAL_VERSION="$(cat ./version.txt)";
295 | 	  SHORT_HASH="$(echo "$VERSION" | cut -c1-7)";
296 | 	  echo "$REAL_VERSION+$SHORT_HASH";
297 | 	else
298 | 	  echo "$VERSION";
299 | 	fi;
300 | EOT
301 | RUN chmod 755 /build/version-string.sh
302 | 
303 | COPY <<-"EOT" /build/storage-info.sh
304 | 	#!/bin/sh
305 | 	set -e;
306 | 	TARGET="$(realpath "$1")";
307 | 
308 | 	STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0;
309 | 	printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO";
310 | EOT
311 | RUN chmod 755 /build/storage-info.sh
312 | 
313 | ## Build torch
314 | RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \
315 |     pip3 install --no-cache-dir --upgrade numpy && \
316 |     cd pytorch && pip3 install --no-cache-dir -r requirements.txt
317 | 
318 | # Build tool & library paths, shared for all libraries to be built
319 | ENV CMAKE_PREFIX_PATH=/usr/bin/ \
320 |     LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/lib \
321 |     CUDA_BIN_PATH=/usr/local/cuda/bin \
322 |     CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ \
323 |     CUDNN_LIB_DIR=/usr/local/cuda/lib64
324 | 
325 | ARG BUILD_TRITON
326 | ARG BUILD_MAX_JOBS=""
327 | RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
328 |     --mount=type=cache,target=/ccache \
329 |     if [ "$BUILD_TRITON" = '1' ]; then \
330 |       pip3 install --no-cache-dir pybind11 && \
331 |       export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
332 |       cd triton/python && \
333 |       python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \
334 |       pip3 install ../../dist/*.whl; \
335 |     fi
336 | 
337 | ARG BUILD_TORCH_VERSION
338 | ENV TORCH_VERSION=$BUILD_TORCH_VERSION
339 | # Filter out the 10.0 arch on CUDA versions != 12.8 and != 12.9
340 | ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${TORCH_CUDA_ARCH_LIST/ 10.0/}||${TORCH_CUDA_ARCH_LIST}"
341 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#12.9.?}"
342 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}"
343 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}"
344 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
345 | 
346 | RUN printf 'Arch: %s\nTORCH_CUDA_ARCH_LIST=%s\n' "$(uname -m)" "${TORCH_CUDA_ARCH_LIST}"
347 | 
348 | ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a"
349 | # Add sm_100a build if NV_CUDA_LIB_VERSION matches 12.[89].*
350 | RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
351 |     case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
352 |       FLAGS="${FLAGS} -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_100a,code=sm_100a" ;; \
353 |     esac && \
354 |     echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
355 | 
356 | # If the directory /opt/nccl-tests exists,
357 | # the base image is assumed to be nccl-tests,
358 | # so it uses the system's special NCCL and UCC installations for the build.
359 | #
360 | # Additionally, this RUN is executed with the downloaded PyTorch repository
361 | # mounted temporarily in "rw" mode, which allows ephemeral writes like
362 | # OverlayFS would that do not mutate the downloaded copy.
363 | # This means the downloaded data never needs to be duplicated in the cache in
364 | # a layer of this build step, and temporary build files are automatically
365 | # cleaned up at the end of the step once the directory is detached.
366 | #
367 | # This step is itself cacheable as long as the downloaded files (and ARCH_LIST)
368 | # remain the same.
369 | #
370 | # NB: This cannot specify BLAS=FLAME directly, because PyTorch (v2.3.0)'s code
371 | # to explicitly choose a BLAS implementation is missing that option
372 | # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Dependencies.cmake#L195-L266),
373 | # and using BLAS=blis makes it ignore the libflame LAPACK library, because
374 | # that triggers its FindBLIS logic rather than FindBLAS, and FindLAPACK depends
375 | # on a variable set only during FindBLAS (BLAS_INFO=FLAME)
376 | # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindLAPACK.cmake#L176-L189).
377 | # Thus, we have to force it to use its generic FindBLAS logic,
378 | # and narrow it down from there by specifying WITH_BLAS=FLAME
379 | # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
380 | # Without WITH_BLAS, it would detect the BLAS implementation as
381 | # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
382 | ARG BUILD_CXX11_ABI=""
383 | SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
384 | RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
385 |     --mount=type=cache,target=/ccache \
386 |     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
387 |     echo "MAX_JOBS: ${MAX_JOBS}" && \
388 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
389 |     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
390 |     if [ -n "${BUILD_CXX11_ABI}" ]; then \
391 |       export _GLIBCXX_USE_CXX11_ABI="${BUILD_CXX11_ABI}"; \
392 |     fi && \
393 |     ./storage-info.sh . && \
394 |     cd pytorch && \
395 |     ../storage-info.sh . && \
396 |     mkdir build && \
397 |     ln -s /usr/bin/cc build/cc && \
398 |     ln -s /usr/bin/c++ build/c++ && \
399 |     if [ "$(uname -m)" = 'aarch64' ]; then \
400 |       export USE_PRIORITIZED_TEXT_FOR_LD=1; \
401 |     fi && \
402 |     { if [ -d /opt/nccl-tests ]; then \
403 |       export \
404 |         USE_DISTRIBUTED=1 \
405 |         USE_NCCL=1 USE_SYSTEM_NCCL=1 \
406 |         UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \
407 |         USE_NCCL_WITH_UCC=1 \
408 |         USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \
409 |     USE_CUDNN=1 \
410 |     BUILD_TORCH=ON \
411 |     BUILD_TEST=0 \
412 |     CUDA_HOST_COMPILER=cc \
413 |     USE_CUDA=1 \
414 |     USE_NNPACK=1 \
415 |     CC=cc \
416 |     CXX=c++ \
417 |     USE_BLAS=1 \
418 |     USE_LAPACK=1 \
419 |     WITH_BLAS=FLAME \
420 |     PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \
421 |     PYTORCH_BUILD_NUMBER=0 \
422 |     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
423 |     python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
424 |     | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)'
425 | SHELL ["/bin/sh", "-c"]
426 | RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
427 | 
428 | RUN python3 -m pip install -U --no-cache-dir \
429 |       packaging setuptools wheel pip
430 | 
431 | FROM builder-base AS torchvision-builder
432 | RUN rm ./dist/*
433 | 
434 | ## Build torchvision
435 | ARG BUILD_TORCH_VISION_VERSION
436 | ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION
437 | RUN pip3 install --no-cache-dir --upgrade \
438 |     matplotlib numpy typing_extensions requests pillow
439 | 
440 | RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \
441 |     --mount=type=cache,target=/ccache \
442 |     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
443 |     echo "MAX_JOBS: ${MAX_JOBS}" && \
444 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
445 |     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
446 |     cd vision && \
447 |     mkdir build && \
448 |     ln -s /usr/bin/cc build/cc && \
449 |     ln -s /usr/bin/c++ build/c++ && \
450 |     { if [ -d /opt/nccl-tests ]; then \
451 |       export \
452 |         USE_DISTRIBUTED=1 \
453 |         USE_NCCL=1 USE_SYSTEM_NCCL=1 \
454 |         UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \
455 |         USE_NCCL_WITH_UCC=1 \
456 |         USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \
457 |     USE_CUDNN=1 \
458 |     USE_OPENCV=1 \
459 |     BUILD_TORCH=ON \
460 |     BUILD_TEST=0 \
461 |     CUDA_HOST_COMPILER=cc \
462 |     USE_CUDA=1 \
463 |     FORCE_CUDA=1 \
464 |     USE_NNPACK=1 \
465 |     CC=cc \
466 |     CXX=c++ \
467 |     BUILD_VERSION="$(../version-string.sh "$TORCH_VISION_VERSION")" \
468 |     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
469 |     python3 setup.py bdist_wheel --dist-dir ../dist
470 | 
471 | FROM builder-base AS torchaudio-builder
472 | RUN rm ./dist/*
473 | 
474 | ## Build torchaudio
475 | ARG BUILD_TORCH_AUDIO_VERSION
476 | ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION
477 | RUN pip3 install --no-cache-dir --upgrade \
478 |     matplotlib numpy typing_extensions requests pillow
479 | 
480 | RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \
481 |     --mount=type=cache,target=/ccache \
482 |     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
483 |     echo "MAX_JOBS: ${MAX_JOBS}" && \
484 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
485 |     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
486 |     cd audio && \
487 |     mkdir build && \
488 |     ln -s /usr/bin/cc build/cc && \
489 |     ln -s /usr/bin/c++ build/c++ && \
490 |     { if [ -d /opt/nccl-tests ]; then \
491 |       export \
492 |         USE_DISTRIBUTED=1 \
493 |         USE_NCCL=1 USE_SYSTEM_NCCL=1 \
494 |         UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \
495 |         USE_NCCL_WITH_UCC=1 \
496 |         USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \
497 |     USE_CUDNN=1 \
498 |     USE_OPENCV=1 \
499 |     BUILD_TORCH=ON \
500 |     BUILD_TEST=0 \
501 |     CUDA_HOST_COMPILER=cc \
502 |     USE_CUDA=1 \
503 |     FORCE_CUDA=1 \
504 |     USE_NNPACK=1 \
505 |     CC=cc \
506 |     CXX=c++ \
507 |     BUILD_VERSION="$(../version-string.sh "$TORCH_AUDIO_VERSION")" \
508 |     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
509 |     python3 setup.py bdist_wheel --dist-dir ../dist
510 | 
511 | FROM builder-base AS transformerengine-builder
512 | RUN rm ./dist/*
513 | 
514 | # Build TransformerEngine
515 | ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
516 | ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
517 | 
518 | RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \
519 |     --mount=type=cache,target=/ccache \
520 |     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
521 |     export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \
522 |     echo "MAX_JOBS: ${MAX_JOBS}" && \
523 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
524 |     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
525 |     case "${CUDA_VERSION}" in 12.[0123456].*) \
526 |       export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \
527 |     esac && \
528 |     cd TransformerEngine && \
529 |     if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \
530 |       sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \
531 |         build_tools/utils.py; \
532 |     fi && \
533 |     python3 setup.py bdist_wheel --dist-dir /build/dist
534 | 
535 | FROM builder-base AS flash-attn-builder-base
536 | RUN rm ./dist/*
537 | ENV PYTHONUNBUFFERED=1
538 | ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
539 | ARG BUILD_FLASH_ATTN_MAX_JOBS=""
540 | 
541 | COPY <<-"EOT" /build/fa-build.sh
542 | 	#!/bin/bash
543 | 	set -eo pipefail;
544 | 	if [ -n "$1" ]; then cd "$1"; fi;
545 | 	python3 setup.py bdist_wheel --dist-dir /build/dist \
546 | 	| grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores'
547 | EOT
548 | RUN chmod 755 /build/fa-build.sh
549 | 
550 | FROM flash-attn-builder-base AS flash-attn-builder
551 | 
552 | # Build flash-attn
553 | RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
554 |     --mount=type=cache,target=/ccache \
555 |     export CC=$(realpath -e ./compiler) \
556 |       MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \
557 |     echo "MAX_JOBS: ${MAX_JOBS}" && \
558 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
559 |     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
560 |     cd flash-attention && \
561 |     for EXT_DIR in $(realpath -s -e \
562 |       . \
563 |       csrc/ft_attention \
564 |       csrc/fused_dense_lib \
565 |       csrc/fused_softmax \
566 |       csrc/layer_norm \
567 |       csrc/rotary \
568 |       csrc/xentropy); \
569 |     do /build/fa-build.sh "$EXT_DIR" || exit 1; done
570 | 
571 | FROM flash-attn-builder-base AS flash-attn-3-builder
572 | 
573 | # Artifically sequence this build stage after the previous one
574 | # to prevent parallelism, because these are both very resource-intensive
575 | RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build :
576 | 
577 | # Build flash-attn v3
578 | RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \
579 |     --mount=type=cache,target=/ccache \
580 |     if [ ! -d flash-attention/hopper ]; then \
581 |       echo "Not compiling flash-attn v3" && exit 0; \
582 |     fi && \
583 |     export CC=$(realpath -e ./compiler) \
584 |       MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \
585 |     echo "MAX_JOBS: ${MAX_JOBS}" && \
586 |     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
587 |     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
588 |     /build/fa-build.sh flash-attention/hopper
589 | 
590 | FROM builder-base AS builder
591 | COPY --link --from=torchaudio-builder /build/dist/ /build/dist/
592 | COPY --link --from=torchvision-builder /build/dist/ /build/dist/
593 | COPY --link --from=transformerengine-builder /build/dist/ /build/dist/
594 | COPY --link --from=flash-attn-builder /build/dist/ /build/dist/
595 | COPY --link --from=flash-attn-3-builder /build/dist/ /build/dist/
596 | 
597 | ## Build the final torch image.
598 | FROM ${FINAL_BASE_IMAGE} as final-arm64
599 | ARG BUILD_TORCH_CUDA_ARCH_LIST
600 | ENV TORCH_CUDA_ARCH_LIST="9.0${BUILD_TORCH_CUDA_ARCH_LIST#* 9.0}"
601 | 
602 | FROM ${FINAL_BASE_IMAGE} as final-amd64
603 | ARG BUILD_TORCH_CUDA_ARCH_LIST
604 | ENV TORCH_CUDA_ARCH_LIST="${BUILD_TORCH_CUDA_ARCH_LIST}"
605 | 
606 | FROM final-${TARGETARCH}
607 | ENV DEBIAN_FRONTEND=noninteractive
608 | 
609 | # Install core packages
610 | RUN apt-get -qq update && apt-get -qq install -y \
611 |       libncurses5 python3 python3-pip python3-distutils \
612 |       libomp5 libpng16-16 libjpeg-turbo8 libsodium23 \
613 |       curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \
614 |       rsync htop wget unzip tini && \
615 |     apt-get clean && \
616 |     /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
617 |     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
618 |     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
619 |     update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \
620 |     ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \
621 |     ldconfig
622 | 
623 | RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
624 |         software-properties-common lsb-release && \
625 |     SETUP_LIBSTDCXX() { \
626 |         apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
627 |         | sed -e '/connection timed out/{p; Q1}' && \
628 |         apt-get -qq install -y --no-install-recommends libstdc++6 && \
629 |         apt-get clean; \
630 |     } && \
631 |     { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; }
632 | 
633 | RUN LLVM_VERSION='18' && \
634 |     CODENAME="$(lsb_release -cs)" && \
635 |     wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
636 |     apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \
637 |     apt-get -qq install -y --no-install-recommends "libomp5-$LLVM_VERSION" && \
638 |     apt-get clean
639 | 
640 | # Install AOCL-BLAS and AOCL-LAPACK
641 | # See: https://www.amd.com/en/developer/aocl/dense.html
642 | ARG AOCL_BASE
643 | COPY --from=aocl-downloader "${AOCL_BASE}" "${AOCL_BASE}"
644 | 
645 | # `ldconfig` lets the dynamic linker access AOCL libraries
646 | RUN install -m 0644 -t /etc/ld.so.conf.d "${AOCL_BASE}/aocl.conf" && \
647 |     ldconfig
648 | 
649 | ARG BUILD_TORCH_VERSION
650 | ARG BUILD_TORCH_VISION_VERSION
651 | ARG BUILD_TORCH_AUDIO_VERSION
652 | ENV TORCH_VERSION=$BUILD_TORCH_VERSION
653 | ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION
654 | ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION
655 | # Filter out the 10.0 arch on CUDA versions != 12.8 and != 12.9
656 | ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${TORCH_CUDA_ARCH_LIST/ 10.0/}||${TORCH_CUDA_ARCH_LIST}"
657 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#12.9.?}"
658 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}"
659 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}"
660 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
661 | 
662 | COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
663 | # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0.
664 | # - Don't mess with libnccl2 when using nccl-tests as a base,
665 | #   checked via the existence of the directory "/opt/nccl-tests".
666 | RUN export \
667 |       CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
668 |       CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
669 |     export \
670 |       CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
671 |     apt-get -qq update && \
672 |     apt-get -qq install --no-upgrade -y \
673 |       libcurand-${CUDA_PACKAGE_VERSION} \
674 |       libcufft-${CUDA_PACKAGE_VERSION} \
675 |       libcublas-${CUDA_PACKAGE_VERSION} \
676 |       cuda-nvrtc-${CUDA_PACKAGE_VERSION} \
677 |       libcusparse-${CUDA_PACKAGE_VERSION} \
678 |       libcusolver-${CUDA_PACKAGE_VERSION} \
679 |       libcufile-${CUDA_PACKAGE_VERSION} \
680 |       cuda-cupti-${CUDA_PACKAGE_VERSION} \
681 |       libnvjpeg-${CUDA_PACKAGE_VERSION} \
682 |       libnvtoolsext1 && \
683 |     { if [ $CUDA_MAJOR_VERSION -ge 12 ]; then \
684 |       apt-get -qq install --no-upgrade -y libnvjitlink-${CUDA_PACKAGE_VERSION}; fi; } && \
685 |     { if [ ! -d /opt/nccl-tests ]; then \
686 |       export NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \
687 |       apt-get -qq install --no-upgrade -y "libnccl2=$NCCL_PACKAGE_VERSION"; fi; } && \
688 |     /tmp/install_cudnn.sh "$CUDA_VERSION" runtime && \
689 |     rm /tmp/install_cudnn.sh && \
690 |     apt-get clean && \
691 |     ldconfig
692 | 
693 | 
694 | WORKDIR /usr/src/app
695 | 
696 | # Install custom PyTorch wheels.
697 | RUN --mount=type=bind,from=builder,source=/build/dist,target=. \
698 |     pip3 install --no-cache-dir -U numpy packaging && \
699 |     pip3 install --no-cache-dir -U ./*.whl
700 | 
701 | # Make a symlink to flash-attn v3 where TransformerEngine expects it,
702 | # and modify the installation record so that pip uninstall knows how to
703 | # fully remove it.
704 | RUN <<-"EOT" python3
705 | 	#!/bin/env python3
706 | 	from base64 import urlsafe_b64encode as b64
707 | 	from hashlib import sha256
708 | 	from importlib import metadata
709 | 	from pathlib import Path
710 | 	from py_compile import compile
711 | 
712 | 	dist = metadata.distribution("flashattn-hopper")
713 | 	p = dist.locate_file("flash_attn_interface.py")
714 | 	print("flash_attn_interface:", p)
715 | 	root = p.parent
716 | 
717 | 	if not p.exists():
718 | 	    raise SystemExit("flash_attn_interface not found")
719 | 	if not p.is_file():
720 | 	    raise SystemExit("flash_attn_interface path is not a file")
721 | 
722 | 	d = root / "flashattn_hopper"
723 | 	if d.exists():
724 | 	    raise SystemExit(f'"{d}" already exists')
725 | 
726 | 	d.mkdir(mode=0o755, parents=False, exist_ok=False)
727 | 	new = d / p.name
728 | 	new.symlink_to(p)
729 | 	print(f"Created new symlink at {new}")
730 | 
731 | 	compiled = Path(compile(new))
732 | 
733 | 
734 | 	def record_entry(path: Path) -> str:
735 | 	    content = path.read_bytes()
736 | 	    digest = b64(sha256(content).digest()).rstrip(b"=").decode()
737 | 	    package_path = path.relative_to(root).as_posix()
738 | 	    return f"{package_path},sha256={digest},{len(content):d}\r\n"
739 | 
740 | 
741 | 	for f in dist.files:
742 | 	    if f.match("flashattn?hopper-*.dist-info/RECORD"):
743 | 	        with f.locate().open("a", encoding="utf-8", newline="") as record:
744 | 	            for added in (new, compiled):
745 | 	                record.write(record_entry(added))
746 | 	        break
747 | EOT
748 | 


--------------------------------------------------------------------------------
/torch/compiler_wrapper.f95:
--------------------------------------------------------------------------------
 1 | #ifndef WRAPPER_NATIVE
 2 | #define WRAPPER_NATIVE "skylake"
 3 | #endif
 4 | 
 5 | #ifndef WRAPPER_CC
 6 | #define WRAPPER_CC "gcc"
 7 | #endif
 8 | 
 9 | #ifndef WRAPPER_AVX
10 | #define WRAPPER_AVX "AVX256"
11 | #endif
12 | 
13 | PROGRAM compiler_wrapper
14 |     ! Wraps C compiler invocations,
15 |     ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions
16 |     ! with -D__<WRAPPER_AVX>__, and -march=native with -march=<WRAPPER_NATIVE>,
17 |     ! for better reproducibility and compatibility.
18 |     IMPLICIT NONE
19 |     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
20 |     CHARACTER(len=:), ALLOCATABLE :: arg, command
21 |     ALLOCATE(CHARACTER(len=128) :: arg)
22 |     command = WRAPPER_CC
23 | 
24 |     DO i = 1, COMMAND_ARGUMENT_COUNT()
25 |         DO
26 |             CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated)
27 |             IF (truncated == 0) THEN
28 |                 EXIT
29 |             ELSE IF (truncated == -1) THEN
30 |                 DEALLOCATE(arg)
31 |                 ALLOCATE(CHARACTER(len=full_length) :: arg)
32 |             ELSE
33 |                 CALL EXIT(95)
34 |             END IF
35 |         END DO
36 |         IF (arg == "-march=native") THEN
37 |             command = command // (" '-march=" // WRAPPER_NATIVE // "'")
38 |         ELSE IF ( &
39 |             arg == "-D__AVX512__" &
40 |             .OR. arg == "-D__AVX256__" &
41 |             .OR. arg == "-D__SCALAR__" &
42 |         ) THEN
43 | #ifndef WRAPPER_NO_AVX
44 |             command = command // (" '-D__" // WRAPPER_AVX // "__'")
45 | #endif
46 |         ELSE
47 |             command = command // shell_escaped(arg)
48 |         END IF
49 |     END DO
50 |     CALL SYSTEM(command, exitcode)
51 |     IF (exitcode > 255) THEN
52 |         exitcode = MAX(IAND(exitcode, 255), 1)
53 |     END IF
54 |     CALL EXIT(exitcode)
55 | 
56 | 
57 |     CONTAINS
58 |         FUNCTION shell_escaped(str) RESULT(out)
59 |             ! Turns [str] into [ 'str'] and replaces all
60 |             ! internal ['] characters with ['"'"']
61 |             IMPLICIT NONE
62 |             CHARACTER(len=*), INTENT(IN) :: str
63 |             CHARACTER(len=:), ALLOCATABLE :: out
64 |             INTEGER :: old_i, out_i, old_len, out_len
65 | 
66 |             old_len = LEN_TRIM(str)
67 |             ! Figure out the new length to allocate by scanning `str`.
68 |             ! This always needs to add at least [ '] at the beginning
69 |             ! and ['] at the end, so the length increases by at least 3.
70 |             out_len = old_len + 3
71 |             DO old_i = 1, old_len
72 |                 IF (str(old_i:old_i) == "'") THEN
73 |                     out_len = out_len + 4
74 |                 END IF
75 |             END DO
76 |             ALLOCATE(CHARACTER(len=out_len) :: out)
77 | 
78 |             ! Copy over the string, performing necessary escapes.
79 |             out(1:2) = " '"
80 |             out_i = 3
81 |             DO old_i = 1, old_len
82 |                 IF (str(old_i:old_i) == "'") THEN
83 |                     ! Escape internal single-quotes
84 |                     out(out_i:out_i + 4) = '''"''"'''
85 |                     out_i = out_i + 5
86 |                 ELSE
87 |                     ! No escaping needed
88 |                     out(out_i:out_i) = str(old_i:old_i)
89 |                     out_i = out_i + 1
90 |                 END IF
91 |             END DO
92 |             out(out_i:out_i) = "'"
93 |         END FUNCTION
94 | END PROGRAM
95 | 


--------------------------------------------------------------------------------
/torch/effective_cpu_count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | CPU_QUOTA() (
 4 |     CGROUP='/sys/fs/cgroup';
 5 |     CGROUP_V1="$CGROUP/cpu,cpuacct";
 6 |     CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us";
 7 |     CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us";
 8 |     CGROUP_V2="$CGROUP/user.slice/cpu.max";
 9 |     if [ ! -d "$CGROUP" ]; then
10 |         return 1;
11 |     elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then
12 |         IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1;
13 |         IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1;
14 |     elif [ -f "$CGROUP_V2" ]; then
15 |         IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1;
16 |     else
17 |         return 1;
18 |     fi;
19 | 
20 |     if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then
21 |         echo $((QUOTA / PERIOD));
22 |         return 0;
23 |     else
24 |         return 1;
25 |     fi;
26 | )
27 | 
28 | EFFECTIVE_CPU_COUNT() {
29 |     CPU_QUOTA || getconf _NPROCESSORS_ONLN;
30 | }
31 | 
32 | EFFECTIVE_CPU_COUNT;
33 | 


--------------------------------------------------------------------------------
/torch/install_cudnn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | CUDA_VERSION="$1";
 4 | if [ -z "$CUDA_VERSION" ]; then
 5 |     exit 14;
 6 | fi;
 7 | 
 8 | INSTALL_DEV="$2";
 9 | if [ "$INSTALL_DEV" = "dev" ]; then
10 |     echo "Ensuring installation of cuDNN (dev)";
11 |     DEV_SUFFIX="-dev";
12 |     DEV_PREFIX="";
13 | elif [ "$INSTALL_DEV" = "runtime" ]; then
14 |     echo "Ensuring installation of cuDNN (runtime)";
15 |     DEV_SUFFIX="";
16 |     DEV_PREFIX="lib";
17 | else
18 |     exit 15;
19 | fi;
20 | 
21 | CHECK_VERSION() {
22 |     dpkg-query --status "$1" 2>/dev/null \
23 |     | sed -ne 's/Version: //p' \
24 |     | grep .;
25 | }
26 | 
27 | CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1);
28 | LIBCUDNN_VER="$(
29 |     CHECK_VERSION "libcudnn8${DEV_SUFFIX}" || \
30 |     CHECK_VERSION "libcudnn9${DEV_SUFFIX}-cuda-${CUDA_MAJOR_VERSION}" || \
31 |     :;
32 | )" || exit 16;
33 | 
34 | if [ -z "$LIBCUDNN_VER" ]; then
35 |     apt-get -qq update && \
36 |     apt-get -qq install --no-upgrade -y "${DEV_PREFIX}cudnn9-cuda-${CUDA_MAJOR_VERSION}" && \
37 |     apt-get clean && \
38 |     ldconfig;
39 | else
40 |     echo "Found cuDNN version ${LIBCUDNN_VER}"
41 | fi;
42 | 


--------------------------------------------------------------------------------
/torch/scale.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e;
 4 | 
 5 | VAL="$1";
 6 | DIVISOR="$2";
 7 | MAXIMUM="$3";
 8 | 
 9 | [ -n "$VAL" ];
10 | 
11 | if [ -n "$DIVISOR" ];
12 | then VAL="$((( $VAL + $DIVISOR - 1 ) / $DIVISOR))";
13 | fi;
14 | 
15 | if [ -n "$MAXIMUM" ];
16 | then VAL="$((VAL > MAXIMUM ? MAXIMUM : VAL))";
17 | fi;
18 | 
19 | echo "$VAL";
20 | 


--------------------------------------------------------------------------------
/torch/torchaudio-cu125-pr3811.patch:
--------------------------------------------------------------------------------
 1 | From 7797f83e1d66ff78872763e1da3a5fb2f0534c40 Mon Sep 17 00:00:00 2001
 2 | From: Markus Hennerbichler <markushennerbichler@gmail.com>
 3 | Date: Mon, 15 Jul 2024 14:07:13 +0100
 4 | Subject: [PATCH] Fix CUDA 12.5 build
 5 | 
 6 | CUDA 12.5 removed the FLT_MAX symbol.
 7 | This was previously used without being explicitly imported.
 8 | FLT_MAX is defined in <float.h>, including this header fixes the issue
 9 | ---
10 |  src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu | 1 +
11 |  1 file changed, 1 insertion(+)
12 | 
13 | diff --git a/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu b/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu
14 | index 4ca8f1bf24..e6192155a2 100644
15 | --- a/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu
16 | +++ b/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu
17 | @@ -24,6 +24,7 @@
18 |  // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
19 |  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20 |  #include <algorithm>
21 | +#include <float.h>
22 |  #include "ctc_fast_divmod.cuh"
23 |  #include "cub/cub.cuh"
24 |  #include "device_data_wrap.h"
25 | 


--------------------------------------------------------------------------------
/vllm-tensorizer/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE="ghcr.io/coreweave/ml-containers/torch-extras:es-22.04-58a49a2-base-cuda12.1.1-torch2.1.2-vision0.16.2-audio2.1.2-flash_attn2.4.2"
 2 | 
 3 | FROM scratch as freezer
 4 | WORKDIR /
 5 | COPY --chmod=755 freeze.sh /
 6 | 
 7 | FROM ${BASE_IMAGE} as builder-base
 8 | 
 9 | ARG MAX_JOBS=""
10 | 
11 | # Dependencies requiring NVCC are built ahead of time in a separate stage
12 | # so that the ~2 GiB dev library installations don't have to be included
13 | # in the final image.
14 | RUN export \
15 |       CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
16 |       CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
17 |     export \
18 |       CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
19 |     apt-get -qq update && apt-get install -y --no-install-recommends \
20 |       cuda-nvcc-${CUDA_PACKAGE_VERSION} \
21 |       cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \
22 |       libcurand-dev-${CUDA_PACKAGE_VERSION} \
23 |       libcublas-dev-${CUDA_PACKAGE_VERSION} \
24 |       libcusparse-dev-${CUDA_PACKAGE_VERSION} \
25 |       libcusolver-dev-${CUDA_PACKAGE_VERSION} \
26 |       cuda-nvprof-${CUDA_PACKAGE_VERSION} \
27 |       cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
28 |       libaio-dev \
29 |       ninja-build && \
30 |     apt-get clean
31 | 
32 | RUN ldconfig
33 | 
34 | RUN apt-get -qq update && \
35 |     apt-get -qq install -y --no-install-recommends \
36 |       python3-pip git ninja-build && \
37 |     apt-get clean && \
38 |     pip3 install -U --no-cache-dir pip packaging setuptools wheel
39 | 
40 | FROM alpine/git:2.36.3 as vllm-downloader
41 | WORKDIR /git
42 | ARG COMMIT_HASH
43 | RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
44 |       https://github.com/coreweave/vllm.git && \
45 |     cd vllm && \
46 |     git checkout "${COMMIT_HASH}" && \
47 |     git submodule update --init --recursive --jobs 8 \
48 |       --depth 1 --filter=blob:none
49 | 
50 | FROM builder-base as vllm-builder
51 | WORKDIR /workspace
52 | RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw \
53 |     --mount=type=bind,from=freezer,target=/tmp/frozen,rw \
54 |     /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/frozen/constraints.txt && \
55 |     LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}" \
56 |       python3 -m pip wheel -w /wheels \
57 |       -v --no-cache-dir --no-build-isolation --no-deps \
58 |       -c /tmp/frozen/constraints.txt \
59 |       ./
60 | 
61 | WORKDIR /wheels
62 | 
63 | FROM ${BASE_IMAGE} as base
64 | 
65 | WORKDIR /workspace
66 | 
67 | RUN apt-get -qq update && apt-get install -y --no-install-recommends curl && apt-get clean
68 | 
69 | RUN --mount=type=bind,from=freezer,target=/tmp/frozen \
70 |     /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/constraints.txt
71 | 
72 | RUN python3 -m pip install --no-cache-dir \
73 |       "fschat[model_worker] == 0.2.30" "triton == 2.1.0" \
74 |       -c /tmp/constraints.txt
75 | 
76 | RUN --mount=type=bind,from=vllm-builder,source=/wheels,target=/tmp/wheels \
77 |     python3 -m pip install --no-cache-dir /tmp/wheels/*.whl -c /tmp/constraints.txt && \
78 |     rm /tmp/constraints.txt
79 | 
80 | 
81 | EXPOSE 8080


--------------------------------------------------------------------------------
/vllm-tensorizer/freeze.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | PATTERN="";
 4 | for DEP in "$@"; do {
 5 |   PATTERN="${PATTERN:+$PATTERN|}${DEP}";
 6 | }; done;
 7 | PATTERN="^(${PATTERN})\b";
 8 | 
 9 | python3 -m pip list --format freeze --disable-pip-version-check \
10 |   | { grep -iE "${PATTERN}" || :; };
11 | 


--------------------------------------------------------------------------------