├── .github ├── configurations │ ├── torch-base.yml │ └── torch-nccl.yml └── workflows │ ├── bloom.yml │ ├── build.yml │ ├── cuda-ssh.yml │ ├── cw-mega-sam.yml │ ├── gpt-neox-determined.yml │ ├── gpt-neox-mpi.yml │ ├── hf-llm-inference.yml │ ├── megatron.yml │ ├── read-configuration.yml │ ├── sd-finetuner.yml │ ├── sd-inference.yml │ ├── sglang.yml │ ├── tensorizer.yml │ ├── torch-base.yml │ ├── torch-extras.yml │ ├── torch-nccl.yml │ ├── torch-nightly.yml │ ├── torch.yml │ └── vllm-tensorizer.yml ├── .gitignore ├── LICENSE ├── bloom ├── Dockerfile └── environment.yaml ├── catalog.yaml ├── cuda-ssh └── Dockerfile ├── cw-mega-sam ├── Dockerfile ├── cuda124.patch └── requirements.txt ├── docs └── README.md ├── gpt-neox-determined └── Dockerfile ├── gpt-neox-mpi └── Dockerfile ├── hf-llm-inference └── Dockerfile ├── megatron ├── Dockerfile └── requirements.txt ├── mkdocs.yml ├── sd-finetuner └── Dockerfile ├── sd-inference └── Dockerfile ├── sglang ├── Dockerfile ├── build.bash └── install.bash ├── tensorizer └── Dockerfile ├── torch-extras ├── Dockerfile ├── compiler_wrapper.f95 ├── effective_cpu_count.sh ├── install_cudnn.sh └── scale.sh ├── torch ├── Dockerfile ├── compiler_wrapper.f95 ├── effective_cpu_count.sh ├── install_cudnn.sh ├── scale.sh └── torchaudio-cu125-pr3811.patch └── vllm-tensorizer ├── Dockerfile └── freeze.sh /.github/configurations/torch-base.yml: -------------------------------------------------------------------------------- 1 | cuda: [ 12.9.0, 12.8.1, 12.6.3 ] 2 | os: [ ubuntu22.04 ] 3 | abi: [ 1 ] 4 | include: 5 | - torch: 2.7.0 6 | vision: 0.22.0 7 | audio: 2.7.0 8 | -------------------------------------------------------------------------------- /.github/configurations/torch-nccl.yml: -------------------------------------------------------------------------------- 1 | cuda: [ 12.9.0, 12.8.1, 12.6.3 ] 2 | os: [ ubuntu22.04 ] 3 | abi: [ 1 ] 4 | include: 5 | - torch: 2.7.0 6 | vision: 0.22.0 7 | audio: 2.7.0 8 | nccl: 2.27.3-1 9 | nccl-tests-hash: d82e3c0 10 | -------------------------------------------------------------------------------- /.github/workflows/bloom.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | push: 4 | paths: 5 | - "bloom/**" 6 | - ".github/workflows/bloom.yml" 7 | - ".github/workflows/build.yml" 8 | 9 | 10 | jobs: 11 | build: 12 | uses: ./.github/workflows/build.yml 13 | secrets: inherit 14 | with: 15 | image-name: bloom 16 | folder: bloom 17 | build-args: "" 18 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | folder: 7 | required: true 8 | type: string 9 | image-name: 10 | required: true 11 | type: string 12 | build-args: 13 | required: false 14 | type: string 15 | tag-suffix: 16 | required: false 17 | type: string 18 | cache-key: 19 | required: false 20 | description: "Optional sub-key to append to the image name for build layer caching" 21 | type: string 22 | platforms: 23 | required: false 24 | description: "Platforms for which to build (default: linux/amd64,linux/arm64)" 25 | type: string 26 | default: linux/amd64,linux/arm64 27 | outputs: 28 | outcome: 29 | description: "The outcome of the build" 30 | value: ${{ jobs.build.outputs.outcome }} 31 | tags: 32 | description: "The resulting image tags" 33 | value: ${{ jobs.build.outputs.tags }} 34 | version: 35 | description: "The resulting image version" 36 | value: ${{ jobs.build.outputs.tags }} 37 | 38 | jobs: 39 | build: 40 | name: Build Images 41 | runs-on: [ cw ] 42 | container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' 43 | timeout-minutes: 960 44 | defaults: 45 | run: 46 | shell: bash 47 | outputs: 48 | outcome: ${{ steps.docker-build.outcome }} 49 | tags: ${{ steps.meta.outputs.tags }} 50 | version: ${{ steps.meta.outputs.version }} 51 | steps: 52 | - uses: actions/checkout@v4 53 | - name: Fetch BuildKit Client Certs 54 | uses: dopplerhq/secrets-fetch-action@v1.2.0 55 | id: client-certs 56 | with: 57 | doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }} 58 | doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }} 59 | doppler-config: prod 60 | inject-env-vars: false 61 | - name: Set up Docker Buildx 62 | uses: docker/setup-buildx-action@v3.7.1 63 | with: 64 | driver: remote 65 | endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }} 66 | platforms: linux/amd64 67 | append: | 68 | - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }} 69 | platforms: linux/arm64 70 | env: 71 | BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} 72 | BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} 73 | BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} 74 | BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} 75 | BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} 76 | BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} 77 | - name: Get base registry 78 | run: | 79 | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV 80 | - name: Set tag prefix 81 | if: github.ref_name != 'main' 82 | run: | 83 | echo "TAG_PREFIX=${{ github.ref_name }}-" >> $GITHUB_ENV 84 | - name: Set tag suffix 85 | if: inputs.tag-suffix != '' 86 | run: | 87 | echo "TAG_SUFFIX=-${{ inputs.tag-suffix }}" >> $GITHUB_ENV 88 | - name: Set cache key 89 | if: inputs.cache-key != '' 90 | run: | 91 | echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV 92 | - name: Extract metadata (tags, labels) for Docker 93 | id: meta 94 | uses: docker/metadata-action@v5.5.1 95 | with: 96 | images: ${{ env.REGISTRY }}/${{ inputs.image-name }} 97 | tags: | 98 | type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short 99 | - name: Initialize registry credentials file 100 | env: 101 | USER: ${{ github.actor }} 102 | PASS: ${{ secrets.GITHUB_TOKEN }} 103 | run: | 104 | jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \ 105 | | install -m400 /dev/stdin ~/.docker/config.json 106 | - name: Build and push Docker image 107 | id: docker-build 108 | uses: docker/build-push-action@v6.9.0 109 | with: 110 | context: ${{ inputs.folder }} 111 | build-args: |- 112 | ${{ inputs.build-args }} 113 | push: true 114 | tags: ${{ steps.meta.outputs.tags }} 115 | labels: ${{ steps.meta.outputs.labels }} 116 | cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} 117 | cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max 118 | platforms: ${{ inputs.platforms }} 119 | - name: Clear registry credentials 120 | if: always() 121 | run: | 122 | rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ] 123 | - uses: 8BitJonny/gh-get-current-pr@2.1.3 124 | id: PR 125 | with: 126 | filterOutClosed: true 127 | - name: Comment 128 | if: steps.PR.outputs.number 129 | uses: peter-evans/create-or-update-comment@v2.1.0 130 | with: 131 | issue-number: ${{ steps.PR.outputs.number }} 132 | body: > 133 | @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}: 134 | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 135 | 136 | Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}` 137 | -------------------------------------------------------------------------------- /.github/workflows/cuda-ssh.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | push: 4 | paths: 5 | - "cuda-ssh/**" 6 | - ".github/workflows/cuda-ssh.yml" 7 | - ".github/workflows/build.yml" 8 | 9 | 10 | jobs: 11 | build: 12 | strategy: 13 | matrix: 14 | tag: 15 | - ceeb8c2-base-cuda11.8.0-torch2.0.1-vision0.15.2-audio2.0.2 16 | - ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2 17 | 18 | uses: ./.github/workflows/build.yml 19 | secrets: inherit 20 | with: 21 | image-name: cuda-ssh 22 | folder: cuda-ssh 23 | tag-suffix: torch-${{ matrix.tag }} 24 | build-args: | 25 | BASE_IMAGE=ghcr.io/coreweave/ml-containers/torch:${{ matrix.tag }} 26 | -------------------------------------------------------------------------------- /.github/workflows/cw-mega-sam.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | base-image: 5 | description: "Base image to use" 6 | required: true 7 | commit: 8 | description: "Commit of Mega-sam to include" 9 | required: true 10 | push: 11 | paths: 12 | - "cw-mega-sam/**" 13 | - ".github/workflows/cw-mega-sam.yml" 14 | - ".github/workflows/build.yml" 15 | 16 | jobs: 17 | build: 18 | uses: ./.github/workflows/build.yml 19 | secrets: inherit 20 | with: 21 | image-name: cw-mega-sam 22 | folder: cw-mega-sam 23 | build-args: | 24 | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-68fbfd1-nccl-cuda12.4.1-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi0'}} 25 | COMMIT=${{ inputs.commit || 'main'}} 26 | -------------------------------------------------------------------------------- /.github/workflows/gpt-neox-determined.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | push: 4 | paths: 5 | - "gpt-neox-determined/**" 6 | - ".github/workflows/gpt-neox-determined.yml" 7 | - ".github/workflows/build.yml" 8 | 9 | 10 | jobs: 11 | build: 12 | uses: ./.github/workflows/build.yml 13 | secrets: inherit 14 | with: 15 | image-name: gpt-neox-determined 16 | folder: gpt-neox-determined 17 | build-args: "" 18 | -------------------------------------------------------------------------------- /.github/workflows/gpt-neox-mpi.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | push: 4 | paths: 5 | - "gpt-neox-mpi/**" 6 | - ".github/workflows/gpt-neox-mpi.yml" 7 | - ".github/workflows/build.yml" 8 | 9 | 10 | jobs: 11 | build: 12 | uses: ./.github/workflows/build.yml 13 | secrets: inherit 14 | with: 15 | image-name: gpt-neox-mpi 16 | folder: gpt-neox-mpi 17 | build-args: "" 18 | -------------------------------------------------------------------------------- /.github/workflows/hf-llm-inference.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | commit: 5 | description: 'Commit to build' 6 | required: true 7 | push: 8 | paths: 9 | - "hf-llm-inference/**" 10 | - ".github/workflows/hf-llm-inference.yml" 11 | - ".github/workflows/build.yml" 12 | 13 | 14 | jobs: 15 | build: 16 | uses: ./.github/workflows/build.yml 17 | secrets: inherit 18 | with: 19 | image-name: hf-llm-inference 20 | folder: hf-llm-inference 21 | build-args: | 22 | ${{ inputs.commit && 'COMMIT=' }}${{ inputs.commit }} 23 | -------------------------------------------------------------------------------- /.github/workflows/megatron.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | base-image: 5 | description: 'Base image to use' 6 | required: true 7 | commit: 8 | description: 'Commit of Megatron to include' 9 | required: true 10 | push: 11 | paths: 12 | - "megatron/**" 13 | - ".github/workflows/megatron.yml" 14 | - ".github/workflows/build.yml" 15 | 16 | 17 | jobs: 18 | build: 19 | uses: ./.github/workflows/build.yml 20 | secrets: inherit 21 | with: 22 | image-name: megatron 23 | folder: megatron 24 | build-args: | 25 | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:bfe03aa-nccl-cuda12.4.1-ubuntu22.04-nccl2.21.5-1-torch2.4.0-vision0.19.0-audio2.4.0'}} 26 | COMMIT=${{ inputs.commit || 'main'}} -------------------------------------------------------------------------------- /.github/workflows/read-configuration.yml: -------------------------------------------------------------------------------- 1 | name: read-configuration 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | path: 7 | required: true 8 | type: string 9 | filter: 10 | required: false 11 | type: string 12 | outputs: 13 | config: 14 | description: "The retrieved configuration, as JSON" 15 | value: ${{ jobs.read-file.outputs.config }} 16 | 17 | jobs: 18 | read-file: 19 | name: Read Configuration File 20 | runs-on: [ cw ] 21 | container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' 22 | defaults: 23 | run: 24 | shell: bash 25 | permissions: {} 26 | outputs: 27 | config: ${{ steps.read.outputs.contents }} 28 | steps: 29 | - uses: actions/checkout@v4 30 | - name: Read configuration 31 | id: read 32 | env: 33 | FILE_PATH: ${{ inputs.path }} 34 | FILTER: ${{ inputs.filter }} 35 | run: | 36 | set -x; 37 | if [ -n "$FILTER" ]; then 38 | CONTENTS="$(yq e "$FILE_PATH" --expression "$FILTER" -oj -I0)"; 39 | else 40 | CONTENTS="$(yq e "$FILE_PATH" -oj -I0)"; 41 | fi; 42 | echo "contents=$CONTENTS" >> "$GITHUB_OUTPUT"; 43 | 44 | { 45 | echo '## Configuration'; 46 | echo '```json'; 47 | echo "$CONTENTS" | jq .; 48 | echo '```'; 49 | } >> "$GITHUB_STEP_SUMMARY"; 50 | -------------------------------------------------------------------------------- /.github/workflows/sd-finetuner.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | commit: 5 | description: 'Commit to build' 6 | required: true 7 | default: 'master' 8 | push: 9 | paths: 10 | - "sd-finetuner/**" 11 | - ".github/workflows/sd-finetuner.yml" 12 | - ".github/workflows/build.yml" 13 | 14 | 15 | jobs: 16 | build: 17 | uses: ./.github/workflows/build.yml 18 | secrets: inherit 19 | with: 20 | image-name: sd-finetuner 21 | folder: sd-finetuner 22 | build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" 23 | -------------------------------------------------------------------------------- /.github/workflows/sd-inference.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | commit: 5 | description: 'Commit to build' 6 | required: true 7 | default: 'master' 8 | push: 9 | paths: 10 | - "sd-inference/**" 11 | - ".github/workflows/sd-inference.yml" 12 | - ".github/workflows/build.yml" 13 | 14 | 15 | jobs: 16 | build: 17 | uses: ./.github/workflows/build.yml 18 | secrets: inherit 19 | with: 20 | image-name: sd-inference 21 | folder: sd-inference 22 | build-args: | 23 | COMMIT=${{ github.event.inputs.commit }} 24 | -------------------------------------------------------------------------------- /.github/workflows/sglang.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | tag: 5 | description: 'Tag for the build' 6 | required: true 7 | base-image: 8 | description: 'Base image from which to build' 9 | required: true 10 | builder-image: 11 | description: 'Image to use to compile wheels, if different from the base image' 12 | required: false 13 | push: 14 | paths: 15 | - "sglang/**" 16 | - ".github/workflows/sglang.yml" 17 | - ".github/workflows/build.yml" 18 | 19 | 20 | jobs: 21 | build: 22 | uses: ./.github/workflows/build.yml 23 | secrets: inherit 24 | with: 25 | image-name: sglang 26 | folder: sglang 27 | tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }} 28 | build-args: | 29 | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}} 30 | ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}} 31 | -------------------------------------------------------------------------------- /.github/workflows/tensorizer.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | commit: 5 | description: 'Commit to build' 6 | required: true 7 | default: 'master' 8 | push: 9 | paths: 10 | - "tensorizer/**" 11 | - ".github/workflows/tensorizer.yml" 12 | - ".github/workflows/build.yml" 13 | 14 | 15 | jobs: 16 | build: 17 | uses: ./.github/workflows/build.yml 18 | secrets: inherit 19 | with: 20 | image-name: tensorizer 21 | folder: tensorizer 22 | build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" 23 | -------------------------------------------------------------------------------- /.github/workflows/torch-base.yml: -------------------------------------------------------------------------------- 1 | name: torch-base 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | image-name: 7 | required: false 8 | description: "Custom name under which to publish the resulting container" 9 | type: string 10 | image-tag-suffix: 11 | required: false 12 | description: "Custom tag suffix listing library versions under which to publish the resulting container" 13 | type: string 14 | push: 15 | paths: 16 | - "torch/**" 17 | - ".github/configurations/torch-base.yml" 18 | - ".github/workflows/torch-base.yml" 19 | - ".github/workflows/torch.yml" 20 | - ".github/workflows/build.yml" 21 | 22 | 23 | jobs: 24 | get-config: 25 | name: Get torch:base Config 26 | uses: ./.github/workflows/read-configuration.yml 27 | with: 28 | path: ./.github/configurations/torch-base.yml 29 | build: 30 | name: Build torch:base 31 | needs: get-config 32 | strategy: 33 | matrix: ${{ fromJSON(needs.get-config.outputs.config) }} 34 | uses: ./.github/workflows/torch.yml 35 | secrets: inherit 36 | with: 37 | image-name: ${{ inputs.image-name }} 38 | tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }} 39 | builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }} 40 | base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }} 41 | torch-version: ${{ matrix.torch }} 42 | torchvision-version: ${{ matrix.vision }} 43 | torchaudio-version: ${{ matrix.audio }} 44 | additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }} 45 | cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} 46 | build-extras: true 47 | -------------------------------------------------------------------------------- /.github/workflows/torch-extras.yml: -------------------------------------------------------------------------------- 1 | name: torch-extras 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | tag: 7 | required: true 8 | type: string 9 | base-image: 10 | required: true 11 | type: string 12 | image-name: 13 | required: false 14 | type: string 15 | skip-bases-check: 16 | required: false 17 | type: boolean 18 | default: true 19 | cache-key: 20 | required: false 21 | type: string 22 | 23 | workflow_dispatch: 24 | inputs: 25 | tag: 26 | required: false 27 | description: "Tag suffix to identify the build" 28 | type: string 29 | base-image: 30 | required: false 31 | description: "Base image for the build" 32 | type: string 33 | image-name: 34 | required: false 35 | description: "Custom name under which to publish the resulting container" 36 | type: string 37 | skip-bases-check: 38 | required: false 39 | description: "Build from one specific image rather than the most recent releases from the main branch" 40 | type: boolean 41 | default: true 42 | 43 | push: 44 | paths: 45 | - "torch-extras/**" 46 | - ".github/workflows/torch-extras.yml" 47 | - ".github/workflows/build.yml" 48 | 49 | 50 | jobs: 51 | get-required-bases: 52 | name: Get Latest Required Base Images 53 | if: inputs.skip-bases-check != true 54 | runs-on: [ cw ] 55 | container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' 56 | defaults: 57 | run: 58 | shell: bash 59 | permissions: 60 | packages: read 61 | outputs: 62 | bases-list: ${{ steps.choose-bases.outputs.list }} 63 | steps: 64 | - uses: actions/checkout@v4 65 | with: 66 | fetch-depth: 0 67 | - name: Check if torch-extras needs to be rebuilt from previous bases 68 | id: check-changed 69 | run: | 70 | if [ "$EVENT_NAME" = 'push' ]; then \ 71 | if [ "$FORCE_PUSH" = '1' ] || \ 72 | [ "$BEFORE_HASH" = '0000000000000000000000000000000000000000' ] && [ -n "$FIRST_COMMIT" ]; then \ 73 | export BEFORE_HASH="$FIRST_COMMIT~"; 74 | fi && \ 75 | CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ 76 | { \ 77 | echo "$CHANGED_FILES" \ 78 | | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ 79 | && echo "BASE_PROVIDED=true" >> "$GITHUB_OUTPUT" \ 80 | || echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT"; \ 81 | } && { \ 82 | echo "$CHANGED_FILES" \ 83 | | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ 84 | && echo "NCCL_PROVIDED=true" >> "$GITHUB_OUTPUT" \ 85 | || echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; \ 86 | }; \ 87 | else \ 88 | echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT" && \ 89 | echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; 90 | fi 91 | env: 92 | EVENT_NAME: ${{ github.event_name }} 93 | BEFORE_HASH: ${{ github.event.before }} 94 | AFTER_HASH: ${{ github.event.after }} 95 | FIRST_COMMIT: ${{ github.event.commits[0].id }} 96 | FORCE_PUSH: ${{ github.event.forced && '1' || '' }} 97 | - name: Get latest torch container releases 98 | if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' 99 | id: get-latest 100 | run: | 101 | RELEASES="$( \ 102 | /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \ 103 | 'https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list?n=100000' \ 104 | | jq -r '.["tags"][]' \ 105 | | grep -E '^[0-9a-f]{7}-(base|nccl)-' \ 106 | )" && \ 107 | BASE_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-base-')" && \ 108 | NCCL_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-nccl-')" && \ 109 | LATEST_BASE_COMMIT="$(echo "$BASE_RELEASES" | tail -1 | cut -c1-7)" && \ 110 | LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \ 111 | LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \ 112 | LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \ 113 | echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> "$GITHUB_OUTPUT" && \ 114 | echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> "$GITHUB_OUTPUT" 115 | env: 116 | BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} 117 | - name: Choose which torch containers to use as a build base 118 | if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' 119 | id: choose-bases 120 | run: | 121 | TAG_TO_JSON() { 122 | TAG_PATTERN='^[0-9a-f]{7}-(.*)'; 123 | JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}'; 124 | sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; 125 | } && \ 126 | SPLIT_TO_LINES() { xargs -n 1; } && \ 127 | JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ 128 | echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \ 129 | echo "list=[$( \ 130 | ( \ 131 | if [ "$BASE_PROVIDED" = 'false' ]; then \ 132 | echo "$LATEST_BASE_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \ 133 | echo "$LATEST_BASE_IMAGES"; \ 134 | fi && \ 135 | if [ "$NCCL_PROVIDED" = 'false' ]; then \ 136 | echo "$LATEST_NCCL_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \ 137 | echo "$LATEST_NCCL_IMAGES"; \ 138 | fi; \ 139 | ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \ 140 | )]" >> "$GITHUB_OUTPUT"; 141 | env: 142 | BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} 143 | NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} 144 | LATEST_BASE_IMAGES: ${{ steps.get-latest.outputs.LATEST_BASE_IMAGES }} 145 | LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }} 146 | 147 | build-call: 148 | name: Build torch-extras via Workflow Call 149 | if: inputs.skip-bases-check 150 | uses: ./.github/workflows/build.yml 151 | secrets: inherit 152 | with: 153 | image-name: ${{ inputs.image-name || 'torch-extras' }} 154 | folder: torch-extras 155 | tag-suffix: ${{ inputs.tag }} 156 | cache-key: ${{ inputs.cache-key }} 157 | build-args: | 158 | BASE_IMAGE=${{ inputs.base-image }} 159 | 160 | build-self: 161 | name: Build torch-extras via Event Trigger 162 | needs: get-required-bases 163 | if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]' 164 | strategy: 165 | matrix: 166 | bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }} 167 | uses: ./.github/workflows/build.yml 168 | secrets: inherit 169 | with: 170 | image-name: ${{ inputs.image-name || 'torch-extras' }} 171 | folder: torch-extras 172 | tag-suffix: ${{ matrix.bases.tag }} 173 | build-args: | 174 | BASE_IMAGE=${{ matrix.bases.image }} 175 | -------------------------------------------------------------------------------- /.github/workflows/torch-nccl.yml: -------------------------------------------------------------------------------- 1 | name: torch-nccl 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | image-name: 7 | required: false 8 | type: string 9 | image-tag-suffix: 10 | required: false 11 | type: string 12 | workflow_dispatch: 13 | inputs: 14 | image-name: 15 | required: false 16 | description: "Custom name under which to publish the resulting container" 17 | type: string 18 | image-tag-suffix: 19 | required: false 20 | description: "Custom tag suffix listing library versions under which to publish the resulting container" 21 | type: string 22 | push: 23 | paths: 24 | - "torch/**" 25 | - ".github/configurations/torch-nccl.yml" 26 | - ".github/workflows/torch-nccl.yml" 27 | - ".github/workflows/torch.yml" 28 | - ".github/workflows/build.yml" 29 | 30 | 31 | jobs: 32 | get-config: 33 | name: Get torch:nccl Config 34 | uses: ./.github/workflows/read-configuration.yml 35 | with: 36 | path: ./.github/configurations/torch-nccl.yml 37 | build: 38 | name: Build torch:nccl 39 | needs: get-config 40 | strategy: 41 | matrix: ${{ fromJSON(needs.get-config.outputs.config) }} 42 | uses: ./.github/workflows/torch.yml 43 | secrets: inherit 44 | with: 45 | image-name: ${{ inputs.image-name }} 46 | tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }} 47 | builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} 48 | base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} 49 | torch-version: ${{ matrix.torch }} 50 | torchvision-version: ${{ matrix.vision }} 51 | torchaudio-version: ${{ matrix.audio }} 52 | additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }} 53 | cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} 54 | build-extras: true 55 | -------------------------------------------------------------------------------- /.github/workflows/torch-nightly.yml: -------------------------------------------------------------------------------- 1 | name: torch-nightly 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | # At 05:00 UTC (midnight EST) 7 | - cron: "0 5 * * *" 8 | push: 9 | paths: 10 | - "torch/**" 11 | - ".github/configurations/torch-base.yml" 12 | - ".github/configurations/torch-nccl.yml" 13 | - ".github/workflows/torch-nightly.yml" 14 | - ".github/workflows/torch.yml" 15 | - ".github/workflows/build.yml" 16 | 17 | 18 | jobs: 19 | get-nightly-info: 20 | name: 21 | Get Nightly Info 22 | runs-on: [ cw ] 23 | container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' 24 | defaults: 25 | run: 26 | shell: bash 27 | outputs: 28 | pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} 29 | triton-commit: ${{ steps.get-hash.outputs.triton-commit }} 30 | torchvision-commit: ${{ steps.get-hash.outputs.torchvision-commit }} 31 | torchaudio-commit: ${{ steps.get-hash.outputs.torchaudio-commit }} 32 | version-string: ${{ steps.get-hash.outputs.version-string }} 33 | date: ${{ steps.get-date.outputs.date }} 34 | steps: 35 | - name: Get latest commit hashes 36 | id: get-hash 37 | run: | 38 | set -e; 39 | 40 | FORMAT_COMMIT_LINK() { 41 | printf '[`%.7s`](https://github.com/%s/tree/%s)\n' "$2" "$1" "$2"; 42 | }; 43 | 44 | LOG() { 45 | printf -- "$@" >> "$GITHUB_STEP_SUMMARY"; 46 | }; 47 | 48 | CLONE() { 49 | git clone --filter=blob:none --no-checkout --depth=1 \ 50 | "https://github.com/$1" \ 51 | "$2" > /dev/null 2> /dev/null && \ 52 | local COMMIT="$(git -C "$2" rev-parse HEAD)" && \ 53 | LOG 'Latest `%s` commit: %s\n' \ 54 | "$1" "$(FORMAT_COMMIT_LINK "$1" "$COMMIT")" && \ 55 | echo $COMMIT; 56 | }; 57 | 58 | GET_VERSION() { 59 | git -C "$1" show HEAD:version.txt 2> /dev/null; 60 | }; 61 | 62 | PYTORCH_COMMIT="$(CLONE pytorch/pytorch pytorch-git)"; 63 | PYTORCH_VERSION="$(GET_VERSION pytorch-git)"; 64 | TRITON_COMMIT_FILE=".ci/docker/ci_commit_pins/triton.txt"; 65 | TRITON_COMMIT="$(git -C pytorch-git show "HEAD:$TRITON_COMMIT_FILE" 2> /dev/null)"; 66 | rm -rf pytorch-git; 67 | 68 | LOG 'Corresponding `openai/triton` commit: %s\n' \ 69 | "$(FORMAT_COMMIT_LINK openai/triton "$TRITON_COMMIT")"; 70 | 71 | TORCHVISION_COMMIT="$(CLONE pytorch/vision torchvision-git)"; 72 | TORCHVISION_VERSION="$(GET_VERSION torchvision-git)"; 73 | rm -rf torchvision-git; 74 | 75 | TORCHAUDIO_COMMIT="$(CLONE pytorch/audio torchaudio-git)"; 76 | TORCHAUDIO_VERSION="$(GET_VERSION torchaudio-git)"; 77 | rm -rf torchaudio-git; 78 | 79 | echo "pytorch-commit=$PYTORCH_COMMIT" >> "$GITHUB_OUTPUT"; 80 | echo "triton-commit=$TRITON_COMMIT" >> "$GITHUB_OUTPUT"; 81 | echo "torchvision-commit=$TORCHVISION_COMMIT" >> "$GITHUB_OUTPUT"; 82 | echo "torchaudio-commit=$TORCHAUDIO_COMMIT" >> "$GITHUB_OUTPUT"; 83 | 84 | printf -- 'version-string=torch%s-vision%s-audio%s\n' \ 85 | "$PYTORCH_VERSION" "$TORCHVISION_VERSION" "$TORCHAUDIO_VERSION" \ 86 | >> "$GITHUB_OUTPUT"; 87 | - name: Get date 88 | id: get-date 89 | run: echo "date=$(date -u '+%y%m%d%H')" >> "$GITHUB_OUTPUT"; 90 | 91 | get-base-config: 92 | name: Get torch:base Config 93 | uses: ./.github/workflows/read-configuration.yml 94 | with: 95 | path: ./.github/configurations/torch-base.yml 96 | filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]' 97 | get-nccl-config: 98 | name: Get torch:nccl Config 99 | uses: ./.github/workflows/read-configuration.yml 100 | with: 101 | path: ./.github/configurations/torch-nccl.yml 102 | filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]' 103 | 104 | build-base: 105 | name: Build Nightly torch:base 106 | needs: 107 | - get-nightly-info 108 | - get-base-config 109 | strategy: 110 | fail-fast: false 111 | matrix: ${{ fromJSON(needs.get-base-config.outputs.config) }} 112 | uses: ./.github/workflows/torch.yml 113 | secrets: inherit 114 | with: 115 | image-name: nightly-torch 116 | tag: ${{ format('base-{0}-cuda{1}-{2}-{3}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, needs.get-nightly-info.outputs.version-string) }} 117 | builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }} 118 | base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }} 119 | torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} 120 | torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} 121 | torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} 122 | additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }} 123 | cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} 124 | build-extras: true 125 | build-nccl: 126 | name: Build Nightly torch:nccl 127 | needs: 128 | - get-nightly-info 129 | - get-nccl-config 130 | strategy: 131 | fail-fast: false 132 | matrix: ${{ fromJSON(needs.get-nccl-config.outputs.config) }} 133 | uses: ./.github/workflows/torch.yml 134 | secrets: inherit 135 | with: 136 | image-name: nightly-torch 137 | tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }} 138 | builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} 139 | base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} 140 | torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} 141 | torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} 142 | torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} 143 | additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }} 144 | cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} 145 | build-extras: true 146 | -------------------------------------------------------------------------------- /.github/workflows/torch.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_call: 3 | inputs: 4 | tag: 5 | required: true 6 | type: string 7 | builder-base-image: 8 | required: true 9 | type: string 10 | base-image: 11 | required: true 12 | type: string 13 | torch-version: 14 | required: true 15 | type: string 16 | torchvision-version: 17 | required: true 18 | type: string 19 | torchaudio-version: 20 | required: true 21 | type: string 22 | additional-build-args: 23 | required: false 24 | type: string 25 | image-name: 26 | required: false 27 | type: string 28 | build-extras: 29 | required: false 30 | type: boolean 31 | default: false 32 | cache-key: 33 | required: false 34 | type: string 35 | 36 | workflow_dispatch: 37 | inputs: 38 | tag: 39 | required: true 40 | description: "Tag suffix to identify the build" 41 | type: string 42 | builder-base-image: 43 | required: true 44 | description: "Base image used during the compilation step" 45 | type: string 46 | base-image: 47 | required: true 48 | description: "Base image for the final image" 49 | type: string 50 | torch-version: 51 | required: true 52 | description: "Tagged version number from pytorch/pytorch to build" 53 | type: string 54 | torchvision-version: 55 | required: true 56 | description: "Tagged version number from pytorch/vision to build" 57 | type: string 58 | torchaudio-version: 59 | required: true 60 | description: "Tagged version number from pytorch/audio to build" 61 | type: string 62 | additional-build-args: 63 | required: false 64 | description: "Further --build-arg parameters for the build" 65 | type: string 66 | image-name: 67 | required: false 68 | description: "Custom name under which to publish the resulting container" 69 | type: string 70 | build-extras: 71 | required: false 72 | description: "Whether to build and push a torch-extras container as well" 73 | type: boolean 74 | default: false 75 | 76 | jobs: 77 | build: 78 | name: Build torch 79 | uses: ./.github/workflows/build.yml 80 | secrets: inherit 81 | with: 82 | image-name: ${{ inputs.image-name || 'torch' }} 83 | folder: torch 84 | tag-suffix: ${{ inputs.tag }} 85 | cache-key: ${{ inputs.cache-key }} 86 | build-args: | 87 | BUILD_CCACHE_SIZE=5Gi 88 | BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }} 89 | FINAL_BASE_IMAGE=${{ inputs.base-image }} 90 | BUILD_TORCH_VERSION=${{ inputs.torch-version }} 91 | BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} 92 | BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} 93 | ${{ inputs.additional-build-args }} 94 | build-extras: 95 | name: Build torch-extras 96 | if: inputs.build-extras 97 | needs: build 98 | uses: ./.github/workflows/torch-extras.yml 99 | secrets: inherit 100 | with: 101 | tag: ${{ inputs.tag }} 102 | base-image: ${{ needs.build.outputs.tags }} 103 | image-name: ${{ inputs.image-name && format('{0}-extras', inputs.image-name) || '' }} 104 | cache-key: ${{ inputs.cache-key }} 105 | -------------------------------------------------------------------------------- /.github/workflows/vllm-tensorizer.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | commit: 5 | description: 'Commit to build' 6 | required: true 7 | push: 8 | paths: 9 | - "vllm-tensorizer/**" 10 | - ".github/workflows/vllm-tensorizer.yml" 11 | - ".github/workflows/build.yml" 12 | 13 | 14 | jobs: 15 | build: 16 | uses: ./.github/workflows/build.yml 17 | secrets: inherit 18 | with: 19 | image-name: vllm-tensorizer 20 | folder: vllm-tensorizer 21 | tag-suffix: ${{ inputs.commit || '19307ba71ddeb7e1cc6aec3c1baa8b50d59c1beb'}} 22 | build-args: | 23 | COMMIT_HASH=${{ inputs.commit || '19307ba71ddeb7e1cc6aec3c1baa8b50d59c1beb'}} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | .AppleDouble 4 | .LSOverride 5 | 6 | # Icon must end with two \r 7 | Icon 8 | 9 | # Thumbnails 10 | ._* 11 | 12 | # Files that might appear in the root of a volume 13 | .DocumentRevisions-V100 14 | .fseventsd 15 | .Spotlight-V100 16 | .TemporaryItems 17 | .Trashes 18 | .VolumeIcon.icns 19 | .com.apple.timemachine.donotpresent 20 | 21 | # Directories potentially created on remote AFP share 22 | .AppleDB 23 | .AppleDesktop 24 | Network Trash Folder 25 | Temporary Items 26 | .apdisk 27 | 28 | *~ 29 | 30 | # temporary files which can be created if a process still has a handle open of a deleted file 31 | .fuse_hidden* 32 | 33 | # KDE directory preferences 34 | .directory 35 | 36 | # Linux trash folder which might appear on any partition or disk 37 | .Trash-* 38 | 39 | # .nfs files are created when an open file is removed but is still being accessed 40 | .nfs* 41 | 42 | # Swap 43 | [._]*.s[a-v][a-z] 44 | !*.svg # comment out if you don't need vector files 45 | [._]*.sw[a-p] 46 | [._]s[a-rt-v][a-z] 47 | [._]ss[a-gi-z] 48 | [._]sw[a-p] 49 | 50 | # Session 51 | Session.vim 52 | Sessionx.vim 53 | 54 | # Temporary 55 | .netrwhist 56 | *~ 57 | # Auto-generated tag files 58 | tags 59 | # Persistent undo 60 | [._]*.un~ 61 | 62 | # -*- mode: gitignore; -*- 63 | *~ 64 | \#*\# 65 | /.emacs.desktop 66 | /.emacs.desktop.lock 67 | *.elc 68 | auto-save-list 69 | tramp 70 | .\#* 71 | 72 | # Org-mode 73 | .org-id-locations 74 | *_archive 75 | 76 | # flymake-mode 77 | *_flymake.* 78 | 79 | # eshell files 80 | /eshell/history 81 | /eshell/lastdir 82 | 83 | # elpa packages 84 | /elpa/ 85 | 86 | # reftex files 87 | *.rel 88 | 89 | # AUCTeX auto folder 90 | /auto/ 91 | 92 | # cask packages 93 | .cask/ 94 | dist/ 95 | 96 | # Flycheck 97 | flycheck_*.el 98 | 99 | # server auth directory 100 | /server/ 101 | 102 | # projectiles files 103 | .projectile 104 | 105 | # directory configuration 106 | .dir-locals.el 107 | 108 | # network security 109 | /network-security.data 110 | 111 | # -*- mode: gitignore; -*- 112 | *~ 113 | \#*\# 114 | /.emacs.desktop 115 | /.emacs.desktop.lock 116 | *.elc 117 | auto-save-list 118 | tramp 119 | .\#* 120 | 121 | # Org-mode 122 | .org-id-locations 123 | *_archive 124 | 125 | # flymake-mode 126 | *_flymake.* 127 | 128 | # eshell files 129 | /eshell/history 130 | /eshell/lastdir 131 | 132 | # elpa packages 133 | /elpa/ 134 | 135 | # reftex files 136 | *.rel 137 | 138 | # AUCTeX auto folder 139 | /auto/ 140 | 141 | # cask packages 142 | .cask/ 143 | dist/ 144 | 145 | # Flycheck 146 | flycheck_*.el 147 | 148 | # server auth directory 149 | /server/ 150 | 151 | # projectiles files 152 | .projectile 153 | 154 | # directory configuration 155 | .dir-locals.el 156 | 157 | # network security 158 | /network-security.data 159 | 160 | # local environment files 161 | .env 162 | .env* 163 | .environment 164 | .environment* 165 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 CoreWeave 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /bloom/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM coreweave/nccl-tests:2022-11-06_19-21-22.11_EST 2 | 3 | # setup python and conda 4 | 5 | RUN DEBIAN_FRONTEND=noninteractive apt-get -qq update && \ 6 | DEBIAN_FRONTEND=noninteractive apt-get -qq install -y --no-install-recommends \ 7 | python3 python3-dev python3-pip git libssl-dev pkg-config 8 | 9 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 10 | bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ 11 | rm Miniconda3-latest-Linux-x86_64.sh 12 | 13 | ENV PATH=/opt/conda/bin:$PATH 14 | 15 | # create conda environment from file. this step installs torch. 16 | COPY ./environment.yaml /opt/nccl-tests/environment.yaml 17 | RUN conda env create -f environment.yaml 18 | 19 | SHELL ["conda", "run", "-n", "tr11-176B-ml", "/bin/bash", "-c"] 20 | 21 | # setup rust and then tokenizers 22 | RUN conda install -y -c conda-forge rust 23 | 24 | RUN git clone https://github.com/huggingface/tokenizers && \ 25 | cd tokenizers && \ 26 | git checkout bigscience_fork && \ 27 | pip install setuptools_rust && \ 28 | pip install -e bindings/python 29 | 30 | # install apex 31 | RUN git clone https://github.com/NVIDIA/apex && \ 32 | cd apex && \ 33 | pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ --upgrade 34 | 35 | # install deepspeed from here: 36 | # https://github.com/microsoft/DeepSpeed/tree/olruwase/elastic-ckpt-refresh 37 | RUN pip install git+https://github.com/microsoft/DeepSpeed.git@olruwase/elastic-ckpt-refresh --upgrade 38 | 39 | # clone bigscience repo 40 | RUN git clone https://github.com/bigscience-workshop/bigscience.git 41 | 42 | # https://github.com/bigscience-workshop/Megatron-DeepSpeed/tree/olruwase/ds_ckpt_reshape 43 | RUN git clone --single-branch --branch olruwase/ds_ckpt_reshape https://github.com/bigscience-workshop/Megatron-DeepSpeed.git 44 | -------------------------------------------------------------------------------- /bloom/environment.yaml: -------------------------------------------------------------------------------- 1 | name: tr11-176B-ml 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=conda_forge 7 | - _openmp_mutex 8 | - ca-certificates=2022.6.15=ha878542_0 9 | - certifi=2022.6.15=py38h578d9bd_0 10 | - ld_impl_linux-64 11 | - libffi 12 | - libgcc-ng 13 | - libgomp 14 | - libstdcxx-ng>=10.3.0 15 | - nccl=2.10.3.1=hdc17891_0 16 | - ncurses=6.3=h7f8727e_2 17 | - openssl=1.1.1q=h7f8727e_0 18 | - pip=21.2.4=py38h06a4308_0 19 | - python=3.8.12=h12debd9_0 20 | - python_abi=3.8=2_cp38 21 | - readline=8.1.2=h7f8727e_1 22 | - setuptools=58.0.4=py38h06a4308_0 23 | - sqlite=3.37.2=hc218d9a_0 24 | - tk=8.6.11=h1ccaba5_0 25 | - wheel=0.37.1=pyhd3eb1b0_0 26 | - xz=5.2.5=h7b6447c_0 27 | - zlib=1.2.11=h7f8727e_4 28 | - pip: 29 | - absl-py==1.0.0 30 | - aiohttp==3.8.1 31 | - aiosignal==1.2.0 32 | - appdirs==1.4.4 33 | - astunparse==1.6.3 34 | - async-timeout==4.0.2 35 | - attrs==21.4.0 36 | - best-download==0.0.9 37 | - black==21.4b0 38 | - cachetools==5.0.0 39 | - chardet==4.0.0 40 | - charset-normalizer==2.0.12 41 | - click==8.0.4 42 | - colorama==0.4.4 43 | - cython==0.29.28 44 | - dataproperty==0.55.0 45 | - datasets==1.15.1 46 | - dill==0.3.4 47 | - dynet38==2.1 48 | - filelock==3.6.0 49 | - flatbuffers==2.0 50 | - frozenlist==1.3.0 51 | - fsspec==2022.2.0 52 | - gast==0.5.3 53 | - google-auth==2.6.0 54 | - google-auth-oauthlib==0.4.6 55 | - google-pasta==0.2.0 56 | - grpcio==1.44.0 57 | - h5py==3.6.0 58 | - hjson==3.0.2 59 | - huggingface-hub==0.9.0 60 | - idna==3.3 61 | - importlib-metadata==4.11.2 62 | - iniconfig==1.1.1 63 | - isort==5.10.1 64 | - jieba==0.42.1 65 | - joblib==1.1.0 66 | - jsonlines==2.0.0 67 | - keras==2.8.0 68 | - keras-preprocessing==1.1.2 69 | - libclang==14.0.1 70 | - lm-dataformat==0.0.20 71 | - lm-eval==0.2.0 72 | - markdown==3.3.6 73 | - mbstrdecoder==1.1.0 74 | - mock==4.0.3 75 | - msgfy==0.2.0 76 | - multidict==6.0.2 77 | - multiprocess==0.70.12.2 78 | - mypy-extensions==0.4.3 79 | - nagisa==0.2.7 80 | - ninja==1.10.2.3 81 | - nltk==3.7 82 | - numexpr==2.7.2 83 | - numpy==1.22.3 84 | - oauthlib==3.2.0 85 | - openai==0.6.4 86 | - opt-einsum==3.3.0 87 | - packaging==21.3 88 | - pandas==1.4.1 89 | - parameterized==0.8.1 90 | - pathspec==0.9.0 91 | - pathvalidate==2.5.0 92 | - pillow==9.0.1 93 | - pluggy==0.13.1 94 | - portalocker==2.4.0 95 | - protobuf==3.19.4 96 | - psutil==5.9.0 97 | - py==1.11.0 98 | - py-cpuinfo==8.0.0 99 | - py-spy==0.3.11 100 | - pyarrow==7.0.0 101 | - pyasn1==0.4.8 102 | - pyasn1-modules==0.2.8 103 | - pybind11==2.6.2 104 | - pycountry==20.7.3 105 | - pydantic==1.9.1 106 | - pyparsing==3.0.7 107 | - pytablewriter==0.58.0 108 | - pytest==6.2.3 109 | - pytest-instafail==0.4.2 110 | - python-dateutil==2.8.2 111 | - pytz==2021.3 112 | - pyyaml==6.0 113 | - regex==2022.3.2 114 | - rehash==1.0.0 115 | - requests==2.27.1 116 | - requests-oauthlib==1.3.1 117 | - responses==0.18.0 118 | - rouge-score==0.0.4 119 | - rsa==4.8 120 | - sacrebleu==1.5.0 121 | - sacremoses==0.0.47 122 | - scikit-learn==1.0.2 123 | - scipy==1.8.0 124 | - semantic-version==2.9.0 125 | - sentencepiece==0.1.96 126 | - setuptools-rust==1.1.2 127 | - six==1.16.0 128 | - sqlitedict==1.6.0 129 | - tabledata==1.3.0 130 | - tcolorpy==0.1.2 131 | - tensorboard==2.8.0 132 | - tensorboard-data-server==0.6.1 133 | - tensorboard-plugin-wit==1.8.1 134 | - tensorflow==2.8.0 135 | - tensorflow-io-gcs-filesystem==0.25.0 136 | - termcolor==1.1.0 137 | - tf-estimator-nightly==2.8.0.dev2021122109 138 | - tf-slim==1.1.0 139 | - threadpoolctl==3.1.0 140 | - toml==0.10.2 141 | - tomli==2.0.1 142 | - tqdm==4.63.0 143 | - tqdm-multiprocess==0.0.11 144 | - typepy==1.3.0 145 | - typing-extensions==4.1.1 146 | - ujson==5.2.0 147 | - urllib3==1.26.8 148 | - werkzeug==2.0.3 149 | - wrapt==1.14.0 150 | - xxhash==3.0.0 151 | - yarl==1.7.2 152 | - zipp==3.7.0 153 | - zstandard==0.15.2 -------------------------------------------------------------------------------- /catalog.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: backstage.io/v1alpha1 3 | kind: Component 4 | metadata: 5 | name: ml-containers 6 | annotations: 7 | backstage.io/techdocs-ref: dir:. 8 | description: Optimized images for training/inference on CoreWeave infrastructure 9 | tags: 10 | - ml 11 | # links: 12 | # - title: Deployment Manifests 13 | # url: https://github.com/coreweave/awesome-turtles/tree/main/deploy 14 | # icon: github 15 | customer_impact: true 16 | stateless: false 17 | spec: 18 | type: service 19 | lifecycle: production 20 | owner: group:cw/team_ml -------------------------------------------------------------------------------- /cuda-ssh/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE="ghcr.io/coreweave/ml-containers/torch:ceeb8c2-base-cuda11.8.0-torch2.0.1-vision0.15.2-audio2.0.2" 2 | 3 | FROM ${BASE_IMAGE} 4 | 5 | RUN apt-get -qq update && \ 6 | DEBIAN_FRONTEND=noninteractive \ 7 | apt-get -qq install --no-install-recommends -y \ 8 | # Critical packages: 9 | ssh ca-certificates tini bash \ 10 | # Helpful packages: 11 | libncurses5 curl wget sudo htop git rsync locales \ 12 | tmux unzip nano vim apt-utils iputils-ping && \ 13 | apt-get clean && \ 14 | # SSH passes the client's LANG and LC_* environment variables by default. 15 | # However, the only pre-installed locales on most container images are 16 | # C, C.UTF-8, and POSIX. This adds the en_US.UTF-8 locale as well, 17 | # and leaves locale-gen available to install others. 18 | locale-gen en_US.UTF-8 && \ 19 | # Wipe the server-side SSH keys on the container image level 20 | # to prevent leaking the private host keys, which could 21 | # potentially allow impersonation of the SSH server by an attacker. 22 | rm /etc/ssh/ssh_host_* 23 | 24 | # Since there are no host keys, the SSH server 25 | # MUST be configured at runtime by running: 26 | # dpkg-reconfigure openssh-server 27 | # (Or by adding custom host key files to /etc/ssh/) before launching it with: 28 | # service ssh start 29 | # Or (blocking): 30 | # service ssh start -D 31 | 32 | RUN \ 33 | # Configure the privilege separation directory for sshd 34 | # See here for details: https://github.com/openssh/openssh-portable/blob/master/README.privsep 35 | install -d --mode=0755 --owner=0 --group=0 /var/run/sshd && \ 36 | # Configure an empty authorized keys file with correct permissions 37 | install -d --mode=0700 --owner=0 --group=0 /root/.ssh && \ 38 | install --mode=600 --owner=0 --group=0 /dev/null /root/.ssh/authorized_keys && \ 39 | # Allow only public key authentication 40 | install --mode=600 --owner=0 --group=0 /dev/null /etc/ssh/sshd_config.d/10-key-auth.conf && \ 41 | echo "PasswordAuthentication no" >> /etc/ssh/sshd_config.d/10-key-auth.conf && \ 42 | echo "PermitRootLogin without-password" >> /etc/ssh/sshd_config.d/10-key-auth.conf && \ 43 | # Prevent the user from being kicked off after login 44 | # See here for details: https://stackoverflow.com/questions/21391142 45 | sed -i -E -e \ 46 | 's:session(\s*)required(\s*)pam_loginuid\.so:session\1optional\2pam_loginuid.so:g' \ 47 | /etc/pam.d/sshd && \ 48 | # Fix sudo bug: https://github.com/sudo-project/sudo/issues/42 49 | echo 'Set disable_coredump false' >> /etc/sudo.conf 50 | 51 | RUN chsh -s /bin/bash root 52 | 53 | EXPOSE 22 54 | -------------------------------------------------------------------------------- /cw-mega-sam/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM $BASE_IMAGE 3 | 4 | RUN mkdir -p /work 5 | COPY ./cuda124.patch /work/cuda124.patch 6 | COPY ./requirements.txt /work/requirements.txt 7 | 8 | RUN pip install -r /work/requirements.txt 9 | RUN rm /work/requirements.txt 10 | 11 | ARG COMMIT 12 | RUN cd /work && git clone --recursive https://github.com/mega-sam/mega-sam && \ 13 | cd mega-sam && \ 14 | git checkout ${COMMIT} 15 | RUN cd /work/mega-sam && ls -la 16 | RUN cd /work/mega-sam && patch -p2 < /work/cuda124.patch 17 | 18 | 19 | 20 | ENTRYPOINT /work 21 | CMD echo "Hello! You should only need to run python setup.py install from the /work/mega-sam/base directory." 22 | 23 | 24 | -------------------------------------------------------------------------------- /cw-mega-sam/cuda124.patch: -------------------------------------------------------------------------------- 1 | diff -crB '--exclude=.git' ref/mega-sam/base/src/altcorr_kernel.cu mega-sam/base/src/altcorr_kernel.cu 2 | *** ref/mega-sam/base/src/altcorr_kernel.cu Mon Mar 10 18:15:59 2025 3 | --- mega-sam/base/src/altcorr_kernel.cu Mon Mar 10 17:10:59 2025 4 | *************** 5 | *** 304,310 **** 6 | const dim3 threads(BLOCK_H, BLOCK_W); 7 | 8 | 9 | ! AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.type(), "altcorr_forward_kernel", ([&] { 10 | altcorr_forward_kernel<<>>( 11 | fmap1.packed_accessor32(), 12 | fmap2.packed_accessor32(), 13 | --- 304,310 ---- 14 | const dim3 threads(BLOCK_H, BLOCK_W); 15 | 16 | 17 | ! AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.type().scalarType(), "altcorr_forward_kernel", ([&] { 18 | altcorr_forward_kernel<<>>( 19 | fmap1.packed_accessor32(), 20 | fmap2.packed_accessor32(), 21 | *************** 22 | *** 351,354 **** 23 | radius); 24 | 25 | return {fmap1_grad, fmap2_grad, coords_grad}; 26 | ! } 27 | \ No newline at end of file 28 | --- 351,354 ---- 29 | radius); 30 | 31 | return {fmap1_grad, fmap2_grad, coords_grad}; 32 | ! } 33 | diff -crB '--exclude=.git' ref/mega-sam/base/src/correlation_kernels.cu mega-sam/base/src/correlation_kernels.cu 34 | *** ref/mega-sam/base/src/correlation_kernels.cu Mon Mar 10 18:15:59 2025 35 | --- mega-sam/base/src/correlation_kernels.cu Mon Mar 10 17:16:42 2025 36 | *************** 37 | *** 141,147 **** 38 | torch::Tensor corr = torch::zeros( 39 | {batch_size, 2*radius+1, 2*radius+1, ht, wd}, opts); 40 | 41 | ! AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type(), "sampler_forward_kernel", ([&] { 42 | corr_index_forward_kernel<<>>( 43 | volume.packed_accessor32(), 44 | coords.packed_accessor32(), 45 | --- 141,147 ---- 46 | torch::Tensor corr = torch::zeros( 47 | {batch_size, 2*radius+1, 2*radius+1, ht, wd}, opts); 48 | 49 | ! AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type().scalarType(), "sampler_forward_kernel", ([&] { 50 | corr_index_forward_kernel<<>>( 51 | volume.packed_accessor32(), 52 | coords.packed_accessor32(), 53 | *************** 54 | *** 172,178 **** 55 | const dim3 threads(BLOCK, BLOCK); 56 | 57 | 58 | ! AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type(), "sampler_backward_kernel", ([&] { 59 | corr_index_backward_kernel<<>>( 60 | coords.packed_accessor32(), 61 | corr_grad.packed_accessor32(), 62 | --- 172,178 ---- 63 | const dim3 threads(BLOCK, BLOCK); 64 | 65 | 66 | ! AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type().scalarType(), "sampler_backward_kernel", ([&] { 67 | corr_index_backward_kernel<<>>( 68 | coords.packed_accessor32(), 69 | corr_grad.packed_accessor32(), 70 | *************** 71 | *** 181,184 **** 72 | })); 73 | 74 | return {volume_grad}; 75 | ! } 76 | \ No newline at end of file 77 | --- 181,184 ---- 78 | })); 79 | 80 | return {volume_grad}; 81 | ! } 82 | diff -crB '--exclude=.git' ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp 83 | *** ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp Mon Mar 10 18:16:06 2025 84 | --- mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp Mon Mar 10 17:37:48 2025 85 | *************** 86 | *** 357,363 **** 87 | int batch_size = a.size(0); 88 | torch::Tensor X; 89 | 90 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_forward_kernel", ([&] { 91 | X = torch::zeros({batch_size, group_t::N}, a.options()); 92 | exp_forward_kernel( 93 | a.data_ptr(), 94 | --- 357,363 ---- 95 | int batch_size = a.size(0); 96 | torch::Tensor X; 97 | 98 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_forward_kernel", ([&] { 99 | X = torch::zeros({batch_size, group_t::N}, a.options()); 100 | exp_forward_kernel( 101 | a.data_ptr(), 102 | *************** 103 | *** 372,378 **** 104 | int batch_size = a.size(0); 105 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 106 | 107 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_backward_kernel", ([&] { 108 | exp_backward_kernel( 109 | grad.data_ptr(), 110 | a.data_ptr(), 111 | --- 372,378 ---- 112 | int batch_size = a.size(0); 113 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 114 | 115 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_backward_kernel", ([&] { 116 | exp_backward_kernel( 117 | grad.data_ptr(), 118 | a.data_ptr(), 119 | *************** 120 | *** 387,393 **** 121 | int batch_size = X.size(0); 122 | torch::Tensor a; 123 | 124 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_forward_kernel", ([&] { 125 | a = torch::zeros({batch_size, group_t::K}, X.options()); 126 | log_forward_kernel( 127 | X.data_ptr(), 128 | --- 387,393 ---- 129 | int batch_size = X.size(0); 130 | torch::Tensor a; 131 | 132 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_forward_kernel", ([&] { 133 | a = torch::zeros({batch_size, group_t::K}, X.options()); 134 | log_forward_kernel( 135 | X.data_ptr(), 136 | *************** 137 | *** 402,408 **** 138 | int batch_size = X.size(0); 139 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 140 | 141 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_backward_kernel", ([&] { 142 | log_backward_kernel( 143 | grad.data_ptr(), 144 | X.data_ptr(), 145 | --- 402,408 ---- 146 | int batch_size = X.size(0); 147 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 148 | 149 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_backward_kernel", ([&] { 150 | log_backward_kernel( 151 | grad.data_ptr(), 152 | X.data_ptr(), 153 | *************** 154 | *** 417,423 **** 155 | int batch_size = X.size(0); 156 | torch::Tensor Y = torch::zeros_like(X); 157 | 158 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_forward_kernel", ([&] { 159 | inv_forward_kernel( 160 | X.data_ptr(), 161 | Y.data_ptr(), 162 | --- 417,423 ---- 163 | int batch_size = X.size(0); 164 | torch::Tensor Y = torch::zeros_like(X); 165 | 166 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_forward_kernel", ([&] { 167 | inv_forward_kernel( 168 | X.data_ptr(), 169 | Y.data_ptr(), 170 | *************** 171 | *** 431,437 **** 172 | int batch_size = X.size(0); 173 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 174 | 175 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_backward_kernel", ([&] { 176 | inv_backward_kernel( 177 | grad.data_ptr(), 178 | X.data_ptr(), 179 | --- 431,437 ---- 180 | int batch_size = X.size(0); 181 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 182 | 183 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_backward_kernel", ([&] { 184 | inv_backward_kernel( 185 | grad.data_ptr(), 186 | X.data_ptr(), 187 | *************** 188 | *** 447,453 **** 189 | int batch_size = X.size(0); 190 | torch::Tensor Z = torch::zeros_like(X); 191 | 192 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_forward_kernel", ([&] { 193 | mul_forward_kernel( 194 | X.data_ptr(), 195 | Y.data_ptr(), 196 | --- 447,453 ---- 197 | int batch_size = X.size(0); 198 | torch::Tensor Z = torch::zeros_like(X); 199 | 200 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_forward_kernel", ([&] { 201 | mul_forward_kernel( 202 | X.data_ptr(), 203 | Y.data_ptr(), 204 | *************** 205 | *** 463,469 **** 206 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 207 | torch::Tensor dY = torch::zeros(Y.sizes(), grad.options()); 208 | 209 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_backward_kernel", ([&] { 210 | mul_backward_kernel( 211 | grad.data_ptr(), 212 | X.data_ptr(), 213 | --- 463,469 ---- 214 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 215 | torch::Tensor dY = torch::zeros(Y.sizes(), grad.options()); 216 | 217 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_backward_kernel", ([&] { 218 | mul_backward_kernel( 219 | grad.data_ptr(), 220 | X.data_ptr(), 221 | *************** 222 | *** 480,486 **** 223 | int batch_size = X.size(0); 224 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 225 | 226 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_forward_kernel", ([&] { 227 | adj_forward_kernel( 228 | X.data_ptr(), 229 | a.data_ptr(), 230 | --- 480,486 ---- 231 | int batch_size = X.size(0); 232 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 233 | 234 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_forward_kernel", ([&] { 235 | adj_forward_kernel( 236 | X.data_ptr(), 237 | a.data_ptr(), 238 | *************** 239 | *** 496,502 **** 240 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 241 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 242 | 243 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_backward_kernel", ([&] { 244 | adj_backward_kernel( 245 | grad.data_ptr(), 246 | X.data_ptr(), 247 | --- 496,502 ---- 248 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 249 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 250 | 251 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_backward_kernel", ([&] { 252 | adj_backward_kernel( 253 | grad.data_ptr(), 254 | X.data_ptr(), 255 | *************** 256 | *** 514,520 **** 257 | int batch_size = X.size(0); 258 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 259 | 260 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_forward_kernel", ([&] { 261 | adjT_forward_kernel( 262 | X.data_ptr(), 263 | a.data_ptr(), 264 | --- 514,520 ---- 265 | int batch_size = X.size(0); 266 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 267 | 268 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_forward_kernel", ([&] { 269 | adjT_forward_kernel( 270 | X.data_ptr(), 271 | a.data_ptr(), 272 | *************** 273 | *** 530,536 **** 274 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 275 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 276 | 277 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_backward_kernel", ([&] { 278 | adjT_backward_kernel( 279 | grad.data_ptr(), 280 | X.data_ptr(), 281 | --- 530,536 ---- 282 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 283 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 284 | 285 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_backward_kernel", ([&] { 286 | adjT_backward_kernel( 287 | grad.data_ptr(), 288 | X.data_ptr(), 289 | *************** 290 | *** 548,554 **** 291 | int batch_size = X.size(0); 292 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 293 | 294 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_forward_kernel", ([&] { 295 | act_forward_kernel( 296 | X.data_ptr(), 297 | p.data_ptr(), 298 | --- 548,554 ---- 299 | int batch_size = X.size(0); 300 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 301 | 302 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_forward_kernel", ([&] { 303 | act_forward_kernel( 304 | X.data_ptr(), 305 | p.data_ptr(), 306 | *************** 307 | *** 564,570 **** 308 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 309 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 310 | 311 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_backward_kernel", ([&] { 312 | act_backward_kernel( 313 | grad.data_ptr(), 314 | X.data_ptr(), 315 | --- 564,570 ---- 316 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 317 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 318 | 319 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_backward_kernel", ([&] { 320 | act_backward_kernel( 321 | grad.data_ptr(), 322 | X.data_ptr(), 323 | *************** 324 | *** 582,588 **** 325 | int batch_size = X.size(0); 326 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 327 | 328 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_forward_kernel", ([&] { 329 | act4_forward_kernel( 330 | X.data_ptr(), 331 | p.data_ptr(), 332 | --- 582,588 ---- 333 | int batch_size = X.size(0); 334 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 335 | 336 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_forward_kernel", ([&] { 337 | act4_forward_kernel( 338 | X.data_ptr(), 339 | p.data_ptr(), 340 | *************** 341 | *** 598,604 **** 342 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 343 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 344 | 345 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_backward_kernel", ([&] { 346 | act4_backward_kernel( 347 | grad.data_ptr(), 348 | X.data_ptr(), 349 | --- 598,604 ---- 350 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 351 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 352 | 353 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_backward_kernel", ([&] { 354 | act4_backward_kernel( 355 | grad.data_ptr(), 356 | X.data_ptr(), 357 | *************** 358 | *** 616,622 **** 359 | int batch_size = X.size(0); 360 | torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options()); 361 | 362 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "as_matrix_forward_kernel", ([&] { 363 | as_matrix_forward_kernel( 364 | X.data_ptr(), 365 | T4x4.data_ptr(), 366 | --- 616,622 ---- 367 | int batch_size = X.size(0); 368 | torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options()); 369 | 370 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "as_matrix_forward_kernel", ([&] { 371 | as_matrix_forward_kernel( 372 | X.data_ptr(), 373 | T4x4.data_ptr(), 374 | *************** 375 | *** 631,637 **** 376 | int batch_size = X.size(0); 377 | torch::Tensor P; 378 | 379 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "orthogonal_projector_kernel", ([&] { 380 | P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options()); 381 | orthogonal_projector_kernel(X.data_ptr(), P.data_ptr(), batch_size); 382 | })); 383 | --- 631,637 ---- 384 | int batch_size = X.size(0); 385 | torch::Tensor P; 386 | 387 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "orthogonal_projector_kernel", ([&] { 388 | P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options()); 389 | orthogonal_projector_kernel(X.data_ptr(), P.data_ptr(), batch_size); 390 | })); 391 | *************** 392 | *** 645,651 **** 393 | int batch_size = X.size(0); 394 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 395 | 396 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "jleft_forward_kernel", ([&] { 397 | jleft_forward_kernel( 398 | X.data_ptr(), 399 | a.data_ptr(), 400 | --- 645,651 ---- 401 | int batch_size = X.size(0); 402 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 403 | 404 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "jleft_forward_kernel", ([&] { 405 | jleft_forward_kernel( 406 | X.data_ptr(), 407 | a.data_ptr(), 408 | *************** 409 | *** 654,657 **** 410 | })); 411 | 412 | return b; 413 | ! } 414 | \ No newline at end of file 415 | --- 654,657 ---- 416 | })); 417 | 418 | return b; 419 | ! } 420 | diff -crB '--exclude=.git' ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu 421 | *** ref/mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu Mon Mar 10 18:16:06 2025 422 | --- mega-sam/base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu Mon Mar 10 17:29:53 2025 423 | *************** 424 | *** 299,305 **** 425 | int batch_size = a.size(0); 426 | torch::Tensor X; 427 | 428 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_forward_kernel", ([&] { 429 | X = torch::zeros({batch_size, group_t::N}, a.options()); 430 | exp_forward_kernel<<>>( 431 | a.data_ptr(), 432 | --- 299,305 ---- 433 | int batch_size = a.size(0); 434 | torch::Tensor X; 435 | 436 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_forward_kernel", ([&] { 437 | X = torch::zeros({batch_size, group_t::N}, a.options()); 438 | exp_forward_kernel<<>>( 439 | a.data_ptr(), 440 | *************** 441 | *** 314,320 **** 442 | int batch_size = a.size(0); 443 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 444 | 445 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_backward_kernel", ([&] { 446 | exp_backward_kernel<<>>( 447 | grad.data_ptr(), 448 | a.data_ptr(), 449 | --- 314,320 ---- 450 | int batch_size = a.size(0); 451 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 452 | 453 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type().scalarType(), "exp_backward_kernel", ([&] { 454 | exp_backward_kernel<<>>( 455 | grad.data_ptr(), 456 | a.data_ptr(), 457 | *************** 458 | *** 329,335 **** 459 | int batch_size = X.size(0); 460 | torch::Tensor a; 461 | 462 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_forward_kernel", ([&] { 463 | a = torch::zeros({batch_size, group_t::K}, X.options()); 464 | log_forward_kernel<<>>( 465 | X.data_ptr(), 466 | --- 329,335 ---- 467 | int batch_size = X.size(0); 468 | torch::Tensor a; 469 | 470 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_forward_kernel", ([&] { 471 | a = torch::zeros({batch_size, group_t::K}, X.options()); 472 | log_forward_kernel<<>>( 473 | X.data_ptr(), 474 | *************** 475 | *** 344,350 **** 476 | int batch_size = X.size(0); 477 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 478 | 479 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_backward_kernel", ([&] { 480 | log_backward_kernel<<>>( 481 | grad.data_ptr(), 482 | X.data_ptr(), 483 | --- 344,350 ---- 484 | int batch_size = X.size(0); 485 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 486 | 487 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "log_backward_kernel", ([&] { 488 | log_backward_kernel<<>>( 489 | grad.data_ptr(), 490 | X.data_ptr(), 491 | *************** 492 | *** 359,365 **** 493 | int batch_size = X.size(0); 494 | torch::Tensor Y = torch::zeros_like(X); 495 | 496 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_forward_kernel", ([&] { 497 | inv_forward_kernel<<>>( 498 | X.data_ptr(), 499 | Y.data_ptr(), 500 | --- 359,365 ---- 501 | int batch_size = X.size(0); 502 | torch::Tensor Y = torch::zeros_like(X); 503 | 504 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_forward_kernel", ([&] { 505 | inv_forward_kernel<<>>( 506 | X.data_ptr(), 507 | Y.data_ptr(), 508 | *************** 509 | *** 373,379 **** 510 | int batch_size = X.size(0); 511 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 512 | 513 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_backward_kernel", ([&] { 514 | inv_backward_kernel<<>>( 515 | grad.data_ptr(), 516 | X.data_ptr(), 517 | --- 373,379 ---- 518 | int batch_size = X.size(0); 519 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 520 | 521 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "inv_backward_kernel", ([&] { 522 | inv_backward_kernel<<>>( 523 | grad.data_ptr(), 524 | X.data_ptr(), 525 | *************** 526 | *** 389,395 **** 527 | int batch_size = X.size(0); 528 | torch::Tensor Z = torch::zeros_like(X); 529 | 530 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_forward_kernel", ([&] { 531 | mul_forward_kernel<<>>( 532 | X.data_ptr(), 533 | Y.data_ptr(), 534 | --- 389,395 ---- 535 | int batch_size = X.size(0); 536 | torch::Tensor Z = torch::zeros_like(X); 537 | 538 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_forward_kernel", ([&] { 539 | mul_forward_kernel<<>>( 540 | X.data_ptr(), 541 | Y.data_ptr(), 542 | *************** 543 | *** 405,411 **** 544 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 545 | torch::Tensor dY = torch::zeros(Y.sizes(), grad.options()); 546 | 547 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_backward_kernel", ([&] { 548 | mul_backward_kernel<<>>( 549 | grad.data_ptr(), 550 | X.data_ptr(), 551 | --- 405,411 ---- 552 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 553 | torch::Tensor dY = torch::zeros(Y.sizes(), grad.options()); 554 | 555 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "mul_backward_kernel", ([&] { 556 | mul_backward_kernel<<>>( 557 | grad.data_ptr(), 558 | X.data_ptr(), 559 | *************** 560 | *** 422,428 **** 561 | int batch_size = X.size(0); 562 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 563 | 564 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_forward_kernel", ([&] { 565 | adj_forward_kernel<<>>( 566 | X.data_ptr(), 567 | a.data_ptr(), 568 | --- 422,428 ---- 569 | int batch_size = X.size(0); 570 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 571 | 572 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_forward_kernel", ([&] { 573 | adj_forward_kernel<<>>( 574 | X.data_ptr(), 575 | a.data_ptr(), 576 | *************** 577 | *** 438,444 **** 578 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 579 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 580 | 581 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_backward_kernel", ([&] { 582 | adj_backward_kernel<<>>( 583 | grad.data_ptr(), 584 | X.data_ptr(), 585 | --- 438,444 ---- 586 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 587 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 588 | 589 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adj_backward_kernel", ([&] { 590 | adj_backward_kernel<<>>( 591 | grad.data_ptr(), 592 | X.data_ptr(), 593 | *************** 594 | *** 456,462 **** 595 | int batch_size = X.size(0); 596 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 597 | 598 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_forward_kernel", ([&] { 599 | adjT_forward_kernel<<>>( 600 | X.data_ptr(), 601 | a.data_ptr(), 602 | --- 456,462 ---- 603 | int batch_size = X.size(0); 604 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 605 | 606 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_forward_kernel", ([&] { 607 | adjT_forward_kernel<<>>( 608 | X.data_ptr(), 609 | a.data_ptr(), 610 | *************** 611 | *** 472,478 **** 612 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 613 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 614 | 615 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_backward_kernel", ([&] { 616 | adjT_backward_kernel<<>>( 617 | grad.data_ptr(), 618 | X.data_ptr(), 619 | --- 472,478 ---- 620 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 621 | torch::Tensor da = torch::zeros(a.sizes(), grad.options()); 622 | 623 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "adjT_backward_kernel", ([&] { 624 | adjT_backward_kernel<<>>( 625 | grad.data_ptr(), 626 | X.data_ptr(), 627 | *************** 628 | *** 491,497 **** 629 | int batch_size = X.size(0); 630 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 631 | 632 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_forward_kernel", ([&] { 633 | act_forward_kernel<<>>( 634 | X.data_ptr(), 635 | p.data_ptr(), 636 | --- 491,497 ---- 637 | int batch_size = X.size(0); 638 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 639 | 640 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_forward_kernel", ([&] { 641 | act_forward_kernel<<>>( 642 | X.data_ptr(), 643 | p.data_ptr(), 644 | *************** 645 | *** 507,513 **** 646 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 647 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 648 | 649 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_backward_kernel", ([&] { 650 | act_backward_kernel<<>>( 651 | grad.data_ptr(), 652 | X.data_ptr(), 653 | --- 507,513 ---- 654 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 655 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 656 | 657 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act_backward_kernel", ([&] { 658 | act_backward_kernel<<>>( 659 | grad.data_ptr(), 660 | X.data_ptr(), 661 | *************** 662 | *** 524,530 **** 663 | int batch_size = X.size(0); 664 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 665 | 666 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_forward_kernel", ([&] { 667 | act4_forward_kernel<<>>( 668 | X.data_ptr(), 669 | p.data_ptr(), 670 | --- 524,530 ---- 671 | int batch_size = X.size(0); 672 | torch::Tensor q = torch::zeros(p.sizes(), p.options()); 673 | 674 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_forward_kernel", ([&] { 675 | act4_forward_kernel<<>>( 676 | X.data_ptr(), 677 | p.data_ptr(), 678 | *************** 679 | *** 540,546 **** 680 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 681 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 682 | 683 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_backward_kernel", ([&] { 684 | act4_backward_kernel<<>>( 685 | grad.data_ptr(), 686 | X.data_ptr(), 687 | --- 540,546 ---- 688 | torch::Tensor dX = torch::zeros(X.sizes(), grad.options()); 689 | torch::Tensor dp = torch::zeros(p.sizes(), grad.options()); 690 | 691 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "act4_backward_kernel", ([&] { 692 | act4_backward_kernel<<>>( 693 | grad.data_ptr(), 694 | X.data_ptr(), 695 | *************** 696 | *** 558,564 **** 697 | int batch_size = X.size(0); 698 | torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options()); 699 | 700 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "as_matrix_forward_kernel", ([&] { 701 | as_matrix_forward_kernel<<>>( 702 | X.data_ptr(), 703 | T4x4.data_ptr(), 704 | --- 558,564 ---- 705 | int batch_size = X.size(0); 706 | torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options()); 707 | 708 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "as_matrix_forward_kernel", ([&] { 709 | as_matrix_forward_kernel<<>>( 710 | X.data_ptr(), 711 | T4x4.data_ptr(), 712 | *************** 713 | *** 573,579 **** 714 | int batch_size = X.size(0); 715 | torch::Tensor P; 716 | 717 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "orthogonal_projector_kernel", ([&] { 718 | P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options()); 719 | orthogonal_projector_kernel<<>>( 720 | X.data_ptr(), 721 | --- 573,579 ---- 722 | int batch_size = X.size(0); 723 | torch::Tensor P; 724 | 725 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "orthogonal_projector_kernel", ([&] { 726 | P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options()); 727 | orthogonal_projector_kernel<<>>( 728 | X.data_ptr(), 729 | *************** 730 | *** 589,595 **** 731 | int batch_size = X.size(0); 732 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 733 | 734 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "jleft_forward_kernel", ([&] { 735 | jleft_forward_kernel<<>>( 736 | X.data_ptr(), 737 | a.data_ptr(), 738 | --- 589,595 ---- 739 | int batch_size = X.size(0); 740 | torch::Tensor b = torch::zeros(a.sizes(), a.options()); 741 | 742 | ! DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type().scalarType(), "jleft_forward_kernel", ([&] { 743 | jleft_forward_kernel<<>>( 744 | X.data_ptr(), 745 | a.data_ptr(), 746 | -------------------------------------------------------------------------------- /cw-mega-sam/requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types 2 | archspec 3 | boltons 4 | Brotli 5 | certifi 6 | cffi 7 | charset-normalizer 8 | cryptography 9 | distro 10 | filelock==3.17.0 11 | frozendict 12 | fsspec==2025.3.0 13 | idna 14 | Jinja2==3.1.6 15 | jsonpatch 16 | jsonpointer==2.1 17 | markdown-it-py 18 | MarkupSafe==3.0.2 19 | mdurl 20 | mpmath==1.3.0 21 | networkx==3.4.2 22 | numpy==2.2.3 23 | nvidia-cublas-cu12==12.4.5.8 24 | nvidia-cuda-cupti-cu12==12.4.127 25 | nvidia-cuda-nvrtc-cu12==12.4.127 26 | nvidia-cuda-nvrtc-cu12==12.4.127 27 | nvidia-cuda-runtime-cu12==12.4.127 28 | nvidia-cudnn-cu12 29 | nvidia-cufft-cu12==11.2.1.3 30 | nvidia-curand-cu12==10.3.5.147 31 | nvidia-cusolver-cu12==11.6.1.9 32 | nvidia-cusparse-cu12==12.3.1.170 33 | nvidia-cusparselt-cu12==0.6.2 34 | nvidia-nccl-cu12==2.20.5 35 | nvidia-nvjitlink-cu12==12.4.127 36 | nvidia-nvtx-cu12==12.4.127 37 | packaging 38 | pillow==11.1.0 39 | platformdirs 40 | pluggy 41 | pycosat 42 | pycparser 43 | pydantic 44 | pydantic_core 45 | Pygments 46 | PySocks 47 | requests 48 | rich 49 | ruamel.yaml 50 | ruamel.yaml.clib 51 | setuptools==75.8.0 52 | sympy==1.13.1 53 | torch==2.6.0 54 | torchaudio==2.6.0 55 | torchvision==0.21.0 56 | tqdm 57 | triton==3.2.0 58 | truststore 59 | typing_extensions 60 | urllib3 61 | wheel==0.45.1 62 | zstandard 63 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # ml-containers 2 | 3 | Repository for building ML images at CoreWeave 4 | 5 | 6 | ## Index 7 | 8 | See the [list of all published images](https://github.com/orgs/coreweave/packages?repo_name=ml-containers). 9 | 10 | Special PyTorch Images: 11 | 12 | - [PyTorch Base Images](#pytorch-base-images) 13 | - [PyTorch Extras](#pytorch-extras) 14 | - [PyTorch Nightly](#pytorch-nightly) 15 | 16 | ### PyTorch Base Images 17 | 18 | - [`ghcr.io/coreweave/ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) 19 | 20 | CoreWeave provides custom builds of 21 | [PyTorch](https://github.com/pytorch/pytorch), 22 | [`torchvision`](https://github.com/pytorch/vision) 23 | and [`torchaudio`](https://github.com/pytorch/audio) 24 | tuned for our platform in a single container image, [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). 25 | 26 | Versions compiled against CUDA 11.8.0, 12.0.1, 12.1.1, and 12.2.2 are available in this repository, with two variants: 27 | 28 | 1. `base`: Tagged as `ml-containers/torch:a1b2c3d-base-...`. 29 | 1. Built from [`nvidia/cuda:...-base-ubuntu22.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=base-ubuntu22.04) as a base. 30 | 2. Only includes essentials (CUDA, `torch`, `torchvision`, `torchaudio`), 31 | so it has a small image size, making it fast to launch. 32 | 2. `nccl`: Tagged as `ml-containers/torch:a1b2c3d-nccl-...`. 33 | 1. Built from [`ghcr.io/coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/pkgs/container/nccl-tests) as a base. 34 | 2. Ultimately inherits from [`nvidia/cuda:...-cudnn8-devel-ubuntu22.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=cudnn8-devel-ubuntu22.04). 35 | 3. Larger, but includes development libraries and build tools such as `nvcc` necessary for compiling other PyTorch extensions. 36 | 4. These PyTorch builds are built on component libraries optimized for the CoreWeave cloud—see 37 | [`coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/blob/master/README.md). 38 | 39 | > [!NOTE] 40 | > Most `torch` images have both a variant built on Ubuntu 22.04 and a variant built on Ubuntu 20.04. 41 | > - CUDA 11.8.0 is an exception, and is only available on Ubuntu 20.04. 42 | > - Ubuntu 22.04 images use Python 3.10. 43 | > - Ubuntu 20.04 images use Python 3.8. 44 | > - The base distribution is indicated in the container image tag. 45 | 46 | ### PyTorch Extras 47 | 48 | - [`ghcr.io/coreweave/ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) 49 | 50 | [`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) 51 | extends the [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) 52 | images with a set of common PyTorch extensions: 53 | 54 | 1. [DeepSpeed](https://github.com/microsoft/DeepSpeed) 55 | 2. [FlashAttention](https://github.com/Dao-AILab/flash-attention) 56 | 3. [NVIDIA Apex](https://github.com/NVIDIA/apex) 57 | 58 | Each one is compiled specially against the custom PyTorch builds in [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). 59 | 60 | Both `base` and `nccl` editions are available for 61 | [`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) 62 | matching those for 63 | [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). 64 | The `base` edition retains a small size, as a multi-stage build is used to avoid including 65 | CUDA development libraries in it, despite those libraries being required to build 66 | the extensions themselves. 67 | 68 | ### PyTorch Nightly 69 | 70 | - [`ghcr.io/coreweave/ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) 71 | - [`ghcr.io/coreweave/ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras) 72 | 73 | [`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) 74 | is an experimental, nightly release channel of the 75 | [PyTorch Base Images](#pytorch-base-images) in the style of PyTorch's 76 | own nightly preview builds, featuring the latest development versions of 77 | `torch`, `torchvision`, and `torchaudio` pulled daily from GitHub 78 | and compiled from source. 79 | 80 | [`ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras) 81 | is a version of [PyTorch Extras](#pytorch-extras) built on top of the 82 | [`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) 83 | container images. 84 | These are not nightly versions of the extensions themselves, but rather match 85 | the extension versions in the regular [PyTorch Extras](#pytorch-extras) containers. 86 | 87 | > ⚠ The *PyTorch Nightly* containers are based on unstable, experimental preview 88 | builds of PyTorch, and should be expected to contain bugs and other issues. 89 | > For more stable containers use the [PyTorch Base Images](#pytorch-base-images) 90 | > and [PyTorch Extras](#pytorch-extras) containers. 91 | 92 | 93 | ## Organization 94 | This repository contains multiple container image Dockerfiles, each is expected 95 | to be within its own folder along with any other needed files for the build. 96 | 97 | 98 | ## CI Builds (Actions) 99 | The current CI builds are set up to run when changes to files in the respective 100 | folders are detected so that only the changed container images are built. The 101 | actions are set up with an action per image utilizing a reusable base action 102 | [build.yml](.github/workflows/build.yml). The reusable action accepts several inputs: 103 | 104 | - `folder` - the folder containing the dockerfile for the image 105 | - `image-name` - the name to use for the image 106 | - `build-args` - arguments to pass to the docker build 107 | 108 | Images built using the same source can utilize one action as the main reason for 109 | the multiple actions is to handle only building the changed images. A build 110 | matrix can be helpful for these cases 111 | https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs. 112 | -------------------------------------------------------------------------------- /gpt-neox-determined/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM coreweave/nccl-tests:2022-09-28_16-34-19.392_EDT 2 | 3 | ENV DET_PYTHON_EXECUTABLE="/usr/bin/python3.8" 4 | ENV DET_SKIP_PIP_INSTALL="SKIP" 5 | 6 | # Run updates and install packages for build 7 | RUN echo "Dpkg::Options { "--force-confdef"; "--force-confnew"; };" > /etc/apt/apt.conf.d/local 8 | RUN apt-get -qq update && \ 9 | apt-get -qq install -y --no-install-recommends software-properties-common && \ 10 | add-apt-repository ppa:deadsnakes/ppa -y && \ 11 | add-apt-repository universe && \ 12 | apt-get -qq update && \ 13 | DEBIAN_FRONTEND=noninteractive apt-get install -y curl tzdata build-essential daemontools && \ 14 | apt-get install -y --no-install-recommends \ 15 | python3.8 \ 16 | python3.8-distutils \ 17 | python3.8-dev \ 18 | python3.8-venv \ 19 | git && \ 20 | apt-get clean 21 | 22 | # python3.8 -m ensurepip --default-pip && \ 23 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py 24 | RUN python3.8 get-pip.py 25 | RUN python3.8 -m pip install --no-cache-dir --upgrade pip 26 | 27 | ARG PYTORCH_VERSION=1.12.1 28 | ARG TORCHVISION_VERSION=0.13.1 29 | ARG TORCHAUDIO_VERSION=0.12.1 30 | ARG TORCH_CUDA=116 31 | ARG TORCH_INDEX=whl 32 | 33 | RUN python3.8 -m pip install --no-cache-dir install torch==${PYTORCH_VERSION}+cu${TORCH_CUDA} \ 34 | torchvision==${TORCHVISION_VERSION}+cu${TORCH_CUDA} \ 35 | torchaudio==${TORCHAUDIO_VERSION}+cu${TORCH_CUDA} \ 36 | --extra-index-url https://download.pytorch.org/${TORCH_INDEX}/cu${TORCH_CUDA} 37 | 38 | RUN python3.8 -m pip install --no-cache-dir install packaging 39 | 40 | RUN mkdir -p /tmp/build && \ 41 | cd /tmp/build && \ 42 | git clone https://github.com/NVIDIA/apex && \ 43 | cd apex && \ 44 | python3.8 -m pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ && \ 45 | cd /tmp && \ 46 | rm -r /tmp/build 47 | 48 | #### Python packages 49 | RUN python3.8 -m pip install --no-cache-dir determined==0.19.2 50 | 51 | #### Clone GPT-Neox for requirements 52 | RUN git clone https://github.com/EleutherAI/gpt-neox.git && cd gpt-neox && \ 53 | python3.8 -m pip install --no-cache-dir -r requirements/requirements.txt && \ 54 | python3.8 -m pip install --no-cache-dir -r requirements/requirements-onebitadam.txt && \ 55 | python3.8 -m pip install -r requirements/requirements-sparseattention.txt 56 | 57 | RUN python3.8 -m pip install --no-cache-dir pybind11 58 | RUN python3.8 -m pip install --no-cache-dir protobuf==3.19.4 59 | RUN update-alternatives --install /usr/bin/python3 python /usr/bin/python3.8 2 60 | RUN echo 2 | update-alternatives --config python 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /gpt-neox-mpi/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/nccl-tests:11.7.1-devel-ubuntu20.04-nccl2.14.3-1-a0cb1a6 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | #### System package (uses default Python 3 version in Ubuntu 20.04) 6 | RUN apt-get update -y && \ 7 | apt-get install -y \ 8 | git python3 python3-dev libpython3-dev python3-pip pdsh && \ 9 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ 10 | update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ 11 | pip install --upgrade pip && \ 12 | pip install gpustat 13 | 14 | #### Python packages 15 | RUN pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html && \ 16 | pip install packaging>=14.0 && pip cache purge 17 | 18 | ## Install APEX 19 | ARG APEX_COMMIT=537424d24d55e3a166c930828e4780549edc6151 20 | RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" \ 21 | --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@${APEX_COMMIT} 22 | 23 | # Get the gpt-neox source code 24 | WORKDIR / 25 | RUN git clone https://github.com/EleutherAI/gpt-neox.git 26 | 27 | # Use the-eye.eu instead of the dead mystic.the-eye.eu mirror for dataset links 28 | RUN sed -i 's/mystic.the-eye/the-eye/g' /gpt-neox/tools/corpora.py 29 | 30 | RUN pip install -r /gpt-neox/requirements/requirements.txt && \ 31 | pip install -r /gpt-neox/requirements/requirements-onebitadam.txt && \ 32 | pip install -r /gpt-neox/requirements/requirements-sparseattention.txt && \ 33 | pip install protobuf==3.20.* && \ 34 | pip install git+https://github.com/EleutherAI/best-download.git && \ 35 | pip cache purge 36 | 37 | RUN python /gpt-neox/megatron/fused_kernels/setup.py install 38 | 39 | # Clear staging 40 | RUN mkdir -p /tmp && chmod 0777 /tmp 41 | 42 | WORKDIR /gpt-neox 43 | -------------------------------------------------------------------------------- /hf-llm-inference/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get -qq update && \ 5 | apt-get -qq install --no-install-recommends -y git curl && \ 6 | apt-get clean 7 | 8 | RUN mkdir /app 9 | WORKDIR /app 10 | 11 | ARG COMMIT=cfd8b249a6bac47e0b3dab6fa2be781965a69025 12 | RUN git clone --filter=blob:none https://github.com/coreweave/kubernetes-cloud && \ 13 | cd kubernetes-cloud && \ 14 | git checkout ${COMMIT} && \ 15 | cd .. && \ 16 | cp kubernetes-cloud/online-inference/hf-llm/service/* . && \ 17 | cp kubernetes-cloud/online-inference/hf-llm/serializer/serialize.py . && \ 18 | rm -rf kubernetes-cloud 19 | 20 | RUN pip3 install --no-cache-dir --upgrade pip && \ 21 | pip3 install --no-cache-dir -r requirements.txt 22 | -------------------------------------------------------------------------------- /megatron/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM $BASE_IMAGE 3 | 4 | COPY requirements.txt /tmp/requirements.txt 5 | 6 | RUN python3 -m pip install -U --no-cache-dir \ 7 | -r /tmp/requirements.txt \ 8 | && rm /tmp/requirements.txt 9 | 10 | ARG COMMIT 11 | RUN git clone https://github.com/NVIDIA/megatron-lm && \ 12 | cd megatron-lm && \ 13 | git checkout ${COMMIT} && \ 14 | rm -rf .git 15 | -------------------------------------------------------------------------------- /megatron/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.4 2 | pybind11==2.13.6 3 | pyyaml==6.0.2 4 | regex==2024.9.11 5 | tensorboard==2.18.0 6 | tensorboard-data-server==0.7.2 7 | transformers==4.45.2 8 | triton==3.0.0 9 | wandb==0.18.3 10 | sentencepiece==0.2.0 -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: ml-containers 2 | plugins: 3 | - techdocs-core 4 | markdown_extensions: 5 | pymdownx.extra: 6 | pymdownx.superfences: 7 | custom_fences: 8 | - name: mermaid 9 | class: mermaid 10 | format: !!python/name:pymdownx.superfences.fence_code_format -------------------------------------------------------------------------------- /sd-finetuner/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gooseai/torch-base:6cfdc11 2 | 3 | RUN apt-get install -y cuda-nvcc-11-3 cuda-nvml-dev-11-3 libcurand-dev-11-3 \ 4 | libcublas-dev-11-3 libcusparse-dev-11-3 \ 5 | libcusolver-dev-11-3 cuda-nvprof-11-3 \ 6 | ninja-build git && \ 7 | apt-get clean 8 | 9 | RUN mkdir /app 10 | WORKDIR /app 11 | 12 | ARG COMMIT=master 13 | RUN git clone https://github.com/coreweave/kubernetes-cloud.git && \ 14 | cd kubernetes-cloud && \ 15 | git checkout ${COMMIT} && \ 16 | cd .. 17 | RUN cp kubernetes-cloud/sd-finetuner-workflow/sd-finetuner/* . 18 | RUN pip3 install --no-cache-dir -r requirements.txt 19 | 20 | CMD [ "/usr/bin/python3", "finetuner.py" ] 21 | -------------------------------------------------------------------------------- /sd-inference/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt update && apt upgrade -y && \ 5 | apt update && apt install -y python3 python3-pip git curl && \ 6 | apt clean 7 | 8 | RUN mkdir /app 9 | WORKDIR /app 10 | 11 | ARG COMMIT=master 12 | RUN git clone https://github.com/coreweave/kubernetes-cloud && \ 13 | cd kubernetes-cloud && \ 14 | git checkout ${COMMIT} && \ 15 | cd .. && \ 16 | cp kubernetes-cloud/online-inference/stable-diffusion/service/* . && \ 17 | cp kubernetes-cloud/online-inference/stable-diffusion/serializer/serialize.py . && \ 18 | rm -rf kubernetes-cloud 19 | 20 | RUN pip3 install --no-cache-dir --upgrade pip && \ 21 | pip3 install --no-cache-dir -r requirements.txt 22 | -------------------------------------------------------------------------------- /sglang/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1.2 2 | ARG BASE_IMAGE 3 | ARG BUILDER_IMAGE="${BASE_IMAGE}" 4 | 5 | FROM ${BUILDER_IMAGE} AS builder 6 | 7 | ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX' 8 | 9 | ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8' 10 | ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef' 11 | ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0' 12 | ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb' 13 | ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b' 14 | # Building Triton is not currently enabled, 15 | # but this is the commit that would be used if it were 16 | ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd' 17 | 18 | WORKDIR /build 19 | COPY build.bash /build/ 20 | RUN mkdir /wheels && \ 21 | bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \ 22 | rm -rf /build/* 23 | COPY install.bash /wheels/ 24 | 25 | FROM ${BASE_IMAGE} 26 | RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \ 27 | cd /wheels && \ 28 | bash install.bash 29 | -------------------------------------------------------------------------------- /sglang/build.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xeo pipefail 3 | export DEBIAN_FRONTEND=noninteractive 4 | 5 | TORCH_CUDA_ARCH_LIST='' 6 | FILTER_ARCHES='' 7 | BUILD_TRITON='' 8 | 9 | while getopts 'a:ft' OPT; do 10 | case "${OPT}" in 11 | a) TORCH_CUDA_ARCH_LIST="${OPTARG}" ;; 12 | f) FILTER_ARCHES='1' ;; 13 | t) BUILD_TRITON='1' ;; 14 | *) exit 92 ;; 15 | esac 16 | done 17 | 18 | export NVCC_APPEND_FLAGS='-gencode=arch=compute_100,code=[sm_100,compute_100] -gencode=arch=compute_100a,code=sm_100a --diag-suppress 174' 19 | export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0 10.0+PTX}" 20 | 21 | mkdir -p /wheels/logs 22 | 23 | _BUILD() { python3 -m build -w -n -v -o /wheels "${1:-.}"; } 24 | _LOG() { tee -a "/wheels/logs/${1:?}"; } 25 | _CONSTRAINTS="$(python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p')" 26 | _PIP_INSTALL() { 27 | python3 -m pip install --no-cache-dir \ 28 | --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \ 29 | "$@" 30 | } 31 | 32 | _PIP_INSTALL -U pip setuptools wheel build pybind11 ninja cmake 33 | 34 | # triton (not compatible with torch 2.6) 35 | if [ "${BUILD_TRITON}" = 1 ]; then ( 36 | : "${TRITON_COMMIT:?}" 37 | echo 'Building triton-lang/triton' 38 | git clone --recursive --filter=blob:none https://github.com/triton-lang/triton 39 | cd triton 40 | git checkout "${TRITON_COMMIT}" 41 | _BUILD python |& _LOG triton.log 42 | ); fi 43 | 44 | # flashinfer 45 | : "${FLASHINFER_COMMIT:?}" 46 | : "${CUTLASS_COMMIT:?}" 47 | ( 48 | echo 'Building flashinfer-ai/flashinfer' 49 | git clone --recursive --filter=blob:none https://github.com/flashinfer-ai/flashinfer 50 | cd flashinfer 51 | git checkout "${FLASHINFER_COMMIT}" 52 | sed -i 's/name = "flashinfer-python"/name = "flashinfer"/' pyproject.toml 53 | git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}" 54 | _PIP_INSTALL -U optree 55 | NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS } --diag-suppress 20281,174" \ 56 | FLASHINFER_ENABLE_AOT=1 _BUILD . |& _LOG flashinfer.log 57 | ) 58 | 59 | # Setup cutlass repo for vLLM to use 60 | git clone --recursive --filter=blob:none https://github.com/NVIDIA/cutlass 61 | git -C cutlass checkout "${CUTLASS_COMMIT}" 62 | 63 | # vLLM 64 | : "${VLLM_COMMIT:?}" 65 | ( 66 | echo 'Building vllm-project/vllm' 67 | export VLLM_CUTLASS_SRC_DIR="${PWD}/cutlass" 68 | test -d "${VLLM_CUTLASS_SRC_DIR}" 69 | git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm 70 | cd vllm 71 | git checkout "${VLLM_COMMIT}" 72 | # For lsmod 73 | apt-get -qq update && apt-get -qq install --no-install-recommends -y kmod 74 | python3 use_existing_torch.py 75 | _PIP_INSTALL -r requirements-build.txt 76 | USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log 77 | ) 78 | 79 | # sglang 80 | : "${SGLANG_COMMIT:?}" 81 | ( 82 | echo 'Building sglang' 83 | git clone --recursive --filter=blob:none https://github.com/sgl-project/sglang 84 | cd sglang 85 | git checkout "${SGLANG_COMMIT}" 86 | ( 87 | cd sgl-kernel 88 | git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}" 89 | git -C 3rdparty/flashinfer/3rdparty/cutlass checkout "${CUTLASS_COMMIT}" 90 | 91 | ARCH_TRIPLE="$(gcc -print-multiarch)" 92 | LIB_DIR="/usr/lib/${ARCH_TRIPLE:?}" 93 | test -d "${LIB_DIR:?}" 94 | PYTHON_API_VER="$( 95 | python3 --version | sed -En 's@Python ([0-9])\.([0-9]+)\..*@cp\1\2@p' 96 | )" 97 | ARCH_FILTER=() 98 | if [ "${FILTER_ARCHES}" = 1 ]; then 99 | ARCH_FILTER=(-e 's@"-gencode=arch=compute_[78][0-9],code=sm_[78][0-9]",@#\0@') 100 | fi 101 | 102 | sed -Ei \ 103 | "${ARCH_FILTER[@]}" \ 104 | -e 's@/usr/lib/x86_64-linux-gnu@'"${LIB_DIR}"'@' \ 105 | -e 's@(\s+)(\w.+manylinux2014_x86_64.+)@\1pass # \2@' \ 106 | -e 's@\{"py_limited_api": "cp39"}@{"py_limited_api": "'"${PYTHON_API_VER:-cp310}"'"}@' \ 107 | setup.py 108 | SGL_KERNEL_ENABLE_BF16=1 SGL_KERNEL_ENABLE_FP8=1 SGL_KERNEL_ENABLE_SM90A=1 \ 109 | _BUILD . |& _LOG sglang.log 110 | ) 111 | _BUILD python |& _LOG sglang.log 112 | ) 113 | 114 | # decord and xgrammar aren't available on PyPI for ARM64 115 | 116 | if [ ! "$(uname -m)" = 'x86_64' ]; then 117 | # xgrammar (for sglang) 118 | ( 119 | git clone --recursive --filter=blob:none -b v0.1.11 https://github.com/mlc-ai/xgrammar && \ 120 | cd xgrammar 121 | ( 122 | mkdir build && cd build 123 | cmake -S.. -B. -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG xgrammar.log 124 | cmake --build . |& _LOG xgrammar.log 125 | ) 126 | _BUILD python |& _LOG xgrammar.log 127 | ) 128 | 129 | # decord (for sglang) 130 | : "${DECORD_COMMIT:?}" 131 | ( 132 | apt-get -qq update && apt-get -q install --no-install-recommends -y \ 133 | build-essential python3-dev python3-setuptools \ 134 | make cmake ffmpeg \ 135 | libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev 136 | git clone --recursive --filter=blob:none https://github.com/dmlc/decord 137 | cd decord 138 | git checkout "${DECORD_COMMIT}" 139 | ( 140 | mkdir build && cd build 141 | cmake -S.. -B. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG decord.log 142 | cmake --build . |& _LOG decord.log 143 | cp libdecord.so /wheels/libdecord.so 144 | ) 145 | cd python 146 | _BUILD . |& _LOG decord.log 147 | ) 148 | fi 149 | 150 | apt-get clean 151 | -------------------------------------------------------------------------------- /sglang/install.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xeo pipefail 3 | export DEBIAN_FRONTEND=noninteractive 4 | 5 | _CONSTRAINTS="$( 6 | python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p' 7 | )" 8 | _PIP_INSTALL() { 9 | python3 -m pip install --no-cache-dir \ 10 | --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \ 11 | "$@" 12 | } 13 | 14 | _PIP_INSTALL /wheels/*.whl 15 | if [ -x /wheels/libdecord.so ]; then 16 | apt-get -qq update && apt-get -q install --no-install-recommends -y \ 17 | libavfilter7 libavformat58 && \ 18 | apt-get clean 19 | cp /wheels/libdecord.so /usr/local/lib/ && ldconfig 20 | fi 21 | 22 | SGLANG_EXTRA_PIP_DEPENDENCIES=() 23 | if [ "$(uname -m)" = 'x86_64' ]; then 24 | SGLANG_EXTRA_PIP_DEPENDENCIES=('decord' 'xgrammar>=0.1.10') 25 | fi 26 | _PIP_INSTALL \ 27 | 'aiohttp' 'fastapi' \ 28 | 'hf_transfer' 'huggingface_hub' 'interegular' 'modelscope' \ 29 | 'orjson' 'packaging' 'pillow' 'prometheus-client>=0.20.0' \ 30 | 'psutil' 'pydantic' 'python-multipart' 'pyzmq>=25.1.2' \ 31 | 'torchao>=0.7.0' 'uvicorn' 'uvloop' \ 32 | 'cuda-python' 'outlines>=0.0.44,<0.1.0' \ 33 | "${SGLANG_EXTRA_PIP_DEPENDENCIES[@]}" 34 | -------------------------------------------------------------------------------- /tensorizer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:es-22.04-3ce72cc-base-cuda12.2.2-torch2.1.2-vision0.16.2-audio2.1.2 2 | ARG COMMIT=main 3 | 4 | RUN mkdir /app 5 | WORKDIR /app 6 | 7 | RUN git clone https://github.com/coreweave/tensorizer && \ 8 | cd tensorizer && \ 9 | git checkout ${COMMIT} && \ 10 | pip3 install . -------------------------------------------------------------------------------- /torch-extras/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1.2 2 | 3 | ARG BASE_IMAGE 4 | ARG DEEPSPEED_VERSION="0.14.4" 5 | ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8" 6 | ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1" 7 | ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90" 8 | ARG XFORMERS_VERSION="0.0.28.post1" 9 | ARG BUILD_MAX_JOBS="" 10 | 11 | FROM alpine/git:2.36.3 as apex-downloader 12 | WORKDIR /git 13 | ARG APEX_COMMIT 14 | RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ 15 | https://github.com/NVIDIA/apex && \ 16 | cd apex && \ 17 | git checkout "${APEX_COMMIT}" && \ 18 | git submodule update --init --recursive --jobs 8 \ 19 | --depth 1 --filter=blob:none && \ 20 | find -type d -name docs -prune -exec rm -r '{}' ';' 21 | 22 | 23 | FROM alpine/git:2.36.3 as ds-kernels-downloader 24 | WORKDIR /git 25 | ARG DEEPSPEED_KERNELS_COMMIT 26 | RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ 27 | https://github.com/microsoft/DeepSpeed-Kernels ds-kernels && \ 28 | cd ds-kernels && \ 29 | git checkout "${DEEPSPEED_KERNELS_COMMIT}" && \ 30 | git submodule update --init --recursive --jobs 8 \ 31 | --depth 1 --filter=blob:none 32 | 33 | 34 | # Dependencies requiring NVCC are built ahead of time in a separate stage 35 | # so that the ~2 GiB dev library installations don't have to be included 36 | # in the final image. 37 | FROM ${BASE_IMAGE} as builder-base 38 | RUN export \ 39 | CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \ 40 | CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \ 41 | export \ 42 | CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \ 43 | apt-get install -y --no-install-recommends --no-upgrade \ 44 | cuda-nvcc-${CUDA_PACKAGE_VERSION} \ 45 | cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \ 46 | libcurand-dev-${CUDA_PACKAGE_VERSION} \ 47 | libcublas-dev-${CUDA_PACKAGE_VERSION} \ 48 | libcusparse-dev-${CUDA_PACKAGE_VERSION} \ 49 | libcusolver-dev-${CUDA_PACKAGE_VERSION} \ 50 | cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ 51 | cuda-nvtx-${CUDA_PACKAGE_VERSION} \ 52 | cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \ 53 | apt-get -qq update && \ 54 | apt-get install -y --no-install-recommends \ 55 | libaio-dev \ 56 | ninja-build && \ 57 | apt-get clean 58 | 59 | # Install the cuDNN dev package for building Apex 60 | # The cuDNN runtime is installed in the base torch image 61 | COPY --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh 62 | RUN /tmp/install_cudnn.sh "${CUDA_VERSION}" dev && \ 63 | rm /tmp/install_cudnn.sh 64 | 65 | # Add Kitware's apt repository to get a newer version of CMake 66 | RUN apt-get -qq update && apt-get -qq install -y \ 67 | software-properties-common lsb-release && \ 68 | { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ 69 | | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ 70 | apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ 71 | apt-get -qq update && \ 72 | apt-get -qq install -y 'cmake=3.31.6-*' 'cmake-data=3.31.6-*' && \ 73 | apt-get clean && \ 74 | python3 -m pip install --no-cache-dir 'cmake==3.31.6' 75 | 76 | # Update compiler (GCC) and linker (LLD) versions 77 | # gfortran-11 is just for compiler_wrapper.f95 78 | RUN LLVM_VERSION='18' && \ 79 | apt-get -qq update && apt-get -qq install --no-install-recommends -y \ 80 | gcc-11 g++-11 gfortran-11 "lld-$LLVM_VERSION" && \ 81 | apt-get clean && \ 82 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ 83 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ 84 | update-alternatives --install \ 85 | /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ 86 | update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1 87 | 88 | RUN mkdir /wheels /build 89 | WORKDIR /build 90 | 91 | # DeepSpeed forces -march=native into the compiler options, 92 | # making the result dependent on the processor architecture 93 | # used on the builder machine. 94 | # The compiler wrapper normalizes -march=native to -march=skylake 95 | # along with a couple other transformations before invoking GCC. 96 | COPY compiler_wrapper.f95 . 97 | ARG AMD64_NATIVE_ARCH="skylake" 98 | ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" 99 | RUN if [ "$(uname -m)" = "aarch64" ]; then \ 100 | NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \ 101 | AVX='WRAPPER_NO_AVX'; \ 102 | else \ 103 | NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ 104 | AVX='WRAPPER_AVX="AVX256"'; \ 105 | fi && \ 106 | gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 107 | 108 | COPY --chmod=755 effective_cpu_count.sh . 109 | COPY --chmod=755 scale.sh . 110 | 111 | ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a" 112 | RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ 113 | case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ 114 | FLAGS="${FLAGS} -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_100a,code=sm_100a" ;; \ 115 | esac && \ 116 | echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf 117 | ARG BUILD_MAX_JOBS 118 | 119 | 120 | FROM builder-base as deepspeed-builder 121 | 122 | ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST 123 | RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \ 124 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 125 | cd ds-kernels && \ 126 | export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \ 127 | echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \ 128 | python3 -m pip wheel -w /wheels \ 129 | --no-cache-dir --no-build-isolation --no-deps . && \ 130 | python3 -m pip install /wheels/*.whl 131 | 132 | # DeepSpeed build flags 133 | # See: https://www.deepspeed.ai/tutorials/advanced-install 134 | ARG DS_BUILD_OPS="0" 135 | ARG DS_BUILD_CCL_COMM="0" 136 | ARG DS_BUILD_CPU_ADAM="1" 137 | ARG DS_BUILD_CPU_LION="1" 138 | # Requires CUTLASS 139 | ARG DS_BUILD_EVOFORMER_ATTN="0" 140 | ARG DS_BUILD_FUSED_ADAM="1" 141 | ARG DS_BUILD_FUSED_LION="1" 142 | ARG DS_BUILD_CPU_ADAGRAD="1" 143 | ARG DS_BUILD_FUSED_LAMB="1" 144 | ARG DS_BUILD_QUANTIZER="1" 145 | ARG DS_BUILD_RANDOM_LTD="1" 146 | # sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4 147 | ARG DS_BUILD_SPARSE_ATTN="0" 148 | ARG DS_BUILD_TRANSFORMER="1" 149 | ARG DS_BUILD_TRANSFORMER_INFERENCE="1" 150 | ARG DS_BUILD_STOCHASTIC_TRANSFORMER="1" 151 | ARG DS_BUILD_UTILS="1" 152 | ARG DS_BUILD_AIO="1" 153 | 154 | ARG DEEPSPEED_VERSION 155 | 156 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 157 | RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 158 | python3 -m pip install -U --no-cache-dir \ 159 | setuptools wheel pip py-cpuinfo && \ 160 | if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \ 161 | # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's 162 | # requirement for C++17 (as of DeepSpeed 0.10.1). 163 | # See: https://github.com/microsoft/DeepSpeed/pull/3976 164 | export DS_BUILD_AIO='0'; \ 165 | fi && \ 166 | { \ 167 | # DeepSpeed doesn't handle blank environment variables 168 | # in the same way as unset ones, so clear any blank ones. 169 | for VAR in \ 170 | DS_BUILD_OPS \ 171 | DS_BUILD_CCL_COMM \ 172 | DS_BUILD_CPU_ADAM \ 173 | DS_BUILD_CPU_LION \ 174 | DS_BUILD_EVOFORMER_ATTN \ 175 | DS_BUILD_FUSED_ADAM \ 176 | DS_BUILD_FUSED_LION \ 177 | DS_BUILD_CPU_ADAGRAD \ 178 | DS_BUILD_FUSED_LAMB \ 179 | DS_BUILD_QUANTIZER \ 180 | DS_BUILD_RANDOM_LTD \ 181 | DS_BUILD_SPARSE_ATTN \ 182 | DS_BUILD_TRANSFORMER \ 183 | DS_BUILD_TRANSFORMER_INFERENCE \ 184 | DS_BUILD_STOCHASTIC_TRANSFORMER \ 185 | DS_BUILD_UTILS \ 186 | DS_BUILD_AIO; \ 187 | do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \ 188 | } && \ 189 | CC=$(realpath -e ./compiler) \ 190 | MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \ 191 | python3 -m pip wheel -w /wheels \ 192 | --no-cache-dir --no-build-isolation --no-deps -v \ 193 | deepspeed==${DEEPSPEED_VERSION} && \ 194 | rm ./* 195 | SHELL ["/bin/sh", "-c"] 196 | 197 | WORKDIR /wheels 198 | 199 | 200 | FROM builder-base as apex-builder 201 | 202 | RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) && \ 203 | apt-get -qq update && apt-get install -y --no-install-recommends \ 204 | libnccl-dev=$LIBNCCL2_VERSION && \ 205 | apt-get clean 206 | 207 | # --distributed_adam, --distributed_lamb, and --group_norm aren't documented 208 | # in the Apex README, but are defined in its setup.py config. 209 | RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ 210 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 211 | python3 -m pip install -U --no-cache-dir \ 212 | packaging setuptools wheel pip && \ 213 | CUDA_MAJOR_VERSION=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \ 214 | CHECK_VERSION() { \ 215 | dpkg-query --status "$1" 2>/dev/null \ 216 | | sed -ne 's/Version: //p' \ 217 | | grep .; \ 218 | } && \ 219 | LIBCUDNN_VER="$( \ 220 | CHECK_VERSION libcudnn8-dev || \ 221 | CHECK_VERSION "libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION}" || \ 222 | :; \ 223 | )" && \ 224 | export CC=$(realpath -e ./compiler) && \ 225 | export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)}" && \ 226 | printf -- '--config-settings="--build-option=%s" ' $( \ 227 | echo \ 228 | --cpp_ext \ 229 | --cuda_ext \ 230 | --distributed_adam \ 231 | --distributed_lamb \ 232 | --permutation_search \ 233 | --xentropy \ 234 | --focal_loss \ 235 | --group_norm \ 236 | --index_mul_2d \ 237 | --deprecated_fused_adam \ 238 | --deprecated_fused_lamb \ 239 | --fast_layer_norm \ 240 | --fmha \ 241 | --fast_multihead_attn \ 242 | --transducer \ 243 | --peer_memory \ 244 | --nccl_p2p \ 245 | --fast_bottleneck && \ 246 | if [ -n "$LIBCUDNN_VER" ]; then \ 247 | echo \ 248 | --bnp \ 249 | --cudnn_gbn \ 250 | --fused_conv_bias_relu; \ 251 | fi; \ 252 | ) > ./apex-extensions.conf && \ 253 | echo "Extensions: $(cat ./apex-extensions.conf)" && \ 254 | cd apex && \ 255 | xargs -a ../apex-extensions.conf python3 -m pip wheel -w /wheels -v --no-cache-dir --no-build-isolation --no-deps ./ 256 | 257 | WORKDIR /wheels 258 | 259 | FROM builder-base as xformers-builder 260 | 261 | ARG XFORMERS_VERSION 262 | 263 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 264 | RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 265 | python3 -m pip install -U --no-cache-dir \ 266 | setuptools wheel pip && \ 267 | CC=$(realpath -e ./compiler) \ 268 | MAX_JOBS=1 \ 269 | PYTHONUNBUFFERED=1 \ 270 | XFORMERS_DISABLE_FLASH_ATTN=1 \ 271 | python3 -m pip wheel -w /wheels -v \ 272 | --no-cache-dir --no-build-isolation --no-deps \ 273 | --no-binary=xformers \ 274 | xformers==${XFORMERS_VERSION} 2> \ 275 | >(grep -Ev --line-buffered 'ptxas info\s*:|bytes spill stores' >&2) 276 | 277 | SHELL ["/bin/sh", "-c"] 278 | 279 | WORKDIR /build 280 | 281 | FROM ${BASE_IMAGE} 282 | 283 | RUN apt-get -qq update && \ 284 | apt-get install -y --no-install-recommends libaio-dev && \ 285 | apt-get clean 286 | 287 | 288 | RUN --mount=type=bind,from=deepspeed-builder,source=/wheels,target=/tmp/wheels \ 289 | python3 -m pip install --no-cache-dir /tmp/wheels/*.whl 290 | RUN --mount=type=bind,from=apex-builder,source=/wheels,target=/tmp/wheels \ 291 | python3 -m pip install --no-cache-dir /tmp/wheels/*.whl 292 | RUN --mount=type=bind,from=xformers-builder,source=/wheels,target=/tmp/wheels \ 293 | python3 -m pip install --no-cache-dir /tmp/wheels/*.whl 294 | -------------------------------------------------------------------------------- /torch-extras/compiler_wrapper.f95: -------------------------------------------------------------------------------- 1 | #ifndef WRAPPER_NATIVE 2 | #define WRAPPER_NATIVE "skylake" 3 | #endif 4 | 5 | #ifndef WRAPPER_CC 6 | #define WRAPPER_CC "gcc" 7 | #endif 8 | 9 | #ifndef WRAPPER_AVX 10 | #define WRAPPER_AVX "AVX256" 11 | #endif 12 | 13 | PROGRAM compiler_wrapper 14 | ! Wraps C compiler invocations, 15 | ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions 16 | ! with -D____, and -march=native with -march=, 17 | ! for better reproducibility and compatibility. 18 | IMPLICIT NONE 19 | INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 20 | CHARACTER(len=:), ALLOCATABLE :: arg, command 21 | ALLOCATE(CHARACTER(len=128) :: arg) 22 | command = WRAPPER_CC 23 | 24 | DO i = 1, COMMAND_ARGUMENT_COUNT() 25 | DO 26 | CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated) 27 | IF (truncated == 0) THEN 28 | EXIT 29 | ELSE IF (truncated == -1) THEN 30 | DEALLOCATE(arg) 31 | ALLOCATE(CHARACTER(len=full_length) :: arg) 32 | ELSE 33 | CALL EXIT(95) 34 | END IF 35 | END DO 36 | IF (arg == "-march=native") THEN 37 | command = command // (" '-march=" // WRAPPER_NATIVE // "'") 38 | ELSE IF ( & 39 | arg == "-D__AVX512__" & 40 | .OR. arg == "-D__AVX256__" & 41 | .OR. arg == "-D__SCALAR__" & 42 | ) THEN 43 | #ifndef WRAPPER_NO_AVX 44 | command = command // (" '-D__" // WRAPPER_AVX // "__'") 45 | #endif 46 | ELSE 47 | command = command // shell_escaped(arg) 48 | END IF 49 | END DO 50 | CALL SYSTEM(command, exitcode) 51 | IF (exitcode > 255) THEN 52 | exitcode = MAX(IAND(exitcode, 255), 1) 53 | END IF 54 | CALL EXIT(exitcode) 55 | 56 | 57 | CONTAINS 58 | FUNCTION shell_escaped(str) RESULT(out) 59 | ! Turns [str] into [ 'str'] and replaces all 60 | ! internal ['] characters with ['"'"'] 61 | IMPLICIT NONE 62 | CHARACTER(len=*), INTENT(IN) :: str 63 | CHARACTER(len=:), ALLOCATABLE :: out 64 | INTEGER :: old_i, out_i, old_len, out_len 65 | 66 | old_len = LEN_TRIM(str) 67 | ! Figure out the new length to allocate by scanning `str`. 68 | ! This always needs to add at least [ '] at the beginning 69 | ! and ['] at the end, so the length increases by at least 3. 70 | out_len = old_len + 3 71 | DO old_i = 1, old_len 72 | IF (str(old_i:old_i) == "'") THEN 73 | out_len = out_len + 4 74 | END IF 75 | END DO 76 | ALLOCATE(CHARACTER(len=out_len) :: out) 77 | 78 | ! Copy over the string, performing necessary escapes. 79 | out(1:2) = " '" 80 | out_i = 3 81 | DO old_i = 1, old_len 82 | IF (str(old_i:old_i) == "'") THEN 83 | ! Escape internal single-quotes 84 | out(out_i:out_i + 4) = '''"''"''' 85 | out_i = out_i + 5 86 | ELSE 87 | ! No escaping needed 88 | out(out_i:out_i) = str(old_i:old_i) 89 | out_i = out_i + 1 90 | END IF 91 | END DO 92 | out(out_i:out_i) = "'" 93 | END FUNCTION 94 | END PROGRAM 95 | -------------------------------------------------------------------------------- /torch-extras/effective_cpu_count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CPU_QUOTA() ( 4 | CGROUP='/sys/fs/cgroup'; 5 | CGROUP_V1="$CGROUP/cpu,cpuacct"; 6 | CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us"; 7 | CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us"; 8 | CGROUP_V2="$CGROUP/user.slice/cpu.max"; 9 | if [ ! -d "$CGROUP" ]; then 10 | return 1; 11 | elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then 12 | IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1; 13 | IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1; 14 | elif [ -f "$CGROUP_V2" ]; then 15 | IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1; 16 | else 17 | return 1; 18 | fi; 19 | 20 | if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then 21 | echo $((QUOTA / PERIOD)); 22 | return 0; 23 | else 24 | return 1; 25 | fi; 26 | ) 27 | 28 | EFFECTIVE_CPU_COUNT() { 29 | CPU_QUOTA || getconf _NPROCESSORS_ONLN; 30 | } 31 | 32 | EFFECTIVE_CPU_COUNT; 33 | -------------------------------------------------------------------------------- /torch-extras/install_cudnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CUDA_VERSION="$1"; 4 | if [ -z "$CUDA_VERSION" ]; then 5 | exit 14; 6 | fi; 7 | 8 | INSTALL_DEV="$2"; 9 | if [ "$INSTALL_DEV" = "dev" ]; then 10 | echo "Ensuring installation of cuDNN (dev)"; 11 | DEV_SUFFIX="-dev"; 12 | DEV_PREFIX=""; 13 | elif [ "$INSTALL_DEV" = "runtime" ]; then 14 | echo "Ensuring installation of cuDNN (runtime)"; 15 | DEV_SUFFIX=""; 16 | DEV_PREFIX="lib"; 17 | else 18 | exit 15; 19 | fi; 20 | 21 | CHECK_VERSION() { 22 | dpkg-query --status "$1" 2>/dev/null \ 23 | | sed -ne 's/Version: //p' \ 24 | | grep .; 25 | } 26 | 27 | CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1); 28 | LIBCUDNN_VER="$( 29 | CHECK_VERSION "libcudnn8${DEV_SUFFIX}" || \ 30 | CHECK_VERSION "libcudnn9${DEV_SUFFIX}-cuda-${CUDA_MAJOR_VERSION}" || \ 31 | :; 32 | )" || exit 16; 33 | 34 | if [ -z "$LIBCUDNN_VER" ]; then 35 | apt-get -qq update && \ 36 | apt-get -qq install --no-upgrade -y "${DEV_PREFIX}cudnn9-cuda-${CUDA_MAJOR_VERSION}" && \ 37 | apt-get clean && \ 38 | ldconfig; 39 | else 40 | echo "Found cuDNN version ${LIBCUDNN_VER}" 41 | fi; 42 | -------------------------------------------------------------------------------- /torch-extras/scale.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e; 4 | 5 | VAL="$1"; 6 | DIVISOR="$2"; 7 | MAXIMUM="$3"; 8 | 9 | [ -n "$VAL" ]; 10 | 11 | if [ -n "$DIVISOR" ]; 12 | then VAL="$((( $VAL + $DIVISOR - 1 ) / $DIVISOR))"; 13 | fi; 14 | 15 | if [ -n "$MAXIMUM" ]; 16 | then VAL="$((VAL > MAXIMUM ? MAXIMUM : VAL))"; 17 | fi; 18 | 19 | echo "$VAL"; 20 | -------------------------------------------------------------------------------- /torch/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1.7 2 | ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.9.0-devel-ubuntu22.04" 3 | ARG FINAL_BASE_IMAGE="nvidia/cuda:12.9.0-base-ubuntu22.04" 4 | 5 | ARG BUILD_TORCH_VERSION="2.7.0" 6 | ARG BUILD_TORCH_VISION_VERSION="0.22.0" 7 | ARG BUILD_TORCH_AUDIO_VERSION="2.7.0" 8 | ARG BUILD_TRANSFORMERENGINE_VERSION="1.13" 9 | ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1" 10 | ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" 11 | ARG BUILD_TRITON_VERSION="" 12 | ARG BUILD_TRITON="1" 13 | ARG BUILD_TORCH_CUDA_ARCH_LIST="7.0 8.0 8.9 9.0 10.0+PTX" 14 | ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100" 15 | 16 | ARG AOCL_BASE="/opt/aocl" 17 | ARG AOCL_VER="4.2.0" 18 | ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-aocc-4.2.0.tar.gz" 19 | 20 | # region Downloads 21 | 22 | # Clone PyTorch repositories independently from all other build steps 23 | # for cache-friendliness and parallelization 24 | FROM alpine/git:2.40.1 AS downloader-base 25 | WORKDIR /git 26 | RUN git config --global advice.detachedHead false 27 | 28 | COPY <<-"EOT" /git/clone.sh 29 | #!/bin/sh 30 | REPO="https://github.com/$1"; 31 | DEST="$2"; 32 | REF="$3"; 33 | 34 | CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; 35 | 36 | # Try cloning REF as a tag prefixed with "v", otherwise fall back 37 | # to git checkout for commit hashes 38 | CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \ 39 | "$REPO" -b "v$REF" "$DEST" || { \ 40 | CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \ 41 | git -C "$DEST" checkout "$REF" && \ 42 | git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \ 43 | }; 44 | EOT 45 | 46 | RUN chmod 755 /git/clone.sh 47 | 48 | 49 | FROM downloader-base AS pytorch-downloader 50 | ARG BUILD_TORCH_VERSION 51 | # Includes a patch for a foreach bug in PyTorch v2.5.1 52 | RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ 53 | if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \ 54 | wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \ 55 | | git -C pytorch apply; \ 56 | fi && \ 57 | rm -rf pytorch/.git 58 | 59 | FROM downloader-base AS torchvision-downloader 60 | ARG BUILD_TORCH_VISION_VERSION 61 | RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \ 62 | rm -rf vision/.git 63 | 64 | FROM downloader-base AS torchaudio-downloader 65 | ARG BUILD_TORCH_AUDIO_VERSION 66 | RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}" 67 | # The torchaudio build requires that this directory remain a full git repository, 68 | # so no rm -rf audio/.git is done for this one. 69 | 70 | # torchaudio is broken for CUDA 12.5+ without this patch (up to and including v2.5.0) 71 | # See https://github.com/pytorch/audio/pull/3811 72 | # Fixed as a side effect of https://github.com/pytorch/audio/pull/3843 in versions after v2.5.0 73 | COPY torchaudio-cu125-pr3811.patch /git/patch 74 | RUN if grep -qF '#include ' \ 75 | 'audio/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu'; \ 76 | then :; else git -C audio apply -v --stat --apply /git/patch; \ 77 | fi && \ 78 | rm /git/patch 79 | 80 | FROM downloader-base AS transformerengine-downloader 81 | ARG BUILD_TRANSFORMERENGINE_VERSION 82 | RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}" 83 | 84 | FROM downloader-base AS flash-attn-downloader 85 | ARG BUILD_FLASH_ATTN_VERSION 86 | RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}" 87 | 88 | FROM downloader-base AS flash-attn-3-downloader 89 | ARG BUILD_FLASH_ATTN_3_VERSION 90 | RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \ 91 | ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \ 92 | else \ 93 | mkdir flash-attention; \ 94 | fi 95 | 96 | FROM downloader-base AS triton-version 97 | ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt' 98 | COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt 99 | ARG BUILD_TRITON_VERSION 100 | RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \ 101 | echo "${BUILD_TRITON_VERSION}" > /git/version.txt; \ 102 | fi 103 | 104 | FROM downloader-base AS triton-downloader 105 | COPY --link --from=triton-version /git/version.txt /git/version.txt 106 | ARG BUILD_TRITON 107 | RUN if [ "${BUILD_TRITON}" = '1' ]; then \ 108 | ./clone.sh openai/triton triton "$(cat /git/version.txt)"; \ 109 | else \ 110 | mkdir triton; \ 111 | fi 112 | 113 | FROM alpine/curl:8.7.1 AS aocl-downloader 114 | WORKDIR /tmp/install 115 | 116 | RUN apk add --no-cache bash 117 | 118 | ARG AOCL_BASE 119 | ARG AOCL_VER 120 | ARG AOCL_URL 121 | 122 | RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \ 123 | INSTALL_LIB() { ./install.sh -l "$1" -t "${AOCL_BASE}" -i lp64; } && \ 124 | INSTALL_LIB blis && \ 125 | INSTALL_LIB libflame && \ 126 | INSTALL_LIB utils && \ 127 | . ./amd-libs.cfg && \ 128 | rm -r "${AOCL_ROOT}/include_ILP64" && \ 129 | rm -r "${AOCL_ROOT}/lib_ILP64" && \ 130 | ln -s "${AOCL_ROOT}/amd-libs.cfg" "${AOCL_BASE}/amd-libs.cfg" && \ 131 | ln -s "${AOCL_ROOT}/include" "${AOCL_BASE}/include" && \ 132 | ln -s "${AOCL_ROOT}/lib" "${AOCL_BASE}/lib" && \ 133 | echo "${AOCL_BASE}/lib" \ 134 | | install -m 0644 /dev/stdin "${AOCL_BASE}/aocl.conf" && \ 135 | rm -r ./* 136 | 137 | # endregion Downloads 138 | 139 | ## Build PyTorch on a builder image. 140 | FROM ${BUILDER_BASE_IMAGE} AS builder-base-shared 141 | ENV DEBIAN_FRONTEND=noninteractive 142 | 143 | ARG BUILD_CCACHE_SIZE="1Gi" 144 | 145 | # ninja-build, ccache, and lld are optional but improve the build 146 | RUN apt-get -qq update && apt-get -qq install -y \ 147 | libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \ 148 | libomp5 libpng-dev libjpeg-dev pkg-config python3-distutils \ 149 | build-essential ninja-build && \ 150 | apt-get clean && \ 151 | /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ 152 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ 153 | update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ 154 | ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \ 155 | ldconfig 156 | 157 | COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh 158 | 159 | RUN export \ 160 | CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1) \ 161 | CUDA_MINOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f2) && \ 162 | export \ 163 | CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \ 164 | apt-get -qq update && \ 165 | apt-get -qq install --no-upgrade -y \ 166 | cuda-nvtx-${CUDA_PACKAGE_VERSION} \ 167 | cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \ 168 | /tmp/install_cudnn.sh "${CUDA_VERSION}" dev && \ 169 | rm /tmp/install_cudnn.sh && \ 170 | apt-get clean 171 | 172 | # Add Kitware's apt repository to get a newer version of CMake 173 | RUN apt-get -qq update && apt-get -qq install -y \ 174 | software-properties-common lsb-release && \ 175 | { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ 176 | | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ 177 | apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ 178 | apt-get -qq update && \ 179 | apt-get -qq install -y 'cmake=3.31.6-*' 'cmake-data=3.31.6-*' && \ 180 | apt-get clean && \ 181 | python3 -m pip install --no-cache-dir 'cmake==3.31.6' 182 | 183 | RUN mkdir /tmp/ccache-install && \ 184 | cd /tmp/ccache-install && \ 185 | CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \ 186 | wget -qO - "$CCACHE_URL" | tar --strip-components 1 -xJf - && \ 187 | mkdir build && \ 188 | cd build && \ 189 | cmake -B. -S.. -DCMAKE_BUILD_TYPE=Release && \ 190 | cmake --build . --config Release && \ 191 | make install && \ 192 | cd ../.. && \ 193 | rm -rf /tmp/ccache-install && \ 194 | ccache -M "${BUILD_CCACHE_SIZE}" && \ 195 | ccache -F 0 196 | 197 | # Build-time environment variables 198 | ENV CCACHE_DIR=/ccache \ 199 | CMAKE_C_COMPILER_LAUNCHER=ccache \ 200 | CMAKE_CXX_COMPILER_LAUNCHER=ccache \ 201 | CMAKE_CUDA_COMPILER_LAUNCHER=ccache 202 | 203 | # Update compiler (GCC) and linker (LLD) versions 204 | RUN LLVM_VERSION='18' && \ 205 | CODENAME="$(lsb_release -cs)" && \ 206 | wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ 207 | apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \ 208 | SETUP_TOOLCHAIN() { \ 209 | apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \ 210 | | sed -e '/connection timed out/{p; Q1}' && \ 211 | apt-get -qq install --no-install-recommends -y \ 212 | gcc-11 g++-11 gfortran-11 \ 213 | "lld-$LLVM_VERSION" "libomp-$LLVM_VERSION-dev" && \ 214 | apt-get clean; \ 215 | } && \ 216 | { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \ 217 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ 218 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ 219 | update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ 220 | if [ "$(uname -m)" != 'aarch64' ]; then \ 221 | update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \ 222 | fi && \ 223 | ldconfig 224 | 225 | 226 | FROM builder-base-shared AS builder-base-arm64 227 | ARG BUILD_TORCH_CUDA_ARCH_LIST 228 | ENV TORCH_CUDA_ARCH_LIST="9.0${BUILD_TORCH_CUDA_ARCH_LIST#* 9.0}" 229 | # There is currently no CPU BLAS used for ARM builds 230 | 231 | 232 | FROM builder-base-shared AS builder-base-amd64 233 | ARG BUILD_TORCH_CUDA_ARCH_LIST 234 | ENV TORCH_CUDA_ARCH_LIST="${BUILD_TORCH_CUDA_ARCH_LIST}" 235 | # Install AOCL-BLAS and AOCL-LAPACK 236 | # See: https://www.amd.com/en/developer/aocl/dense.html 237 | ARG AOCL_BASE 238 | COPY --from=aocl-downloader "${AOCL_BASE}" "${AOCL_BASE}" 239 | 240 | # `ldconfig` lets the dynamic linker access AOCL libraries 241 | RUN install -m 0644 -t /etc/ld.so.conf.d "${AOCL_BASE}/aocl.conf" && \ 242 | ldconfig 243 | 244 | # These environment variables are only for the build stage, 245 | # and register paths to build-time AOCL resources. 246 | # This could alternatively be done by invoking `. "${AOCL_BASE}/amd-libs.cfg"` 247 | # in every RUN compilation step, but this will make sure it is never missed. 248 | # 249 | # PyTorch's logic to find LAPACK during CMake configuration 250 | # additionally requires its installed path to either be in: 251 | # - One of: 252 | # - /usr/local/lib, or 253 | # - /usr/lib, or 254 | # - /usr/local/lib64, or 255 | # - /usr/lib64, or 256 | # - /usr/lib/aarch64-linux-gnu, or 257 | # - $LD_LIBRARY_PATH 258 | # While skipping $LIBRARY_PATH, and ld's normal configured paths, 259 | # so it is necessary to add $LD_LIBRARY_PATH here as well. 260 | # See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindLAPACK.cmake#L56-L59 261 | ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \ 262 | CPLUS_INCLUDE_PATH="${AOCL_BASE}/include${CPLUS_INCLUDE_PATH:+:$CPLUS_INCLUDE_PATH}" \ 263 | LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \ 264 | LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}" 265 | 266 | 267 | FROM builder-base-${TARGETARCH} AS builder-base 268 | RUN mkdir /build /build/dist 269 | WORKDIR /build 270 | COPY --chmod=755 effective_cpu_count.sh . 271 | COPY --chmod=755 scale.sh . 272 | COPY compiler_wrapper.f95 . 273 | ARG AMD64_NATIVE_ARCH="skylake" 274 | ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" 275 | RUN if [ "$(uname -m)" = "aarch64" ]; then \ 276 | NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \ 277 | AVX='WRAPPER_NO_AVX'; \ 278 | else \ 279 | NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ 280 | AVX='WRAPPER_AVX="AVX256"'; \ 281 | fi && \ 282 | gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 283 | 284 | COPY <<-"EOT" /build/version-string.sh 285 | #!/bin/sh 286 | set -e; 287 | VERSION="$1"; 288 | 289 | IS_HASH() { 290 | echo "$1" | grep -qxiEe '[0-9a-f]{40}'; 291 | }; 292 | 293 | if IS_HASH "$VERSION"; then 294 | REAL_VERSION="$(cat ./version.txt)"; 295 | SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; 296 | echo "$REAL_VERSION+$SHORT_HASH"; 297 | else 298 | echo "$VERSION"; 299 | fi; 300 | EOT 301 | RUN chmod 755 /build/version-string.sh 302 | 303 | COPY <<-"EOT" /build/storage-info.sh 304 | #!/bin/sh 305 | set -e; 306 | TARGET="$(realpath "$1")"; 307 | 308 | STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0; 309 | printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO"; 310 | EOT 311 | RUN chmod 755 /build/storage-info.sh 312 | 313 | ## Build torch 314 | RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \ 315 | pip3 install --no-cache-dir --upgrade numpy && \ 316 | cd pytorch && pip3 install --no-cache-dir -r requirements.txt 317 | 318 | # Build tool & library paths, shared for all libraries to be built 319 | ENV CMAKE_PREFIX_PATH=/usr/bin/ \ 320 | LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/lib \ 321 | CUDA_BIN_PATH=/usr/local/cuda/bin \ 322 | CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ \ 323 | CUDNN_LIB_DIR=/usr/local/cuda/lib64 324 | 325 | ARG BUILD_TRITON 326 | ARG BUILD_MAX_JOBS="" 327 | RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \ 328 | --mount=type=cache,target=/ccache \ 329 | if [ "$BUILD_TRITON" = '1' ]; then \ 330 | pip3 install --no-cache-dir pybind11 && \ 331 | export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ 332 | cd triton/python && \ 333 | python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \ 334 | pip3 install ../../dist/*.whl; \ 335 | fi 336 | 337 | ARG BUILD_TORCH_VERSION 338 | ENV TORCH_VERSION=$BUILD_TORCH_VERSION 339 | # Filter out the 10.0 arch on CUDA versions != 12.8 and != 12.9 340 | ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${TORCH_CUDA_ARCH_LIST/ 10.0/}||${TORCH_CUDA_ARCH_LIST}" 341 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#12.9.?}" 342 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}" 343 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}" 344 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}" 345 | 346 | RUN printf 'Arch: %s\nTORCH_CUDA_ARCH_LIST=%s\n' "$(uname -m)" "${TORCH_CUDA_ARCH_LIST}" 347 | 348 | ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a" 349 | # Add sm_100a build if NV_CUDA_LIB_VERSION matches 12.[89].* 350 | RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ 351 | case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ 352 | FLAGS="${FLAGS} -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_100a,code=sm_100a" ;; \ 353 | esac && \ 354 | echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf 355 | 356 | # If the directory /opt/nccl-tests exists, 357 | # the base image is assumed to be nccl-tests, 358 | # so it uses the system's special NCCL and UCC installations for the build. 359 | # 360 | # Additionally, this RUN is executed with the downloaded PyTorch repository 361 | # mounted temporarily in "rw" mode, which allows ephemeral writes like 362 | # OverlayFS would that do not mutate the downloaded copy. 363 | # This means the downloaded data never needs to be duplicated in the cache in 364 | # a layer of this build step, and temporary build files are automatically 365 | # cleaned up at the end of the step once the directory is detached. 366 | # 367 | # This step is itself cacheable as long as the downloaded files (and ARCH_LIST) 368 | # remain the same. 369 | # 370 | # NB: This cannot specify BLAS=FLAME directly, because PyTorch (v2.3.0)'s code 371 | # to explicitly choose a BLAS implementation is missing that option 372 | # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Dependencies.cmake#L195-L266), 373 | # and using BLAS=blis makes it ignore the libflame LAPACK library, because 374 | # that triggers its FindBLIS logic rather than FindBLAS, and FindLAPACK depends 375 | # on a variable set only during FindBLAS (BLAS_INFO=FLAME) 376 | # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindLAPACK.cmake#L176-L189). 377 | # Thus, we have to force it to use its generic FindBLAS logic, 378 | # and narrow it down from there by specifying WITH_BLAS=FLAME 379 | # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271). 380 | # Without WITH_BLAS, it would detect the BLAS implementation as 381 | # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either. 382 | ARG BUILD_CXX11_ABI="" 383 | SHELL ["/bin/bash", "-eo", "pipefail", "-c"] 384 | RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ 385 | --mount=type=cache,target=/ccache \ 386 | export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ 387 | echo "MAX_JOBS: ${MAX_JOBS}" && \ 388 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 389 | echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ 390 | if [ -n "${BUILD_CXX11_ABI}" ]; then \ 391 | export _GLIBCXX_USE_CXX11_ABI="${BUILD_CXX11_ABI}"; \ 392 | fi && \ 393 | ./storage-info.sh . && \ 394 | cd pytorch && \ 395 | ../storage-info.sh . && \ 396 | mkdir build && \ 397 | ln -s /usr/bin/cc build/cc && \ 398 | ln -s /usr/bin/c++ build/c++ && \ 399 | if [ "$(uname -m)" = 'aarch64' ]; then \ 400 | export USE_PRIORITIZED_TEXT_FOR_LD=1; \ 401 | fi && \ 402 | { if [ -d /opt/nccl-tests ]; then \ 403 | export \ 404 | USE_DISTRIBUTED=1 \ 405 | USE_NCCL=1 USE_SYSTEM_NCCL=1 \ 406 | UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \ 407 | USE_NCCL_WITH_UCC=1 \ 408 | USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \ 409 | USE_CUDNN=1 \ 410 | BUILD_TORCH=ON \ 411 | BUILD_TEST=0 \ 412 | CUDA_HOST_COMPILER=cc \ 413 | USE_CUDA=1 \ 414 | USE_NNPACK=1 \ 415 | CC=cc \ 416 | CXX=c++ \ 417 | USE_BLAS=1 \ 418 | USE_LAPACK=1 \ 419 | WITH_BLAS=FLAME \ 420 | PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \ 421 | PYTORCH_BUILD_NUMBER=0 \ 422 | TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ 423 | python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ 424 | | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)' 425 | SHELL ["/bin/sh", "-c"] 426 | RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl 427 | 428 | RUN python3 -m pip install -U --no-cache-dir \ 429 | packaging setuptools wheel pip 430 | 431 | FROM builder-base AS torchvision-builder 432 | RUN rm ./dist/* 433 | 434 | ## Build torchvision 435 | ARG BUILD_TORCH_VISION_VERSION 436 | ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION 437 | RUN pip3 install --no-cache-dir --upgrade \ 438 | matplotlib numpy typing_extensions requests pillow 439 | 440 | RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \ 441 | --mount=type=cache,target=/ccache \ 442 | export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ 443 | echo "MAX_JOBS: ${MAX_JOBS}" && \ 444 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 445 | echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ 446 | cd vision && \ 447 | mkdir build && \ 448 | ln -s /usr/bin/cc build/cc && \ 449 | ln -s /usr/bin/c++ build/c++ && \ 450 | { if [ -d /opt/nccl-tests ]; then \ 451 | export \ 452 | USE_DISTRIBUTED=1 \ 453 | USE_NCCL=1 USE_SYSTEM_NCCL=1 \ 454 | UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \ 455 | USE_NCCL_WITH_UCC=1 \ 456 | USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \ 457 | USE_CUDNN=1 \ 458 | USE_OPENCV=1 \ 459 | BUILD_TORCH=ON \ 460 | BUILD_TEST=0 \ 461 | CUDA_HOST_COMPILER=cc \ 462 | USE_CUDA=1 \ 463 | FORCE_CUDA=1 \ 464 | USE_NNPACK=1 \ 465 | CC=cc \ 466 | CXX=c++ \ 467 | BUILD_VERSION="$(../version-string.sh "$TORCH_VISION_VERSION")" \ 468 | TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ 469 | python3 setup.py bdist_wheel --dist-dir ../dist 470 | 471 | FROM builder-base AS torchaudio-builder 472 | RUN rm ./dist/* 473 | 474 | ## Build torchaudio 475 | ARG BUILD_TORCH_AUDIO_VERSION 476 | ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION 477 | RUN pip3 install --no-cache-dir --upgrade \ 478 | matplotlib numpy typing_extensions requests pillow 479 | 480 | RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \ 481 | --mount=type=cache,target=/ccache \ 482 | export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ 483 | echo "MAX_JOBS: ${MAX_JOBS}" && \ 484 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 485 | echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ 486 | cd audio && \ 487 | mkdir build && \ 488 | ln -s /usr/bin/cc build/cc && \ 489 | ln -s /usr/bin/c++ build/c++ && \ 490 | { if [ -d /opt/nccl-tests ]; then \ 491 | export \ 492 | USE_DISTRIBUTED=1 \ 493 | USE_NCCL=1 USE_SYSTEM_NCCL=1 \ 494 | UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \ 495 | USE_NCCL_WITH_UCC=1 \ 496 | USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \ 497 | USE_CUDNN=1 \ 498 | USE_OPENCV=1 \ 499 | BUILD_TORCH=ON \ 500 | BUILD_TEST=0 \ 501 | CUDA_HOST_COMPILER=cc \ 502 | USE_CUDA=1 \ 503 | FORCE_CUDA=1 \ 504 | USE_NNPACK=1 \ 505 | CC=cc \ 506 | CXX=c++ \ 507 | BUILD_VERSION="$(../version-string.sh "$TORCH_AUDIO_VERSION")" \ 508 | TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ 509 | python3 setup.py bdist_wheel --dist-dir ../dist 510 | 511 | FROM builder-base AS transformerengine-builder 512 | RUN rm ./dist/* 513 | 514 | # Build TransformerEngine 515 | ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST 516 | ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST 517 | 518 | RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \ 519 | --mount=type=cache,target=/ccache \ 520 | export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ 521 | export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \ 522 | echo "MAX_JOBS: ${MAX_JOBS}" && \ 523 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 524 | echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ 525 | case "${CUDA_VERSION}" in 12.[0123456].*) \ 526 | export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \ 527 | esac && \ 528 | cd TransformerEngine && \ 529 | if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \ 530 | sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \ 531 | build_tools/utils.py; \ 532 | fi && \ 533 | python3 setup.py bdist_wheel --dist-dir /build/dist 534 | 535 | FROM builder-base AS flash-attn-builder-base 536 | RUN rm ./dist/* 537 | ENV PYTHONUNBUFFERED=1 538 | ENV FLASH_ATTENTION_FORCE_BUILD=TRUE 539 | ARG BUILD_FLASH_ATTN_MAX_JOBS="" 540 | 541 | COPY <<-"EOT" /build/fa-build.sh 542 | #!/bin/bash 543 | set -eo pipefail; 544 | if [ -n "$1" ]; then cd "$1"; fi; 545 | python3 setup.py bdist_wheel --dist-dir /build/dist \ 546 | | grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores' 547 | EOT 548 | RUN chmod 755 /build/fa-build.sh 549 | 550 | FROM flash-attn-builder-base AS flash-attn-builder 551 | 552 | # Build flash-attn 553 | RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ 554 | --mount=type=cache,target=/ccache \ 555 | export CC=$(realpath -e ./compiler) \ 556 | MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \ 557 | echo "MAX_JOBS: ${MAX_JOBS}" && \ 558 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 559 | echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ 560 | cd flash-attention && \ 561 | for EXT_DIR in $(realpath -s -e \ 562 | . \ 563 | csrc/ft_attention \ 564 | csrc/fused_dense_lib \ 565 | csrc/fused_softmax \ 566 | csrc/layer_norm \ 567 | csrc/rotary \ 568 | csrc/xentropy); \ 569 | do /build/fa-build.sh "$EXT_DIR" || exit 1; done 570 | 571 | FROM flash-attn-builder-base AS flash-attn-3-builder 572 | 573 | # Artifically sequence this build stage after the previous one 574 | # to prevent parallelism, because these are both very resource-intensive 575 | RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build : 576 | 577 | # Build flash-attn v3 578 | RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \ 579 | --mount=type=cache,target=/ccache \ 580 | if [ ! -d flash-attention/hopper ]; then \ 581 | echo "Not compiling flash-attn v3" && exit 0; \ 582 | fi && \ 583 | export CC=$(realpath -e ./compiler) \ 584 | MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \ 585 | echo "MAX_JOBS: ${MAX_JOBS}" && \ 586 | export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ 587 | echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ 588 | /build/fa-build.sh flash-attention/hopper 589 | 590 | FROM builder-base AS builder 591 | COPY --link --from=torchaudio-builder /build/dist/ /build/dist/ 592 | COPY --link --from=torchvision-builder /build/dist/ /build/dist/ 593 | COPY --link --from=transformerengine-builder /build/dist/ /build/dist/ 594 | COPY --link --from=flash-attn-builder /build/dist/ /build/dist/ 595 | COPY --link --from=flash-attn-3-builder /build/dist/ /build/dist/ 596 | 597 | ## Build the final torch image. 598 | FROM ${FINAL_BASE_IMAGE} as final-arm64 599 | ARG BUILD_TORCH_CUDA_ARCH_LIST 600 | ENV TORCH_CUDA_ARCH_LIST="9.0${BUILD_TORCH_CUDA_ARCH_LIST#* 9.0}" 601 | 602 | FROM ${FINAL_BASE_IMAGE} as final-amd64 603 | ARG BUILD_TORCH_CUDA_ARCH_LIST 604 | ENV TORCH_CUDA_ARCH_LIST="${BUILD_TORCH_CUDA_ARCH_LIST}" 605 | 606 | FROM final-${TARGETARCH} 607 | ENV DEBIAN_FRONTEND=noninteractive 608 | 609 | # Install core packages 610 | RUN apt-get -qq update && apt-get -qq install -y \ 611 | libncurses5 python3 python3-pip python3-distutils \ 612 | libomp5 libpng16-16 libjpeg-turbo8 libsodium23 \ 613 | curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \ 614 | rsync htop wget unzip tini && \ 615 | apt-get clean && \ 616 | /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ 617 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ 618 | update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ 619 | update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \ 620 | ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \ 621 | ldconfig 622 | 623 | RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ 624 | software-properties-common lsb-release && \ 625 | SETUP_LIBSTDCXX() { \ 626 | apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \ 627 | | sed -e '/connection timed out/{p; Q1}' && \ 628 | apt-get -qq install -y --no-install-recommends libstdc++6 && \ 629 | apt-get clean; \ 630 | } && \ 631 | { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; } 632 | 633 | RUN LLVM_VERSION='18' && \ 634 | CODENAME="$(lsb_release -cs)" && \ 635 | wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ 636 | apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \ 637 | apt-get -qq install -y --no-install-recommends "libomp5-$LLVM_VERSION" && \ 638 | apt-get clean 639 | 640 | # Install AOCL-BLAS and AOCL-LAPACK 641 | # See: https://www.amd.com/en/developer/aocl/dense.html 642 | ARG AOCL_BASE 643 | COPY --from=aocl-downloader "${AOCL_BASE}" "${AOCL_BASE}" 644 | 645 | # `ldconfig` lets the dynamic linker access AOCL libraries 646 | RUN install -m 0644 -t /etc/ld.so.conf.d "${AOCL_BASE}/aocl.conf" && \ 647 | ldconfig 648 | 649 | ARG BUILD_TORCH_VERSION 650 | ARG BUILD_TORCH_VISION_VERSION 651 | ARG BUILD_TORCH_AUDIO_VERSION 652 | ENV TORCH_VERSION=$BUILD_TORCH_VERSION 653 | ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION 654 | ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION 655 | # Filter out the 10.0 arch on CUDA versions != 12.8 and != 12.9 656 | ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${TORCH_CUDA_ARCH_LIST/ 10.0/}||${TORCH_CUDA_ARCH_LIST}" 657 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#12.9.?}" 658 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}" 659 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}" 660 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}" 661 | 662 | COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh 663 | # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0. 664 | # - Don't mess with libnccl2 when using nccl-tests as a base, 665 | # checked via the existence of the directory "/opt/nccl-tests". 666 | RUN export \ 667 | CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \ 668 | CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \ 669 | export \ 670 | CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \ 671 | apt-get -qq update && \ 672 | apt-get -qq install --no-upgrade -y \ 673 | libcurand-${CUDA_PACKAGE_VERSION} \ 674 | libcufft-${CUDA_PACKAGE_VERSION} \ 675 | libcublas-${CUDA_PACKAGE_VERSION} \ 676 | cuda-nvrtc-${CUDA_PACKAGE_VERSION} \ 677 | libcusparse-${CUDA_PACKAGE_VERSION} \ 678 | libcusolver-${CUDA_PACKAGE_VERSION} \ 679 | libcufile-${CUDA_PACKAGE_VERSION} \ 680 | cuda-cupti-${CUDA_PACKAGE_VERSION} \ 681 | libnvjpeg-${CUDA_PACKAGE_VERSION} \ 682 | libnvtoolsext1 && \ 683 | { if [ $CUDA_MAJOR_VERSION -ge 12 ]; then \ 684 | apt-get -qq install --no-upgrade -y libnvjitlink-${CUDA_PACKAGE_VERSION}; fi; } && \ 685 | { if [ ! -d /opt/nccl-tests ]; then \ 686 | export NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \ 687 | apt-get -qq install --no-upgrade -y "libnccl2=$NCCL_PACKAGE_VERSION"; fi; } && \ 688 | /tmp/install_cudnn.sh "$CUDA_VERSION" runtime && \ 689 | rm /tmp/install_cudnn.sh && \ 690 | apt-get clean && \ 691 | ldconfig 692 | 693 | 694 | WORKDIR /usr/src/app 695 | 696 | # Install custom PyTorch wheels. 697 | RUN --mount=type=bind,from=builder,source=/build/dist,target=. \ 698 | pip3 install --no-cache-dir -U numpy packaging && \ 699 | pip3 install --no-cache-dir -U ./*.whl 700 | 701 | # Make a symlink to flash-attn v3 where TransformerEngine expects it, 702 | # and modify the installation record so that pip uninstall knows how to 703 | # fully remove it. 704 | RUN <<-"EOT" python3 705 | #!/bin/env python3 706 | from base64 import urlsafe_b64encode as b64 707 | from hashlib import sha256 708 | from importlib import metadata 709 | from pathlib import Path 710 | from py_compile import compile 711 | 712 | dist = metadata.distribution("flashattn-hopper") 713 | p = dist.locate_file("flash_attn_interface.py") 714 | print("flash_attn_interface:", p) 715 | root = p.parent 716 | 717 | if not p.exists(): 718 | raise SystemExit("flash_attn_interface not found") 719 | if not p.is_file(): 720 | raise SystemExit("flash_attn_interface path is not a file") 721 | 722 | d = root / "flashattn_hopper" 723 | if d.exists(): 724 | raise SystemExit(f'"{d}" already exists') 725 | 726 | d.mkdir(mode=0o755, parents=False, exist_ok=False) 727 | new = d / p.name 728 | new.symlink_to(p) 729 | print(f"Created new symlink at {new}") 730 | 731 | compiled = Path(compile(new)) 732 | 733 | 734 | def record_entry(path: Path) -> str: 735 | content = path.read_bytes() 736 | digest = b64(sha256(content).digest()).rstrip(b"=").decode() 737 | package_path = path.relative_to(root).as_posix() 738 | return f"{package_path},sha256={digest},{len(content):d}\r\n" 739 | 740 | 741 | for f in dist.files: 742 | if f.match("flashattn?hopper-*.dist-info/RECORD"): 743 | with f.locate().open("a", encoding="utf-8", newline="") as record: 744 | for added in (new, compiled): 745 | record.write(record_entry(added)) 746 | break 747 | EOT 748 | -------------------------------------------------------------------------------- /torch/compiler_wrapper.f95: -------------------------------------------------------------------------------- 1 | #ifndef WRAPPER_NATIVE 2 | #define WRAPPER_NATIVE "skylake" 3 | #endif 4 | 5 | #ifndef WRAPPER_CC 6 | #define WRAPPER_CC "gcc" 7 | #endif 8 | 9 | #ifndef WRAPPER_AVX 10 | #define WRAPPER_AVX "AVX256" 11 | #endif 12 | 13 | PROGRAM compiler_wrapper 14 | ! Wraps C compiler invocations, 15 | ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions 16 | ! with -D____, and -march=native with -march=, 17 | ! for better reproducibility and compatibility. 18 | IMPLICIT NONE 19 | INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 20 | CHARACTER(len=:), ALLOCATABLE :: arg, command 21 | ALLOCATE(CHARACTER(len=128) :: arg) 22 | command = WRAPPER_CC 23 | 24 | DO i = 1, COMMAND_ARGUMENT_COUNT() 25 | DO 26 | CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated) 27 | IF (truncated == 0) THEN 28 | EXIT 29 | ELSE IF (truncated == -1) THEN 30 | DEALLOCATE(arg) 31 | ALLOCATE(CHARACTER(len=full_length) :: arg) 32 | ELSE 33 | CALL EXIT(95) 34 | END IF 35 | END DO 36 | IF (arg == "-march=native") THEN 37 | command = command // (" '-march=" // WRAPPER_NATIVE // "'") 38 | ELSE IF ( & 39 | arg == "-D__AVX512__" & 40 | .OR. arg == "-D__AVX256__" & 41 | .OR. arg == "-D__SCALAR__" & 42 | ) THEN 43 | #ifndef WRAPPER_NO_AVX 44 | command = command // (" '-D__" // WRAPPER_AVX // "__'") 45 | #endif 46 | ELSE 47 | command = command // shell_escaped(arg) 48 | END IF 49 | END DO 50 | CALL SYSTEM(command, exitcode) 51 | IF (exitcode > 255) THEN 52 | exitcode = MAX(IAND(exitcode, 255), 1) 53 | END IF 54 | CALL EXIT(exitcode) 55 | 56 | 57 | CONTAINS 58 | FUNCTION shell_escaped(str) RESULT(out) 59 | ! Turns [str] into [ 'str'] and replaces all 60 | ! internal ['] characters with ['"'"'] 61 | IMPLICIT NONE 62 | CHARACTER(len=*), INTENT(IN) :: str 63 | CHARACTER(len=:), ALLOCATABLE :: out 64 | INTEGER :: old_i, out_i, old_len, out_len 65 | 66 | old_len = LEN_TRIM(str) 67 | ! Figure out the new length to allocate by scanning `str`. 68 | ! This always needs to add at least [ '] at the beginning 69 | ! and ['] at the end, so the length increases by at least 3. 70 | out_len = old_len + 3 71 | DO old_i = 1, old_len 72 | IF (str(old_i:old_i) == "'") THEN 73 | out_len = out_len + 4 74 | END IF 75 | END DO 76 | ALLOCATE(CHARACTER(len=out_len) :: out) 77 | 78 | ! Copy over the string, performing necessary escapes. 79 | out(1:2) = " '" 80 | out_i = 3 81 | DO old_i = 1, old_len 82 | IF (str(old_i:old_i) == "'") THEN 83 | ! Escape internal single-quotes 84 | out(out_i:out_i + 4) = '''"''"''' 85 | out_i = out_i + 5 86 | ELSE 87 | ! No escaping needed 88 | out(out_i:out_i) = str(old_i:old_i) 89 | out_i = out_i + 1 90 | END IF 91 | END DO 92 | out(out_i:out_i) = "'" 93 | END FUNCTION 94 | END PROGRAM 95 | -------------------------------------------------------------------------------- /torch/effective_cpu_count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CPU_QUOTA() ( 4 | CGROUP='/sys/fs/cgroup'; 5 | CGROUP_V1="$CGROUP/cpu,cpuacct"; 6 | CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us"; 7 | CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us"; 8 | CGROUP_V2="$CGROUP/user.slice/cpu.max"; 9 | if [ ! -d "$CGROUP" ]; then 10 | return 1; 11 | elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then 12 | IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1; 13 | IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1; 14 | elif [ -f "$CGROUP_V2" ]; then 15 | IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1; 16 | else 17 | return 1; 18 | fi; 19 | 20 | if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then 21 | echo $((QUOTA / PERIOD)); 22 | return 0; 23 | else 24 | return 1; 25 | fi; 26 | ) 27 | 28 | EFFECTIVE_CPU_COUNT() { 29 | CPU_QUOTA || getconf _NPROCESSORS_ONLN; 30 | } 31 | 32 | EFFECTIVE_CPU_COUNT; 33 | -------------------------------------------------------------------------------- /torch/install_cudnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CUDA_VERSION="$1"; 4 | if [ -z "$CUDA_VERSION" ]; then 5 | exit 14; 6 | fi; 7 | 8 | INSTALL_DEV="$2"; 9 | if [ "$INSTALL_DEV" = "dev" ]; then 10 | echo "Ensuring installation of cuDNN (dev)"; 11 | DEV_SUFFIX="-dev"; 12 | DEV_PREFIX=""; 13 | elif [ "$INSTALL_DEV" = "runtime" ]; then 14 | echo "Ensuring installation of cuDNN (runtime)"; 15 | DEV_SUFFIX=""; 16 | DEV_PREFIX="lib"; 17 | else 18 | exit 15; 19 | fi; 20 | 21 | CHECK_VERSION() { 22 | dpkg-query --status "$1" 2>/dev/null \ 23 | | sed -ne 's/Version: //p' \ 24 | | grep .; 25 | } 26 | 27 | CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1); 28 | LIBCUDNN_VER="$( 29 | CHECK_VERSION "libcudnn8${DEV_SUFFIX}" || \ 30 | CHECK_VERSION "libcudnn9${DEV_SUFFIX}-cuda-${CUDA_MAJOR_VERSION}" || \ 31 | :; 32 | )" || exit 16; 33 | 34 | if [ -z "$LIBCUDNN_VER" ]; then 35 | apt-get -qq update && \ 36 | apt-get -qq install --no-upgrade -y "${DEV_PREFIX}cudnn9-cuda-${CUDA_MAJOR_VERSION}" && \ 37 | apt-get clean && \ 38 | ldconfig; 39 | else 40 | echo "Found cuDNN version ${LIBCUDNN_VER}" 41 | fi; 42 | -------------------------------------------------------------------------------- /torch/scale.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e; 4 | 5 | VAL="$1"; 6 | DIVISOR="$2"; 7 | MAXIMUM="$3"; 8 | 9 | [ -n "$VAL" ]; 10 | 11 | if [ -n "$DIVISOR" ]; 12 | then VAL="$((( $VAL + $DIVISOR - 1 ) / $DIVISOR))"; 13 | fi; 14 | 15 | if [ -n "$MAXIMUM" ]; 16 | then VAL="$((VAL > MAXIMUM ? MAXIMUM : VAL))"; 17 | fi; 18 | 19 | echo "$VAL"; 20 | -------------------------------------------------------------------------------- /torch/torchaudio-cu125-pr3811.patch: -------------------------------------------------------------------------------- 1 | From 7797f83e1d66ff78872763e1da3a5fb2f0534c40 Mon Sep 17 00:00:00 2001 2 | From: Markus Hennerbichler 3 | Date: Mon, 15 Jul 2024 14:07:13 +0100 4 | Subject: [PATCH] Fix CUDA 12.5 build 5 | 6 | CUDA 12.5 removed the FLT_MAX symbol. 7 | This was previously used without being explicitly imported. 8 | FLT_MAX is defined in , including this header fixes the issue 9 | --- 10 | src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu | 1 + 11 | 1 file changed, 1 insertion(+) 12 | 13 | diff --git a/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu b/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu 14 | index 4ca8f1bf24..e6192155a2 100644 15 | --- a/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu 16 | +++ b/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu 17 | @@ -24,6 +24,7 @@ 18 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 19 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 20 | #include 21 | +#include 22 | #include "ctc_fast_divmod.cuh" 23 | #include "cub/cub.cuh" 24 | #include "device_data_wrap.h" 25 | -------------------------------------------------------------------------------- /vllm-tensorizer/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE="ghcr.io/coreweave/ml-containers/torch-extras:es-22.04-58a49a2-base-cuda12.1.1-torch2.1.2-vision0.16.2-audio2.1.2-flash_attn2.4.2" 2 | 3 | FROM scratch as freezer 4 | WORKDIR / 5 | COPY --chmod=755 freeze.sh / 6 | 7 | FROM ${BASE_IMAGE} as builder-base 8 | 9 | ARG MAX_JOBS="" 10 | 11 | # Dependencies requiring NVCC are built ahead of time in a separate stage 12 | # so that the ~2 GiB dev library installations don't have to be included 13 | # in the final image. 14 | RUN export \ 15 | CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \ 16 | CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \ 17 | export \ 18 | CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \ 19 | apt-get -qq update && apt-get install -y --no-install-recommends \ 20 | cuda-nvcc-${CUDA_PACKAGE_VERSION} \ 21 | cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \ 22 | libcurand-dev-${CUDA_PACKAGE_VERSION} \ 23 | libcublas-dev-${CUDA_PACKAGE_VERSION} \ 24 | libcusparse-dev-${CUDA_PACKAGE_VERSION} \ 25 | libcusolver-dev-${CUDA_PACKAGE_VERSION} \ 26 | cuda-nvprof-${CUDA_PACKAGE_VERSION} \ 27 | cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ 28 | libaio-dev \ 29 | ninja-build && \ 30 | apt-get clean 31 | 32 | RUN ldconfig 33 | 34 | RUN apt-get -qq update && \ 35 | apt-get -qq install -y --no-install-recommends \ 36 | python3-pip git ninja-build && \ 37 | apt-get clean && \ 38 | pip3 install -U --no-cache-dir pip packaging setuptools wheel 39 | 40 | FROM alpine/git:2.36.3 as vllm-downloader 41 | WORKDIR /git 42 | ARG COMMIT_HASH 43 | RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ 44 | https://github.com/coreweave/vllm.git && \ 45 | cd vllm && \ 46 | git checkout "${COMMIT_HASH}" && \ 47 | git submodule update --init --recursive --jobs 8 \ 48 | --depth 1 --filter=blob:none 49 | 50 | FROM builder-base as vllm-builder 51 | WORKDIR /workspace 52 | RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw \ 53 | --mount=type=bind,from=freezer,target=/tmp/frozen,rw \ 54 | /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/frozen/constraints.txt && \ 55 | LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}" \ 56 | python3 -m pip wheel -w /wheels \ 57 | -v --no-cache-dir --no-build-isolation --no-deps \ 58 | -c /tmp/frozen/constraints.txt \ 59 | ./ 60 | 61 | WORKDIR /wheels 62 | 63 | FROM ${BASE_IMAGE} as base 64 | 65 | WORKDIR /workspace 66 | 67 | RUN apt-get -qq update && apt-get install -y --no-install-recommends curl && apt-get clean 68 | 69 | RUN --mount=type=bind,from=freezer,target=/tmp/frozen \ 70 | /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/constraints.txt 71 | 72 | RUN python3 -m pip install --no-cache-dir \ 73 | "fschat[model_worker] == 0.2.30" "triton == 2.1.0" \ 74 | -c /tmp/constraints.txt 75 | 76 | RUN --mount=type=bind,from=vllm-builder,source=/wheels,target=/tmp/wheels \ 77 | python3 -m pip install --no-cache-dir /tmp/wheels/*.whl -c /tmp/constraints.txt && \ 78 | rm /tmp/constraints.txt 79 | 80 | 81 | EXPOSE 8080 -------------------------------------------------------------------------------- /vllm-tensorizer/freeze.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | PATTERN=""; 4 | for DEP in "$@"; do { 5 | PATTERN="${PATTERN:+$PATTERN|}${DEP}"; 6 | }; done; 7 | PATTERN="^(${PATTERN})\b"; 8 | 9 | python3 -m pip list --format freeze --disable-pip-version-check \ 10 | | { grep -iE "${PATTERN}" || :; }; 11 | --------------------------------------------------------------------------------