├── .devcontainer
└── devcontainer.json
├── .github
├── ISSUE_TEMPLATE
│ └── bug-report.yml
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── build-docker-images-release.yml
│ ├── build_and_run_tests.yml
│ ├── build_docker_images.yml
│ ├── build_documentation.yml
│ ├── build_pr_documentation.yml
│ ├── fp8_runner.yml
│ ├── gaudi3_scheduled.yml
│ ├── integration_tests.yml
│ ├── nightly.yml
│ ├── pr_style_bot.yml
│ ├── quality.yml
│ ├── run_merge_tests.yml
│ ├── self_hosted_integration_tests.yml
│ ├── stale.yml
│ ├── test.yml
│ ├── test_imports.yml
│ ├── trufflehog.yml
│ └── upload_pr_documentation.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── benchmarks
├── README.md
├── big_model_inference
│ ├── README.md
│ ├── big_model_inference.py
│ └── measures_util.py
├── fp8
│ ├── ms_amp
│ │ ├── Dockerfile
│ │ ├── ddp.py
│ │ ├── distrib_deepspeed.py
│ │ ├── fp8_utils.py
│ │ └── non_distributed.py
│ ├── torchao
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── ddp.py
│ │ ├── distrib_deepspeed.py
│ │ ├── fp8_utils.py
│ │ ├── fsdp.py
│ │ └── non_distributed.py
│ └── transformer_engine
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── ddp.py
│ │ ├── distrib_deepspeed.py
│ │ ├── fp8_utils.py
│ │ ├── fsdp.py
│ │ └── non_distributed.py
├── fsdp2
│ ├── README.md
│ ├── imgs
│ │ ├── allocated_memory.png
│ │ └── reserved_memory.png
│ ├── main.py
│ ├── measure_utils.py
│ ├── utils.py
│ └── visualize.py
└── torch.compile
│ ├── README.md
│ ├── imgs
│ ├── compilation_time.png
│ └── speedup_factor.png
│ └── regional_compilation.py
├── docker
├── README.md
├── accelerate-cpu
│ └── Dockerfile
├── accelerate-gpu-deepspeed
│ └── Dockerfile
└── accelerate-gpu
│ └── Dockerfile
├── docs
├── Makefile
├── README.md
└── source
│ ├── _toctree.yml
│ ├── basic_tutorials
│ ├── execution.md
│ ├── install.md
│ ├── launch.md
│ ├── migration.md
│ ├── notebook.md
│ ├── overview.md
│ ├── tpu.md
│ └── troubleshooting.md
│ ├── concept_guides
│ ├── big_model_inference.md
│ ├── deferring_execution.md
│ ├── fsdp1_vs_fsdp2.md
│ ├── fsdp_and_deepspeed.md
│ ├── gradient_synchronization.md
│ ├── internal_mechanism.md
│ ├── low_precision_training.md
│ ├── performance.md
│ └── training_tpu.md
│ ├── imgs
│ ├── accelerate_logo.png
│ ├── course_banner.png
│ └── profile_export.png
│ ├── index.md
│ ├── package_reference
│ ├── accelerator.md
│ ├── big_modeling.md
│ ├── cli.md
│ ├── deepspeed.md
│ ├── fp8.md
│ ├── fsdp.md
│ ├── inference.md
│ ├── kwargs.md
│ ├── launchers.md
│ ├── logging.md
│ ├── megatron_lm.md
│ ├── state.md
│ ├── torch_wrappers.md
│ ├── tracking.md
│ └── utilities.md
│ ├── quicktour.md
│ └── usage_guides
│ ├── big_modeling.md
│ ├── checkpoint.md
│ ├── compilation.md
│ ├── ddp_comm_hook.md
│ ├── deepspeed.md
│ ├── deepspeed_multiple_model.md
│ ├── distributed_inference.md
│ ├── explore.md
│ ├── fsdp.md
│ ├── gaudi.md
│ ├── gradient_accumulation.md
│ ├── ipex.md
│ ├── local_sgd.md
│ ├── low_precision_training.md
│ ├── megatron_lm.md
│ ├── model_size_estimator.md
│ ├── mps.md
│ ├── profiler.md
│ ├── quantization.md
│ ├── sagemaker.md
│ ├── tracking.md
│ └── training_zoo.md
├── examples
├── README.md
├── by_feature
│ ├── README.md
│ ├── automatic_gradient_accumulation.py
│ ├── checkpointing.py
│ ├── cross_validation.py
│ ├── ddp_comm_hook.py
│ ├── deepspeed_with_config_support.py
│ ├── early_stopping.py
│ ├── fsdp_with_peak_mem_tracking.py
│ ├── gradient_accumulation.py
│ ├── gradient_accumulation_for_autoregressive_models.py
│ ├── local_sgd.py
│ ├── megatron_lm_gpt_pretraining.py
│ ├── memory.py
│ ├── multi_process_metrics.py
│ ├── profiler.py
│ ├── schedule_free.py
│ └── tracking.py
├── complete_cv_example.py
├── complete_nlp_example.py
├── config_yaml_templates
│ ├── README.md
│ ├── deepspeed.yaml
│ ├── fp8.yaml
│ ├── fsdp.yaml
│ ├── multi_gpu.yaml
│ ├── multi_node.yaml
│ ├── run_me.py
│ └── single_gpu.yaml
├── cv_example.py
├── deepspeed_config_templates
│ ├── zero_stage1_config.json
│ ├── zero_stage2_config.json
│ ├── zero_stage2_offload_config.json
│ ├── zero_stage3_config.json
│ └── zero_stage3_offload_config.json
├── inference
│ ├── distributed
│ │ ├── README.md
│ │ ├── distributed_image_generation.py
│ │ ├── distributed_speech_generation.py
│ │ ├── florence2.py
│ │ ├── llava_next_video.py
│ │ ├── phi2.py
│ │ └── stable_diffusion.py
│ └── pippy
│ │ ├── README.md
│ │ ├── bert.py
│ │ ├── gpt2.py
│ │ ├── llama.py
│ │ ├── requirements.txt
│ │ └── t5.py
├── multigpu_remote_launcher.py
├── nlp_example.py
├── requirements.txt
└── slurm
│ ├── fsdp_config.yaml
│ ├── submit_multicpu.sh
│ ├── submit_multigpu.sh
│ ├── submit_multinode.sh
│ └── submit_multinode_fsdp.sh
├── manim_animations
├── big_model_inference
│ ├── stage_1.py
│ ├── stage_2.py
│ ├── stage_3.py
│ ├── stage_4.py
│ └── stage_5.py
└── dataloaders
│ ├── stage_0.py
│ ├── stage_1.py
│ ├── stage_2.py
│ ├── stage_3.py
│ ├── stage_4.py
│ ├── stage_5.py
│ ├── stage_6.py
│ └── stage_7.py
├── pyproject.toml
├── setup.py
├── src
└── accelerate
│ ├── __init__.py
│ ├── accelerator.py
│ ├── big_modeling.py
│ ├── checkpointing.py
│ ├── commands
│ ├── __init__.py
│ ├── accelerate_cli.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── cluster.py
│ │ ├── config.py
│ │ ├── config_args.py
│ │ ├── config_utils.py
│ │ ├── default.py
│ │ ├── sagemaker.py
│ │ └── update.py
│ ├── env.py
│ ├── estimate.py
│ ├── launch.py
│ ├── menu
│ │ ├── __init__.py
│ │ ├── cursor.py
│ │ ├── helpers.py
│ │ ├── input.py
│ │ ├── keymap.py
│ │ └── selection_menu.py
│ ├── merge.py
│ ├── test.py
│ ├── to_fsdp2.py
│ ├── tpu.py
│ └── utils.py
│ ├── data_loader.py
│ ├── hooks.py
│ ├── inference.py
│ ├── launchers.py
│ ├── local_sgd.py
│ ├── logging.py
│ ├── memory_utils.py
│ ├── optimizer.py
│ ├── scheduler.py
│ ├── state.py
│ ├── test_utils
│ ├── __init__.py
│ ├── examples.py
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── external_deps
│ │ │ ├── __init__.py
│ │ │ ├── test_checkpointing.py
│ │ │ ├── test_ds_multiple_model.py
│ │ │ ├── test_metrics.py
│ │ │ ├── test_peak_memory_usage.py
│ │ │ ├── test_performance.py
│ │ │ ├── test_pippy.py
│ │ │ └── test_zero3_integration.py
│ │ ├── test_cli.py
│ │ ├── test_ddp_comm_hook.py
│ │ ├── test_distributed_data_loop.py
│ │ ├── test_merge_weights.py
│ │ ├── test_notebook.py
│ │ ├── test_ops.py
│ │ ├── test_script.py
│ │ └── test_sync.py
│ ├── testing.py
│ └── training.py
│ ├── tracking.py
│ └── utils
│ ├── __init__.py
│ ├── ao.py
│ ├── bnb.py
│ ├── constants.py
│ ├── dataclasses.py
│ ├── deepspeed.py
│ ├── environment.py
│ ├── fsdp_utils.py
│ ├── imports.py
│ ├── launch.py
│ ├── megatron_lm.py
│ ├── memory.py
│ ├── modeling.py
│ ├── offload.py
│ ├── operations.py
│ ├── other.py
│ ├── random.py
│ ├── rich.py
│ ├── torch_xla.py
│ ├── tqdm.py
│ ├── transformer_engine.py
│ └── versions.py
├── tests
├── __init__.py
├── deepspeed
│ ├── ds_config_zero2.json
│ ├── ds_config_zero2_model_only.json
│ ├── ds_config_zero3.json
│ ├── ds_config_zero3_model_only.json
│ ├── test_deepspeed.py
│ └── test_deepspeed_multiple_model.py
├── fsdp
│ └── test_fsdp.py
├── test_accelerator.py
├── test_big_modeling.py
├── test_cli.py
├── test_compile.py
├── test_configs
│ ├── 0_11_0.yaml
│ ├── 0_12_0.yaml
│ ├── 0_28_0_mpi.yaml
│ ├── 0_30_0_sagemaker.yaml
│ ├── 0_34_0_fp8.yaml
│ ├── README.md
│ ├── invalid_keys.yaml
│ ├── latest.yaml
│ ├── latest_fsdp.yaml
│ └── validate_launch_cmd.yaml
├── test_cpu.py
├── test_data_loader.py
├── test_examples.py
├── test_fp8.py
├── test_grad_sync.py
├── test_hooks.py
├── test_imports.py
├── test_kwargs_handlers.py
├── test_launch.py
├── test_load_checkpoint_and_dispatch_with_broadcast.py
├── test_logging.py
├── test_memory_utils.py
├── test_metrics.py
├── test_modeling_utils.py
├── test_multigpu.py
├── test_offload.py
├── test_optimizer.py
├── test_quantization.py
├── test_sagemaker.py
├── test_samples
│ ├── MRPC
│ │ ├── dev.csv
│ │ └── train.csv
│ └── test_command_file.sh
├── test_scheduler.py
├── test_state_checkpointing.py
├── test_tpu.py
├── test_tracking.py
├── test_utils.py
├── tp
│ └── test_tp.py
└── xla_spawn.py
└── utils
├── log_reports.py
└── stale.py
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | // File only needed for VSCode users to have proper Docker based interpreters
2 | {
3 | "name": "accelerate_dev_environment",
4 | "build": {
5 | // ACTION NEEDED: comment/uncomment the relevant line depending on whether you are in a CPU/GPU environment
6 | "dockerfile": "../docker/accelerate-cpu/Dockerfile"
7 | // "dockerfile": "../docker/accelerate-gpu/Dockerfile"
8 | },
9 | "runArgs": [
10 | // ACTION NEEDED: uncomment the next line if your local machine has GPUs available
11 | // "--gpus", "all",
12 | // Enable the docker container to access system resources
13 | "--ipc", "host"
14 | ],
15 | "remoteEnv": {
16 | "PYTHONPATH": "${containerEnv:PATH}:${containerWorkspaceFolder}"
17 | },
18 | "customizations": {
19 | "vscode": {
20 | "extensions": [
21 | // Ensure we have IntelliSense in VSCode when running inside container
22 | "ms-python.python"
23 | ]
24 | }
25 | },
26 | "workspaceFolder": "/workspaces/accelerate",
27 | // Need git for VSCode to color code modifications. Only runs when building environment.
28 | "onCreateCommand": "apt-get update && apt-get install -y git && pip install -e '.[dev]'"
29 | }
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
1 | name: "\U0001F41B Bug Report"
2 | description: Submit a bug report to help us improve Accelerate
3 | body:
4 | - type: markdown
5 | attributes:
6 | value: |
7 | Thanks for taking the time to submit a bug report! 🐛
8 | If this is not a bug related to the Accelerate library directly, but instead a general question about your code or the library specifically please use the [forums](https://discuss.huggingface.co/c/accelerate/18).
9 |
10 | - type: textarea
11 | id: system-info
12 | attributes:
13 | label: System Info
14 | description: Please share your accelerate configuration with us. You can run the command `accelerate env` and copy-paste its outputs below
15 | render: Shell
16 | placeholder: accelerate version, OS, python version, numpy version, torch version, and accelerate's configuration
17 | validations:
18 | required: true
19 |
20 | - type: checkboxes
21 | id: information-scripts-examples
22 | attributes:
23 | label: Information
24 | description: 'The problem arises when using:'
25 | options:
26 | - label: "The official example scripts"
27 | - label: "My own modified scripts"
28 |
29 | - type: checkboxes
30 | id: information-tasks
31 | attributes:
32 | label: Tasks
33 | description: "The tasks I am working on are:"
34 | options:
35 | - label: "One of the scripts in the examples/ folder of Accelerate or an officially supported `no_trainer` script in the `examples` folder of the `transformers` repo (such as `run_no_trainer_glue.py`)"
36 | - label: "My own task or dataset (give details below)"
37 |
38 | - type: textarea
39 | id: reproduction
40 | validations:
41 | required: true
42 | attributes:
43 | label: Reproduction
44 | description: |
45 | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
46 | If you have code snippets, error messages, stack traces please provide them here as well.
47 | Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
48 | Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
49 |
50 | placeholder: |
51 | Steps to reproduce the behavior:
52 |
53 | 1.
54 | 2.
55 | 3.
56 |
57 | - type: textarea
58 | id: expected-behavior
59 | validations:
60 | required: true
61 | attributes:
62 | label: Expected behavior
63 | description: "A clear and concise description of what you would expect to happen."
64 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # What does this PR do?
2 |
3 |
12 |
13 |
14 |
15 | Fixes # (issue)
16 |
17 |
18 | ## Before submitting
19 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
20 | - [ ] Did you read the [contributor guideline](https://github.com/huggingface/accelerate/blob/main/CONTRIBUTING.md#submitting-a-pull-request-pr),
21 | Pull Request section?
22 | - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
23 | to it if that's the case.
24 | - [ ] Did you make sure to update the documentation with your changes? Here are the
25 | [documentation guidelines](https://github.com/huggingface/accelerate/tree/main/docs), and
26 | [here are tips on formatting docstrings](https://github.com/huggingface/accelerate/tree/main/docs#writing-documentation---specification).
27 | - [ ] Did you write any new necessary tests?
28 |
29 |
30 | ## Who can review?
31 |
32 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
33 | members/contributors who may be interested in your PR.
34 |
35 |
--------------------------------------------------------------------------------
/.github/workflows/build-docker-images-release.yml:
--------------------------------------------------------------------------------
1 | name: Build Docker images (releases)
2 |
3 | on:
4 | workflow_dispatch:
5 | release:
6 | types: [published]
7 |
8 | concurrency:
9 | group: docker-image-builds
10 | cancel-in-progress: false
11 |
12 | jobs:
13 | get-version:
14 | runs-on: ubuntu-latest
15 | outputs:
16 | version: ${{ steps.step1.outputs.version }}
17 | steps:
18 | - uses: actions/checkout@4
19 | - id: step1
20 | run: echo "version=$(python setup.py --version)" >> $GITHUB_OUTPUT
21 |
22 | version-cpu:
23 | name: "Latest Accelerate CPU [version]"
24 | runs-on:
25 | group: aws-general-8-plus
26 | needs: get-version
27 | steps:
28 | - name: Set up Docker Buildx
29 | uses: docker/setup-buildx-action@v2
30 | - name: Login to DockerHub
31 | uses: docker/login-action@v2
32 | with:
33 | username: ${{ secrets.DOCKERHUB_USERNAME }}
34 | password: ${{ secrets.DOCKERHUB_PASSWORD }}
35 |
36 | - name: Build and Push CPU
37 | uses: docker/build-push-action@v4
38 | with:
39 | file: docker/accelerate-cpu/Dockerfile
40 | push: true
41 | tags: huggingface/accelerate:cpu-release-${{ needs.get-version.outputs.version }}
42 |
43 | version-cuda:
44 | name: "Latest Accelerate GPU [version]"
45 | runs-on:
46 | group: aws-g6-4xlarge-plus
47 | needs: get-version
48 | steps:
49 | - name: Set up Docker Buildx
50 | uses: docker/setup-buildx-action@v2
51 | - name: Login to DockerHub
52 | uses: docker/login-action@v2
53 | with:
54 | username: ${{ secrets.DOCKERHUB_USERNAME }}
55 | password: ${{ secrets.DOCKERHUB_PASSWORD }}
56 |
57 | - name: Build and Push GPU
58 | uses: docker/build-push-action@v4
59 | with:
60 | file: docker/accelerate-gpu/Dockerfile
61 | push: true
62 | tags: huggingface/accelerate:gpu-release-${{needs.get-version.outputs.version}}
63 |
64 | version-cuda-deepspeed:
65 | name: "Latest Accelerate GPU DeepSpeed [version]"
66 | runs-on:
67 | group: aws-g6-4xlarge-plus
68 | needs: get-version
69 | steps:
70 | - name: Set up Docker Buildx
71 | uses: docker/setup-buildx-action@v2
72 | - name: Login to DockerHub
73 | uses: docker/login-action@v2
74 | with:
75 | username: ${{ secrets.DOCKERHUB_USERNAME }}
76 | password: ${{ secrets.DOCKERHUB_PASSWORD }}
77 |
78 | - name: Build and Push GPU
79 | uses: docker/build-push-action@v4
80 | with:
81 | file: docker/accelerate-gpu-deepspeed/Dockerfile
82 | push: true
83 | tags: huggingface/accelerate:gpu-deepspeed-release-${{needs.get-version.outputs.version}}
84 |
85 | version-cuda-fp8-transformerengine:
86 | name: "Latest Accelerate GPU FP8 TransformerEngine [version]"
87 | runs-on:
88 | group: aws-g6-4xlarge-plus
89 | needs: get-version
90 | steps:
91 | - name: Set up Docker Buildx
92 | uses: docker/setup-buildx-action@v2
93 | - name: Login to DockerHub
94 | uses: docker/login-action@v2
95 | with:
96 | username: ${{ secrets.DOCKERHUB_USERNAME }}
97 | password: ${{ secrets.DOCKERHUB_PASSWORD }}
98 |
99 | - name: Build and Push GPU
100 | uses: docker/build-push-action@v4
101 | with:
102 | file: docker/accelerate-gpu/Dockerfile
103 | push: true
104 | tags: huggingface/accelerate:gpu-fp8-transformerengine-release-${{needs.get-version.outputs.version}}
--------------------------------------------------------------------------------
/.github/workflows/build_and_run_tests.yml:
--------------------------------------------------------------------------------
1 | name: Trigger docker images and run tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch:
8 |
9 | env:
10 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11 |
12 | jobs:
13 | check-for-source:
14 | runs-on: ubuntu-latest
15 | name: Check if setup was changed
16 | outputs:
17 | changed: ${{ steps.was_changed.outputs.changed }}
18 | steps:
19 | - uses: actions/checkout@v4
20 | with:
21 | fetch-depth: "2"
22 |
23 | - name: Get changed files
24 | id: changed-files
25 | uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
26 |
27 | - name: Was setup changed
28 | id: was_changed
29 | run: |
30 | for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
31 | if [ `basename "${file}"` == "setup.py" ]; then
32 | echo "changed=1" >> $GITHUB_OUTPUT
33 | fi
34 | done
35 |
36 | build-docker-containers:
37 | needs: check-for-source
38 | if: (github.event_name == 'push') && (needs.check-for-source.outputs.changed == '1')
39 | uses: ./.github/workflows/build_docker_images.yml
40 | secrets: inherit
41 |
42 | run-merge-tests:
43 | needs: build-docker-containers
44 | if: always()
45 | uses: ./.github/workflows/run_merge_tests.yml
46 |
47 | run-integration-tests:
48 | needs: build-docker-containers
49 | if: always()
50 | uses: ./.github/workflows/self_hosted_integration_tests.yml
51 |
--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Build documentation
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - doc-builder*
8 | - v*-release
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.sha }}
15 | package: accelerate
16 | custom_container: huggingface/transformers-doc-builder
17 | secrets:
18 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
19 |
--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Build PR Documentation
2 |
3 | on:
4 | pull_request:
5 |
6 | concurrency:
7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8 | cancel-in-progress: true
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.event.pull_request.head.sha }}
15 | pr_number: ${{ github.event.number }}
16 | package: accelerate
17 | custom_container: huggingface/transformers-doc-builder
18 |
--------------------------------------------------------------------------------
/.github/workflows/fp8_runner.yml:
--------------------------------------------------------------------------------
1 | name: Test FP8 Runner
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | env:
7 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8 | jobs:
9 | set-prev-day:
10 | runs-on: ubuntu-latest
11 | outputs:
12 | prev-day: ${{ steps.set-prev-day.outputs.prev-day }}
13 | steps:
14 | - name: Set PREV_DAY
15 | id: set-prev-day
16 | run: |
17 | PREV_DAY=$(date -d "yesterday" '+%Y-%m-%d')
18 | echo "prev-day=$PREV_DAY" >> $GITHUB_OUTPUT
19 | run-fp8-tests:
20 | needs: set-prev-day
21 | runs-on:
22 | group: aws-g6e-12xlarge
23 | container:
24 | image: huggingface/accelerate:gpu-fp8-transformerengine-nightly-${{ needs.set-prev-day.outputs.prev-day }}
25 | options: --gpus all --shm-size "16gb"
26 | steps:
27 | - uses: actions/checkout@v3
28 | - name: Install the library
29 | run: |
30 | pip install -e .[test_prod,test_fp8]
31 | - name: Show installed libraries
32 | run: |
33 | pip freeze
34 | - name: Run TE FP8 tests
35 | run: |
36 | python -m pytest -s -v ./tests/test_fp8.py
37 |
38 |
--------------------------------------------------------------------------------
/.github/workflows/gaudi3_scheduled.yml:
--------------------------------------------------------------------------------
1 | name: Gaudi3 tests (scheduled)
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule: # every day at 6 AM UTC
6 | - cron: "0 6 * * *"
7 |
8 | concurrency:
9 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
10 | cancel-in-progress: true
11 |
12 | jobs:
13 | run-gaudi3-tests:
14 | runs-on:
15 | group: itac-bm-emr-gaudi3-dell-2gaudi
16 |
17 | container:
18 | image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
19 | options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
20 | env:
21 | OMPI_MCA_btl_vader_single_copy_mechanism: none
22 | PT_ENABLE_INT64_SUPPORT: 1
23 | PT_HPU_LAZY_MODE: 0
24 | RUN_SLOW: 1
25 |
26 | steps:
27 | - name: HL-SMI (1)
28 | run: |
29 | hl-smi
30 | echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
31 | echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
32 |
33 | - name: Extract HPU visible modules
34 | id: add-modules
35 | run: |
36 | export HABANA_VISIBLE_MODULES=$(hl-smi -Q module_id -f csv,noheader | tr '\n' ',' | sed 's/,$//')
37 | echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" >> $GITHUB_ENV
38 |
39 | - name: HL-SMI (2)
40 | run: |
41 | hl-smi
42 | echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
43 | echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
44 |
45 | - name: Checkout to Accelerate
46 | uses: actions/checkout@v4
47 |
48 | - name: Install Accelerate with Transformers & DeepSpeed
49 | run: |
50 | pip install -e .[testing] \
51 | git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 \
52 | git+https://github.com/huggingface/transformers.git
53 |
54 | - name: Run CLI tests
55 | if: ${{ !cancelled() && (success() || failure()) }}
56 | run: |
57 | make test_cli
58 |
59 | - name: Run Core tests
60 | if: ${{ !cancelled() && (success() || failure()) }}
61 | run: |
62 | make test_core
63 |
64 | - name: Run Big Modeling tests
65 | if: ${{ !cancelled() && (success() || failure()) }}
66 | run: |
67 | make test_big_modeling
68 |
69 | - name: Run FSDP integration tests
70 | if: ${{ !cancelled() && (success() || failure()) }}
71 | run: |
72 | make test_fsdp
73 |
74 | - name: Run DeepSpeed integration tests
75 | if: ${{ !cancelled() && (success() || failure()) }}
76 | run: |
77 | make test_deepspeed
78 |
79 | - name: Run Examples tests
80 | if: ${{ !cancelled() && (success() || failure()) }}
81 | run: |
82 | make test_examples
83 |
--------------------------------------------------------------------------------
/.github/workflows/integration_tests.yml:
--------------------------------------------------------------------------------
1 | # CI for specifically ensuring integrations work fine (`transformers` mainly)
2 | # Useful tips:
3 | # - New integrations to test should have its own job, and follow a strategy method where we check both
4 | # the pypi and github versions.
5 | # - When checking the latest release of the integration, use
6 | # git checkout $(git describe --tags `git rev-list --tags --max-count=1`) to get the latest release.
7 |
8 | name: Integration Tests
9 |
10 | on:
11 | pull_request:
12 | paths:
13 | - "src/**"
14 | - "tests/**"
15 | - ".github/**"
16 | - "examples/**"
17 | - "setup.py"
18 | types: [opened, synchronize, reopened]
19 |
20 | env:
21 | HF_HOME: ~/hf_cache
22 |
23 | jobs:
24 | run-trainer-tests:
25 | runs-on: ubuntu-latest
26 | strategy:
27 | fail-fast: false
28 | steps:
29 | - uses: actions/checkout@v4
30 | - name: Set up python 3.9
31 | uses: actions/setup-python@v5
32 | with:
33 | python-version: 3.9
34 | cache: 'pip'
35 | cache-dependency-path: 'setup.py'
36 |
37 | - name: Install Accelerate from source
38 | run: |
39 | pip install --upgrade pip
40 | pip install -e .
41 |
42 | - name: Clone and install transformers
43 | run: |
44 | cd ..
45 | git clone https://github.com/huggingface/transformers
46 | cd transformers
47 | pip install .[torch,testing]
48 |
49 | - name: Show installed libraries
50 | run: |
51 | pip freeze
52 |
53 | - name: Run Trainer tests
54 | env:
55 | WANDB_DISABLED: true
56 | run: |
57 | cd ../transformers
58 | pytest -sv tests/trainer
59 |
--------------------------------------------------------------------------------
/.github/workflows/pr_style_bot.yml:
--------------------------------------------------------------------------------
1 | # To run this bot, comment "@bot /style" on a PR
2 | name: Style Bot
3 |
4 | on:
5 | issue_comment:
6 | types: [created]
7 |
8 | permissions:
9 | contents: write
10 | pull-requests: write
11 |
12 | jobs:
13 | style:
14 | uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
15 | with:
16 | python_quality_dependencies: "[quality]"
17 | style_command_type: "default"
18 | secrets:
19 | bot_token: ${{ secrets.GITHUB_TOKEN }}
--------------------------------------------------------------------------------
/.github/workflows/quality.yml:
--------------------------------------------------------------------------------
1 | name: Quality Check
2 |
3 | on: [pull_request]
4 |
5 | jobs:
6 | quality:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - name: Set up Python 3.9
11 | uses: actions/setup-python@v5
12 | with:
13 | python-version: 3.9
14 | cache: 'pip'
15 | cache-dependency-path: 'setup.py'
16 | - name: Install Python dependencies
17 | run: pip install -e .[quality]
18 | - name: Run Quality check
19 | run: make quality
20 | - name: Check if failure
21 | if: ${{ failure() }}
22 | run: |
23 | echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and rerun 'make style; make quality;'" >> $GITHUB_STEP_SUMMARY
24 |
25 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: Stale Bot
2 |
3 | on:
4 | schedule:
5 | - cron: "0 15 * * *"
6 | workflow_dispatch:
7 |
8 | jobs:
9 | close_stale_issues:
10 | name: Close Stale Issues
11 | if: github.repository == 'huggingface/accelerate'
12 | runs-on: ubuntu-latest
13 | permissions:
14 | issues: write
15 | pull-requests: write
16 | env:
17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 | steps:
19 | - uses: actions/checkout@v4
20 |
21 | - name: Setup Python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: 3.9
25 | cache: 'pip'
26 | cache-dependency-path: 'setup.py'
27 |
28 | - name: Install requirements
29 | run: |
30 | pip install PyGithub
31 | - name: Close stale issues
32 | run: |
33 | python utils/stale.py
34 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Run Tests
2 |
3 | on:
4 | pull_request:
5 | paths:
6 | - "src/**"
7 | - "tests/**"
8 | - ".github/**"
9 | - "examples/**"
10 | - "setup.py"
11 | types: [opened, synchronize, reopened]
12 |
13 | env:
14 | HF_HOME: ~/hf_cache
15 | TESTING_MOCKED_DATALOADERS: "1"
16 | IS_GITHUB_CI: "1"
17 |
18 | jobs:
19 | run-tests:
20 | runs-on: ubuntu-latest
21 | strategy:
22 | fail-fast: false
23 | matrix:
24 | pytorch-version: [
25 | latest,
26 | minimum,
27 | ]
28 | test-kind: [
29 | test_prod,
30 | test_core,
31 | test_cli,
32 | test_big_modeling,
33 | test_deepspeed,
34 | test_fsdp,
35 | test_example_differences,
36 | test_checkpoint_step,
37 | test_checkpoint_epoch,
38 | test_rest
39 | ]
40 | steps:
41 | - uses: actions/checkout@v4
42 | - name: Set up python 3.9
43 | uses: actions/setup-python@v5
44 | with:
45 | python-version: 3.9
46 | cache: 'pip'
47 | cache-dependency-path: 'setup.py'
48 |
49 | - name: Install the library
50 | run: |
51 | if [[ ${{ matrix.test-kind }} = test_prod ]]; then pip install -e .[test_prod]; fi
52 | if [[ ${{ matrix.test-kind }} != test_prod ]]; then pip install -e .[testing,test_trackers]; fi
53 | if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
54 | if [[ ${{ matrix.pytorch-version }} = minimum ]]; then pip install torchvision==0.18.1 torch==2.3.1; fi
55 | pip install pytest-reportlog tabulate setuptools importlib_metadata
56 |
57 | - name: Show installed libraries
58 | run: |
59 | pip freeze
60 |
61 | - name: Run Tests
62 | env:
63 | PYTORCH_VERSION: ${{ matrix.pytorch-version }}
64 | run: |
65 | make ${{ matrix.test-kind }}
66 |
67 | - name: Generate Report
68 | if: always()
69 | run: |
70 | python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
71 |
--------------------------------------------------------------------------------
/.github/workflows/test_imports.yml:
--------------------------------------------------------------------------------
1 | name: Run Import Tests
2 |
3 | on:
4 | pull_request:
5 | paths:
6 | - "src/**"
7 | - "tests/**"
8 | - ".github/**"
9 | - "examples/**"
10 | - "setup.py"
11 | types: [opened, synchronize, reopened]
12 |
13 | env:
14 | HF_HOME: ~/hf_cache
15 | TESTING_MOCKED_DATALOADERS: "1"
16 | IS_GITHUB_CI: "1"
17 |
18 | jobs:
19 | run-tests:
20 | runs-on: ubuntu-latest
21 | strategy:
22 | fail-fast: false
23 | matrix:
24 | pytorch-version: [
25 | latest,
26 | minimum,
27 | ]
28 | steps:
29 | - uses: actions/checkout@v4
30 | - name: Set up python 3.9
31 | uses: actions/setup-python@v5
32 | with:
33 | python-version: 3.9
34 | cache: 'pip'
35 | cache-dependency-path: 'setup.py'
36 |
37 | - name: Install the library
38 | run: |
39 | pip install -e .
40 | pip install pytest-reportlog tabulate setuptools git+https://github.com/muellerzr/import-timer
41 |
42 | - name: Show installed libraries
43 | run: |
44 | pip freeze
45 |
46 | - name: Run Import Tests
47 | env:
48 | PYTORCH_VERSION: ${{ matrix.pytorch-version }}
49 | run: |
50 | pytest -sv tests/test_imports.py
51 |
52 | - name: Generate Report
53 | if: always()
54 | run: |
55 | python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
56 |
--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 |
4 | name: Secret Leaks
5 |
6 | jobs:
7 | trufflehog:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout code
11 | uses: actions/checkout@v4
12 | with:
13 | fetch-depth: 0
14 | - name: Secret Scanning
15 | uses: trufflesecurity/trufflehog@main
16 |
--------------------------------------------------------------------------------
/.github/workflows/upload_pr_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Upload PR Documentation
2 |
3 | on:
4 | workflow_run:
5 | workflows: ["Build PR Documentation"]
6 | types:
7 | - completed
8 |
9 | jobs:
10 | build:
11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 | with:
13 | package_name: accelerate
14 | secrets:
15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # VSCode
132 | .vscode
133 |
134 | # IntelliJ
135 | .idea
136 |
137 | # Mac .DS_Store
138 | .DS_Store
139 |
140 | # More test things
141 | wandb
142 |
143 | # ruff
144 | .ruff_cache
145 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/astral-sh/ruff-pre-commit
3 | rev: v0.2.1
4 | hooks:
5 | - id: ruff
6 | args:
7 | - --fix
8 | - id: ruff-format
9 | - repo: https://github.com/pre-commit/pre-commit-hooks
10 | rev: v4.5.0
11 | hooks:
12 | - id: check-merge-conflict
13 | - id: check-yaml
14 |
--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarks
2 |
3 | The folders below contain suites to test various functionalities in Accelerate.
4 |
5 | See their relevant README.md's for more information.
6 |
--------------------------------------------------------------------------------
/benchmarks/big_model_inference/README.md:
--------------------------------------------------------------------------------
1 | # Big model inference benchmarks
2 |
3 | Running inference with Accelerate on big models.
4 |
5 | ## Setup
6 |
7 | These benchmarks use the `transformers` library:
8 |
9 | ```bash
10 | pip install transformers
11 | ```
12 |
13 | To reproduce or test a new setup, run
14 |
15 | ```py
16 | python big_model_inference.py model_name
17 | ```
18 |
19 | This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`.
20 |
21 | To force a different `torch_dtype` than the one in the config: `--torch_dtype xxx`.
22 |
23 | If you get an error linked to disk offload, you need to add the option `--disk-offload`
24 |
25 | ## Results
26 |
27 | On a setup with two Titan RTXs (24GB of RAM) and 32GB of RAM, we get the following benchmarks (T0pp does not run in float16, which is why it's not included).
28 |
29 | | Model | Model load time | Generation time | dtype | GPU 0 use | GPU 1 use | CPU use | Disk offload |
30 | |:-----:|:---------------:|:---------------:|:-----:|:---------:|:---------:|:-------:|:------------:|
31 | | GPT-J-6B | 8.7s | 0.05s per token | float16 | 11.7GB | 0GB | 0GB | no |
32 | | GPT-J-6B | 12.4s | 0.06s per token | float32 | 21.9GB | 1.5GB | 0GB | no |
33 | | GPT-Neo-X-20B | 30.9s | 0.08s per token | float16 | 21.5GB | 18GB | 0GB | no |
34 | | GPT-Neo-X-20B | 78.2s | 10.72s per token | float32 | 20.3GB | 22.7 GB | 24.4GB | yes |
35 | | T0pp (11B) | 29.4s | 0.05s per token | float32 | 21.1GB | 21.3GB | 0GB | no |
36 | | OPT-30B | 34.5s | 2.37s per token | float16 | 20.7GB | 22.3GB | 14.1GB | no |
37 | | OPT-30B | 112.3s | 33.9s per token | float32 | 20.2GB | 21.2GB | 23.5GB | yes |
38 |
39 | Note on the results:
40 | - using two GPUs instead of one does not slow down generation
41 | - using CPU offload slows down a bit (see OPT-30b)
42 | - using disk offload slows down a lot (need to implement prefetching)
43 |
44 | You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary:
45 | - peak GPU memory is exactly the size of the model put on a given GPU
46 | - peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.
47 |
--------------------------------------------------------------------------------
/benchmarks/big_model_inference/measures_util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import gc
15 | import threading
16 | import time
17 |
18 | import psutil
19 | import torch
20 |
21 | from accelerate.test_utils.testing import get_backend
22 |
23 |
24 | torch_device_type, _, _ = get_backend()
25 | torch_accelerator_module = getattr(torch, torch_device_type, torch.cuda)
26 |
27 |
28 | class PeakCPUMemory:
29 | def __init__(self):
30 | self.process = psutil.Process()
31 | self.peak_monitoring = False
32 |
33 | def peak_monitor(self):
34 | self.cpu_memory_peak = -1
35 |
36 | while True:
37 | self.cpu_memory_peak = max(self.process.memory_info().rss, self.cpu_memory_peak)
38 |
39 | # can't sleep or will not catch the peak right (this comment is here on purpose)
40 | if not self.peak_monitoring:
41 | break
42 |
43 | def start(self):
44 | self.peak_monitoring = True
45 | self.thread = threading.Thread(target=self.peak_monitor)
46 | self.thread.daemon = True
47 | self.thread.start()
48 |
49 | def stop(self):
50 | self.peak_monitoring = False
51 | self.thread.join()
52 | return self.cpu_memory_peak
53 |
54 |
55 | cpu_peak_tracker = PeakCPUMemory()
56 |
57 |
58 | def start_measure():
59 | # Time
60 | measures = {"time": time.time()}
61 |
62 | gc.collect()
63 | torch_accelerator_module.empty_cache()
64 |
65 | # CPU mem
66 | measures["cpu"] = psutil.Process().memory_info().rss
67 | cpu_peak_tracker.start()
68 |
69 | # GPU mem
70 | for i in range(torch_accelerator_module.device_count()):
71 | measures[str(i)] = torch_accelerator_module.memory_allocated(i)
72 | torch_accelerator_module.reset_peak_memory_stats()
73 |
74 | return measures
75 |
76 |
77 | def end_measure(start_measures):
78 | # Time
79 | measures = {"time": time.time() - start_measures["time"]}
80 |
81 | gc.collect()
82 | torch_accelerator_module.empty_cache()
83 |
84 | # CPU mem
85 | measures["cpu"] = (psutil.Process().memory_info().rss - start_measures["cpu"]) / 2**20
86 | measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20
87 |
88 | # GPU mem
89 | for i in range(torch_accelerator_module.device_count()):
90 | measures[str(i)] = (torch_accelerator_module.memory_allocated(i) - start_measures[str(i)]) / 2**20
91 | measures[f"{i}-peak"] = (torch_accelerator_module.max_memory_allocated(i) - start_measures[str(i)]) / 2**20
92 |
93 | return measures
94 |
95 |
96 | def log_measures(measures, description):
97 | print(f"{description}:")
98 | print(f"- Time: {measures['time']:.2f}s")
99 | for i in range(torch_accelerator_module.device_count()):
100 | print(f"- {torch_device_type} {i} allocated: {measures[str(i)]:.2f}MiB")
101 | peak = measures[f"{i}-peak"]
102 | print(f"- {torch_device_type} {i} peak: {peak:.2f}MiB")
103 | print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB")
104 | print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB")
105 |
--------------------------------------------------------------------------------
/benchmarks/fp8/ms_amp/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ghcr.io/azure/msamp
2 |
3 | RUN pip install transformers evaluate datasets
4 | RUN git clone https://github.com/huggingface/accelerate
5 |
6 | RUN cd accelerate && \
7 | pip install -e . && \
8 | cd benchmarks/fp8
9 |
10 | CMD ["bash"]
11 |
12 |
13 |
--------------------------------------------------------------------------------
/benchmarks/fp8/torchao/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:24.07-py3
2 |
3 | RUN pip install transformers evaluate datasets
4 | RUN git clone https://github.com/huggingface/accelerate.git
5 |
6 | RUN cd accelerate && \
7 | pip install -e . && \
8 | cd benchmarks/fp8
9 |
10 | RUN /bin/bash
11 |
12 |
13 |
--------------------------------------------------------------------------------
/benchmarks/fp8/torchao/README.md:
--------------------------------------------------------------------------------
1 | # FP8 Benchmarks
2 |
3 | Comparing and running [torchao](https://github.com/pytorch/ao/tree/main/torchao/float8) FP8 with accelerate
4 |
5 | ## Overview
6 |
7 | This repo provides scripts which compare native `torchao` model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following:
8 |
9 | * Single GPU training (`non_distributed.py`)
10 | * Multi-GPU training via DistributedDataParallelism (`ddp.py`)
11 | * Fully Sharded Data Parallelism (`fsdp.py`)
12 | * DeepSpeed ZeRO 1-3 (`deepspeed.py`)
13 |
14 | To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `torchao` manually.
15 |
16 | ## Running:
17 |
18 | There are official Docker images located at `huggingface/accelerate:gpu-fp8-torchao-nightly` which can be used.
19 |
20 | You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed.
21 |
22 | For single GPU, run it via `python`:
23 |
24 | ```bash
25 | python non_distributed.py
26 | ```
27 |
28 | For the rest, run it via `accelerate launch`:
29 |
30 | ```bash
31 | accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py
32 | ```
--------------------------------------------------------------------------------
/benchmarks/fp8/transformer_engine/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG BASE_YEAR=25
2 | ARG BASE_MONTH=03
3 |
4 | FROM nvcr.io/nvidia/pytorch:${BASE_YEAR}.${BASE_MONTH}-py3
5 |
6 | RUN pip install transformers evaluate datasets
7 | RUN git clone https://github.com/huggingface/accelerate.git
8 |
9 | RUN cd accelerate && \
10 | pip install -e . && \
11 | cd benchmarks/fp8
12 |
13 | RUN /bin/bash
14 |
15 |
16 |
--------------------------------------------------------------------------------
/benchmarks/fp8/transformer_engine/README.md:
--------------------------------------------------------------------------------
1 | # FP8 Benchmarks
2 |
3 | Comparing and running [TransformerEngine](https://github.com/NVIDIA/TransformerEngine) FP8 with accelerate
4 |
5 | ## Overview
6 |
7 | This repo provides scripts which compare native TransformerEngine model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following:
8 |
9 | * Single GPU training (`non_distributed.py`)
10 | * Multi-GPU training via DistributedDataParallelism (`ddp.py`)
11 | * Fully Sharded Data Parallelism (`fsdp.py`)
12 | * DeepSpeed ZeRO 1-3 (`deepspeed.py`)
13 |
14 | To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `TransformerEngine` manually.
15 |
16 | ## Running:
17 |
18 | There are official Docker images located at `huggingface/accelerate:gpu-fp8-transformerengine-nightly` which can be used.
19 |
20 | You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed.
21 |
22 | For single GPU, run it via `python`:
23 |
24 | ```bash
25 | python non_distributed.py
26 | ```
27 |
28 | For the rest, run it via `accelerate launch`:
29 |
30 | ```bash
31 | accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py
32 | ```
--------------------------------------------------------------------------------
/benchmarks/fsdp2/imgs/allocated_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/fsdp2/imgs/allocated_memory.png
--------------------------------------------------------------------------------
/benchmarks/fsdp2/imgs/reserved_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/fsdp2/imgs/reserved_memory.png
--------------------------------------------------------------------------------
/benchmarks/torch.compile/imgs/compilation_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/torch.compile/imgs/compilation_time.png
--------------------------------------------------------------------------------
/benchmarks/torch.compile/imgs/speedup_factor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/torch.compile/imgs/speedup_factor.png
--------------------------------------------------------------------------------
/benchmarks/torch.compile/regional_compilation.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | from torch.utils.benchmark import Compare, Timer
17 | from transformers import AutoConfig, AutoModelForCausalLM
18 |
19 | from accelerate.test_utils.testing import get_backend
20 | from accelerate.utils import compile_regions
21 |
22 |
23 | torch.set_float32_matmul_precision("high")
24 |
25 | COMPILE_ITERS = 2
26 | INFERENCE_ITERS = 100
27 |
28 | BASELINE = "Baseline"
29 | COMPILE_TIME = "Compile time"
30 | INFRENCE_TIME = "Inference time"
31 | FULL_COMPILATION = "Full compilation"
32 | REGIONAL_COMPILATION = "Regional compilation"
33 |
34 | INFRENCE_STMT = "model(input_ids, use_cache=False)"
35 | COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"
36 |
37 | torch_device_type, _, _ = get_backend()
38 |
39 | results = []
40 | for model_id in [
41 | # non-gated llama models
42 | "NousResearch/Llama-3.2-1B",
43 | "NousResearch/Hermes-3-Llama-3.2-3B",
44 | "NousResearch/Hermes-3-Llama-3.1-8B",
45 | "NousResearch/Nous-Hermes-Llama2-13b",
46 | ]:
47 | with torch.device(torch_device_type):
48 | config = AutoConfig.from_pretrained(model_id)
49 | model = AutoModelForCausalLM.from_config(config).to(dtype=torch.float16).eval()
50 |
51 | full_compilation_model = torch.compile(model)
52 | regional_compilation_model = compile_regions(model)
53 |
54 | for model, sub_label, description, stmt, iters in [
55 | (model, BASELINE, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS),
56 | (full_compilation_model, FULL_COMPILATION, COMPILE_TIME, COMPILE_STMT, COMPILE_ITERS),
57 | (full_compilation_model, FULL_COMPILATION, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS),
58 | (regional_compilation_model, REGIONAL_COMPILATION, COMPILE_TIME, COMPILE_STMT, COMPILE_ITERS),
59 | (regional_compilation_model, REGIONAL_COMPILATION, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS),
60 | ]:
61 | for batch_size, sequence_length in [(1, 128), (4, 128)]:
62 | input_ids = torch.randint(
63 | 0, 1000, size=(batch_size, sequence_length), dtype=torch.int64, device=torch_device_type
64 | )
65 | results.append(
66 | Timer(
67 | label=model_id,
68 | sub_label=sub_label,
69 | description=f"{description} ({batch_size}x{sequence_length})",
70 | globals={"model": model, "input_ids": input_ids},
71 | stmt=stmt,
72 | ).timeit(number=iters)
73 | )
74 |
75 | compare = Compare(results)
76 | compare.colorize()
77 | compare.print()
78 |
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | # Official Hugging Face Accelerate Docker Images
18 |
19 | Accelerate publishes a variety of docker versions as part of our CI that users can also use. These are stable images that Accelerate can run off of which comes with a variety of different setup configurations, all of which are officially hosted on [Docker Hub](https://hub.docker.com/r/huggingface/accelerate).
20 |
21 | A breakdown of each are given below
22 |
23 | ## Naming Conventions
24 |
25 | Accelerate docker images follow a tagging convention of:
26 |
27 | ```bash
28 | huggingface/accelerate:{accelerator}-{nightly,release}
29 | ```
30 |
31 | `accelerator` in this instance is one of many applical pre-configured backend supports:
32 | * `gpu`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes`. Runs off python 3.9.
33 | * `cpu`: Comes compiled off of `python:3.9-slim` and is designed for non-CUDA based workloads.
34 | * More to come soon
35 | * `gpu-deepspeed`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes` as well as the latest `deepspeed` version. Runs off python 3.10.
36 | * `gpu-fp8-transformerengine`: Comes compiled off of `nvcr.io/nvidia/pytorch` and is specifically for running the `benchmarks/fp8` scripts on devices which support FP8 operations using the `TransformerEngine` library (RTX 4090, H100, etc)
37 |
38 | ## Nightlies vs Releases
39 |
40 | Each release a new build is pushed with a version number included in the name. For a GPU-supported image of version 0.28.0 for instance, it would look like the following:
41 |
42 | ```bash
43 | huggingface/accelerate:gpu-release-0.28.0
44 | ```
45 |
46 | Nightlies contain two different image tags. There is a general `nightly` tag which is built each night, and a `nightly-YYYY-MM-DD` which corresponds to a build from a particular date.
47 |
48 | For instance, here is an example nightly CPU image from 3/14/2024
49 |
50 | ```bash
51 | huggingface/accelerate:cpu-nightly-2024-03-14
52 | ```
53 |
54 | ## Running the images
55 |
56 | Each image comes compiled with `conda` and an `accelerate` environment contains all of the installed dependencies.
57 |
58 | To pull down the latest nightly run:
59 |
60 | ```bash
61 | docker pull huggingface/accelerate:gpu-nightly
62 | ```
63 |
64 | To then run it in interactive mode with GPU-memory available, run:
65 |
66 | ```bash
67 | docker container run --gpus all -it huggingface/accelerate:gpu-nightly
68 | ```
69 |
70 | ## DEPRECATED IMAGES
71 |
72 | CPU and GPU docker images were hosted at `huggingface/accelerate-gpu` and `huggingface/accelerate-cpu`. These builds are now outdated and will not receive updates.
73 |
74 | The builds at the corresponding `huggingface/accelerate:{gpu,cpu}` contain the same `Dockerfile`, so it's as simple as changing the docker image to the desired ones from above. We will not be deleting these images for posterity, but they will not be receiving updates going forward.
--------------------------------------------------------------------------------
/docker/accelerate-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | # Builds CPU-only Docker image of PyTorch
2 | # Uses multi-staged approach to reduce size
3 | # Stage 1
4 | FROM python:3.9-slim as compile-image
5 |
6 | ARG DEBIAN_FRONTEND=noninteractive
7 |
8 | RUN apt update
9 | RUN apt-get install -y --no-install-recommends \
10 | build-essential \
11 | git \
12 | gcc
13 |
14 | # Setup virtual environment for Docker
15 | ENV VIRTUAL_ENV=/opt/venv
16 | RUN python3 -m venv ${VIRTUAL_ENV}
17 | # Make sure we use the virtualenv
18 | ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
19 | WORKDIR /workspace
20 | # Install specific CPU torch wheel to save on space
21 | RUN python3 -m pip install --upgrade --no-cache-dir pip
22 | RUN python3 -m pip install --no-cache-dir \
23 | jupyter \
24 | git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers] \
25 | --extra-index-url https://download.pytorch.org/whl/cpu
26 |
27 | # Stage 2
28 | FROM python:3.9-slim AS build-image
29 | COPY --from=compile-image /opt/venv /opt/venv
30 | RUN useradd -ms /bin/bash user
31 | USER user
32 |
33 | # Make sure we use the virtualenv
34 | ENV PATH="/opt/venv/bin:$PATH"
35 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/docker/accelerate-gpu-deepspeed/Dockerfile:
--------------------------------------------------------------------------------
1 | # Builds GPU docker image of PyTorch specifically
2 | # Uses multi-staged approach to reduce size
3 | # Stage 1
4 | # Use base conda image to reduce time
5 | FROM continuumio/miniconda3:latest AS compile-image
6 | # Specify py version
7 | # Note: DeepSpeed beyond v0.12.6 requires py 3.10
8 | ENV PYTHON_VERSION=3.10
9 | # Install apt libs
10 | RUN apt-get update && \
11 | apt-get install -y curl git wget && \
12 | apt-get clean && \
13 | rm -rf /var/lib/apt/lists*
14 |
15 | # Create our conda env
16 | RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
17 | # We don't install pytorch here yet since CUDA isn't available
18 | # instead we use the direct torch wheel
19 | ENV PATH /opt/conda/envs/accelerate/bin:$PATH
20 | # Activate our bash shell
21 | RUN chsh -s /bin/bash
22 | SHELL ["/bin/bash", "-c"]
23 | # Activate the conda env, install mpy4pi, and install torch + accelerate
24 | RUN source activate accelerate && conda install -c conda-forge mpi4py
25 | RUN source activate accelerate && \
26 | python3 -m pip install --no-cache-dir \
27 | git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
28 | --extra-index-url https://download.pytorch.org/whl/cu126
29 |
30 | RUN python3 -m pip install --no-cache-dir bitsandbytes
31 |
32 | # Stage 2
33 | FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS build-image
34 | COPY --from=compile-image /opt/conda /opt/conda
35 | ENV PATH /opt/conda/bin:$PATH
36 |
37 | # Install apt libs
38 | RUN apt-get update && \
39 | apt-get install -y curl git wget && \
40 | apt-get clean && \
41 | rm -rf /var/lib/apt/lists*
42 |
43 | RUN echo "source activate accelerate" >> ~/.profile
44 |
45 | # Activate the virtualenv
46 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/docker/accelerate-gpu/Dockerfile:
--------------------------------------------------------------------------------
1 | # Builds GPU docker image of PyTorch specifically
2 | # Uses multi-staged approach to reduce size
3 | # Stage 1
4 | # Use base conda image to reduce time
5 | FROM continuumio/miniconda3:latest AS compile-image
6 | # Specify py version
7 | ENV PYTHON_VERSION=3.9
8 | # Install apt libs
9 | RUN apt-get update && \
10 | apt-get install -y curl git wget && \
11 | apt-get clean && \
12 | rm -rf /var/lib/apt/lists*
13 |
14 | # Create our conda env
15 | RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
16 | # We don't install pytorch here yet since CUDA isn't available
17 | # instead we use the direct torch wheel
18 | ENV PATH /opt/conda/envs/accelerate/bin:$PATH
19 | # Activate our bash shell
20 | RUN chsh -s /bin/bash
21 | SHELL ["/bin/bash", "-c"]
22 | # Activate the conda env, install mpy4pi, and install torch + accelerate
23 | RUN source activate accelerate && conda install -c conda-forge mpi4py
24 | RUN source activate accelerate && \
25 | python3 -m pip install --no-cache-dir \
26 | git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers] \
27 | --extra-index-url https://download.pytorch.org/whl/cu126
28 |
29 | RUN python3 -m pip install --no-cache-dir bitsandbytes
30 |
31 | # Stage 2
32 | FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS build-image
33 | COPY --from=compile-image /opt/conda /opt/conda
34 | ENV PATH /opt/conda/bin:$PATH
35 |
36 | # Install apt libs
37 | RUN apt-get update && \
38 | apt-get install -y curl git wget && \
39 | apt-get clean && \
40 | rm -rf /var/lib/apt/lists*
41 |
42 | RUN echo "source activate accelerate" >> ~/.profile
43 |
44 | # Activate the virtualenv
45 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = source
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/source/basic_tutorials/overview.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Overview
17 |
18 | Welcome to the Accelerate tutorials! These introductory guides will help catch you up to speed on working with Accelerate.
19 | You'll learn how to modify your code to have it work with the API seamlessly, how to launch your script properly,
20 | and more!
21 |
22 | These tutorials assume some basic knowledge of Python and familiarity with the PyTorch framework.
23 |
24 | If you have any questions about Accelerate, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/accelerate/18).
--------------------------------------------------------------------------------
/docs/source/basic_tutorials/tpu.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # TPU training
17 |
18 | A [TPU (Tensor Processing Unit)](https://cloud.google.com/tpu/docs/intro-to-tpu) is a type of hardware specifically designed for training models efficiently. Accelerate supports TPU training, but there are a few things you should be aware of, namely graph compilation. This tutorial briefly discusses compilation, and for more details, take a look at the [Training on TPUs with Accelerate](../concept_guides/training_tpu) guide.
19 |
20 | ## Compilation
21 |
22 | A TPU creates a graph of all the operations in the training step such as the forward pass, backward pass and optimizer step. This is why the first training step always takes a while because building and compiling this graph takes time. But once compilation is complete, it is cached and all subsequent steps are much faster.
23 |
24 | The key is to avoid compiling your code again or else training is super slow. This means all your operations must be exactly the same:
25 |
26 | * all tensors in your batches must have the same length (for example, no dynamic padding for NLP tasks)
27 | * your code must be static (for example, no layers with for loops that have different lengths depending on the input such as a LSTM)
28 |
29 | ## Weight tying
30 |
31 | A common language model design is to tie the weights of the embedding and softmax layers. However, moving the model to a TPU (either yourself or passing it to the [`~Accelerator.prepare`] method) breaks the weight tying and you'll need to retie the weights.
32 |
33 | To add special behavior (like weight tying) in your script for TPUs, set [`~Accelerator.distributed_type`] to `DistributedType.TPU` first. Then you can use the [`~transformers.PreTrainedModel.tie_weights`] method to tie the weights.
34 |
35 | ```py
36 | if accelerator.distributed_type == DistributedType.TPU:
37 | model.tie_weights()
38 | ```
39 |
--------------------------------------------------------------------------------
/docs/source/imgs/accelerate_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/docs/source/imgs/accelerate_logo.png
--------------------------------------------------------------------------------
/docs/source/imgs/course_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/docs/source/imgs/course_banner.png
--------------------------------------------------------------------------------
/docs/source/imgs/profile_export.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/docs/source/imgs/profile_export.png
--------------------------------------------------------------------------------
/docs/source/package_reference/accelerator.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Accelerator
17 |
18 | The [`Accelerator`] is the main class for enabling distributed training on any type of training setup. Read the [Add Accelerator to your code](../basic_tutorials/migration) tutorial to learn more about how to add the [`Accelerator`] to your script.
19 |
20 | ## Accelerator[[api]]
21 |
22 | [[autodoc]] Accelerator
23 |
24 | ## Utilities
25 |
26 | [[autodoc]] accelerate.utils.gather_object
27 |
--------------------------------------------------------------------------------
/docs/source/package_reference/big_modeling.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Working with large models
17 |
18 | ## Dispatch and offload
19 |
20 | ### init_empty_weights
21 |
22 | [[autodoc]] big_modeling.init_empty_weights
23 |
24 | ### cpu_offload
25 |
26 | [[autodoc]] big_modeling.cpu_offload
27 |
28 | ### cpu_offload_with_hook
29 |
30 | [[autodoc]] big_modeling.cpu_offload_with_hook
31 |
32 | ### disk_offload
33 |
34 | [[autodoc]] big_modeling.disk_offload
35 |
36 | ### dispatch_model
37 |
38 | [[autodoc]] big_modeling.dispatch_model
39 |
40 | ### load_checkpoint_and_dispatch
41 |
42 | [[autodoc]] big_modeling.load_checkpoint_and_dispatch
43 |
44 | ### load_checkpoint_in_model
45 |
46 | [[autodoc]] big_modeling.load_checkpoint_in_model
47 |
48 | ### infer_auto_device_map
49 |
50 | [[autodoc]] utils.infer_auto_device_map
51 |
52 | ## Hooks
53 |
54 | ### ModelHook
55 |
56 | [[autodoc]] hooks.ModelHook
57 |
58 | ### AlignDevicesHook
59 |
60 | [[autodoc]] hooks.AlignDevicesHook
61 |
62 | ### SequentialHook
63 |
64 | [[autodoc]] hooks.SequentialHook
65 |
66 | ### LayerwiseCastingHook
67 |
68 | [[autodoc]] hooks.LayerwiseCastingHook
69 |
70 | ## Adding Hooks
71 |
72 | ### add_hook_to_module
73 |
74 | [[autodoc]] hooks.add_hook_to_module
75 |
76 | ### attach_execution_device_hook
77 |
78 | [[autodoc]] hooks.attach_execution_device_hook
79 |
80 | ### attach_align_device_hook
81 |
82 | [[autodoc]] hooks.attach_align_device_hook
83 |
84 | ### attach_align_device_hook_on_blocks
85 |
86 | [[autodoc]] hooks.attach_align_device_hook_on_blocks
87 |
88 | ### attach_layerwise_casting_hooks
89 |
90 | [[autodoc]] big_modeling.attach_layerwise_casting_hooks
91 |
92 | ## Removing Hooks
93 |
94 | ### remove_hook_from_module
95 |
96 | [[autodoc]] hooks.remove_hook_from_module
97 |
98 | ### remove_hook_from_submodules
99 |
100 | [[autodoc]] hooks.remove_hook_from_submodules
101 |
102 | ## Utilities
103 |
104 | ### has_offloaded_params
105 |
106 | [[autodoc]] utils.has_offloaded_params
107 |
108 | ### align_module_device
109 |
110 | [[autodoc]] utils.align_module_device
111 |
--------------------------------------------------------------------------------
/docs/source/package_reference/deepspeed.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # DeepSpeed utilities
17 |
18 | ## DeepSpeedPlugin
19 |
20 | ## get_active_deepspeed_plugin
21 |
22 | [[autodoc]] utils.get_active_deepspeed_plugin
23 |
24 | [[autodoc]] utils.DeepSpeedPlugin
25 |
26 | [[autodoc]] utils.deepspeed.DummyScheduler
27 |
28 | ## DeepSpeedEnginerWrapper
29 |
30 | [[autodoc]] utils.deepspeed.DeepSpeedEngineWrapper
31 |
32 | ## DeepSpeedOptimizerWrapper
33 |
34 | [[autodoc]] utils.deepspeed.DeepSpeedOptimizerWrapper
35 |
36 | ## DeepSpeedSchedulerWrapper
37 |
38 | [[autodoc]] utils.deepspeed.DeepSpeedSchedulerWrapper
39 |
40 | ## DummyOptim
41 |
42 | [[autodoc]] utils.deepspeed.DummyOptim
43 |
44 | ## DummyScheduler
--------------------------------------------------------------------------------
/docs/source/package_reference/fp8.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # FP8
17 |
18 | Below are functions and classes relative to the underlying FP8 implementation
19 |
20 | ## FP8RecipeKwargs
21 |
22 | [[autodoc]] utils.FP8RecipeKwargs
23 |
24 | ## convert_model
25 |
26 | [[autodoc]] utils.convert_model
27 |
28 | ## has_transformer_engine_layers
29 |
30 | [[autodoc]] utils.has_transformer_engine_layers
31 |
32 | ## contextual_fp8_autocast
33 |
34 | [[autodoc]] utils.contextual_fp8_autocast
35 |
36 | ## apply_fp8_autowrap
37 |
38 | [[autodoc]] utils.apply_fp8_autowrap
39 |
--------------------------------------------------------------------------------
/docs/source/package_reference/fsdp.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Fully Sharded Data Parallel utilities
17 |
18 | ## enable_fsdp_ram_efficient_loading
19 |
20 | [[autodoc]] utils.enable_fsdp_ram_efficient_loading
21 |
22 | ## disable_fsdp_ram_efficient_loading
23 |
24 | [[autodoc]] utils.disable_fsdp_ram_efficient_loading
25 |
26 | ## merge_fsdp_weights
27 |
28 | [[autodoc]] utils.merge_fsdp_weights
29 |
30 | ## FullyShardedDataParallelPlugin
31 |
32 | [[autodoc]] utils.FullyShardedDataParallelPlugin
33 |
34 | ## fsdp2_load_full_state_dict
35 |
36 | [[autodoc]] utils.fsdp2_load_full_state_dict
37 |
38 | ## fsdp2_switch_optimizer_parameters
39 |
40 | [[autodoc]] utils.fsdp2_switch_optimizer_parameters
41 |
42 | ## fsdp2_prepare_model
43 |
44 | [[autodoc]] utils.fsdp2_prepare_model
45 |
46 | ## fsdp2_prepare_auto_wrap_policy
47 |
--------------------------------------------------------------------------------
/docs/source/package_reference/inference.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Pipeline parallelism
17 |
18 | Accelerate supports pipeline parallelism for large-scale training with the PyTorch [torch.distributed.pipelining](https://pytorch.org/docs/stable/distributed.pipelining.html) API.
19 |
20 | ## prepare_pippy
21 |
22 | [[autodoc]] inference.prepare_pippy
23 |
--------------------------------------------------------------------------------
/docs/source/package_reference/kwargs.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Kwargs handlers
17 |
18 | The following objects can be passed to the main [`Accelerator`] to customize how some PyTorch objects
19 | related to distributed training or mixed precision are created.
20 |
21 | ## AutocastKwargs
22 |
23 | [[autodoc]] AutocastKwargs
24 |
25 | ## DistributedDataParallelKwargs
26 |
27 | [[autodoc]] DistributedDataParallelKwargs
28 |
29 | ## FP8RecipeKwargs
30 |
31 | [[autodoc]] utils.FP8RecipeKwargs
32 |
33 | ## ProfileKwargs
34 |
35 | [[autodoc]] utils.ProfileKwargs
36 |
37 | ## GradScalerKwargs
38 |
39 | [[autodoc]] GradScalerKwargs
40 |
41 | ## InitProcessGroupKwargs
42 |
43 | [[autodoc]] InitProcessGroupKwargs
44 |
45 | ## KwargsHandler
46 |
47 | [[autodoc]] utils.KwargsHandler
48 |
--------------------------------------------------------------------------------
/docs/source/package_reference/launchers.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Launchers
17 |
18 | Functions for launching training on distributed processes.
19 |
20 | ## notebook_launcher
21 |
22 | [[autodoc]] accelerate.notebook_launcher
23 |
24 | ## debug_launcher
25 |
26 | [[autodoc]] accelerate.debug_launcher
--------------------------------------------------------------------------------
/docs/source/package_reference/logging.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Logging
17 |
18 | Refer to the [Troubleshooting guide](../usage_guides/troubleshooting#logging) or to the example below to learn
19 | how to use Accelerate's logger.
20 |
21 | [[autodoc]] logging.get_logger
--------------------------------------------------------------------------------
/docs/source/package_reference/megatron_lm.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Megatron-LM utilities
17 |
18 | ## MegatronLMPlugin
19 |
20 | [[autodoc]] utils.MegatronLMPlugin
21 |
22 | ## MegatronLMDummyScheduler
23 |
24 | [[autodoc]] utils.MegatronLMDummyScheduler
25 |
26 | ## MegatronLMDummyDataLoader
27 |
28 | [[autodoc]] utils.MegatronLMDummyDataLoader
29 |
30 | ## AbstractTrainStep
31 |
32 | [[autodoc]] utils.AbstractTrainStep
33 |
34 | ## GPTTrainStep
35 |
36 | [[autodoc]] utils.GPTTrainStep
37 |
38 | ## BertTrainStep
39 |
40 | [[autodoc]] utils.BertTrainStep
41 |
42 | ## T5TrainStep
43 |
44 | [[autodoc]] utils.T5TrainStep
45 |
46 | ## avg_losses_across_data_parallel_group
47 |
48 | [[autodoc]] utils.avg_losses_across_data_parallel_group
49 |
--------------------------------------------------------------------------------
/docs/source/package_reference/state.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Stateful Classes
17 |
18 | Below are variations of a [singleton class](https://en.wikipedia.org/wiki/Singleton_pattern) in the sense that all
19 | instances share the same state, which is initialized on the first instantiation.
20 |
21 | These classes are immutable and store information about certain configurations or
22 | states.
23 |
24 | ## PartialState
25 |
26 | [[autodoc]] state.PartialState
27 |
28 | ## AcceleratorState
29 |
30 | [[autodoc]] state.AcceleratorState
31 |
32 | ## GradientState
33 |
34 | [[autodoc]] state.GradientState
--------------------------------------------------------------------------------
/docs/source/package_reference/torch_wrappers.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # DataLoaders, Optimizers, and Schedulers
17 |
18 | The internal classes Accelerate uses to prepare objects for distributed training
19 | when calling [`~Accelerator.prepare`].
20 |
21 | ## DataLoader utilities
22 |
23 | [[autodoc]] data_loader.prepare_data_loader
24 | [[autodoc]] data_loader.skip_first_batches
25 |
26 | ## BatchSamplerShard
27 |
28 | [[autodoc]] data_loader.BatchSamplerShard
29 |
30 | ## IterableDatasetShard
31 |
32 | [[autodoc]] data_loader.IterableDatasetShard
33 |
34 | ## DataLoaderShard
35 |
36 | [[autodoc]] data_loader.DataLoaderShard
37 |
38 | ## DataLoaderDispatcher
39 |
40 | [[autodoc]] data_loader.DataLoaderDispatcher
41 |
42 | ## AcceleratedOptimizer
43 |
44 | [[autodoc]] optimizer.AcceleratedOptimizer
45 |
46 | ## AcceleratedScheduler
47 |
48 | [[autodoc]] scheduler.AcceleratedScheduler
--------------------------------------------------------------------------------
/docs/source/package_reference/tracking.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Experiment Trackers
17 |
18 | ## GeneralTracker
19 |
20 | [[autodoc]] tracking.GeneralTracker
21 |
22 | ## TensorBoardTracker
23 |
24 | [[autodoc]] tracking.TensorBoardTracker
25 | - __init__
26 |
27 | ## WandBTracker
28 |
29 | [[autodoc]] tracking.WandBTracker
30 | - __init__
31 |
32 | ## CometMLTracker
33 |
34 | [[autodoc]] tracking.CometMLTracker
35 | - __init__
36 |
37 | ## AimTracker
38 |
39 | [[autodoc]] tracking.AimTracker
40 | - __init__
41 |
42 | ## MLflowTracker
43 |
44 | [[autodoc]] tracking.MLflowTracker
45 | - __init__
46 |
47 | ## ClearMLTracker
48 |
49 | [[autodoc]] tracking.ClearMLTracker
50 | - __init__
51 |
--------------------------------------------------------------------------------
/docs/source/usage_guides/explore.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Start Here!
17 |
18 | Please use the interactive tool below to help you get started with learning about a particular
19 | feature of Accelerate and how to utilize it! It will provide you with a code diff, an explanation
20 | towards what is going on, as well as provide you with some useful links to explore more within
21 | the documentation!
22 |
23 | Most code examples start from the following python code before integrating Accelerate in some way:
24 |
25 | ```python
26 | for batch in dataloader:
27 | optimizer.zero_grad()
28 | inputs, targets = batch
29 | inputs = inputs.to(device)
30 | targets = targets.to(device)
31 | outputs = model(inputs)
32 | loss = loss_function(outputs, targets)
33 | loss.backward()
34 | optimizer.step()
35 | scheduler.step()
36 | ```
37 |
38 |
39 |
44 |
45 |
46 |
51 |
52 |
--------------------------------------------------------------------------------
/docs/source/usage_guides/gaudi.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Intel Gaudi
17 |
18 | Users can take advantage of Intel Gaudi AI accelerators for significantly faster and cost-effective model training and inference.
19 | The Intel Gaudi AI accelerator family currently includes three product generations: [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture Overview](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html).
20 |
21 | ## How it works out of the box
22 |
23 | It is enabled by default if an Intel Gaudi device is detected.
24 | To disable it, pass `--cpu` flag to `accelerate launch` command or answer the corresponding question when answering the `accelerate config` questionnaire.
25 |
26 | You can directly run the following script to test it out on Intel Gaudi:
27 |
28 | ```bash
29 | accelerate launch /examples/cv_example.py --data_dir images
30 | ```
31 |
32 | ## Limitations
33 |
34 | The following features are not part of the Accelerate library and requires [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index):
35 |
36 | - `fast_ddp` which implements DDP by applying an all-reduce on gradients instead of the Torch DDP wrapper.
37 | - `minimize_memory` which is used for fp8 training and enables keeping fp8 weights in memory between the forward and backward passes, leading to a smaller memory footprint at the cost of additional fp8 casts.
38 | - `context_parallel_size` which is used for Context/Sequence Parallelism (CP/SP) and partitions the network inputs and activations along sequence dimension to reduce memory footprint and increase throughput.
39 |
--------------------------------------------------------------------------------
/docs/source/usage_guides/mps.md:
--------------------------------------------------------------------------------
1 |
15 |
16 | # Accelerated PyTorch Training on Mac
17 |
18 | With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training.
19 | This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
20 | Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device.
21 | This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
22 | For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
23 | and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html).
24 |
25 | ### Benefits of Training and Inference using Apple Silicon Chips
26 |
27 | 1. Enables users to train larger networks or batch sizes locally
28 | 2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture.
29 | Therefore, improving end-to-end performance.
30 | 3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
31 |
32 | **Pre-requisites**: To install torch with mps support,
33 | please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
34 |
35 |
36 | ## How it works out of the box
37 | It is enabled by default on MacOs machines with MPS enabled Apple Silicon GPUs.
38 | To disable it, pass `--cpu` flag to `accelerate launch` command or answer the corresponding question when answering the `accelerate config` questionnaire.
39 |
40 | You can directly run the following script to test it out on MPS enabled Apple Silicon machines:
41 | ```bash
42 | accelerate launch /examples/cv_example.py --data_dir images
43 | ```
44 |
45 | ## A few caveats to be aware of
46 |
47 | 1. Distributed setups `gloo` and `nccl` are not working with `mps` device.
48 | This means that currently only single GPU of `mps` device type can be used.
49 |
50 | Finally, please, remember that, `Accelerate` only integrates MPS backend, therefore if you
51 | have any problems or questions with regards to MPS backend usage, please, file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
--------------------------------------------------------------------------------
/examples/config_yaml_templates/README.md:
--------------------------------------------------------------------------------
1 | # Config Zoo
2 |
3 | This folder contains a variety of minimal configurations for `Accelerate` achieving certain goals. You can use these
4 | direct config YAML's, or build off of them for your own YAML's.
5 |
6 | These are highly annoted versions, aiming to teach you what each section does.
7 |
8 | Each config can be run via `accelerate launch --config_file {file} run_me.py`
9 |
10 | `run_me.py` will then print out how the current environment is setup (the contents of the `AcceleratorState`)
--------------------------------------------------------------------------------
/examples/config_yaml_templates/deepspeed.yaml:
--------------------------------------------------------------------------------
1 | # Similar to FSDP, we set the distributed type as DEEPSPEED
2 | distributed_type: DEEPSPEED
3 | # With DeepSpeed, we utilize a deepspeed config file for the entire configuration
4 | deepspeed_config:
5 | # Can also be any of the config json's in accelerate/examples/deepspeed_config_templates
6 | deepspeed_config_file: ../deepspeed_config_templates/zero_stage1_config.json
7 | # If using ZeRO-3 and wanting to load big models in, this should be set to `true` so
8 | # `transformers` uses the right `init` function
9 | zero3_init_flag: false # true
10 |
11 | # Finally we need to specify the number of GPUs to use
12 | num_processes: 2
13 | # Optionally we can set the mixed precision now instead of in the deepspeed config file,
14 | # however this requires the `fp16` and `bf16` options to be set to `auto` in the deepspeed config file
15 | # mixed_precision: "bf16"
16 |
--------------------------------------------------------------------------------
/examples/config_yaml_templates/fp8.yaml:
--------------------------------------------------------------------------------
1 | # This config template simply setups up the TransformersEngine config (and a config for a single GPU),
2 | # this can interop with the other configs in this folder
3 | distributed_type: "NO"
4 | mixed_precision: "fp8"
5 | # Then we specify the fp8 configuration:
6 | fp8_config:
7 | backend: TE # Can be TE | MS-AMP
8 | # The following are TE specific arguments.
9 | # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#common-api for more details
10 | amax_history_len: 1024
11 | fp8_format: E4M3
12 | interval: 1
13 | margin: 0
14 | override_linear_precision: (false, false, false)
15 | # Generally this should always be set to `false` to have the most realistic fp8 eval performance
16 | use_autocast_during_eval: false
17 | # If using MS-AMP, we ignore all of the prior and set a opt_level
18 | #opt_level: O1
--------------------------------------------------------------------------------
/examples/config_yaml_templates/fsdp.yaml:
--------------------------------------------------------------------------------
1 | # Since we are doing FSDP (even though it's multi-GPU), we need to specify the distributed type as FSDP
2 | distributed_type: FSDP
3 | # Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`, but it works for FSDP as well)
4 | mixed_precision: 'bf16'
5 | # Specify the number of GPUs to use
6 | num_processes: 2
7 | # Then we can specify the FSDP config
8 | fsdp_config:
9 | fsdp_activation_checkpointing: false
10 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
11 | fsdp_backward_prefetch: BACKWARD_PRE
12 | fsdp_cpu_ram_efficient_loading: true
13 | fsdp_forward_prefetch: false
14 | fsdp_offload_params: false
15 | fsdp_sharding_strategy: FULL_SHARD
16 | fsdp_state_dict_type: SHARDED_STATE_DICT
17 | fsdp_sync_module_states: true
18 | fsdp_use_orig_params: true
19 |
--------------------------------------------------------------------------------
/examples/config_yaml_templates/multi_gpu.yaml:
--------------------------------------------------------------------------------
1 | # Specify distributed_type as `MULTI_GPU` for DDP
2 | distributed_type: "MULTI_GPU"
3 | # Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`)
4 | mixed_precision: "bf16"
5 | # Specify the number of GPUs to use
6 | num_processes: 2
--------------------------------------------------------------------------------
/examples/config_yaml_templates/multi_node.yaml:
--------------------------------------------------------------------------------
1 | # This config template is for a multi-node setup. This assumes DDP, but can be interop'd with the other configs in this folder
2 | # Generally it's recommended to look at the SLURM config template for a more robust multi-node setup
3 | distributed_type: MULTI_GPU
4 | # We need to specify the current machine's rank
5 | machine_rank: 0
6 | # We then need to specify the IP address and port of the main process
7 | main_process_ip: '1234'
8 | main_process_port: 9999
9 | # We need to specify the number of machines
10 | num_machines: 2
11 | # We need to specify the *total* number of processes
12 | num_processes: 8
13 | # And then we need to specify how rdvz comms will be handled
14 | rdzv_backend: static # or c10d
15 | # If the compute nodes are on the same network (cloud will more than likely be false)
16 | same_network: false
17 |
--------------------------------------------------------------------------------
/examples/config_yaml_templates/run_me.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | A base script which outputs the accelerate config for the given environment
17 | """
18 |
19 | from accelerate import Accelerator
20 |
21 |
22 | accelerator = Accelerator()
23 |
24 | accelerator.print(f"Accelerator state from the current environment:\n{accelerator.state}")
25 | if accelerator.fp8_recipe_handler is not None:
26 | accelerator.print(f"FP8 config:\n{accelerator.fp8_recipe_handler}")
27 | accelerator.end_training()
28 |
--------------------------------------------------------------------------------
/examples/config_yaml_templates/single_gpu.yaml:
--------------------------------------------------------------------------------
1 | # Since this is single GPU, we don't need distributed training
2 | distributed_type: "NO"
3 | # Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`)
4 | mixed_precision: "bf16"
--------------------------------------------------------------------------------
/examples/deepspeed_config_templates/zero_stage1_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "weight_decay": "auto",
15 | "torch_adam": true,
16 | "adam_w_mode": true
17 | }
18 | },
19 | "scheduler": {
20 | "type": "WarmupDecayLR",
21 | "params": {
22 | "warmup_min_lr": "auto",
23 | "warmup_max_lr": "auto",
24 | "warmup_num_steps": "auto",
25 | "total_num_steps": "auto"
26 | }
27 | },
28 | "zero_optimization": {
29 | "stage": 1,
30 | "allgather_partitions": true,
31 | "allgather_bucket_size": 2e8,
32 | "overlap_comm": true,
33 | "reduce_scatter": true,
34 | "reduce_bucket_size": "auto",
35 | "contiguous_gradients": true
36 | },
37 | "gradient_accumulation_steps": 1,
38 | "gradient_clipping": "auto",
39 | "steps_per_print": 2000,
40 | "train_batch_size": "auto",
41 | "train_micro_batch_size_per_gpu": "auto",
42 | "wall_clock_breakdown": false
43 | }
--------------------------------------------------------------------------------
/examples/deepspeed_config_templates/zero_stage2_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "weight_decay": "auto",
15 | "torch_adam": true,
16 | "adam_w_mode": true
17 | }
18 | },
19 | "scheduler": {
20 | "type": "WarmupDecayLR",
21 | "params": {
22 | "warmup_min_lr": "auto",
23 | "warmup_max_lr": "auto",
24 | "warmup_num_steps": "auto",
25 | "total_num_steps": "auto"
26 | }
27 | },
28 | "zero_optimization": {
29 | "stage": 2,
30 | "allgather_partitions": true,
31 | "allgather_bucket_size": 2e8,
32 | "overlap_comm": true,
33 | "reduce_scatter": true,
34 | "reduce_bucket_size": "auto",
35 | "contiguous_gradients": true
36 | },
37 | "gradient_accumulation_steps": 1,
38 | "gradient_clipping": "auto",
39 | "steps_per_print": 2000,
40 | "train_batch_size": "auto",
41 | "train_micro_batch_size_per_gpu": "auto",
42 | "wall_clock_breakdown": false
43 | }
--------------------------------------------------------------------------------
/examples/deepspeed_config_templates/zero_stage2_offload_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "weight_decay": "auto",
15 | "torch_adam": true,
16 | "adam_w_mode": true
17 | }
18 | },
19 | "scheduler": {
20 | "type": "WarmupDecayLR",
21 | "params": {
22 | "warmup_min_lr": "auto",
23 | "warmup_max_lr": "auto",
24 | "warmup_num_steps": "auto",
25 | "total_num_steps": "auto"
26 | }
27 | },
28 | "zero_optimization": {
29 | "stage": 2,
30 | "offload_optimizer": {
31 | "device": "cpu",
32 | "pin_memory": true
33 | },
34 | "allgather_partitions": true,
35 | "allgather_bucket_size": 2e8,
36 | "overlap_comm": true,
37 | "reduce_scatter": true,
38 | "reduce_bucket_size": "auto",
39 | "contiguous_gradients": true
40 | },
41 | "gradient_accumulation_steps": 1,
42 | "gradient_clipping": "auto",
43 | "steps_per_print": 2000,
44 | "train_batch_size": "auto",
45 | "train_micro_batch_size_per_gpu": "auto",
46 | "wall_clock_breakdown": false
47 | }
--------------------------------------------------------------------------------
/examples/deepspeed_config_templates/zero_stage3_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "weight_decay": "auto"
15 | }
16 | },
17 | "scheduler": {
18 | "type": "WarmupDecayLR",
19 | "params": {
20 | "warmup_min_lr": "auto",
21 | "warmup_max_lr": "auto",
22 | "warmup_num_steps": "auto",
23 | "total_num_steps": "auto"
24 | }
25 | },
26 | "zero_optimization": {
27 | "stage": 3,
28 | "overlap_comm": true,
29 | "contiguous_gradients": true,
30 | "reduce_bucket_size": "auto",
31 | "stage3_prefetch_bucket_size": "auto",
32 | "stage3_param_persistence_threshold": "auto",
33 | "sub_group_size": 1e9,
34 | "stage3_max_live_parameters": 1e9,
35 | "stage3_max_reuse_distance": 1e9,
36 | "stage3_gather_16bit_weights_on_model_save": "auto"
37 | },
38 | "gradient_accumulation_steps": 1,
39 | "gradient_clipping": "auto",
40 | "steps_per_print": 2000,
41 | "train_batch_size": "auto",
42 | "train_micro_batch_size_per_gpu": "auto",
43 | "wall_clock_breakdown": false
44 | }
--------------------------------------------------------------------------------
/examples/deepspeed_config_templates/zero_stage3_offload_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "weight_decay": "auto"
15 | }
16 | },
17 | "scheduler": {
18 | "type": "WarmupDecayLR",
19 | "params": {
20 | "warmup_min_lr": "auto",
21 | "warmup_max_lr": "auto",
22 | "warmup_num_steps": "auto",
23 | "total_num_steps": "auto"
24 | }
25 | },
26 | "zero_optimization": {
27 | "stage": 3,
28 | "offload_optimizer": {
29 | "device": "cpu",
30 | "pin_memory": true
31 | },
32 | "offload_param": {
33 | "device": "cpu",
34 | "pin_memory": true
35 | },
36 | "overlap_comm": true,
37 | "contiguous_gradients": true,
38 | "reduce_bucket_size": "auto",
39 | "stage3_prefetch_bucket_size": "auto",
40 | "stage3_param_persistence_threshold": "auto",
41 | "sub_group_size": 1e9,
42 | "stage3_max_live_parameters": 1e9,
43 | "stage3_max_reuse_distance": 1e9,
44 | "stage3_gather_16bit_weights_on_model_save": "auto"
45 | },
46 | "gradient_accumulation_steps": 1,
47 | "gradient_clipping": "auto",
48 | "steps_per_print": 2000,
49 | "train_batch_size": "auto",
50 | "train_micro_batch_size_per_gpu": "auto",
51 | "wall_clock_breakdown": false
52 | }
--------------------------------------------------------------------------------
/examples/inference/distributed/README.md:
--------------------------------------------------------------------------------
1 | # Distributed inference examples
2 |
3 | This folder contains a variety of tutorials for running distributed inference with the following strategy:
4 |
5 | Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time
6 |
7 | ## Installation
8 |
9 | ```bash
10 | pip install accelerate torch
11 | ```
12 |
13 | ## Running code
14 |
15 | You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
16 |
17 | ```bash
18 | accelerate launch --num_processes {NUM_GPUS} phi2.py
19 | ```
20 |
21 | Or:
22 |
23 | ```bash
24 | torchrun --nproc-per-node {NUM_GPUS} phi2.py
25 | ```
26 |
--------------------------------------------------------------------------------
/examples/inference/distributed/stable_diffusion.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch
16 | from diffusers import DiffusionPipeline
17 |
18 | from accelerate import PartialState # Can also be Accelerator or AcceleratorState
19 |
20 |
21 | pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
22 | distributed_state = PartialState()
23 | pipe.to(distributed_state.device)
24 |
25 | # Assume two processes
26 | # On the first GPU, the prompts will be ["a dog", "a cat"],
27 | # and on the second GPU it will be ["a chicken", "a chicken"].
28 | # Make sure to drop the final sample, as it will be a duplicate of the previous one.
29 | with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
30 | result = pipe(prompt).images
31 |
--------------------------------------------------------------------------------
/examples/inference/pippy/README.md:
--------------------------------------------------------------------------------
1 | # Distributed inference examples with PiPPy
2 |
3 | This repo contains a variety of tutorials for using the [PiPPy](https://github.com/PyTorch/PiPPy) pipeline parallelism library with accelerate. You will find examples covering:
4 |
5 | 1. How to trace the model using `accelerate.prepare_pippy`
6 | 2. How to specify inputs based on what the model expects (when to use `kwargs`, `args`, and such)
7 | 3. How to gather the results at the end.
8 |
9 | ## Installation
10 |
11 | This requires the `main` branch of accelerate (or a version at least 0.27.0), `pippy` version of 0.2.0 or greater, and at least python 3.9. Please install using `pip install .` to pull from the `setup.py` in this repo, or run manually:
12 |
13 | ```bash
14 | pip install 'accelerate>=0.27.0' 'torchpippy>=0.2.0'
15 | ```
16 |
17 | ## Running code
18 |
19 | You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
20 |
21 | ```bash
22 | accelerate launch bert.py
23 | ```
24 |
25 | Or:
26 |
27 | ```bash
28 | accelerate launch --num_processes {NUM_GPUS} bert.py
29 | ```
30 |
31 | Or:
32 |
33 | ```bash
34 | torchrun --nproc-per-node {NUM_GPUS} bert.py
35 | ```
36 |
37 | ## General speedups
38 |
39 | One can expect that PiPPy will outperform native model parallism by a multiplicative factor since all GPUs are running at all times with inputs, rather than one input being passed through a GPU at a time waiting for the prior to finish.
40 |
41 | Below are some benchmarks we have found when using the accelerate-pippy integration for a few models when running on 2x4090's:
42 |
43 | ### Bert
44 |
45 | | | Accelerate/Sequential | PiPPy + Accelerate |
46 | |---|---|---|
47 | | First batch | 0.2137s | 0.3119s |
48 | | Average of 5 batches | 0.0099s | **0.0062s** |
49 |
50 | ### GPT2
51 |
52 | | | Accelerate/Sequential | PiPPy + Accelerate |
53 | |---|---|---|
54 | | First batch | 0.1959s | 0.4189s |
55 | | Average of 5 batches | 0.0205s | **0.0126s** |
56 |
57 | ### T5
58 |
59 | | | Accelerate/Sequential | PiPPy + Accelerate |
60 | |---|---|---|
61 | | First batch | 0.2789s | 0.3809s |
62 | | Average of 5 batches | 0.0198s | **0.0166s** |
--------------------------------------------------------------------------------
/examples/inference/pippy/bert.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import time
15 |
16 | import torch
17 | from transformers import AutoModelForMaskedLM
18 |
19 | from accelerate import PartialState, prepare_pippy
20 | from accelerate.test_utils import torch_device
21 | from accelerate.utils import set_seed
22 |
23 |
24 | synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize
25 |
26 | # Set the random seed to have reproducable outputs
27 | set_seed(42)
28 |
29 | # Create an example model
30 | model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
31 | model.eval()
32 |
33 | # Input configs
34 | # Create example inputs for the model
35 | input = torch.randint(
36 | low=0,
37 | high=model.config.vocab_size,
38 | size=(1, 512), # bs x seq_len
39 | device="cpu",
40 | dtype=torch.int64,
41 | requires_grad=False,
42 | )
43 |
44 |
45 | # Create a pipeline stage from the model
46 | # Using `auto` is equivalent to letting `device_map="auto"` figure
47 | # out device mapping and will also split the model according to the
48 | # number of total GPUs available if it fits on one GPU
49 | model = prepare_pippy(model, split_points="auto", example_args=(input,))
50 |
51 | # You can pass `gather_output=True` to have the output from the model
52 | # available on all GPUs
53 | # model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True)
54 |
55 | # Create new inputs of the expected size (n_processes)
56 | input = torch.randint(
57 | low=0,
58 | high=model.config.vocab_size,
59 | size=(2, 512), # bs x seq_len
60 | device="cpu",
61 | dtype=torch.int64,
62 | requires_grad=False,
63 | )
64 |
65 | # Move the inputs to the first device
66 | input = input.to(torch_device)
67 |
68 | # Take an average of 5 times
69 | # Measure first batch
70 | synchronize_func()
71 | start_time = time.time()
72 | with torch.no_grad():
73 | output = model(input)
74 | synchronize_func()
75 | end_time = time.time()
76 | first_batch = end_time - start_time
77 |
78 | # Now that hpu is init, measure after
79 | synchronize_func()
80 | start_time = time.time()
81 | for i in range(5):
82 | with torch.no_grad():
83 | output = model(input)
84 | synchronize_func()
85 | end_time = time.time()
86 |
87 | # The outputs are only on the final process by default
88 | if PartialState().is_last_process:
89 | output = torch.stack(tuple(output[0]))
90 | print(f"Time of first pass: {first_batch}")
91 | print(f"Average time per batch: {(end_time - start_time) / 5}")
92 | PartialState().destroy_process_group()
93 |
--------------------------------------------------------------------------------
/examples/inference/pippy/gpt2.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import time
15 |
16 | import torch
17 | from transformers import AutoModelForSequenceClassification
18 |
19 | from accelerate import PartialState, prepare_pippy
20 | from accelerate.test_utils import torch_device
21 | from accelerate.utils import set_seed
22 |
23 |
24 | synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize
25 |
26 | # Set the random seed to have reproducable outputs
27 | set_seed(42)
28 |
29 | # Create an example model
30 | model = AutoModelForSequenceClassification.from_pretrained("gpt2")
31 | model.eval()
32 |
33 | # Input configs
34 | # Create example inputs for the model
35 | input = torch.randint(
36 | low=0,
37 | high=model.config.vocab_size,
38 | size=(1, 1024), # bs x seq_len
39 | device="cpu",
40 | dtype=torch.int64,
41 | requires_grad=False,
42 | )
43 |
44 | # Create a pipeline stage from the model
45 | # Using `auto` is equivalent to letting `device_map="auto"` figure
46 | # out device mapping and will also split the model according to the
47 | # number of total GPUs available if it fits on one GPU
48 | model = prepare_pippy(model, split_points="auto", example_args=(input,))
49 |
50 | # You can pass `gather_output=True` to have the output from the model
51 | # available on all GPUs
52 | # model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True)
53 |
54 | # Create new inputs of the expected size (n_processes)
55 | input = torch.randint(
56 | low=0,
57 | high=model.config.vocab_size,
58 | size=(2, 1024), # bs x seq_len
59 | device="cpu",
60 | dtype=torch.int64,
61 | requires_grad=False,
62 | )
63 |
64 | # Move the inputs to the first device
65 | input = input.to(torch_device)
66 |
67 | # Take an average of 5 times
68 | # Measure first batch
69 | synchronize_func()
70 | start_time = time.time()
71 | with torch.no_grad():
72 | output = model(input)
73 | synchronize_func()
74 | end_time = time.time()
75 | first_batch = end_time - start_time
76 |
77 | # Now that device/backend is init, measure after
78 | synchronize_func()
79 | start_time = time.time()
80 | for i in range(5):
81 | with torch.no_grad():
82 | output = model(input)
83 | synchronize_func()
84 | end_time = time.time()
85 |
86 | # The outputs are only on the final process by default
87 | if PartialState().is_last_process:
88 | output = torch.stack(tuple(output[0]))
89 | print(f"Time of first pass: {first_batch}")
90 | print(f"Average time per batch: {(end_time - start_time) / 5}")
91 | PartialState().destroy_process_group()
92 |
--------------------------------------------------------------------------------
/examples/inference/pippy/llama.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | from transformers import AutoModelForCausalLM, AutoTokenizer
16 |
17 | from accelerate import PartialState, prepare_pippy
18 |
19 |
20 | # sdpa implementation which is the default torch>2.1.2 fails with the tracing + attention mask kwarg
21 | # with attn_implementation="eager" mode, the forward is very slow for some reason
22 | model = AutoModelForCausalLM.from_pretrained(
23 | "meta-llama/Llama-2-7b-chat-hf", low_cpu_mem_usage=True, attn_implementation="sdpa"
24 | )
25 | model.eval()
26 |
27 | # Input configs
28 | # Create example inputs for the model
29 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
30 | prompts = ("I would like to", "I really like to") # bs = 2, sending 2 per process
31 | tokenizer.pad_token = tokenizer.eos_token
32 | inputs = tokenizer(prompts, return_tensors="pt", padding=True)
33 |
34 | # Create a pipeline stage from the model
35 | # Using `auto` is equivalent to letting `device_map="auto"` figure
36 | # out device mapping and will also split the model according to the
37 | # number of total GPUs available if it fits on one GPU
38 | model = prepare_pippy(model, split_points="auto", example_kwargs=inputs)
39 |
40 | # You can pass `gather_output=True` to have the output from the model
41 | # available on all GPUs
42 | # model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True)
43 |
44 | # currently we don't support `model.generate`
45 | # output = model.generate(**inputs, max_new_tokens=1)
46 | prompts = ("I would like to", "I really like to", "The weather is pretty") # bs = 3
47 | inputs = tokenizer(prompts, return_tensors="pt", padding=True)
48 | inputs = inputs.to(0)
49 | with torch.no_grad():
50 | output = model(**inputs)
51 |
52 | # The outputs are only on the final process by default
53 | if PartialState().is_last_process:
54 | next_token_logits = output[0][:, -1, :]
55 | next_token = torch.argmax(next_token_logits, dim=-1)
56 | print(tokenizer.batch_decode(next_token))
57 | PartialState().destroy_process_group()
58 |
--------------------------------------------------------------------------------
/examples/inference/pippy/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | pippy>=0.2.0
--------------------------------------------------------------------------------
/examples/inference/pippy/t5.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import time
15 |
16 | import torch
17 | from packaging import version
18 | from transformers import AutoModelForSeq2SeqLM
19 |
20 | from accelerate import PartialState, prepare_pippy
21 | from accelerate import __version__ as accelerate_version
22 | from accelerate.utils import set_seed
23 |
24 |
25 | if version.parse(accelerate_version) > version.parse("0.33.0"):
26 | raise RuntimeError(
27 | "Using encoder/decoder models is not supported with the `torch.pipelining` integration or accelerate>=0.34.0. "
28 | "Please use a lower accelerate version and `torchpippy`, which this example uses."
29 | )
30 |
31 |
32 | # Set the random seed to have reproducable outputs
33 | set_seed(42)
34 |
35 | # Create an example model
36 | model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
37 | model.eval()
38 |
39 | # Input configs
40 | # Create example inputs for the model
41 | input = torch.randint(
42 | low=0,
43 | high=model.config.vocab_size,
44 | size=(2, 1024), # bs x seq_len
45 | device="cpu",
46 | dtype=torch.int64,
47 | requires_grad=False,
48 | )
49 |
50 | example_inputs = {"input_ids": input, "decoder_input_ids": input}
51 |
52 | # Create a pipeline stage from the model
53 | # Using `auto` is equivalent to letting `device_map="auto"` figure
54 | # out device mapping and will also split the model according to the
55 | # number of total GPUs available if it fits on one GPU
56 | model = prepare_pippy(
57 | model,
58 | no_split_module_classes=["T5Block"],
59 | example_kwargs=example_inputs,
60 | )
61 |
62 | # You can pass `gather_output=True` to have the output from the model
63 | # available on all GPUs
64 | # model = prepare_pippy(
65 | # model,
66 | # no_split_module_classes=["T5Block"],
67 | # example_kwargs=example_inputs,
68 | # gather_outputs=True
69 | # )
70 |
71 | # The model expects a tuple during real inference
72 | # with the data on the first device
73 | args = (example_inputs["input_ids"].to("cuda:0"), example_inputs["decoder_input_ids"].to("cuda:0"))
74 |
75 | # Take an average of 5 times
76 | # Measure first batch
77 | torch.cuda.synchronize()
78 | start_time = time.time()
79 | with torch.no_grad():
80 | output = model(*args)
81 | torch.cuda.synchronize()
82 | end_time = time.time()
83 | first_batch = end_time - start_time
84 |
85 | # Now that CUDA is init, measure after
86 | torch.cuda.synchronize()
87 | start_time = time.time()
88 | for i in range(5):
89 | with torch.no_grad():
90 | output = model(*args)
91 | torch.cuda.synchronize()
92 | end_time = time.time()
93 |
94 | # The outputs are only on the final process by default
95 | if PartialState().is_last_process:
96 | output = torch.stack(tuple(output[0]))
97 | print(f"Time of first pass: {first_batch}")
98 | print(f"Average time per batch: {(end_time - start_time) / 5}")
99 | PartialState().destroy_process_group()
100 |
--------------------------------------------------------------------------------
/examples/multigpu_remote_launcher.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import argparse
15 |
16 | import runhouse as rh
17 | import torch
18 | from nlp_example import training_function
19 |
20 | from accelerate.utils import PrepareForLaunch, patch_environment
21 |
22 |
23 | def launch_train(*args):
24 | num_processes = torch.cuda.device_count()
25 | print(f"Device count: {num_processes}")
26 | with patch_environment(
27 | world_size=num_processes, master_addr="127.0.0.1", master_port="29500", mixed_precision=args[1].mixed_precision
28 | ):
29 | launcher = PrepareForLaunch(training_function, distributed_type="MULTI_GPU")
30 | torch.multiprocessing.start_processes(launcher, args=args, nprocs=num_processes, start_method="spawn")
31 |
32 |
33 | if __name__ == "__main__":
34 | # Refer to https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup
35 | # for cloud access setup instructions (if using on-demand hardware), and for API specifications.
36 |
37 | # on-demand GPU
38 | # gpu = rh.cluster(name='rh-cluster', instance_type='V100:1', provider='cheapest', use_spot=False) # single GPU
39 | gpu = rh.cluster(name="rh-cluster", instance_type="V100:4", provider="cheapest", use_spot=False) # multi GPU
40 | gpu.up_if_not()
41 |
42 | # on-prem GPU
43 | # gpu = rh.cluster(
44 | # ips=["ip_addr"], ssh_creds={ssh_user:"", ssh_private_key:""}, name="rh-cluster"
45 | # )
46 |
47 | # Set up remote function
48 | reqs = [
49 | "pip:./",
50 | "transformers",
51 | "datasets",
52 | "evaluate",
53 | "tqdm",
54 | "scipy",
55 | "scikit-learn",
56 | "tensorboard",
57 | "torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117",
58 | ]
59 | launch_train_gpu = rh.function(fn=launch_train, system=gpu, reqs=reqs, name="train_bert_glue")
60 |
61 | # Define train args/config, run train function
62 | train_args = argparse.Namespace(cpu=False, mixed_precision="fp16")
63 | config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
64 | launch_train_gpu(config, train_args, stream_logs=True)
65 |
66 | # Alternatively, we can just run as instructed in the README (but only because there's already a wrapper CLI):
67 | # gpu.install_packages(reqs)
68 | # gpu.run(['accelerate launch --multi_gpu accelerate/examples/nlp_example.py'])
69 |
--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate # used to be installed in Amazon SageMaker environment
2 | evaluate
3 | datasets==2.3.2
4 | schedulefree
5 | huggingface_hub>=0.20.0
6 |
--------------------------------------------------------------------------------
/examples/slurm/fsdp_config.yaml:
--------------------------------------------------------------------------------
1 | distributed_type: FSDP
2 | fsdp_config:
3 | fsdp_activation_checkpointing: false
4 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
5 | fsdp_backward_prefetch: BACKWARD_PRE
6 | fsdp_cpu_ram_efficient_loading: true
7 | fsdp_forward_prefetch: false
8 | fsdp_offload_params: false
9 | fsdp_sharding_strategy: FULL_SHARD
10 | fsdp_state_dict_type: SHARDED_STATE_DICT
11 | fsdp_sync_module_states: true
12 | fsdp_use_orig_params: true
13 |
--------------------------------------------------------------------------------
/examples/slurm/submit_multicpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -l
2 |
3 | #SBATCH --job-name=multicpu
4 | #SBATCH --nodes=2 # number of Nodes
5 | #SBATCH --ntasks-per-node=1 # number of MP tasks
6 | #SBATCH --exclusive
7 | #SBATCH --output=O-%x.%j
8 | #SBATCH --error=E-%x.%j
9 |
10 | ######################
11 | ### Set environment ###
12 | ######################
13 | source activateEnvironment.sh
14 |
15 | ######################
16 | #### Set network #####
17 | ######################
18 | head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
19 | ######################
20 |
21 | # Setup env variables for distributed jobs
22 | export MASTER_PORT="${MASTER_PORT:-29555 }"
23 | echo "head_node_ip=${head_node_ip}"
24 | echo "MASTER_PORT=${MASTER_PORT}"
25 |
26 | INSTANCES_PER_NODE="${INSTANCES_PER_NODE:-1}"
27 |
28 | if [[ $SLURM_NNODES == 1 ]] && [[ $INSTANCES_PER_NODE == 1 ]]; then
29 | export CCL_WORKER_COUNT=0
30 | LAUNCHER=""
31 | else
32 | # Setup env variables for distributed jobs
33 | export CCL_WORKER_COUNT="${CCL_WORKER_COUNT:-2}"
34 | echo "CCL_WORKER_COUNT=${CCL_WORKER_COUNT}"
35 |
36 | # Write hostfile
37 | HOSTFILE_PATH=hostfile
38 | scontrol show hostname $SLURM_JOB_NODELIST | perl -ne 'chomb; print "$_"x1'> ${HOSTFILE_PATH}
39 |
40 | export LAUNCHER="accelerate launch \
41 | --num_processes $((SLURM_NNODES * ${INSTANCES_PER_NODE})) \
42 | --num_machines $SLURM_NNODES \
43 | --rdzv_backend c10d \
44 | --main_process_ip $head_node_ip \
45 | --main_process_port $MASTER_PORT \
46 | --mpirun_hostfile $HOSTFILE_PATH \
47 | --mpirun_ccl $CCL_WORKER_COUNT"
48 | fi
49 |
50 | # This step is necessary because accelerate launch does not handle multiline arguments properly
51 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
52 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
53 | export SCRIPT_ARGS=" \
54 | --cpu \
55 | --output_dir ${ACCELERATE_DIR}/examples/output \
56 | "
57 |
58 | # This step is necessary because accelerate launch does not handle multiline arguments properly
59 | export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS"
60 | # Print the command
61 | echo $CMD
62 | echo ""
63 |
64 | # Run the command
65 | eval $CMD
--------------------------------------------------------------------------------
/examples/slurm/submit_multigpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=multigpu
4 | #SBATCH -D .
5 | #SBATCH --output=O-%x.%j
6 | #SBATCH --error=E-%x.%j
7 | #SBATCH --nodes=1
8 | #SBATCH --ntasks-per-node=1 # number of MP tasks
9 | #SBATCH --gres=gpu:4 # number of GPUs per node
10 | #SBATCH --cpus-per-task=160 # number of cores per tasks
11 | #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS)
12 |
13 | ######################
14 | ### Set environment ###
15 | ######################
16 | source activateEnvironment.sh
17 | export GPUS_PER_NODE=4
18 | ######################
19 |
20 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
21 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
22 | export SCRIPT_ARGS=" \
23 | --mixed_precision fp16 \
24 | --output_dir ${ACCELERATE_DIR}/examples/output \
25 | --with_tracking \
26 | "
27 |
28 | accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS
--------------------------------------------------------------------------------
/examples/slurm/submit_multinode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=multinode
4 | #SBATCH -D .
5 | #SBATCH --output=O-%x.%j
6 | #SBATCH --error=E-%x.%j
7 | #SBATCH --nodes=4 # number of nodes
8 | #SBATCH --ntasks-per-node=1 # number of MP tasks
9 | #SBATCH --gres=gpu:4 # number of GPUs per node
10 | #SBATCH --cpus-per-task=160 # number of cores per tasks
11 | #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS)
12 |
13 | ######################
14 | ### Set environment ###
15 | ######################
16 | source activateEnvironment.sh
17 | export GPUS_PER_NODE=4
18 | ######################
19 |
20 | ######################
21 | #### Set network #####
22 | ######################
23 | head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
24 | ######################
25 |
26 | export LAUNCHER="accelerate launch \
27 | --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
28 | --num_machines $SLURM_NNODES \
29 | --rdzv_backend c10d \
30 | --main_process_ip $head_node_ip \
31 | --main_process_port 29500 \
32 | "
33 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
34 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
35 | export SCRIPT_ARGS=" \
36 | --mixed_precision fp16 \
37 | --output_dir ${ACCELERATE_DIR}/examples/output \
38 | "
39 |
40 | # This step is necessary because accelerate launch does not handle multiline arguments properly
41 | export CMD="$LAUNCHER $PYTHON_FILE $ARGS"
42 | srun $CMD
--------------------------------------------------------------------------------
/examples/slurm/submit_multinode_fsdp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=multinode
4 | #SBATCH -D .
5 | #SBATCH --output=O-%x.%j
6 | #SBATCH --error=E-%x.%j
7 | #SBATCH --nodes=4 # number of nodes
8 | #SBATCH --ntasks-per-node=1 # number of MP tasks
9 | #SBATCH --gres=gpu:4 # number of GPUs per node
10 | #SBATCH --cpus-per-task=160 # number of cores per tasks
11 | #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS)
12 |
13 | ######################
14 | ### Set environment ###
15 | ######################
16 | source activateEnvironment.sh
17 | export GPUS_PER_NODE=4
18 | ######################
19 |
20 | ######################
21 | #### Set network #####
22 | ######################
23 | head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
24 | ######################
25 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
26 |
27 | export LAUNCHER="accelerate launch \
28 | --config_file ${ACCELERATE_DIR}/examples/slurm/fsdp_config.yaml \
29 | --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
30 | --num_machines $SLURM_NNODES \
31 | --rdzv_backend c10d \
32 | --main_process_ip $head_node_ip \
33 | --main_process_port 29500 \
34 | "
35 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
36 | export SCRIPT_ARGS=" \
37 | --mixed_precision fp16 \
38 | --output_dir ${ACCELERATE_DIR}/examples/output \
39 | "
40 |
41 | # This step is necessary because accelerate launch does not handle multiline arguments properly
42 | export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS"
43 | srun $CMD
--------------------------------------------------------------------------------
/manim_animations/dataloaders/stage_0.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from manim import *
16 |
17 |
18 | class Stage0(Scene):
19 | def construct(self):
20 | mascot = ImageMobject("mascot_bookie.png")
21 | mascot.scale(.35)
22 | mascot.move_to([-3.75,-1,0])
23 | text = Paragraph(
24 | "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?",
25 | font_size=36,
26 | line_spacing=1,
27 | alignment="center",
28 | weight=BOLD,
29 | )
30 | text.move_to([1.75,.5,0])
31 | self.add(mascot)
32 | self.add(text)
--------------------------------------------------------------------------------
/manim_animations/dataloaders/stage_1.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from manim import *
16 |
17 | class Stage01(Scene):
18 | def construct(self):
19 | mascot = ImageMobject("mascot_bookie.png")
20 | mascot.scale(.35)
21 | mascot.move_to([-3.75,-1,0])
22 | text = Paragraph(
23 | "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?",
24 | font_size=36,
25 | line_spacing=1,
26 | alignment="center",
27 | weight=BOLD,
28 | )
29 | text.move_to([1.75,.5,0])
30 | self.add(mascot)
31 | self.add(text)
--------------------------------------------------------------------------------
/manim_animations/dataloaders/stage_3.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from manim import *
16 |
17 | class Stage3(Scene):
18 | def construct(self):
19 | step_1 = MarkupText(
20 | f"To combat this, Accelerate employs one of two different\nSampler wrapper methods depending on the scenario:",
21 | font_size=24
22 | )
23 | step_1.move_to([0, 1.5, 0])
24 | self.add(step_1)
25 | step_2 = MarkupText(
26 | f"1. Sharding the dataset before drawing:\n\t● IterableDatasetShard\n\t● BatchSamplerShard",
27 | font_size=24,
28 | ).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
29 | self.add(step_2)
30 | step_3 = MarkupText(
31 | f"\n\n2. Splitting the batch after drawing:\n\t● DataLoaderDispatcher",
32 | font_size=24,
33 | ).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
34 | self.add(step_3)
--------------------------------------------------------------------------------
/manim_animations/dataloaders/stage_4.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from manim import *
16 |
17 | class Stage4(Scene):
18 | def construct(self):
19 |
20 | step_1 = MarkupText(
21 | f"To understand the next part fully, let's define two terms,\n`batch_size` and `global_batch_size`:",
22 | font_size=18
23 | )
24 | step_1.move_to([0, 1.5, 0])
25 | # ●
26 | step_2 = MarkupText(
27 | f"\n\n● `batch_size`: \n\tThis will be defined as the batch size seen on a given\n\t*individual* GPU",
28 | font_size=18,
29 | ).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
30 |
31 | step_3 = MarkupText(
32 | f"\n\n● `global_batch_size`:\n\tThis will be defined as the *total* number of\n\tdifferent items seen in the dataset, across all GPUs",
33 | font_size=18,
34 | ).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
35 |
36 | step_4 = MarkupText(
37 | f"\n\nSo if we have a dataset of 64 items, 8 GPUs, \nand a `batch_size` of 8, each *step* will go through\nthe entire dataset one time as 8*8=64",
38 | font_size=18,
39 | ).next_to(step_3, direction=DOWN, aligned_edge=LEFT)
40 | self.play(
41 | Write(step_1, run_time=4),
42 | )
43 | self.play(
44 | Write(step_2, run_time=4)
45 | )
46 | self.play(
47 | Write(step_3, run_time=4)
48 | )
49 | self.play(
50 | Write(step_4, run_time=6)
51 | )
52 | self.wait()
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | line-length = 119
3 | target-version = "py39"
4 |
5 | [tool.ruff.lint]
6 | preview = true
7 | extend-select = [
8 | "B009", # static getattr
9 | "B010", # static setattr
10 | "CPY", # Copyright
11 | "E", # PEP8 errors
12 | "F", # PEP8 formatting
13 | "I", # Import sorting
14 | "TID251", # Banned API
15 | "UP", # Pyupgrade
16 | "W", # PEP8 warnings
17 | ]
18 | ignore = [
19 | "E501", # Line length (handled by ruff-format)
20 | "E741", # Ambiguous variable name
21 | "W605", # Invalid escape sequence
22 | "UP007", # X | Y type annotations
23 | ]
24 |
25 | [tool.ruff.lint.per-file-ignores]
26 | "__init__.py" = [
27 | "F401", # Ignore seemingly unused imports (they're meant for re-export)
28 | ]
29 | "manim_animations/*" = ["ALL"]
30 |
31 | [tool.ruff.lint.isort]
32 | lines-after-imports = 2
33 | known-first-party = ["accelerate"]
34 |
35 | [tool.ruff.format]
36 | exclude = [
37 | "manim_animations/*"
38 | ]
39 |
40 | [tool.ruff.lint.flake8-tidy-imports.banned-api]
41 | "os.getenv".msg = "Use os.environ instead"
42 | "os.putenv".msg = "Use os.environ instead"
43 | "os.unsetenv".msg = "Use os.environ instead"
44 |
--------------------------------------------------------------------------------
/src/accelerate/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | __version__ = "1.8.0.dev0"
15 |
16 | from .accelerator import Accelerator
17 | from .big_modeling import (
18 | cpu_offload,
19 | cpu_offload_with_hook,
20 | disk_offload,
21 | dispatch_model,
22 | init_empty_weights,
23 | init_on_device,
24 | load_checkpoint_and_dispatch,
25 | )
26 | from .data_loader import skip_first_batches
27 | from .inference import prepare_pippy
28 | from .launchers import debug_launcher, notebook_launcher
29 | from .state import PartialState
30 | from .utils import (
31 | AutocastKwargs,
32 | DataLoaderConfiguration,
33 | DDPCommunicationHookType,
34 | DeepSpeedPlugin,
35 | DistributedDataParallelKwargs,
36 | DistributedType,
37 | FullyShardedDataParallelPlugin,
38 | GradScalerKwargs,
39 | InitProcessGroupKwargs,
40 | ProfileKwargs,
41 | find_executable_batch_size,
42 | infer_auto_device_map,
43 | is_rich_available,
44 | load_checkpoint_in_model,
45 | synchronize_rng_states,
46 | )
47 |
48 |
49 | if is_rich_available():
50 | from .utils import rich
51 |
--------------------------------------------------------------------------------
/src/accelerate/commands/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/accelerate/commands/accelerate_cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2021 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from accelerate.commands.config import get_config_parser
18 | from accelerate.commands.env import env_command_parser
19 | from accelerate.commands.estimate import estimate_command_parser
20 | from accelerate.commands.launch import launch_command_parser
21 | from accelerate.commands.merge import merge_command_parser
22 | from accelerate.commands.test import test_command_parser
23 | from accelerate.commands.to_fsdp2 import to_fsdp2_command_parser
24 | from accelerate.commands.tpu import tpu_command_parser
25 | from accelerate.commands.utils import CustomArgumentParser
26 |
27 |
28 | def main():
29 | parser = CustomArgumentParser("Accelerate CLI tool", usage="accelerate []", allow_abbrev=False)
30 | subparsers = parser.add_subparsers(help="accelerate command helpers")
31 |
32 | # Register commands
33 | get_config_parser(subparsers=subparsers)
34 | estimate_command_parser(subparsers=subparsers)
35 | env_command_parser(subparsers=subparsers)
36 | launch_command_parser(subparsers=subparsers)
37 | merge_command_parser(subparsers=subparsers)
38 | tpu_command_parser(subparsers=subparsers)
39 | test_command_parser(subparsers=subparsers)
40 | to_fsdp2_command_parser(subparsers=subparsers)
41 |
42 | # Let's go
43 | args = parser.parse_args()
44 |
45 | if not hasattr(args, "func"):
46 | parser.print_help()
47 | exit(1)
48 |
49 | # Run
50 | args.func(args)
51 |
52 |
53 | if __name__ == "__main__":
54 | main()
55 |
--------------------------------------------------------------------------------
/src/accelerate/commands/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2021 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import argparse
18 |
19 | from .config import config_command_parser
20 | from .config_args import default_config_file, load_config_from_file # noqa: F401
21 | from .default import default_command_parser
22 | from .update import update_command_parser
23 |
24 |
25 | def get_config_parser(subparsers=None):
26 | parent_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
27 | # The main config parser
28 | config_parser = config_command_parser(subparsers)
29 | # The subparser to add commands to
30 | subcommands = config_parser.add_subparsers(title="subcommands", dest="subcommand")
31 |
32 | # Then add other parsers with the parent parser
33 | default_command_parser(subcommands, parents=[parent_parser])
34 | update_command_parser(subcommands, parents=[parent_parser])
35 |
36 | return config_parser
37 |
38 |
39 | def main():
40 | config_parser = get_config_parser()
41 | args = config_parser.parse_args()
42 |
43 | if not hasattr(args, "func"):
44 | config_parser.print_help()
45 | exit(1)
46 |
47 | # Run
48 | args.func(args)
49 |
50 |
51 | if __name__ == "__main__":
52 | main()
53 |
--------------------------------------------------------------------------------
/src/accelerate/commands/config/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2021 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import argparse
18 | import os
19 |
20 | from accelerate.utils import ComputeEnvironment
21 |
22 | from .cluster import get_cluster_input
23 | from .config_args import cache_dir, default_config_file, default_yaml_config_file, load_config_from_file # noqa: F401
24 | from .config_utils import _ask_field, _ask_options, _convert_compute_environment # noqa: F401
25 | from .sagemaker import get_sagemaker_input
26 |
27 |
28 | description = "Launches a series of prompts to create and save a `default_config.yaml` configuration file for your training system. Should always be ran first on your machine"
29 |
30 |
31 | def get_user_input():
32 | compute_environment = _ask_options(
33 | "In which compute environment are you running?",
34 | ["This machine", "AWS (Amazon SageMaker)"],
35 | _convert_compute_environment,
36 | )
37 | if compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
38 | config = get_sagemaker_input()
39 | else:
40 | config = get_cluster_input()
41 | return config
42 |
43 |
44 | def config_command_parser(subparsers=None):
45 | if subparsers is not None:
46 | parser = subparsers.add_parser("config", description=description)
47 | else:
48 | parser = argparse.ArgumentParser("Accelerate config command", description=description)
49 |
50 | parser.add_argument(
51 | "--config_file",
52 | default=None,
53 | help=(
54 | "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
55 | "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
56 | "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
57 | "with 'huggingface'."
58 | ),
59 | )
60 |
61 | if subparsers is not None:
62 | parser.set_defaults(func=config_command)
63 | return parser
64 |
65 |
66 | def config_command(args):
67 | config = get_user_input()
68 | if args.config_file is not None:
69 | config_file = args.config_file
70 | else:
71 | if not os.path.isdir(cache_dir):
72 | os.makedirs(cache_dir)
73 | config_file = default_yaml_config_file
74 |
75 | if config_file.endswith(".json"):
76 | config.to_json_file(config_file)
77 | else:
78 | config.to_yaml_file(config_file)
79 | print(f"accelerate configuration saved at {config_file}")
80 |
81 |
82 | def main():
83 | parser = config_command_parser()
84 | args = parser.parse_args()
85 | config_command(args)
86 |
87 |
88 | if __name__ == "__main__":
89 | main()
90 |
--------------------------------------------------------------------------------
/src/accelerate/commands/config/update.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2022 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pathlib import Path
18 |
19 | from .config_args import default_config_file, load_config_from_file
20 | from .config_utils import SubcommandHelpFormatter
21 |
22 |
23 | description = "Update an existing config file with the latest defaults while maintaining the old configuration."
24 |
25 |
26 | def update_config(args):
27 | """
28 | Update an existing config file with the latest defaults while maintaining the old configuration.
29 | """
30 | config_file = args.config_file
31 | if config_file is None and Path(default_config_file).exists():
32 | config_file = default_config_file
33 | elif not Path(config_file).exists():
34 | raise ValueError(f"The passed config file located at {config_file} doesn't exist.")
35 | config = load_config_from_file(config_file)
36 |
37 | if config_file.endswith(".json"):
38 | config.to_json_file(config_file)
39 | else:
40 | config.to_yaml_file(config_file)
41 | return config_file
42 |
43 |
44 | def update_command_parser(parser, parents):
45 | parser = parser.add_parser("update", parents=parents, help=description, formatter_class=SubcommandHelpFormatter)
46 | parser.add_argument(
47 | "--config_file",
48 | default=None,
49 | help=(
50 | "The path to the config file to update. Will default to a file named default_config.yaml in the cache "
51 | "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
52 | "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
53 | "with 'huggingface'."
54 | ),
55 | )
56 |
57 | parser.set_defaults(func=update_config_command)
58 | return parser
59 |
60 |
61 | def update_config_command(args):
62 | config_file = update_config(args)
63 | print(f"Sucessfully updated the configuration file at {config_file}.")
64 |
--------------------------------------------------------------------------------
/src/accelerate/commands/menu/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .selection_menu import BulletMenu
15 |
--------------------------------------------------------------------------------
/src/accelerate/commands/menu/cursor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | A utility for showing and hiding the terminal cursor on Windows and Linux, based on https://github.com/bchao1/bullet
17 | """
18 |
19 | import os
20 | import sys
21 | from contextlib import contextmanager
22 |
23 |
24 | # Windows only
25 | if os.name == "nt":
26 | import ctypes
27 | import msvcrt # noqa
28 |
29 | class CursorInfo(ctypes.Structure):
30 | # _fields is a specific attr expected by ctypes
31 | _fields_ = [("size", ctypes.c_int), ("visible", ctypes.c_byte)]
32 |
33 |
34 | def hide_cursor():
35 | if os.name == "nt":
36 | ci = CursorInfo()
37 | handle = ctypes.windll.kernel32.GetStdHandle(-11)
38 | ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci))
39 | ci.visible = False
40 | ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci))
41 | elif os.name == "posix":
42 | sys.stdout.write("\033[?25l")
43 | sys.stdout.flush()
44 |
45 |
46 | def show_cursor():
47 | if os.name == "nt":
48 | ci = CursorInfo()
49 | handle = ctypes.windll.kernel32.GetStdHandle(-11)
50 | ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci))
51 | ci.visible = True
52 | ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci))
53 | elif os.name == "posix":
54 | sys.stdout.write("\033[?25h")
55 | sys.stdout.flush()
56 |
57 |
58 | @contextmanager
59 | def hide():
60 | "Context manager to hide the terminal cursor"
61 | try:
62 | hide_cursor()
63 | yield
64 | finally:
65 | show_cursor()
66 |
--------------------------------------------------------------------------------
/src/accelerate/commands/menu/helpers.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | A variety of helper functions and constants when dealing with terminal menu choices, based on
17 | https://github.com/bchao1/bullet
18 | """
19 |
20 | import enum
21 | import shutil
22 | import sys
23 |
24 |
25 | TERMINAL_WIDTH, _ = shutil.get_terminal_size()
26 |
27 | CURSOR_TO_CHAR = {"UP": "A", "DOWN": "B", "RIGHT": "C", "LEFT": "D"}
28 |
29 |
30 | class Direction(enum.Enum):
31 | UP = 0
32 | DOWN = 1
33 |
34 |
35 | def forceWrite(content, end=""):
36 | sys.stdout.write(str(content) + end)
37 | sys.stdout.flush()
38 |
39 |
40 | def writeColor(content, color, end=""):
41 | forceWrite(f"\u001b[{color}m{content}\u001b[0m", end)
42 |
43 |
44 | def reset_cursor():
45 | forceWrite("\r")
46 |
47 |
48 | def move_cursor(num_lines: int, direction: str):
49 | forceWrite(f"\033[{num_lines}{CURSOR_TO_CHAR[direction.upper()]}")
50 |
51 |
52 | def clear_line():
53 | forceWrite(" " * TERMINAL_WIDTH)
54 | reset_cursor()
55 |
56 |
57 | def linebreak():
58 | reset_cursor()
59 | forceWrite("-" * TERMINAL_WIDTH)
60 |
--------------------------------------------------------------------------------
/src/accelerate/commands/menu/input.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | This file contains utilities for handling input from the user and registering specific keys to specific functions,
17 | based on https://github.com/bchao1/bullet
18 | """
19 |
20 | from .keymap import KEYMAP, get_character
21 |
22 |
23 | def mark(key: str):
24 | """
25 | Mark the function with the key code so it can be handled in the register
26 | """
27 |
28 | def decorator(func):
29 | handle = getattr(func, "handle_key", [])
30 | handle += [key]
31 | func.handle_key = handle
32 | return func
33 |
34 | return decorator
35 |
36 |
37 | def mark_multiple(*keys: list[str]):
38 | """
39 | Mark the function with the key codes so it can be handled in the register
40 | """
41 |
42 | def decorator(func):
43 | handle = getattr(func, "handle_key", [])
44 | handle += keys
45 | func.handle_key = handle
46 | return func
47 |
48 | return decorator
49 |
50 |
51 | class KeyHandler(type):
52 | """
53 | Metaclass that adds the key handlers to the class
54 | """
55 |
56 | def __new__(cls, name, bases, attrs):
57 | new_cls = super().__new__(cls, name, bases, attrs)
58 | if not hasattr(new_cls, "key_handler"):
59 | new_cls.key_handler = {}
60 | new_cls.handle_input = KeyHandler.handle_input
61 |
62 | for value in attrs.values():
63 | handled_keys = getattr(value, "handle_key", [])
64 | for key in handled_keys:
65 | new_cls.key_handler[key] = value
66 | return new_cls
67 |
68 | @staticmethod
69 | def handle_input(cls):
70 | "Finds and returns the selected character if it exists in the handler"
71 | char = get_character()
72 | if char != KEYMAP["undefined"]:
73 | char = ord(char)
74 | handler = cls.key_handler.get(char)
75 | if handler:
76 | cls.current_selection = char
77 | return handler(cls)
78 | else:
79 | return None
80 |
81 |
82 | def register(cls):
83 | """Adds KeyHandler metaclass to the class"""
84 | return KeyHandler(cls.__name__, cls.__bases__, cls.__dict__.copy())
85 |
--------------------------------------------------------------------------------
/src/accelerate/commands/merge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2024 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from accelerate.commands.utils import CustomArgumentParser
17 | from accelerate.utils import merge_fsdp_weights
18 |
19 |
20 | description = """Utility to merge the weights from multiple FSDP checkpoints into a single combined checkpoint. Should be used if
21 | `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}`.
22 |
23 | This is a CPU-bound process and requires enough RAM to load the entire model state dict."""
24 |
25 |
26 | def merge_command(args):
27 | merge_fsdp_weights(
28 | args.checkpoint_directory, args.output_path, not args.unsafe_serialization, args.remove_checkpoint_dir
29 | )
30 |
31 |
32 | def merge_command_parser(subparsers=None):
33 | if subparsers is not None:
34 | parser = subparsers.add_parser("merge-weights", description=description)
35 | else:
36 | parser = CustomArgumentParser(description=description)
37 |
38 | parser.add_argument("checkpoint_directory", type=str, help="A directory containing sharded weights saved by FSDP.")
39 | parser.add_argument(
40 | "output_path",
41 | type=str,
42 | help="The path to save the merged weights. Defaults to the current directory. ",
43 | )
44 | parser.add_argument(
45 | "--unsafe_serialization",
46 | action="store_true",
47 | default=False,
48 | help="Whether to save the merged weights as `.bin` rather than `.safetensors` (not recommended).",
49 | )
50 | parser.add_argument(
51 | "--remove_checkpoint_dir",
52 | action="store_true",
53 | help="Whether to remove the checkpoint directory after merging.",
54 | default=False,
55 | )
56 |
57 | if subparsers is not None:
58 | parser.set_defaults(func=merge_command)
59 | return parser
60 |
61 |
62 | def main():
63 | parser = merge_command_parser()
64 | args = parser.parse_args()
65 | merge_command(args)
66 |
67 |
68 | if __name__ == "__main__":
69 | main()
70 |
--------------------------------------------------------------------------------
/src/accelerate/commands/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2021 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import argparse
18 |
19 | from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package
20 |
21 |
22 | def test_command_parser(subparsers=None):
23 | if subparsers is not None:
24 | parser = subparsers.add_parser("test")
25 | else:
26 | parser = argparse.ArgumentParser("Accelerate test command")
27 |
28 | parser.add_argument(
29 | "--config_file",
30 | default=None,
31 | help=(
32 | "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
33 | "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
34 | "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
35 | "with 'huggingface'."
36 | ),
37 | )
38 |
39 | if subparsers is not None:
40 | parser.set_defaults(func=test_command)
41 | return parser
42 |
43 |
44 | def test_command(args):
45 | script_name = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
46 |
47 | if args.config_file is None:
48 | test_args = [script_name]
49 | else:
50 | test_args = f"--config_file={args.config_file} {script_name}".split()
51 |
52 | cmd = ["accelerate-launch"] + test_args
53 | result = execute_subprocess_async(cmd)
54 | if result.returncode == 0:
55 | print("Test is a success! You are ready for your distributed training!")
56 |
57 |
58 | def main():
59 | parser = test_command_parser()
60 | args = parser.parse_args()
61 | test_command(args)
62 |
63 |
64 | if __name__ == "__main__":
65 | main()
66 |
--------------------------------------------------------------------------------
/src/accelerate/memory_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import warnings
16 |
17 |
18 | warnings.warn(
19 | "memory_utils has been reorganized to utils.memory. Import `find_executable_batchsize` from the main `__init__`: "
20 | "`from accelerate import find_executable_batch_size` to avoid this warning.",
21 | FutureWarning,
22 | )
23 |
--------------------------------------------------------------------------------
/src/accelerate/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .testing import (
15 | DEFAULT_LAUNCH_COMMAND,
16 | are_the_same_tensors,
17 | assert_exception,
18 | capture_call_output,
19 | device_count,
20 | execute_subprocess_async,
21 | get_launch_command,
22 | get_torch_dist_unique_port,
23 | memory_allocated_func,
24 | path_in_accelerate_package,
25 | pytest_xdist_worker_id,
26 | require_bnb,
27 | require_cpu,
28 | require_cuda,
29 | require_cuda_or_hpu,
30 | require_cuda_or_xpu,
31 | require_fp8,
32 | require_fp16,
33 | require_huggingface_suite,
34 | require_mlu,
35 | require_mps,
36 | require_multi_device,
37 | require_multi_gpu,
38 | require_multi_gpu_or_xpu,
39 | require_multi_xpu,
40 | require_musa,
41 | require_non_cpu,
42 | require_non_hpu,
43 | require_non_torch_xla,
44 | require_non_xpu,
45 | require_npu,
46 | require_pippy,
47 | require_sdaa,
48 | require_single_device,
49 | require_single_gpu,
50 | require_single_xpu,
51 | require_torch_min_version,
52 | require_torchao,
53 | require_torchvision,
54 | require_tpu,
55 | require_transformer_engine,
56 | require_xpu,
57 | run_first,
58 | skip,
59 | slow,
60 | torch_device,
61 | )
62 | from .training import RegressionDataset, RegressionModel, RegressionModel4XPU
63 |
64 |
65 | from .scripts import test_script, test_sync, test_ops # isort: skip
66 |
--------------------------------------------------------------------------------
/src/accelerate/test_utils/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/accelerate/test_utils/scripts/external_deps/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/accelerate/test_utils/scripts/external_deps/test_zero3_integration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch.distributed
16 |
17 | from accelerate.test_utils import require_huggingface_suite, torch_device
18 | from accelerate.utils import is_transformers_available
19 |
20 |
21 | if is_transformers_available():
22 | from transformers import AutoModel, TrainingArguments
23 |
24 |
25 | GPT2_TINY = "sshleifer/tiny-gpt2"
26 |
27 |
28 | @require_huggingface_suite
29 | def init_torch_dist_then_launch_deepspeed():
30 | if torch_device == "xpu":
31 | backend = "ccl"
32 | elif torch_device == "hpu":
33 | backend = "hccl"
34 | else:
35 | backend = "nccl"
36 |
37 | torch.distributed.init_process_group(backend=backend)
38 | deepspeed_config = {
39 | "zero_optimization": {
40 | "stage": 3,
41 | },
42 | "train_batch_size": "auto",
43 | "train_micro_batch_size_per_gpu": "auto",
44 | }
45 | train_args = TrainingArguments(
46 | output_dir="./",
47 | deepspeed=deepspeed_config,
48 | )
49 | model = AutoModel.from_pretrained(GPT2_TINY)
50 | assert train_args is not None
51 | assert model is not None
52 |
53 |
54 | def main():
55 | init_torch_dist_then_launch_deepspeed()
56 |
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/src/accelerate/test_utils/scripts/test_cli.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 |
16 | from accelerate.utils import is_xpu_available
17 |
18 |
19 | def main():
20 | accelerator_type = "GPU"
21 | num_accelerators = 0
22 | if torch.cuda.is_available():
23 | num_accelerators = torch.cuda.device_count()
24 | accelerator_type = "GPU"
25 | elif is_xpu_available():
26 | num_accelerators = torch.xpu.device_count()
27 | accelerator_type = "XPU"
28 | print(f"Successfully ran on {num_accelerators} {accelerator_type}s")
29 |
30 |
31 | if __name__ == "__main__":
32 | main()
33 |
--------------------------------------------------------------------------------
/src/accelerate/test_utils/scripts/test_ddp_comm_hook.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 |
16 | from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs, PartialState
17 | from accelerate.utils import is_hpu_available
18 |
19 |
20 | class MockModel(torch.nn.Module):
21 | def __init__(self):
22 | super().__init__()
23 | torch.manual_seed(0)
24 | self.p = torch.nn.Parameter(torch.randn(40, 20))
25 |
26 | def forward(self, x, rank):
27 | return self.p * (x ** (1 + rank))
28 |
29 |
30 | def _run_and_get_grads(model, rank):
31 | torch.manual_seed(2024)
32 | input = torch.randn(40, 20)
33 | output = model(input, rank)
34 | output.mean().backward()
35 | param = next(model.parameters())
36 | return param.grad
37 |
38 |
39 | def test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option):
40 | ddp_kwargs = DistributedDataParallelKwargs(
41 | comm_hook=comm_hook,
42 | comm_wrapper=comm_wrapper,
43 | comm_state_option=comm_state_option,
44 | )
45 | accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
46 |
47 | model = accelerator.prepare(MockModel())
48 | hook_grads = _run_and_get_grads(model, accelerator.local_process_index)
49 |
50 | reference_model = torch.nn.parallel.DistributedDataParallel(
51 | MockModel().to(accelerator.device),
52 | device_ids=[accelerator.local_process_index],
53 | output_device=accelerator.local_process_index,
54 | )
55 | reference_grads = _run_and_get_grads(reference_model, accelerator.local_process_index)
56 |
57 | torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-2, atol=1e-2)
58 |
59 |
60 | def main():
61 | for comm_hook, comm_wrapper, comm_state_option in [
62 | (DDPCommunicationHookType.NO, DDPCommunicationHookType.NO, {}),
63 | (DDPCommunicationHookType.FP16, DDPCommunicationHookType.NO, {}),
64 | (DDPCommunicationHookType.BF16, DDPCommunicationHookType.NO, {}),
65 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {}),
66 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.FP16, {}),
67 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.BF16, {}),
68 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {"matrix_approximation_rank": 2}),
69 | (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.NO, {}),
70 | (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.FP16, {}),
71 | (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.BF16, {}),
72 | ]:
73 | if is_hpu_available():
74 | HPU_UNSUPPORTED_COMM_HOOKS = {DDPCommunicationHookType.FP16, DDPCommunicationHookType.BF16}
75 | if comm_hook in HPU_UNSUPPORTED_COMM_HOOKS or comm_wrapper in HPU_UNSUPPORTED_COMM_HOOKS:
76 | print(f"Skipping test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper} on HPU")
77 | continue
78 |
79 | print(f"Test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper}")
80 | test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option)
81 | PartialState().destroy_process_group()
82 |
83 |
84 | if __name__ == "__main__":
85 | main()
86 |
--------------------------------------------------------------------------------
/src/accelerate/utils/rich.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .imports import is_rich_available
16 |
17 |
18 | if is_rich_available():
19 | from rich.traceback import install
20 |
21 | install(show_locals=False)
22 |
23 | else:
24 | raise ModuleNotFoundError("To use the rich extension, install rich with `pip install rich`")
25 |
--------------------------------------------------------------------------------
/src/accelerate/utils/torch_xla.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import importlib.metadata
16 | import subprocess
17 | import sys
18 |
19 |
20 | def install_xla(upgrade: bool = False):
21 | """
22 | Helper function to install appropriate xla wheels based on the `torch` version in Google Colaboratory.
23 |
24 | Args:
25 | upgrade (`bool`, *optional*, defaults to `False`):
26 | Whether to upgrade `torch` and install the latest `torch_xla` wheels.
27 |
28 | Example:
29 |
30 | ```python
31 | >>> from accelerate.utils import install_xla
32 |
33 | >>> install_xla(upgrade=True)
34 | ```
35 | """
36 | in_colab = False
37 | if "IPython" in sys.modules:
38 | in_colab = "google.colab" in str(sys.modules["IPython"].get_ipython())
39 |
40 | if in_colab:
41 | if upgrade:
42 | torch_install_cmd = ["pip", "install", "-U", "torch"]
43 | subprocess.run(torch_install_cmd, check=True)
44 | # get the current version of torch
45 | torch_version = importlib.metadata.version("torch")
46 | torch_version_trunc = torch_version[: torch_version.rindex(".")]
47 | xla_wheel = f"https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-{torch_version_trunc}-cp37-cp37m-linux_x86_64.whl"
48 | xla_install_cmd = ["pip", "install", xla_wheel]
49 | subprocess.run(xla_install_cmd, check=True)
50 | else:
51 | raise RuntimeError("`install_xla` utility works only on google colab.")
52 |
--------------------------------------------------------------------------------
/src/accelerate/utils/tqdm.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from .imports import is_tqdm_available
17 |
18 |
19 | if is_tqdm_available():
20 | from tqdm.auto import tqdm as _tqdm
21 |
22 | from ..state import PartialState
23 |
24 |
25 | def tqdm(*args, main_process_only: bool = True, **kwargs):
26 | """
27 | Wrapper around `tqdm.tqdm` that optionally displays only on the main process.
28 |
29 | Args:
30 | main_process_only (`bool`, *optional*):
31 | Whether to display the progress bar only on the main process
32 | """
33 | if not is_tqdm_available():
34 | raise ImportError("Accelerate's `tqdm` module requires `tqdm` to be installed. Please run `pip install tqdm`.")
35 | if len(args) > 0 and isinstance(args[0], bool):
36 | raise ValueError(
37 | "Passing `True` or `False` as the first argument to Accelerate's `tqdm` wrapper is unsupported. "
38 | "Please use the `main_process_only` keyword argument instead."
39 | )
40 | disable = kwargs.pop("disable", False)
41 | if main_process_only and not disable:
42 | disable = PartialState().local_process_index != 0
43 | return _tqdm(*args, **kwargs, disable=disable)
44 |
--------------------------------------------------------------------------------
/src/accelerate/utils/versions.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import importlib.metadata
16 | from typing import Union
17 |
18 | from packaging.version import Version, parse
19 |
20 | from .constants import STR_OPERATION_TO_FUNC
21 |
22 |
23 | torch_version = parse(importlib.metadata.version("torch"))
24 |
25 |
26 | def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
27 | """
28 | Compares a library version to some requirement using a given operation.
29 |
30 | Args:
31 | library_or_version (`str` or `packaging.version.Version`):
32 | A library name or a version to check.
33 | operation (`str`):
34 | A string representation of an operator, such as `">"` or `"<="`.
35 | requirement_version (`str`):
36 | The version to compare the library version against
37 | """
38 | if operation not in STR_OPERATION_TO_FUNC.keys():
39 | raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
40 | operation = STR_OPERATION_TO_FUNC[operation]
41 | if isinstance(library_or_version, str):
42 | library_or_version = parse(importlib.metadata.version(library_or_version))
43 | return operation(library_or_version, parse(requirement_version))
44 |
45 |
46 | def is_torch_version(operation: str, version: str):
47 | """
48 | Compares the current PyTorch version to a given reference with an operation.
49 |
50 | Args:
51 | operation (`str`):
52 | A string representation of an operator, such as `">"` or `"<="`
53 | version (`str`):
54 | A string version of PyTorch
55 | """
56 | return compare_versions(torch_version, operation, version)
57 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/deepspeed/ds_config_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "weight_decay": "auto",
18 | "torch_adam": true,
19 | "adam_w_mode": true
20 | }
21 | },
22 | "scheduler": {
23 | "type": "WarmupLR",
24 | "params": {
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 | "zero_optimization": {
31 | "stage": 2,
32 | "offload_optimizer": {
33 | "device": "cpu",
34 | "pin_memory": true
35 | },
36 | "allgather_partitions": true,
37 | "allgather_bucket_size": 2e8,
38 | "overlap_comm": true,
39 | "reduce_scatter": true,
40 | "reduce_bucket_size": "auto",
41 | "contiguous_gradients": true
42 | },
43 | "gradient_accumulation_steps": 1,
44 | "gradient_clipping": "auto",
45 | "steps_per_print": 2000,
46 | "train_batch_size": "auto",
47 | "train_micro_batch_size_per_gpu": "auto",
48 | "wall_clock_breakdown": false
49 | }
--------------------------------------------------------------------------------
/tests/deepspeed/ds_config_zero2_model_only.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "zero_optimization": {
14 | "stage": 2,
15 | "offload_optimizer": {
16 | "device": "cpu",
17 | "pin_memory": true
18 | },
19 | "allgather_partitions": true,
20 | "allgather_bucket_size": 2e8,
21 | "overlap_comm": true,
22 | "reduce_scatter": true,
23 | "reduce_bucket_size": "auto",
24 | "contiguous_gradients": true
25 | },
26 | "gradient_accumulation_steps": 1,
27 | "gradient_clipping": "auto",
28 | "steps_per_print": 2000,
29 | "train_batch_size": "auto",
30 | "train_micro_batch_size_per_gpu": "auto",
31 | "wall_clock_breakdown": false
32 | }
--------------------------------------------------------------------------------
/tests/deepspeed/ds_config_zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "weight_decay": "auto",
18 | "torch_adam": true,
19 | "adam_w_mode": true
20 | }
21 | },
22 | "scheduler": {
23 | "type": "WarmupLR",
24 | "params": {
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 | "zero_optimization": {
31 | "stage": 3,
32 | "offload_optimizer": {
33 | "device": "cpu",
34 | "pin_memory": true
35 | },
36 | "offload_param": {
37 | "device": "cpu",
38 | "pin_memory": true
39 | },
40 | "overlap_comm": true,
41 | "contiguous_gradients": true,
42 | "sub_group_size": 1e9,
43 | "reduce_bucket_size": "auto",
44 | "stage3_prefetch_bucket_size": "auto",
45 | "stage3_param_persistence_threshold": "auto",
46 | "stage3_max_live_parameters": 1e9,
47 | "stage3_max_reuse_distance": 1e9,
48 | "stage3_gather_16bit_weights_on_model_save": "auto"
49 | },
50 | "gradient_accumulation_steps": 1,
51 | "gradient_clipping": "auto",
52 | "steps_per_print": 2000,
53 | "train_batch_size": "auto",
54 | "train_micro_batch_size_per_gpu": "auto",
55 | "wall_clock_breakdown": false
56 | }
--------------------------------------------------------------------------------
/tests/deepspeed/ds_config_zero3_model_only.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "zero_optimization": {
14 | "stage": 3,
15 | "offload_param": {
16 | "device": "cpu",
17 | "pin_memory": true
18 | },
19 | "overlap_comm": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": 1e9,
22 | "stage3_prefetch_bucket_size": 1e9,
23 | "stage3_param_persistence_threshold": 1e9,
24 | "stage3_max_live_parameters": 1e9,
25 | "stage3_max_reuse_distance": 1e9,
26 | "stage3_gather_16bit_weights_on_model_save": true
27 | },
28 | "train_micro_batch_size_per_gpu": 1
29 | }
--------------------------------------------------------------------------------
/tests/test_configs/0_11_0.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config: {}
3 | distributed_type: 'NO'
4 | fsdp_config: {}
5 | machine_rank: 0
6 | main_process_ip: null
7 | main_process_port: null
8 | main_training_function: main
9 | mixed_precision: 'no'
10 | num_machines: 1
11 | num_processes: 1
12 | use_cpu: false
--------------------------------------------------------------------------------
/tests/test_configs/0_12_0.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config: {}
3 | distributed_type: 'NO'
4 | downcast_bf16: 'no'
5 | fsdp_config: {}
6 | machine_rank: 0
7 | main_process_ip: null
8 | main_process_port: null
9 | main_training_function: main
10 | mixed_precision: 'no'
11 | num_machines: 1
12 | num_processes: 1
13 | use_cpu: false
--------------------------------------------------------------------------------
/tests/test_configs/0_28_0_mpi.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: MULTI_CPU
4 | downcast_bf16: 'no'
5 | ipex_config:
6 | ipex: true
7 | machine_rank: 0
8 | main_process_ip: 127.0.0.1
9 | main_process_port: 29500
10 | main_training_function: main
11 | mixed_precision: 'no'
12 | mpirun_config:
13 | mpirun_ccl: '1'
14 | mpirun_hostfile: /home/user/hostfile
15 | num_machines: 4
16 | num_processes: 16
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: true
23 |
--------------------------------------------------------------------------------
/tests/test_configs/0_30_0_sagemaker.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: AMAZON_SAGEMAKER
2 | debug: false
3 | distributed_type: NO
4 | mixed_precision: fp16
5 | debug: false
6 | use_cpu: false
7 | ec2_instance_type: MY_TYPE
8 | iam_role_name: MY_ROLE
9 |
--------------------------------------------------------------------------------
/tests/test_configs/0_34_0_fp8.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: MULTI_GPU
4 | downcast_bf16: 'no'
5 | enable_cpu_affinity: false
6 | fp8_config:
7 | amax_compute_algo: max
8 | amax_history_len: 1024
9 | backend: TE
10 | fp8_format: E4M3
11 | interval: 1
12 | margin: 0
13 | override_linear_precision: (false, false, false)
14 | use_autocast_during_eval: false
15 | gpu_ids: all
16 | machine_rank: 0
17 | main_training_function: main
18 | mixed_precision: fp8
19 | num_machines: 1
20 | num_processes: 2
21 | rdzv_backend: static
22 | same_network: true
23 | tpu_env: []
24 | tpu_use_cluster: false
25 | tpu_use_sudo: false
26 | use_cpu: false
27 |
--------------------------------------------------------------------------------
/tests/test_configs/README.md:
--------------------------------------------------------------------------------
1 | This folder contains test configs for `accelerate config`. These should be generated for each major version
2 | and are written based on `accelerate config` and selecting the "No distributed training" option.
--------------------------------------------------------------------------------
/tests/test_configs/invalid_keys.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config: {}
3 | distributed_type: 'NO'
4 | downcast_bf16: 'no'
5 | fsdp_config: {}
6 | machine_rank: 0
7 | main_process_ip: null
8 | main_process_port: null
9 | main_training_function: main
10 | mixed_precision: 'no'
11 | num_machines: 1
12 | num_processes: 1
13 | use_cpu: false
14 | invalid_key: "invalid_value"
15 | another_invalid_key: "another_invalid_value"
--------------------------------------------------------------------------------
/tests/test_configs/latest.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config: {}
3 | distributed_type: 'NO'
4 | downcast_bf16: 'no'
5 | fsdp_config: {}
6 | gpu_ids: all
7 | machine_rank: 0
8 | main_process_ip: null
9 | main_process_port: null
10 | main_training_function: main
11 | megatron_lm_config: {}
12 | mixed_precision: 'no'
13 | num_machines: 1
14 | num_processes: 1
15 | rdzv_backend: static
16 | same_network: true
17 | use_cpu: false
18 | tpu_name: 'test-tpu'
19 | tpu_zone: 'us-central1-a'
20 | commands: null
21 | command_file: tests/test_samples/test_command_file.sh
--------------------------------------------------------------------------------
/tests/test_configs/latest_fsdp.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: FSDP
4 | downcast_bf16: 'no'
5 | enable_cpu_affinity: false
6 | fsdp_config:
7 | fsdp_activation_checkpointing: false
8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
9 | fsdp_backward_prefetch: BACKWARD_PRE
10 | fsdp_cpu_ram_efficient_loading: true
11 | fsdp_forward_prefetch: false
12 | fsdp_offload_params: false
13 | fsdp_sharding_strategy: FULL_SHARD
14 | fsdp_state_dict_type: SHARDED_STATE_DICT
15 | fsdp_sync_module_states: true
16 | fsdp_transformer_layer_cls_to_wrap: BertLayer
17 | fsdp_use_orig_params: true
18 | machine_rank: 0
19 | main_training_function: main
20 | mixed_precision: 'no'
21 | num_machines: 1
22 | num_processes: 1
23 | rdzv_backend: static
24 | same_network: true
25 | tpu_env: []
26 | tpu_use_cluster: false
27 | tpu_use_sudo: false
28 | use_cpu: false
29 |
--------------------------------------------------------------------------------
/tests/test_configs/validate_launch_cmd.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: true
3 | num_processes: 1
4 | distributed_type: 'NO'
5 | fsdp_config:
6 | fsdp_sync_module_states: false
7 | deepspeed_config:
8 | deepspeed_config_file: path/to/be/ignored
9 |
--------------------------------------------------------------------------------
/tests/test_cpu.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 |
17 | from accelerate import debug_launcher
18 | from accelerate.test_utils import require_cpu, test_ops, test_script
19 |
20 |
21 | @require_cpu
22 | class MultiCPUTester(unittest.TestCase):
23 | def test_cpu(self):
24 | debug_launcher(test_script.main)
25 |
26 | def test_ops(self):
27 | debug_launcher(test_ops.main)
28 |
--------------------------------------------------------------------------------
/tests/test_grad_sync.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from accelerate import debug_launcher
16 | from accelerate.test_utils import (
17 | DEFAULT_LAUNCH_COMMAND,
18 | device_count,
19 | execute_subprocess_async,
20 | path_in_accelerate_package,
21 | require_cpu,
22 | require_multi_device,
23 | require_non_cpu,
24 | run_first,
25 | test_sync,
26 | )
27 | from accelerate.test_utils.testing import AccelerateTestCase
28 | from accelerate.utils import patch_environment
29 |
30 |
31 | class SyncScheduler(AccelerateTestCase):
32 | test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_sync.py")
33 |
34 | @require_cpu
35 | def test_gradient_sync_cpu_noop(self):
36 | debug_launcher(test_sync.main, num_processes=1)
37 |
38 | @require_cpu
39 | def test_gradient_sync_cpu_multi(self):
40 | debug_launcher(test_sync.main)
41 |
42 | @require_non_cpu
43 | def test_gradient_sync_gpu(self):
44 | test_sync.main()
45 |
46 | @run_first
47 | @require_multi_device
48 | def test_gradient_sync_gpu_multi(self):
49 | print(f"Found {device_count} devices.")
50 | cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path]
51 | with patch_environment(omp_num_threads=1):
52 | execute_subprocess_async(cmd)
53 |
--------------------------------------------------------------------------------
/tests/test_launch.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | import unittest
17 |
18 | from accelerate.utils.launch import prepare_multi_gpu_env
19 |
20 |
21 | class TestPrepareMultiGpuEnv(unittest.TestCase):
22 | def test_auto_port_selection(self):
23 | args = argparse.Namespace(
24 | num_processes=1,
25 | num_machines=1,
26 | main_process_ip="127.0.0.1",
27 | main_process_port=0,
28 | machine_rank=0,
29 | module=False,
30 | no_python=False,
31 | debug=False,
32 | gpu_ids="all",
33 | mixed_precision="no",
34 | dynamo_backend="NO",
35 | dynamo_mode="default",
36 | dynamo_use_fullgraph=False,
37 | dynamo_use_dynamic=False,
38 | dynamo_use_regional_compilation=False,
39 | use_fsdp=False,
40 | fsdp_cpu_ram_efficient_loading=False,
41 | fsdp_sync_module_states=False,
42 | fsdp_version=None,
43 | fsdp_sharding_strategy=None,
44 | fsdp_reshard_after_forward=False,
45 | fsdp_offload_params=False,
46 | fsdp_min_num_params=0,
47 | fsdp_auto_wrap_policy=None,
48 | fsdp_transformer_layer_cls_to_wrap=None,
49 | fsdp_backward_prefetch=None,
50 | fsdp_state_dict_type=None,
51 | fsdp_forward_prefetch=False,
52 | fsdp_use_orig_params=False,
53 | fsdp_activation_checkpointing=False,
54 | use_tp=False,
55 | tp_size=1,
56 | use_megatron_lm=False,
57 | megatron_lm_tp_degree=1,
58 | megatron_lm_pp_degree=1,
59 | megatron_lm_gradient_clipping=1.0,
60 | megatron_lm_num_micro_batches=None,
61 | megatron_lm_sequence_parallelism=None,
62 | megatron_lm_recompute_activations=None,
63 | megatron_lm_use_distributed_optimizer=None,
64 | num_cpu_threads_per_process=1,
65 | enable_cpu_affinity=False,
66 | same_network=False,
67 | )
68 |
69 | prepare_multi_gpu_env(args)
70 | self.assertIn("master_port", args.__dict__)
71 | self.assertNotEqual(args.master_port, "0")
72 | self.assertTrue(args.master_port.isdigit())
73 |
--------------------------------------------------------------------------------
/tests/test_logging.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import inspect
15 | import logging
16 | import os
17 |
18 | import pytest
19 |
20 | from accelerate import Accelerator
21 | from accelerate.logging import get_logger
22 | from accelerate.state import AcceleratorState
23 |
24 |
25 | def current_lineno() -> int:
26 | # A simple helper that returns the lineno of its call-site.
27 | caller_frame = inspect.currentframe().f_back
28 | caller_info = inspect.getframeinfo(caller_frame)
29 | return caller_info.lineno
30 |
31 |
32 | class CustomLogger(logging.LoggerAdapter):
33 | # Mocks a user-defined custom logger wrapper that sets `stacklevel=3`.
34 | def log(self, level, msg, *args, **kwargs):
35 | # E.g. the user wants to modify `stacklevel`, `accelerate.logging`
36 | # should respect the user's `stacklevel`. For the specific value
37 | # of `3`, calling `CustomLogger.log()`, etc., should log that callsite,
38 | # rather than the callsite of the following `self.logger.log()`.
39 | kwargs["stacklevel"] = 3
40 | self.logger.log(level, msg, *args, **kwargs)
41 |
42 |
43 | @pytest.fixture(scope="module")
44 | def accelerator():
45 | accelerator = Accelerator()
46 | yield accelerator
47 | AcceleratorState._reset_state(True)
48 |
49 |
50 | @pytest.mark.usefixtures("accelerator")
51 | def test_log_stack(caplog):
52 | logger = get_logger(__name__)
53 | logging.basicConfig(
54 | format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
55 | datefmt="%m/%d %H:%M:%S",
56 | )
57 |
58 | message = "Test"
59 | lineno = current_lineno() + 1 # the next line is the actual callsite
60 | logger.warning(message)
61 |
62 | assert len(caplog.records) == 1
63 | rec = caplog.records[0]
64 | assert rec.levelname == logging.getLevelName(logging.WARNING)
65 | assert rec.filename == os.path.basename(__file__)
66 | assert rec.name == __name__
67 | assert rec.lineno == lineno
68 | assert rec.funcName == test_log_stack.__name__
69 | assert rec.message == message
70 |
71 |
72 | @pytest.mark.usefixtures("accelerator")
73 | def test_custom_stacklevel(caplog):
74 | wrapped_logger = get_logger(__name__)
75 | logging.basicConfig(
76 | format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
77 | datefmt="%m/%d %H:%M:%S",
78 | )
79 | logger = CustomLogger(wrapped_logger, {})
80 |
81 | message = "Test"
82 | lineno = current_lineno() + 1 # the next line is the actual callsite
83 | logger.warning(message)
84 |
85 | # `CustomLogger.log` set custom `stacklevel=3`, so `logger.warning` should
86 | # log its callsite (rather than those of the `warpped_logger`).
87 | assert len(caplog.records) == 1
88 | rec = caplog.records[0]
89 | assert rec.levelname == logging.getLevelName(logging.WARNING)
90 | assert rec.filename == os.path.basename(__file__)
91 | assert rec.name == __name__
92 | assert rec.lineno == lineno
93 | assert rec.funcName == test_custom_stacklevel.__name__
94 | assert rec.message == message
95 |
--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 |
17 | import numpy as np
18 | from packaging import version
19 |
20 | from accelerate import debug_launcher
21 | from accelerate.test_utils import (
22 | DEFAULT_LAUNCH_COMMAND,
23 | device_count,
24 | execute_subprocess_async,
25 | path_in_accelerate_package,
26 | require_cpu,
27 | require_huggingface_suite,
28 | require_multi_device,
29 | require_single_device,
30 | run_first,
31 | )
32 | from accelerate.utils import patch_environment
33 |
34 |
35 | @require_huggingface_suite
36 | @unittest.skipIf(version.parse(np.__version__) >= version.parse("2.0"), "Test requires numpy version < 2.0")
37 | class MetricTester(unittest.TestCase):
38 | def setUp(self):
39 | self.test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_metrics.py")
40 |
41 | from accelerate.test_utils.scripts.external_deps import test_metrics # noqa: F401
42 |
43 | self.test_metrics = test_metrics
44 |
45 | @require_cpu
46 | def test_metric_cpu_noop(self):
47 | debug_launcher(self.test_metrics.main, num_processes=1)
48 |
49 | @require_cpu
50 | def test_metric_cpu_multi(self):
51 | debug_launcher(self.test_metrics.main)
52 |
53 | @require_single_device
54 | def test_metric_accelerator(self):
55 | self.test_metrics.main()
56 |
57 | @run_first
58 | @require_multi_device
59 | def test_metric_accelerator_multi(self):
60 | print(f"Found {device_count} devices.")
61 | cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path]
62 | with patch_environment(omp_num_threads=1, ACCELERATE_LOG_LEVEL="INFO"):
63 | execute_subprocess_async(cmd)
64 |
--------------------------------------------------------------------------------
/tests/test_optimizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pickle
16 |
17 | import torch
18 |
19 | from accelerate import Accelerator
20 | from accelerate.test_utils import require_cpu, require_fp16, require_non_cpu
21 | from accelerate.test_utils.testing import AccelerateTestCase
22 |
23 |
24 | @require_cpu
25 | class CPUOptimizerTester(AccelerateTestCase):
26 | def test_accelerated_optimizer_pickling(self):
27 | model = torch.nn.Linear(10, 10)
28 | optimizer = torch.optim.SGD(model.parameters(), 0.1)
29 | accelerator = Accelerator()
30 | optimizer = accelerator.prepare(optimizer)
31 | try:
32 | pickle.loads(pickle.dumps(optimizer))
33 | except Exception as e:
34 | self.fail(f"Accelerated optimizer pickling failed with {e}")
35 |
36 |
37 | @require_fp16
38 | @require_non_cpu
39 | class OptimizerTester(AccelerateTestCase):
40 | def test_accelerated_optimizer_step_was_skipped(self):
41 | model = torch.nn.Linear(5, 5)
42 | optimizer = torch.optim.SGD(model.parameters(), 0.1)
43 | accelerator = Accelerator(mixed_precision="fp16")
44 | model, optimizer = accelerator.prepare(model, optimizer)
45 |
46 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
47 | accelerator.backward(loss)
48 | for p in model.parameters():
49 | # Fake the gradients, as if there's no overflow
50 | p.grad.fill_(0.01)
51 |
52 | optimizer.step()
53 | assert optimizer.step_was_skipped is False
54 |
55 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
56 | accelerator.backward(loss)
57 | for p in model.parameters():
58 | p.grad.fill_(0.01)
59 | # Manually set the gradients to be NaN, as if there's an overflow
60 | p.grad[0] = torch.tensor(float("nan"))
61 |
62 | optimizer.step()
63 | assert optimizer.step_was_skipped is True
64 |
65 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
66 | accelerator.backward(loss)
67 | for p in model.parameters():
68 | p.grad.fill_(0.01)
69 | # Manually set the gradients to be NaN, as if there's an overflow
70 | p.grad[0] = torch.tensor(float("nan"))
71 |
72 | optimizer.step()
73 | assert optimizer.step_was_skipped is True
74 |
75 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
76 | accelerator.backward(loss)
77 | for p in model.parameters():
78 | # Fake the gradients, as if there's no overflow
79 | p.grad.fill_(0.01)
80 |
81 | optimizer.step()
82 | assert optimizer.step_was_skipped is False
83 |
--------------------------------------------------------------------------------
/tests/test_sagemaker.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import unittest
15 | from dataclasses import dataclass
16 |
17 | import pytest
18 |
19 | from accelerate.commands.config.config_args import SageMakerConfig
20 | from accelerate.utils import ComputeEnvironment
21 | from accelerate.utils.launch import _convert_nargs_to_dict
22 |
23 |
24 | @dataclass
25 | class MockLaunchConfig(SageMakerConfig):
26 | compute_environment = ComputeEnvironment.AMAZON_SAGEMAKER
27 | fp16 = True
28 | ec2_instance_type = "ml.p3.2xlarge"
29 | iam_role_name = "accelerate_sagemaker_execution_role"
30 | profile = "hf-sm"
31 | region = "us-east-1"
32 | num_machines = 1
33 | base_job_name = "accelerate-sagemaker-1"
34 | pytorch_version = "1.6"
35 | transformers_version = "4.4"
36 | training_script = "train.py"
37 | success_training_script_args = [
38 | "--model_name_or_path",
39 | "bert",
40 | "--do_train",
41 | "False",
42 | "--epochs",
43 | "3",
44 | "--learning_rate",
45 | "5e-5",
46 | "--max_steps",
47 | "50.5",
48 | ]
49 | fail_training_script_args = [
50 | "--model_name_or_path",
51 | "bert",
52 | "--do_train",
53 | "--do_test",
54 | "False",
55 | "--do_predict",
56 | "--epochs",
57 | "3",
58 | "--learning_rate",
59 | "5e-5",
60 | "--max_steps",
61 | "50.5",
62 | ]
63 |
64 |
65 | class SageMakerLaunch(unittest.TestCase):
66 | def test_args_convert(self):
67 | # If no defaults are changed, `to_kwargs` returns an empty dict.
68 | converted_args = _convert_nargs_to_dict(MockLaunchConfig.success_training_script_args)
69 | assert isinstance(converted_args["model_name_or_path"], str)
70 | assert isinstance(converted_args["do_train"], bool)
71 | assert isinstance(converted_args["epochs"], int)
72 | assert isinstance(converted_args["learning_rate"], float)
73 | assert isinstance(converted_args["max_steps"], float)
74 |
75 | with pytest.raises(ValueError):
76 | _convert_nargs_to_dict(MockLaunchConfig.fail_training_script_args)
77 |
--------------------------------------------------------------------------------
/tests/test_samples/MRPC/dev.csv:
--------------------------------------------------------------------------------
1 | label,sentence1,sentence2
2 | equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
3 | not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
4 | not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
5 | equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
7 | equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 |
--------------------------------------------------------------------------------
/tests/test_samples/MRPC/train.csv:
--------------------------------------------------------------------------------
1 | label,sentence1,sentence2
2 | equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
3 | not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
4 | not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
5 | equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
7 | equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 |
--------------------------------------------------------------------------------
/tests/test_samples/test_command_file.sh:
--------------------------------------------------------------------------------
1 | echo "hello world"
2 | echo "this is a second command"
--------------------------------------------------------------------------------
/tests/test_tpu.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import sys
17 | import unittest
18 |
19 | from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package, require_tpu
20 |
21 |
22 | class MultiTPUTester(unittest.TestCase):
23 | test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
24 | test_dir = os.path.dirname(__file__)
25 |
26 | @require_tpu
27 | def test_tpu(self):
28 | distributed_args = f"""
29 | {self.test_dir}/xla_spawn.py
30 | --num_cores 8
31 | {self.test_file_path}
32 | """.split()
33 | cmd = [sys.executable] + distributed_args
34 | execute_subprocess_async(cmd)
35 |
--------------------------------------------------------------------------------
/tests/tp/test_tp.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from accelerate.test_utils.testing import (
17 | TempDirTestCase,
18 | execute_subprocess_async,
19 | get_launch_command,
20 | path_in_accelerate_package,
21 | require_multi_device,
22 | require_non_torch_xla,
23 | require_tp,
24 | require_transformers,
25 | run_first,
26 | slow,
27 | )
28 | from accelerate.utils import patch_environment
29 |
30 |
31 | @require_non_torch_xla
32 | @require_multi_device
33 | @require_transformers
34 | @require_tp
35 | @run_first
36 | @slow
37 | class TPIntegrationTest(TempDirTestCase):
38 | test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")
39 |
40 | def setUp(self):
41 | super().setUp()
42 | self.test_tp_size = 2
43 | self.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
44 | self.batch_size = 1
45 | from accelerate.utils import set_seed
46 |
47 | set_seed(42)
48 |
49 | def test_working_of_tp(self):
50 | self.test_file_path = self.test_scripts_folder / "test_performance.py"
51 | cmd = get_launch_command(num_processes=self.test_tp_size, num_machines=1, machine_rank=0)
52 | cmd.extend(
53 | [
54 | self.test_file_path,
55 | f"--output_dir={self.tmpdir}",
56 | f"--model_name_or_path={self.model_name_or_path}",
57 | "--add_pad_token=true",
58 | "--tp_plan=auto",
59 | f"--tp_size={self.test_tp_size}",
60 | ]
61 | )
62 | with patch_environment(omp_num_threads=1):
63 | execute_subprocess_async(cmd)
64 |
--------------------------------------------------------------------------------
/tests/xla_spawn.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | A simple launcher script for TPU training
17 |
18 | Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py
19 |
20 | ::
21 | >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE
22 | YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
23 | arguments of your training script)
24 |
25 | """
26 |
27 | import importlib
28 | import sys
29 | from argparse import REMAINDER, ArgumentParser
30 | from pathlib import Path
31 |
32 | import torch_xla.distributed.xla_multiprocessing as xmp
33 | from torch_xla import device_count
34 |
35 |
36 | def parse_args():
37 | """
38 | Helper function parsing the command line options
39 | @retval ArgumentParser
40 | """
41 | parser = ArgumentParser(
42 | description=(
43 | "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes"
44 | )
45 | )
46 |
47 | # Optional arguments for the launch helper
48 | num_devices = device_count()
49 | parser.add_argument(
50 | "--num_cores",
51 | type=int,
52 | default=num_devices,
53 | help="Number of TPU cores to use (1 or number of available devices).",
54 | )
55 |
56 | # positional
57 | parser.add_argument(
58 | "training_script",
59 | type=str,
60 | help=(
61 | "The full path to the single TPU training "
62 | "program/script to be launched in parallel, "
63 | "followed by all the arguments for the "
64 | "training script"
65 | ),
66 | )
67 |
68 | # rest from the training program
69 | parser.add_argument("training_script_args", nargs=REMAINDER)
70 |
71 | return parser.parse_args()
72 |
73 |
74 | def main():
75 | args = parse_args()
76 |
77 | # Import training_script as a module.
78 | script_fpath = Path(args.training_script)
79 | sys.path.append(str(script_fpath.parent.resolve()))
80 | mod_name = script_fpath.stem
81 | mod = importlib.import_module(mod_name)
82 |
83 | # Patch sys.argv
84 | sys.argv = [args.training_script] + args.training_script_args
85 | num_cores = args.num_cores
86 | if num_cores == device_count() and num_cores != 1:
87 | # There is an error in xmp.spawn that causes it to fail when num_cores is specified and not 1, so we set it to
88 | # None when it matches the number of devices.
89 | num_cores = None
90 | xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores)
91 |
92 |
93 | if __name__ == "__main__":
94 | main()
95 |
--------------------------------------------------------------------------------
/utils/stale.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Script to close stale issue. Taken in part from the AllenNLP repository.
16 | https://github.com/allenai/allennlp.
17 | """
18 |
19 | import os
20 | from datetime import datetime as dt
21 | from datetime import timezone
22 |
23 | from github import Github
24 |
25 |
26 | LABELS_TO_EXEMPT = [
27 | "good first issue",
28 | "feature request",
29 | "wip",
30 | ]
31 |
32 |
33 | def main():
34 | g = Github(os.environ["GITHUB_TOKEN"])
35 | repo = g.get_repo("huggingface/accelerate")
36 | open_issues = repo.get_issues(state="open")
37 |
38 | for issue in open_issues:
39 | comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
40 | last_comment = comments[0] if len(comments) > 0 else None
41 | current_time = dt.now(timezone.utc)
42 | days_since_updated = (current_time - issue.updated_at).days
43 | days_since_creation = (current_time - issue.created_at).days
44 | if (
45 | last_comment is not None
46 | and last_comment.user.login == "github-actions[bot]"
47 | and days_since_updated > 7
48 | and days_since_creation >= 30
49 | and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
50 | ):
51 | # Close issue since it has been 7 days of inactivity since bot mention.
52 | issue.edit(state="closed")
53 | elif (
54 | days_since_updated > 23
55 | and days_since_creation >= 30
56 | and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
57 | ):
58 | # Add stale comment
59 | issue.create_comment(
60 | "This issue has been automatically marked as stale because it has not had "
61 | "recent activity. If you think this still needs to be addressed "
62 | "please comment on this thread.\n\nPlease note that issues that do not follow the "
63 | "[contributing guidelines](https://github.com/huggingface/accelerate/blob/main/CONTRIBUTING.md) "
64 | "are likely to be ignored."
65 | )
66 |
67 |
68 | if __name__ == "__main__":
69 | main()
70 |
--------------------------------------------------------------------------------