├── .devcontainer └── devcontainer.json ├── .github ├── ISSUE_TEMPLATE │ └── bug-report.yml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── build-docker-images-release.yml │ ├── build_and_run_tests.yml │ ├── build_docker_images.yml │ ├── build_documentation.yml │ ├── build_pr_documentation.yml │ ├── fp8_runner.yml │ ├── gaudi3_scheduled.yml │ ├── integration_tests.yml │ ├── nightly.yml │ ├── pr_style_bot.yml │ ├── quality.yml │ ├── run_merge_tests.yml │ ├── self_hosted_integration_tests.yml │ ├── stale.yml │ ├── test.yml │ ├── test_imports.yml │ ├── trufflehog.yml │ └── upload_pr_documentation.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── benchmarks ├── README.md ├── big_model_inference │ ├── README.md │ ├── big_model_inference.py │ └── measures_util.py ├── fp8 │ ├── ms_amp │ │ ├── Dockerfile │ │ ├── ddp.py │ │ ├── distrib_deepspeed.py │ │ ├── fp8_utils.py │ │ └── non_distributed.py │ ├── torchao │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── ddp.py │ │ ├── distrib_deepspeed.py │ │ ├── fp8_utils.py │ │ ├── fsdp.py │ │ └── non_distributed.py │ └── transformer_engine │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── ddp.py │ │ ├── distrib_deepspeed.py │ │ ├── fp8_utils.py │ │ ├── fsdp.py │ │ └── non_distributed.py ├── fsdp2 │ ├── README.md │ ├── imgs │ │ ├── allocated_memory.png │ │ └── reserved_memory.png │ ├── main.py │ ├── measure_utils.py │ ├── utils.py │ └── visualize.py └── torch.compile │ ├── README.md │ ├── imgs │ ├── compilation_time.png │ └── speedup_factor.png │ └── regional_compilation.py ├── docker ├── README.md ├── accelerate-cpu │ └── Dockerfile ├── accelerate-gpu-deepspeed │ └── Dockerfile └── accelerate-gpu │ └── Dockerfile ├── docs ├── Makefile ├── README.md └── source │ ├── _toctree.yml │ ├── basic_tutorials │ ├── execution.md │ ├── install.md │ ├── launch.md │ ├── migration.md │ ├── notebook.md │ ├── overview.md │ ├── tpu.md │ └── troubleshooting.md │ ├── concept_guides │ ├── big_model_inference.md │ ├── deferring_execution.md │ ├── fsdp1_vs_fsdp2.md │ ├── fsdp_and_deepspeed.md │ ├── gradient_synchronization.md │ ├── internal_mechanism.md │ ├── low_precision_training.md │ ├── performance.md │ └── training_tpu.md │ ├── imgs │ ├── accelerate_logo.png │ ├── course_banner.png │ └── profile_export.png │ ├── index.md │ ├── package_reference │ ├── accelerator.md │ ├── big_modeling.md │ ├── cli.md │ ├── deepspeed.md │ ├── fp8.md │ ├── fsdp.md │ ├── inference.md │ ├── kwargs.md │ ├── launchers.md │ ├── logging.md │ ├── megatron_lm.md │ ├── state.md │ ├── torch_wrappers.md │ ├── tracking.md │ └── utilities.md │ ├── quicktour.md │ └── usage_guides │ ├── big_modeling.md │ ├── checkpoint.md │ ├── compilation.md │ ├── ddp_comm_hook.md │ ├── deepspeed.md │ ├── deepspeed_multiple_model.md │ ├── distributed_inference.md │ ├── explore.md │ ├── fsdp.md │ ├── gaudi.md │ ├── gradient_accumulation.md │ ├── ipex.md │ ├── local_sgd.md │ ├── low_precision_training.md │ ├── megatron_lm.md │ ├── model_size_estimator.md │ ├── mps.md │ ├── profiler.md │ ├── quantization.md │ ├── sagemaker.md │ ├── tracking.md │ └── training_zoo.md ├── examples ├── README.md ├── by_feature │ ├── README.md │ ├── automatic_gradient_accumulation.py │ ├── checkpointing.py │ ├── cross_validation.py │ ├── ddp_comm_hook.py │ ├── deepspeed_with_config_support.py │ ├── early_stopping.py │ ├── fsdp_with_peak_mem_tracking.py │ ├── gradient_accumulation.py │ ├── gradient_accumulation_for_autoregressive_models.py │ ├── local_sgd.py │ ├── megatron_lm_gpt_pretraining.py │ ├── memory.py │ ├── multi_process_metrics.py │ ├── profiler.py │ ├── schedule_free.py │ └── tracking.py ├── complete_cv_example.py ├── complete_nlp_example.py ├── config_yaml_templates │ ├── README.md │ ├── deepspeed.yaml │ ├── fp8.yaml │ ├── fsdp.yaml │ ├── multi_gpu.yaml │ ├── multi_node.yaml │ ├── run_me.py │ └── single_gpu.yaml ├── cv_example.py ├── deepspeed_config_templates │ ├── zero_stage1_config.json │ ├── zero_stage2_config.json │ ├── zero_stage2_offload_config.json │ ├── zero_stage3_config.json │ └── zero_stage3_offload_config.json ├── inference │ ├── distributed │ │ ├── README.md │ │ ├── distributed_image_generation.py │ │ ├── distributed_speech_generation.py │ │ ├── florence2.py │ │ ├── llava_next_video.py │ │ ├── phi2.py │ │ └── stable_diffusion.py │ └── pippy │ │ ├── README.md │ │ ├── bert.py │ │ ├── gpt2.py │ │ ├── llama.py │ │ ├── requirements.txt │ │ └── t5.py ├── multigpu_remote_launcher.py ├── nlp_example.py ├── requirements.txt └── slurm │ ├── fsdp_config.yaml │ ├── submit_multicpu.sh │ ├── submit_multigpu.sh │ ├── submit_multinode.sh │ └── submit_multinode_fsdp.sh ├── manim_animations ├── big_model_inference │ ├── stage_1.py │ ├── stage_2.py │ ├── stage_3.py │ ├── stage_4.py │ └── stage_5.py └── dataloaders │ ├── stage_0.py │ ├── stage_1.py │ ├── stage_2.py │ ├── stage_3.py │ ├── stage_4.py │ ├── stage_5.py │ ├── stage_6.py │ └── stage_7.py ├── pyproject.toml ├── setup.py ├── src └── accelerate │ ├── __init__.py │ ├── accelerator.py │ ├── big_modeling.py │ ├── checkpointing.py │ ├── commands │ ├── __init__.py │ ├── accelerate_cli.py │ ├── config │ │ ├── __init__.py │ │ ├── cluster.py │ │ ├── config.py │ │ ├── config_args.py │ │ ├── config_utils.py │ │ ├── default.py │ │ ├── sagemaker.py │ │ └── update.py │ ├── env.py │ ├── estimate.py │ ├── launch.py │ ├── menu │ │ ├── __init__.py │ │ ├── cursor.py │ │ ├── helpers.py │ │ ├── input.py │ │ ├── keymap.py │ │ └── selection_menu.py │ ├── merge.py │ ├── test.py │ ├── to_fsdp2.py │ ├── tpu.py │ └── utils.py │ ├── data_loader.py │ ├── hooks.py │ ├── inference.py │ ├── launchers.py │ ├── local_sgd.py │ ├── logging.py │ ├── memory_utils.py │ ├── optimizer.py │ ├── scheduler.py │ ├── state.py │ ├── test_utils │ ├── __init__.py │ ├── examples.py │ ├── scripts │ │ ├── __init__.py │ │ ├── external_deps │ │ │ ├── __init__.py │ │ │ ├── test_checkpointing.py │ │ │ ├── test_ds_multiple_model.py │ │ │ ├── test_metrics.py │ │ │ ├── test_peak_memory_usage.py │ │ │ ├── test_performance.py │ │ │ ├── test_pippy.py │ │ │ └── test_zero3_integration.py │ │ ├── test_cli.py │ │ ├── test_ddp_comm_hook.py │ │ ├── test_distributed_data_loop.py │ │ ├── test_merge_weights.py │ │ ├── test_notebook.py │ │ ├── test_ops.py │ │ ├── test_script.py │ │ └── test_sync.py │ ├── testing.py │ └── training.py │ ├── tracking.py │ └── utils │ ├── __init__.py │ ├── ao.py │ ├── bnb.py │ ├── constants.py │ ├── dataclasses.py │ ├── deepspeed.py │ ├── environment.py │ ├── fsdp_utils.py │ ├── imports.py │ ├── launch.py │ ├── megatron_lm.py │ ├── memory.py │ ├── modeling.py │ ├── offload.py │ ├── operations.py │ ├── other.py │ ├── random.py │ ├── rich.py │ ├── torch_xla.py │ ├── tqdm.py │ ├── transformer_engine.py │ └── versions.py ├── tests ├── __init__.py ├── deepspeed │ ├── ds_config_zero2.json │ ├── ds_config_zero2_model_only.json │ ├── ds_config_zero3.json │ ├── ds_config_zero3_model_only.json │ ├── test_deepspeed.py │ └── test_deepspeed_multiple_model.py ├── fsdp │ └── test_fsdp.py ├── test_accelerator.py ├── test_big_modeling.py ├── test_cli.py ├── test_compile.py ├── test_configs │ ├── 0_11_0.yaml │ ├── 0_12_0.yaml │ ├── 0_28_0_mpi.yaml │ ├── 0_30_0_sagemaker.yaml │ ├── 0_34_0_fp8.yaml │ ├── README.md │ ├── invalid_keys.yaml │ ├── latest.yaml │ ├── latest_fsdp.yaml │ └── validate_launch_cmd.yaml ├── test_cpu.py ├── test_data_loader.py ├── test_examples.py ├── test_fp8.py ├── test_grad_sync.py ├── test_hooks.py ├── test_imports.py ├── test_kwargs_handlers.py ├── test_launch.py ├── test_load_checkpoint_and_dispatch_with_broadcast.py ├── test_logging.py ├── test_memory_utils.py ├── test_metrics.py ├── test_modeling_utils.py ├── test_multigpu.py ├── test_offload.py ├── test_optimizer.py ├── test_quantization.py ├── test_sagemaker.py ├── test_samples │ ├── MRPC │ │ ├── dev.csv │ │ └── train.csv │ └── test_command_file.sh ├── test_scheduler.py ├── test_state_checkpointing.py ├── test_tpu.py ├── test_tracking.py ├── test_utils.py ├── tp │ └── test_tp.py └── xla_spawn.py └── utils ├── log_reports.py └── stale.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // File only needed for VSCode users to have proper Docker based interpreters 2 | { 3 | "name": "accelerate_dev_environment", 4 | "build": { 5 | // ACTION NEEDED: comment/uncomment the relevant line depending on whether you are in a CPU/GPU environment 6 | "dockerfile": "../docker/accelerate-cpu/Dockerfile" 7 | // "dockerfile": "../docker/accelerate-gpu/Dockerfile" 8 | }, 9 | "runArgs": [ 10 | // ACTION NEEDED: uncomment the next line if your local machine has GPUs available 11 | // "--gpus", "all", 12 | // Enable the docker container to access system resources 13 | "--ipc", "host" 14 | ], 15 | "remoteEnv": { 16 | "PYTHONPATH": "${containerEnv:PATH}:${containerWorkspaceFolder}" 17 | }, 18 | "customizations": { 19 | "vscode": { 20 | "extensions": [ 21 | // Ensure we have IntelliSense in VSCode when running inside container 22 | "ms-python.python" 23 | ] 24 | } 25 | }, 26 | "workspaceFolder": "/workspaces/accelerate", 27 | // Need git for VSCode to color code modifications. Only runs when building environment. 28 | "onCreateCommand": "apt-get update && apt-get install -y git && pip install -e '.[dev]'" 29 | } -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: "\U0001F41B Bug Report" 2 | description: Submit a bug report to help us improve Accelerate 3 | body: 4 | - type: markdown 5 | attributes: 6 | value: | 7 | Thanks for taking the time to submit a bug report! 🐛 8 | If this is not a bug related to the Accelerate library directly, but instead a general question about your code or the library specifically please use the [forums](https://discuss.huggingface.co/c/accelerate/18). 9 | 10 | - type: textarea 11 | id: system-info 12 | attributes: 13 | label: System Info 14 | description: Please share your accelerate configuration with us. You can run the command `accelerate env` and copy-paste its outputs below 15 | render: Shell 16 | placeholder: accelerate version, OS, python version, numpy version, torch version, and accelerate's configuration 17 | validations: 18 | required: true 19 | 20 | - type: checkboxes 21 | id: information-scripts-examples 22 | attributes: 23 | label: Information 24 | description: 'The problem arises when using:' 25 | options: 26 | - label: "The official example scripts" 27 | - label: "My own modified scripts" 28 | 29 | - type: checkboxes 30 | id: information-tasks 31 | attributes: 32 | label: Tasks 33 | description: "The tasks I am working on are:" 34 | options: 35 | - label: "One of the scripts in the examples/ folder of Accelerate or an officially supported `no_trainer` script in the `examples` folder of the `transformers` repo (such as `run_no_trainer_glue.py`)" 36 | - label: "My own task or dataset (give details below)" 37 | 38 | - type: textarea 39 | id: reproduction 40 | validations: 41 | required: true 42 | attributes: 43 | label: Reproduction 44 | description: | 45 | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. 46 | If you have code snippets, error messages, stack traces please provide them here as well. 47 | Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 48 | Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code. 49 | 50 | placeholder: | 51 | Steps to reproduce the behavior: 52 | 53 | 1. 54 | 2. 55 | 3. 56 | 57 | - type: textarea 58 | id: expected-behavior 59 | validations: 60 | required: true 61 | attributes: 62 | label: Expected behavior 63 | description: "A clear and concise description of what you would expect to happen." 64 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # What does this PR do? 2 | 3 | 12 | 13 | 14 | 15 | Fixes # (issue) 16 | 17 | 18 | ## Before submitting 19 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). 20 | - [ ] Did you read the [contributor guideline](https://github.com/huggingface/accelerate/blob/main/CONTRIBUTING.md#submitting-a-pull-request-pr), 21 | Pull Request section? 22 | - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link 23 | to it if that's the case. 24 | - [ ] Did you make sure to update the documentation with your changes? Here are the 25 | [documentation guidelines](https://github.com/huggingface/accelerate/tree/main/docs), and 26 | [here are tips on formatting docstrings](https://github.com/huggingface/accelerate/tree/main/docs#writing-documentation---specification). 27 | - [ ] Did you write any new necessary tests? 28 | 29 | 30 | ## Who can review? 31 | 32 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag 33 | members/contributors who may be interested in your PR. 34 | 35 | -------------------------------------------------------------------------------- /.github/workflows/build-docker-images-release.yml: -------------------------------------------------------------------------------- 1 | name: Build Docker images (releases) 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [published] 7 | 8 | concurrency: 9 | group: docker-image-builds 10 | cancel-in-progress: false 11 | 12 | jobs: 13 | get-version: 14 | runs-on: ubuntu-latest 15 | outputs: 16 | version: ${{ steps.step1.outputs.version }} 17 | steps: 18 | - uses: actions/checkout@4 19 | - id: step1 20 | run: echo "version=$(python setup.py --version)" >> $GITHUB_OUTPUT 21 | 22 | version-cpu: 23 | name: "Latest Accelerate CPU [version]" 24 | runs-on: 25 | group: aws-general-8-plus 26 | needs: get-version 27 | steps: 28 | - name: Set up Docker Buildx 29 | uses: docker/setup-buildx-action@v2 30 | - name: Login to DockerHub 31 | uses: docker/login-action@v2 32 | with: 33 | username: ${{ secrets.DOCKERHUB_USERNAME }} 34 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 35 | 36 | - name: Build and Push CPU 37 | uses: docker/build-push-action@v4 38 | with: 39 | file: docker/accelerate-cpu/Dockerfile 40 | push: true 41 | tags: huggingface/accelerate:cpu-release-${{ needs.get-version.outputs.version }} 42 | 43 | version-cuda: 44 | name: "Latest Accelerate GPU [version]" 45 | runs-on: 46 | group: aws-g6-4xlarge-plus 47 | needs: get-version 48 | steps: 49 | - name: Set up Docker Buildx 50 | uses: docker/setup-buildx-action@v2 51 | - name: Login to DockerHub 52 | uses: docker/login-action@v2 53 | with: 54 | username: ${{ secrets.DOCKERHUB_USERNAME }} 55 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 56 | 57 | - name: Build and Push GPU 58 | uses: docker/build-push-action@v4 59 | with: 60 | file: docker/accelerate-gpu/Dockerfile 61 | push: true 62 | tags: huggingface/accelerate:gpu-release-${{needs.get-version.outputs.version}} 63 | 64 | version-cuda-deepspeed: 65 | name: "Latest Accelerate GPU DeepSpeed [version]" 66 | runs-on: 67 | group: aws-g6-4xlarge-plus 68 | needs: get-version 69 | steps: 70 | - name: Set up Docker Buildx 71 | uses: docker/setup-buildx-action@v2 72 | - name: Login to DockerHub 73 | uses: docker/login-action@v2 74 | with: 75 | username: ${{ secrets.DOCKERHUB_USERNAME }} 76 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 77 | 78 | - name: Build and Push GPU 79 | uses: docker/build-push-action@v4 80 | with: 81 | file: docker/accelerate-gpu-deepspeed/Dockerfile 82 | push: true 83 | tags: huggingface/accelerate:gpu-deepspeed-release-${{needs.get-version.outputs.version}} 84 | 85 | version-cuda-fp8-transformerengine: 86 | name: "Latest Accelerate GPU FP8 TransformerEngine [version]" 87 | runs-on: 88 | group: aws-g6-4xlarge-plus 89 | needs: get-version 90 | steps: 91 | - name: Set up Docker Buildx 92 | uses: docker/setup-buildx-action@v2 93 | - name: Login to DockerHub 94 | uses: docker/login-action@v2 95 | with: 96 | username: ${{ secrets.DOCKERHUB_USERNAME }} 97 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 98 | 99 | - name: Build and Push GPU 100 | uses: docker/build-push-action@v4 101 | with: 102 | file: docker/accelerate-gpu/Dockerfile 103 | push: true 104 | tags: huggingface/accelerate:gpu-fp8-transformerengine-release-${{needs.get-version.outputs.version}} -------------------------------------------------------------------------------- /.github/workflows/build_and_run_tests.yml: -------------------------------------------------------------------------------- 1 | name: Trigger docker images and run tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | env: 10 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 11 | 12 | jobs: 13 | check-for-source: 14 | runs-on: ubuntu-latest 15 | name: Check if setup was changed 16 | outputs: 17 | changed: ${{ steps.was_changed.outputs.changed }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | fetch-depth: "2" 22 | 23 | - name: Get changed files 24 | id: changed-files 25 | uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42 26 | 27 | - name: Was setup changed 28 | id: was_changed 29 | run: | 30 | for file in ${{ steps.changed-files.outputs.all_changed_files }}; do 31 | if [ `basename "${file}"` == "setup.py" ]; then 32 | echo "changed=1" >> $GITHUB_OUTPUT 33 | fi 34 | done 35 | 36 | build-docker-containers: 37 | needs: check-for-source 38 | if: (github.event_name == 'push') && (needs.check-for-source.outputs.changed == '1') 39 | uses: ./.github/workflows/build_docker_images.yml 40 | secrets: inherit 41 | 42 | run-merge-tests: 43 | needs: build-docker-containers 44 | if: always() 45 | uses: ./.github/workflows/run_merge_tests.yml 46 | 47 | run-integration-tests: 48 | needs: build-docker-containers 49 | if: always() 50 | uses: ./.github/workflows/self_hosted_integration_tests.yml 51 | -------------------------------------------------------------------------------- /.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - doc-builder* 8 | - v*-release 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.sha }} 15 | package: accelerate 16 | custom_container: huggingface/transformers-doc-builder 17 | secrets: 18 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 19 | -------------------------------------------------------------------------------- /.github/workflows/build_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.event.pull_request.head.sha }} 15 | pr_number: ${{ github.event.number }} 16 | package: accelerate 17 | custom_container: huggingface/transformers-doc-builder 18 | -------------------------------------------------------------------------------- /.github/workflows/fp8_runner.yml: -------------------------------------------------------------------------------- 1 | name: Test FP8 Runner 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 8 | jobs: 9 | set-prev-day: 10 | runs-on: ubuntu-latest 11 | outputs: 12 | prev-day: ${{ steps.set-prev-day.outputs.prev-day }} 13 | steps: 14 | - name: Set PREV_DAY 15 | id: set-prev-day 16 | run: | 17 | PREV_DAY=$(date -d "yesterday" '+%Y-%m-%d') 18 | echo "prev-day=$PREV_DAY" >> $GITHUB_OUTPUT 19 | run-fp8-tests: 20 | needs: set-prev-day 21 | runs-on: 22 | group: aws-g6e-12xlarge 23 | container: 24 | image: huggingface/accelerate:gpu-fp8-transformerengine-nightly-${{ needs.set-prev-day.outputs.prev-day }} 25 | options: --gpus all --shm-size "16gb" 26 | steps: 27 | - uses: actions/checkout@v3 28 | - name: Install the library 29 | run: | 30 | pip install -e .[test_prod,test_fp8] 31 | - name: Show installed libraries 32 | run: | 33 | pip freeze 34 | - name: Run TE FP8 tests 35 | run: | 36 | python -m pytest -s -v ./tests/test_fp8.py 37 | 38 | -------------------------------------------------------------------------------- /.github/workflows/gaudi3_scheduled.yml: -------------------------------------------------------------------------------- 1 | name: Gaudi3 tests (scheduled) 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: # every day at 6 AM UTC 6 | - cron: "0 6 * * *" 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | run-gaudi3-tests: 14 | runs-on: 15 | group: itac-bm-emr-gaudi3-dell-2gaudi 16 | 17 | container: 18 | image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest 19 | options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES 20 | env: 21 | OMPI_MCA_btl_vader_single_copy_mechanism: none 22 | PT_ENABLE_INT64_SUPPORT: 1 23 | PT_HPU_LAZY_MODE: 0 24 | RUN_SLOW: 1 25 | 26 | steps: 27 | - name: HL-SMI (1) 28 | run: | 29 | hl-smi 30 | echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}" 31 | echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" 32 | 33 | - name: Extract HPU visible modules 34 | id: add-modules 35 | run: | 36 | export HABANA_VISIBLE_MODULES=$(hl-smi -Q module_id -f csv,noheader | tr '\n' ',' | sed 's/,$//') 37 | echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" >> $GITHUB_ENV 38 | 39 | - name: HL-SMI (2) 40 | run: | 41 | hl-smi 42 | echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}" 43 | echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" 44 | 45 | - name: Checkout to Accelerate 46 | uses: actions/checkout@v4 47 | 48 | - name: Install Accelerate with Transformers & DeepSpeed 49 | run: | 50 | pip install -e .[testing] \ 51 | git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 \ 52 | git+https://github.com/huggingface/transformers.git 53 | 54 | - name: Run CLI tests 55 | if: ${{ !cancelled() && (success() || failure()) }} 56 | run: | 57 | make test_cli 58 | 59 | - name: Run Core tests 60 | if: ${{ !cancelled() && (success() || failure()) }} 61 | run: | 62 | make test_core 63 | 64 | - name: Run Big Modeling tests 65 | if: ${{ !cancelled() && (success() || failure()) }} 66 | run: | 67 | make test_big_modeling 68 | 69 | - name: Run FSDP integration tests 70 | if: ${{ !cancelled() && (success() || failure()) }} 71 | run: | 72 | make test_fsdp 73 | 74 | - name: Run DeepSpeed integration tests 75 | if: ${{ !cancelled() && (success() || failure()) }} 76 | run: | 77 | make test_deepspeed 78 | 79 | - name: Run Examples tests 80 | if: ${{ !cancelled() && (success() || failure()) }} 81 | run: | 82 | make test_examples 83 | -------------------------------------------------------------------------------- /.github/workflows/integration_tests.yml: -------------------------------------------------------------------------------- 1 | # CI for specifically ensuring integrations work fine (`transformers` mainly) 2 | # Useful tips: 3 | # - New integrations to test should have its own job, and follow a strategy method where we check both 4 | # the pypi and github versions. 5 | # - When checking the latest release of the integration, use 6 | # git checkout $(git describe --tags `git rev-list --tags --max-count=1`) to get the latest release. 7 | 8 | name: Integration Tests 9 | 10 | on: 11 | pull_request: 12 | paths: 13 | - "src/**" 14 | - "tests/**" 15 | - ".github/**" 16 | - "examples/**" 17 | - "setup.py" 18 | types: [opened, synchronize, reopened] 19 | 20 | env: 21 | HF_HOME: ~/hf_cache 22 | 23 | jobs: 24 | run-trainer-tests: 25 | runs-on: ubuntu-latest 26 | strategy: 27 | fail-fast: false 28 | steps: 29 | - uses: actions/checkout@v4 30 | - name: Set up python 3.9 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: 3.9 34 | cache: 'pip' 35 | cache-dependency-path: 'setup.py' 36 | 37 | - name: Install Accelerate from source 38 | run: | 39 | pip install --upgrade pip 40 | pip install -e . 41 | 42 | - name: Clone and install transformers 43 | run: | 44 | cd .. 45 | git clone https://github.com/huggingface/transformers 46 | cd transformers 47 | pip install .[torch,testing] 48 | 49 | - name: Show installed libraries 50 | run: | 51 | pip freeze 52 | 53 | - name: Run Trainer tests 54 | env: 55 | WANDB_DISABLED: true 56 | run: | 57 | cd ../transformers 58 | pytest -sv tests/trainer 59 | -------------------------------------------------------------------------------- /.github/workflows/pr_style_bot.yml: -------------------------------------------------------------------------------- 1 | # To run this bot, comment "@bot /style" on a PR 2 | name: Style Bot 3 | 4 | on: 5 | issue_comment: 6 | types: [created] 7 | 8 | permissions: 9 | contents: write 10 | pull-requests: write 11 | 12 | jobs: 13 | style: 14 | uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main 15 | with: 16 | python_quality_dependencies: "[quality]" 17 | style_command_type: "default" 18 | secrets: 19 | bot_token: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/quality.yml: -------------------------------------------------------------------------------- 1 | name: Quality Check 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | quality: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Set up Python 3.9 11 | uses: actions/setup-python@v5 12 | with: 13 | python-version: 3.9 14 | cache: 'pip' 15 | cache-dependency-path: 'setup.py' 16 | - name: Install Python dependencies 17 | run: pip install -e .[quality] 18 | - name: Run Quality check 19 | run: make quality 20 | - name: Check if failure 21 | if: ${{ failure() }} 22 | run: | 23 | echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and rerun 'make style; make quality;'" >> $GITHUB_STEP_SUMMARY 24 | 25 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Stale Bot 2 | 3 | on: 4 | schedule: 5 | - cron: "0 15 * * *" 6 | workflow_dispatch: 7 | 8 | jobs: 9 | close_stale_issues: 10 | name: Close Stale Issues 11 | if: github.repository == 'huggingface/accelerate' 12 | runs-on: ubuntu-latest 13 | permissions: 14 | issues: write 15 | pull-requests: write 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Setup Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: 3.9 25 | cache: 'pip' 26 | cache-dependency-path: 'setup.py' 27 | 28 | - name: Install requirements 29 | run: | 30 | pip install PyGithub 31 | - name: Close stale issues 32 | run: | 33 | python utils/stale.py 34 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - "src/**" 7 | - "tests/**" 8 | - ".github/**" 9 | - "examples/**" 10 | - "setup.py" 11 | types: [opened, synchronize, reopened] 12 | 13 | env: 14 | HF_HOME: ~/hf_cache 15 | TESTING_MOCKED_DATALOADERS: "1" 16 | IS_GITHUB_CI: "1" 17 | 18 | jobs: 19 | run-tests: 20 | runs-on: ubuntu-latest 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | pytorch-version: [ 25 | latest, 26 | minimum, 27 | ] 28 | test-kind: [ 29 | test_prod, 30 | test_core, 31 | test_cli, 32 | test_big_modeling, 33 | test_deepspeed, 34 | test_fsdp, 35 | test_example_differences, 36 | test_checkpoint_step, 37 | test_checkpoint_epoch, 38 | test_rest 39 | ] 40 | steps: 41 | - uses: actions/checkout@v4 42 | - name: Set up python 3.9 43 | uses: actions/setup-python@v5 44 | with: 45 | python-version: 3.9 46 | cache: 'pip' 47 | cache-dependency-path: 'setup.py' 48 | 49 | - name: Install the library 50 | run: | 51 | if [[ ${{ matrix.test-kind }} = test_prod ]]; then pip install -e .[test_prod]; fi 52 | if [[ ${{ matrix.test-kind }} != test_prod ]]; then pip install -e .[testing,test_trackers]; fi 53 | if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi 54 | if [[ ${{ matrix.pytorch-version }} = minimum ]]; then pip install torchvision==0.18.1 torch==2.3.1; fi 55 | pip install pytest-reportlog tabulate setuptools importlib_metadata 56 | 57 | - name: Show installed libraries 58 | run: | 59 | pip freeze 60 | 61 | - name: Run Tests 62 | env: 63 | PYTORCH_VERSION: ${{ matrix.pytorch-version }} 64 | run: | 65 | make ${{ matrix.test-kind }} 66 | 67 | - name: Generate Report 68 | if: always() 69 | run: | 70 | python utils/log_reports.py >> $GITHUB_STEP_SUMMARY 71 | -------------------------------------------------------------------------------- /.github/workflows/test_imports.yml: -------------------------------------------------------------------------------- 1 | name: Run Import Tests 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - "src/**" 7 | - "tests/**" 8 | - ".github/**" 9 | - "examples/**" 10 | - "setup.py" 11 | types: [opened, synchronize, reopened] 12 | 13 | env: 14 | HF_HOME: ~/hf_cache 15 | TESTING_MOCKED_DATALOADERS: "1" 16 | IS_GITHUB_CI: "1" 17 | 18 | jobs: 19 | run-tests: 20 | runs-on: ubuntu-latest 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | pytorch-version: [ 25 | latest, 26 | minimum, 27 | ] 28 | steps: 29 | - uses: actions/checkout@v4 30 | - name: Set up python 3.9 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: 3.9 34 | cache: 'pip' 35 | cache-dependency-path: 'setup.py' 36 | 37 | - name: Install the library 38 | run: | 39 | pip install -e . 40 | pip install pytest-reportlog tabulate setuptools git+https://github.com/muellerzr/import-timer 41 | 42 | - name: Show installed libraries 43 | run: | 44 | pip freeze 45 | 46 | - name: Run Import Tests 47 | env: 48 | PYTORCH_VERSION: ${{ matrix.pytorch-version }} 49 | run: | 50 | pytest -sv tests/test_imports.py 51 | 52 | - name: Generate Report 53 | if: always() 54 | run: | 55 | python utils/log_reports.py >> $GITHUB_STEP_SUMMARY 56 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Secret Leaks 5 | 6 | jobs: 7 | trufflehog: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v4 12 | with: 13 | fetch-depth: 0 14 | - name: Secret Scanning 15 | uses: trufflesecurity/trufflehog@main 16 | -------------------------------------------------------------------------------- /.github/workflows/upload_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Upload PR Documentation 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Build PR Documentation"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | build: 11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main 12 | with: 13 | package_name: accelerate 14 | secrets: 15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 16 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # VSCode 132 | .vscode 133 | 134 | # IntelliJ 135 | .idea 136 | 137 | # Mac .DS_Store 138 | .DS_Store 139 | 140 | # More test things 141 | wandb 142 | 143 | # ruff 144 | .ruff_cache 145 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.2.1 4 | hooks: 5 | - id: ruff 6 | args: 7 | - --fix 8 | - id: ruff-format 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v4.5.0 11 | hooks: 12 | - id: check-merge-conflict 13 | - id: check-yaml 14 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | The folders below contain suites to test various functionalities in Accelerate. 4 | 5 | See their relevant README.md's for more information. 6 | -------------------------------------------------------------------------------- /benchmarks/big_model_inference/README.md: -------------------------------------------------------------------------------- 1 | # Big model inference benchmarks 2 | 3 | Running inference with Accelerate on big models. 4 | 5 | ## Setup 6 | 7 | These benchmarks use the `transformers` library: 8 | 9 | ```bash 10 | pip install transformers 11 | ``` 12 | 13 | To reproduce or test a new setup, run 14 | 15 | ```py 16 | python big_model_inference.py model_name 17 | ``` 18 | 19 | This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`. 20 | 21 | To force a different `torch_dtype` than the one in the config: `--torch_dtype xxx`. 22 | 23 | If you get an error linked to disk offload, you need to add the option `--disk-offload` 24 | 25 | ## Results 26 | 27 | On a setup with two Titan RTXs (24GB of RAM) and 32GB of RAM, we get the following benchmarks (T0pp does not run in float16, which is why it's not included). 28 | 29 | | Model | Model load time | Generation time | dtype | GPU 0 use | GPU 1 use | CPU use | Disk offload | 30 | |:-----:|:---------------:|:---------------:|:-----:|:---------:|:---------:|:-------:|:------------:| 31 | | GPT-J-6B | 8.7s | 0.05s per token | float16 | 11.7GB | 0GB | 0GB | no | 32 | | GPT-J-6B | 12.4s | 0.06s per token | float32 | 21.9GB | 1.5GB | 0GB | no | 33 | | GPT-Neo-X-20B | 30.9s | 0.08s per token | float16 | 21.5GB | 18GB | 0GB | no | 34 | | GPT-Neo-X-20B | 78.2s | 10.72s per token | float32 | 20.3GB | 22.7 GB | 24.4GB | yes | 35 | | T0pp (11B) | 29.4s | 0.05s per token | float32 | 21.1GB | 21.3GB | 0GB | no | 36 | | OPT-30B | 34.5s | 2.37s per token | float16 | 20.7GB | 22.3GB | 14.1GB | no | 37 | | OPT-30B | 112.3s | 33.9s per token | float32 | 20.2GB | 21.2GB | 23.5GB | yes | 38 | 39 | Note on the results: 40 | - using two GPUs instead of one does not slow down generation 41 | - using CPU offload slows down a bit (see OPT-30b) 42 | - using disk offload slows down a lot (need to implement prefetching) 43 | 44 | You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary: 45 | - peak GPU memory is exactly the size of the model put on a given GPU 46 | - peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger. 47 | -------------------------------------------------------------------------------- /benchmarks/big_model_inference/measures_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import gc 15 | import threading 16 | import time 17 | 18 | import psutil 19 | import torch 20 | 21 | from accelerate.test_utils.testing import get_backend 22 | 23 | 24 | torch_device_type, _, _ = get_backend() 25 | torch_accelerator_module = getattr(torch, torch_device_type, torch.cuda) 26 | 27 | 28 | class PeakCPUMemory: 29 | def __init__(self): 30 | self.process = psutil.Process() 31 | self.peak_monitoring = False 32 | 33 | def peak_monitor(self): 34 | self.cpu_memory_peak = -1 35 | 36 | while True: 37 | self.cpu_memory_peak = max(self.process.memory_info().rss, self.cpu_memory_peak) 38 | 39 | # can't sleep or will not catch the peak right (this comment is here on purpose) 40 | if not self.peak_monitoring: 41 | break 42 | 43 | def start(self): 44 | self.peak_monitoring = True 45 | self.thread = threading.Thread(target=self.peak_monitor) 46 | self.thread.daemon = True 47 | self.thread.start() 48 | 49 | def stop(self): 50 | self.peak_monitoring = False 51 | self.thread.join() 52 | return self.cpu_memory_peak 53 | 54 | 55 | cpu_peak_tracker = PeakCPUMemory() 56 | 57 | 58 | def start_measure(): 59 | # Time 60 | measures = {"time": time.time()} 61 | 62 | gc.collect() 63 | torch_accelerator_module.empty_cache() 64 | 65 | # CPU mem 66 | measures["cpu"] = psutil.Process().memory_info().rss 67 | cpu_peak_tracker.start() 68 | 69 | # GPU mem 70 | for i in range(torch_accelerator_module.device_count()): 71 | measures[str(i)] = torch_accelerator_module.memory_allocated(i) 72 | torch_accelerator_module.reset_peak_memory_stats() 73 | 74 | return measures 75 | 76 | 77 | def end_measure(start_measures): 78 | # Time 79 | measures = {"time": time.time() - start_measures["time"]} 80 | 81 | gc.collect() 82 | torch_accelerator_module.empty_cache() 83 | 84 | # CPU mem 85 | measures["cpu"] = (psutil.Process().memory_info().rss - start_measures["cpu"]) / 2**20 86 | measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20 87 | 88 | # GPU mem 89 | for i in range(torch_accelerator_module.device_count()): 90 | measures[str(i)] = (torch_accelerator_module.memory_allocated(i) - start_measures[str(i)]) / 2**20 91 | measures[f"{i}-peak"] = (torch_accelerator_module.max_memory_allocated(i) - start_measures[str(i)]) / 2**20 92 | 93 | return measures 94 | 95 | 96 | def log_measures(measures, description): 97 | print(f"{description}:") 98 | print(f"- Time: {measures['time']:.2f}s") 99 | for i in range(torch_accelerator_module.device_count()): 100 | print(f"- {torch_device_type} {i} allocated: {measures[str(i)]:.2f}MiB") 101 | peak = measures[f"{i}-peak"] 102 | print(f"- {torch_device_type} {i} peak: {peak:.2f}MiB") 103 | print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB") 104 | print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB") 105 | -------------------------------------------------------------------------------- /benchmarks/fp8/ms_amp/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/azure/msamp 2 | 3 | RUN pip install transformers evaluate datasets 4 | RUN git clone https://github.com/huggingface/accelerate 5 | 6 | RUN cd accelerate && \ 7 | pip install -e . && \ 8 | cd benchmarks/fp8 9 | 10 | CMD ["bash"] 11 | 12 | 13 | -------------------------------------------------------------------------------- /benchmarks/fp8/torchao/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.07-py3 2 | 3 | RUN pip install transformers evaluate datasets 4 | RUN git clone https://github.com/huggingface/accelerate.git 5 | 6 | RUN cd accelerate && \ 7 | pip install -e . && \ 8 | cd benchmarks/fp8 9 | 10 | RUN /bin/bash 11 | 12 | 13 | -------------------------------------------------------------------------------- /benchmarks/fp8/torchao/README.md: -------------------------------------------------------------------------------- 1 | # FP8 Benchmarks 2 | 3 | Comparing and running [torchao](https://github.com/pytorch/ao/tree/main/torchao/float8) FP8 with accelerate 4 | 5 | ## Overview 6 | 7 | This repo provides scripts which compare native `torchao` model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following: 8 | 9 | * Single GPU training (`non_distributed.py`) 10 | * Multi-GPU training via DistributedDataParallelism (`ddp.py`) 11 | * Fully Sharded Data Parallelism (`fsdp.py`) 12 | * DeepSpeed ZeRO 1-3 (`deepspeed.py`) 13 | 14 | To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `torchao` manually. 15 | 16 | ## Running: 17 | 18 | There are official Docker images located at `huggingface/accelerate:gpu-fp8-torchao-nightly` which can be used. 19 | 20 | You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed. 21 | 22 | For single GPU, run it via `python`: 23 | 24 | ```bash 25 | python non_distributed.py 26 | ``` 27 | 28 | For the rest, run it via `accelerate launch`: 29 | 30 | ```bash 31 | accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py 32 | ``` -------------------------------------------------------------------------------- /benchmarks/fp8/transformer_engine/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_YEAR=25 2 | ARG BASE_MONTH=03 3 | 4 | FROM nvcr.io/nvidia/pytorch:${BASE_YEAR}.${BASE_MONTH}-py3 5 | 6 | RUN pip install transformers evaluate datasets 7 | RUN git clone https://github.com/huggingface/accelerate.git 8 | 9 | RUN cd accelerate && \ 10 | pip install -e . && \ 11 | cd benchmarks/fp8 12 | 13 | RUN /bin/bash 14 | 15 | 16 | -------------------------------------------------------------------------------- /benchmarks/fp8/transformer_engine/README.md: -------------------------------------------------------------------------------- 1 | # FP8 Benchmarks 2 | 3 | Comparing and running [TransformerEngine](https://github.com/NVIDIA/TransformerEngine) FP8 with accelerate 4 | 5 | ## Overview 6 | 7 | This repo provides scripts which compare native TransformerEngine model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following: 8 | 9 | * Single GPU training (`non_distributed.py`) 10 | * Multi-GPU training via DistributedDataParallelism (`ddp.py`) 11 | * Fully Sharded Data Parallelism (`fsdp.py`) 12 | * DeepSpeed ZeRO 1-3 (`deepspeed.py`) 13 | 14 | To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `TransformerEngine` manually. 15 | 16 | ## Running: 17 | 18 | There are official Docker images located at `huggingface/accelerate:gpu-fp8-transformerengine-nightly` which can be used. 19 | 20 | You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed. 21 | 22 | For single GPU, run it via `python`: 23 | 24 | ```bash 25 | python non_distributed.py 26 | ``` 27 | 28 | For the rest, run it via `accelerate launch`: 29 | 30 | ```bash 31 | accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py 32 | ``` -------------------------------------------------------------------------------- /benchmarks/fsdp2/imgs/allocated_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/fsdp2/imgs/allocated_memory.png -------------------------------------------------------------------------------- /benchmarks/fsdp2/imgs/reserved_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/fsdp2/imgs/reserved_memory.png -------------------------------------------------------------------------------- /benchmarks/torch.compile/imgs/compilation_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/torch.compile/imgs/compilation_time.png -------------------------------------------------------------------------------- /benchmarks/torch.compile/imgs/speedup_factor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/benchmarks/torch.compile/imgs/speedup_factor.png -------------------------------------------------------------------------------- /benchmarks/torch.compile/regional_compilation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from torch.utils.benchmark import Compare, Timer 17 | from transformers import AutoConfig, AutoModelForCausalLM 18 | 19 | from accelerate.test_utils.testing import get_backend 20 | from accelerate.utils import compile_regions 21 | 22 | 23 | torch.set_float32_matmul_precision("high") 24 | 25 | COMPILE_ITERS = 2 26 | INFERENCE_ITERS = 100 27 | 28 | BASELINE = "Baseline" 29 | COMPILE_TIME = "Compile time" 30 | INFRENCE_TIME = "Inference time" 31 | FULL_COMPILATION = "Full compilation" 32 | REGIONAL_COMPILATION = "Regional compilation" 33 | 34 | INFRENCE_STMT = "model(input_ids, use_cache=False)" 35 | COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}" 36 | 37 | torch_device_type, _, _ = get_backend() 38 | 39 | results = [] 40 | for model_id in [ 41 | # non-gated llama models 42 | "NousResearch/Llama-3.2-1B", 43 | "NousResearch/Hermes-3-Llama-3.2-3B", 44 | "NousResearch/Hermes-3-Llama-3.1-8B", 45 | "NousResearch/Nous-Hermes-Llama2-13b", 46 | ]: 47 | with torch.device(torch_device_type): 48 | config = AutoConfig.from_pretrained(model_id) 49 | model = AutoModelForCausalLM.from_config(config).to(dtype=torch.float16).eval() 50 | 51 | full_compilation_model = torch.compile(model) 52 | regional_compilation_model = compile_regions(model) 53 | 54 | for model, sub_label, description, stmt, iters in [ 55 | (model, BASELINE, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS), 56 | (full_compilation_model, FULL_COMPILATION, COMPILE_TIME, COMPILE_STMT, COMPILE_ITERS), 57 | (full_compilation_model, FULL_COMPILATION, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS), 58 | (regional_compilation_model, REGIONAL_COMPILATION, COMPILE_TIME, COMPILE_STMT, COMPILE_ITERS), 59 | (regional_compilation_model, REGIONAL_COMPILATION, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS), 60 | ]: 61 | for batch_size, sequence_length in [(1, 128), (4, 128)]: 62 | input_ids = torch.randint( 63 | 0, 1000, size=(batch_size, sequence_length), dtype=torch.int64, device=torch_device_type 64 | ) 65 | results.append( 66 | Timer( 67 | label=model_id, 68 | sub_label=sub_label, 69 | description=f"{description} ({batch_size}x{sequence_length})", 70 | globals={"model": model, "input_ids": input_ids}, 71 | stmt=stmt, 72 | ).timeit(number=iters) 73 | ) 74 | 75 | compare = Compare(results) 76 | compare.colorize() 77 | compare.print() 78 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Official Hugging Face Accelerate Docker Images 18 | 19 | Accelerate publishes a variety of docker versions as part of our CI that users can also use. These are stable images that Accelerate can run off of which comes with a variety of different setup configurations, all of which are officially hosted on [Docker Hub](https://hub.docker.com/r/huggingface/accelerate). 20 | 21 | A breakdown of each are given below 22 | 23 | ## Naming Conventions 24 | 25 | Accelerate docker images follow a tagging convention of: 26 | 27 | ```bash 28 | huggingface/accelerate:{accelerator}-{nightly,release} 29 | ``` 30 | 31 | `accelerator` in this instance is one of many applical pre-configured backend supports: 32 | * `gpu`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes`. Runs off python 3.9. 33 | * `cpu`: Comes compiled off of `python:3.9-slim` and is designed for non-CUDA based workloads. 34 | * More to come soon 35 | * `gpu-deepspeed`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes` as well as the latest `deepspeed` version. Runs off python 3.10. 36 | * `gpu-fp8-transformerengine`: Comes compiled off of `nvcr.io/nvidia/pytorch` and is specifically for running the `benchmarks/fp8` scripts on devices which support FP8 operations using the `TransformerEngine` library (RTX 4090, H100, etc) 37 | 38 | ## Nightlies vs Releases 39 | 40 | Each release a new build is pushed with a version number included in the name. For a GPU-supported image of version 0.28.0 for instance, it would look like the following: 41 | 42 | ```bash 43 | huggingface/accelerate:gpu-release-0.28.0 44 | ``` 45 | 46 | Nightlies contain two different image tags. There is a general `nightly` tag which is built each night, and a `nightly-YYYY-MM-DD` which corresponds to a build from a particular date. 47 | 48 | For instance, here is an example nightly CPU image from 3/14/2024 49 | 50 | ```bash 51 | huggingface/accelerate:cpu-nightly-2024-03-14 52 | ``` 53 | 54 | ## Running the images 55 | 56 | Each image comes compiled with `conda` and an `accelerate` environment contains all of the installed dependencies. 57 | 58 | To pull down the latest nightly run: 59 | 60 | ```bash 61 | docker pull huggingface/accelerate:gpu-nightly 62 | ``` 63 | 64 | To then run it in interactive mode with GPU-memory available, run: 65 | 66 | ```bash 67 | docker container run --gpus all -it huggingface/accelerate:gpu-nightly 68 | ``` 69 | 70 | ## DEPRECATED IMAGES 71 | 72 | CPU and GPU docker images were hosted at `huggingface/accelerate-gpu` and `huggingface/accelerate-cpu`. These builds are now outdated and will not receive updates. 73 | 74 | The builds at the corresponding `huggingface/accelerate:{gpu,cpu}` contain the same `Dockerfile`, so it's as simple as changing the docker image to the desired ones from above. We will not be deleting these images for posterity, but they will not be receiving updates going forward. -------------------------------------------------------------------------------- /docker/accelerate-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | # Builds CPU-only Docker image of PyTorch 2 | # Uses multi-staged approach to reduce size 3 | # Stage 1 4 | FROM python:3.9-slim as compile-image 5 | 6 | ARG DEBIAN_FRONTEND=noninteractive 7 | 8 | RUN apt update 9 | RUN apt-get install -y --no-install-recommends \ 10 | build-essential \ 11 | git \ 12 | gcc 13 | 14 | # Setup virtual environment for Docker 15 | ENV VIRTUAL_ENV=/opt/venv 16 | RUN python3 -m venv ${VIRTUAL_ENV} 17 | # Make sure we use the virtualenv 18 | ENV PATH="${VIRTUAL_ENV}/bin:$PATH" 19 | WORKDIR /workspace 20 | # Install specific CPU torch wheel to save on space 21 | RUN python3 -m pip install --upgrade --no-cache-dir pip 22 | RUN python3 -m pip install --no-cache-dir \ 23 | jupyter \ 24 | git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers] \ 25 | --extra-index-url https://download.pytorch.org/whl/cpu 26 | 27 | # Stage 2 28 | FROM python:3.9-slim AS build-image 29 | COPY --from=compile-image /opt/venv /opt/venv 30 | RUN useradd -ms /bin/bash user 31 | USER user 32 | 33 | # Make sure we use the virtualenv 34 | ENV PATH="/opt/venv/bin:$PATH" 35 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/accelerate-gpu-deepspeed/Dockerfile: -------------------------------------------------------------------------------- 1 | # Builds GPU docker image of PyTorch specifically 2 | # Uses multi-staged approach to reduce size 3 | # Stage 1 4 | # Use base conda image to reduce time 5 | FROM continuumio/miniconda3:latest AS compile-image 6 | # Specify py version 7 | # Note: DeepSpeed beyond v0.12.6 requires py 3.10 8 | ENV PYTHON_VERSION=3.10 9 | # Install apt libs 10 | RUN apt-get update && \ 11 | apt-get install -y curl git wget && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists* 14 | 15 | # Create our conda env 16 | RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip 17 | # We don't install pytorch here yet since CUDA isn't available 18 | # instead we use the direct torch wheel 19 | ENV PATH /opt/conda/envs/accelerate/bin:$PATH 20 | # Activate our bash shell 21 | RUN chsh -s /bin/bash 22 | SHELL ["/bin/bash", "-c"] 23 | # Activate the conda env, install mpy4pi, and install torch + accelerate 24 | RUN source activate accelerate && conda install -c conda-forge mpi4py 25 | RUN source activate accelerate && \ 26 | python3 -m pip install --no-cache-dir \ 27 | git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \ 28 | --extra-index-url https://download.pytorch.org/whl/cu126 29 | 30 | RUN python3 -m pip install --no-cache-dir bitsandbytes 31 | 32 | # Stage 2 33 | FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS build-image 34 | COPY --from=compile-image /opt/conda /opt/conda 35 | ENV PATH /opt/conda/bin:$PATH 36 | 37 | # Install apt libs 38 | RUN apt-get update && \ 39 | apt-get install -y curl git wget && \ 40 | apt-get clean && \ 41 | rm -rf /var/lib/apt/lists* 42 | 43 | RUN echo "source activate accelerate" >> ~/.profile 44 | 45 | # Activate the virtualenv 46 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/accelerate-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | # Builds GPU docker image of PyTorch specifically 2 | # Uses multi-staged approach to reduce size 3 | # Stage 1 4 | # Use base conda image to reduce time 5 | FROM continuumio/miniconda3:latest AS compile-image 6 | # Specify py version 7 | ENV PYTHON_VERSION=3.9 8 | # Install apt libs 9 | RUN apt-get update && \ 10 | apt-get install -y curl git wget && \ 11 | apt-get clean && \ 12 | rm -rf /var/lib/apt/lists* 13 | 14 | # Create our conda env 15 | RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip 16 | # We don't install pytorch here yet since CUDA isn't available 17 | # instead we use the direct torch wheel 18 | ENV PATH /opt/conda/envs/accelerate/bin:$PATH 19 | # Activate our bash shell 20 | RUN chsh -s /bin/bash 21 | SHELL ["/bin/bash", "-c"] 22 | # Activate the conda env, install mpy4pi, and install torch + accelerate 23 | RUN source activate accelerate && conda install -c conda-forge mpi4py 24 | RUN source activate accelerate && \ 25 | python3 -m pip install --no-cache-dir \ 26 | git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers] \ 27 | --extra-index-url https://download.pytorch.org/whl/cu126 28 | 29 | RUN python3 -m pip install --no-cache-dir bitsandbytes 30 | 31 | # Stage 2 32 | FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS build-image 33 | COPY --from=compile-image /opt/conda /opt/conda 34 | ENV PATH /opt/conda/bin:$PATH 35 | 36 | # Install apt libs 37 | RUN apt-get update && \ 38 | apt-get install -y curl git wget && \ 39 | apt-get clean && \ 40 | rm -rf /var/lib/apt/lists* 41 | 42 | RUN echo "source activate accelerate" >> ~/.profile 43 | 44 | # Activate the virtualenv 45 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/source/basic_tutorials/overview.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Overview 17 | 18 | Welcome to the Accelerate tutorials! These introductory guides will help catch you up to speed on working with Accelerate. 19 | You'll learn how to modify your code to have it work with the API seamlessly, how to launch your script properly, 20 | and more! 21 | 22 | These tutorials assume some basic knowledge of Python and familiarity with the PyTorch framework. 23 | 24 | If you have any questions about Accelerate, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/accelerate/18). -------------------------------------------------------------------------------- /docs/source/basic_tutorials/tpu.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # TPU training 17 | 18 | A [TPU (Tensor Processing Unit)](https://cloud.google.com/tpu/docs/intro-to-tpu) is a type of hardware specifically designed for training models efficiently. Accelerate supports TPU training, but there are a few things you should be aware of, namely graph compilation. This tutorial briefly discusses compilation, and for more details, take a look at the [Training on TPUs with Accelerate](../concept_guides/training_tpu) guide. 19 | 20 | ## Compilation 21 | 22 | A TPU creates a graph of all the operations in the training step such as the forward pass, backward pass and optimizer step. This is why the first training step always takes a while because building and compiling this graph takes time. But once compilation is complete, it is cached and all subsequent steps are much faster. 23 | 24 | The key is to avoid compiling your code again or else training is super slow. This means all your operations must be exactly the same: 25 | 26 | * all tensors in your batches must have the same length (for example, no dynamic padding for NLP tasks) 27 | * your code must be static (for example, no layers with for loops that have different lengths depending on the input such as a LSTM) 28 | 29 | ## Weight tying 30 | 31 | A common language model design is to tie the weights of the embedding and softmax layers. However, moving the model to a TPU (either yourself or passing it to the [`~Accelerator.prepare`] method) breaks the weight tying and you'll need to retie the weights. 32 | 33 | To add special behavior (like weight tying) in your script for TPUs, set [`~Accelerator.distributed_type`] to `DistributedType.TPU` first. Then you can use the [`~transformers.PreTrainedModel.tie_weights`] method to tie the weights. 34 | 35 | ```py 36 | if accelerator.distributed_type == DistributedType.TPU: 37 | model.tie_weights() 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/source/imgs/accelerate_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/docs/source/imgs/accelerate_logo.png -------------------------------------------------------------------------------- /docs/source/imgs/course_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/docs/source/imgs/course_banner.png -------------------------------------------------------------------------------- /docs/source/imgs/profile_export.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/accelerate/3a82b056cf85b16976ca2760615897fe65ae5e64/docs/source/imgs/profile_export.png -------------------------------------------------------------------------------- /docs/source/package_reference/accelerator.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Accelerator 17 | 18 | The [`Accelerator`] is the main class for enabling distributed training on any type of training setup. Read the [Add Accelerator to your code](../basic_tutorials/migration) tutorial to learn more about how to add the [`Accelerator`] to your script. 19 | 20 | ## Accelerator[[api]] 21 | 22 | [[autodoc]] Accelerator 23 | 24 | ## Utilities 25 | 26 | [[autodoc]] accelerate.utils.gather_object 27 | -------------------------------------------------------------------------------- /docs/source/package_reference/big_modeling.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Working with large models 17 | 18 | ## Dispatch and offload 19 | 20 | ### init_empty_weights 21 | 22 | [[autodoc]] big_modeling.init_empty_weights 23 | 24 | ### cpu_offload 25 | 26 | [[autodoc]] big_modeling.cpu_offload 27 | 28 | ### cpu_offload_with_hook 29 | 30 | [[autodoc]] big_modeling.cpu_offload_with_hook 31 | 32 | ### disk_offload 33 | 34 | [[autodoc]] big_modeling.disk_offload 35 | 36 | ### dispatch_model 37 | 38 | [[autodoc]] big_modeling.dispatch_model 39 | 40 | ### load_checkpoint_and_dispatch 41 | 42 | [[autodoc]] big_modeling.load_checkpoint_and_dispatch 43 | 44 | ### load_checkpoint_in_model 45 | 46 | [[autodoc]] big_modeling.load_checkpoint_in_model 47 | 48 | ### infer_auto_device_map 49 | 50 | [[autodoc]] utils.infer_auto_device_map 51 | 52 | ## Hooks 53 | 54 | ### ModelHook 55 | 56 | [[autodoc]] hooks.ModelHook 57 | 58 | ### AlignDevicesHook 59 | 60 | [[autodoc]] hooks.AlignDevicesHook 61 | 62 | ### SequentialHook 63 | 64 | [[autodoc]] hooks.SequentialHook 65 | 66 | ### LayerwiseCastingHook 67 | 68 | [[autodoc]] hooks.LayerwiseCastingHook 69 | 70 | ## Adding Hooks 71 | 72 | ### add_hook_to_module 73 | 74 | [[autodoc]] hooks.add_hook_to_module 75 | 76 | ### attach_execution_device_hook 77 | 78 | [[autodoc]] hooks.attach_execution_device_hook 79 | 80 | ### attach_align_device_hook 81 | 82 | [[autodoc]] hooks.attach_align_device_hook 83 | 84 | ### attach_align_device_hook_on_blocks 85 | 86 | [[autodoc]] hooks.attach_align_device_hook_on_blocks 87 | 88 | ### attach_layerwise_casting_hooks 89 | 90 | [[autodoc]] big_modeling.attach_layerwise_casting_hooks 91 | 92 | ## Removing Hooks 93 | 94 | ### remove_hook_from_module 95 | 96 | [[autodoc]] hooks.remove_hook_from_module 97 | 98 | ### remove_hook_from_submodules 99 | 100 | [[autodoc]] hooks.remove_hook_from_submodules 101 | 102 | ## Utilities 103 | 104 | ### has_offloaded_params 105 | 106 | [[autodoc]] utils.has_offloaded_params 107 | 108 | ### align_module_device 109 | 110 | [[autodoc]] utils.align_module_device 111 | -------------------------------------------------------------------------------- /docs/source/package_reference/deepspeed.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # DeepSpeed utilities 17 | 18 | ## DeepSpeedPlugin 19 | 20 | ## get_active_deepspeed_plugin 21 | 22 | [[autodoc]] utils.get_active_deepspeed_plugin 23 | 24 | [[autodoc]] utils.DeepSpeedPlugin 25 | 26 | [[autodoc]] utils.deepspeed.DummyScheduler 27 | 28 | ## DeepSpeedEnginerWrapper 29 | 30 | [[autodoc]] utils.deepspeed.DeepSpeedEngineWrapper 31 | 32 | ## DeepSpeedOptimizerWrapper 33 | 34 | [[autodoc]] utils.deepspeed.DeepSpeedOptimizerWrapper 35 | 36 | ## DeepSpeedSchedulerWrapper 37 | 38 | [[autodoc]] utils.deepspeed.DeepSpeedSchedulerWrapper 39 | 40 | ## DummyOptim 41 | 42 | [[autodoc]] utils.deepspeed.DummyOptim 43 | 44 | ## DummyScheduler -------------------------------------------------------------------------------- /docs/source/package_reference/fp8.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # FP8 17 | 18 | Below are functions and classes relative to the underlying FP8 implementation 19 | 20 | ## FP8RecipeKwargs 21 | 22 | [[autodoc]] utils.FP8RecipeKwargs 23 | 24 | ## convert_model 25 | 26 | [[autodoc]] utils.convert_model 27 | 28 | ## has_transformer_engine_layers 29 | 30 | [[autodoc]] utils.has_transformer_engine_layers 31 | 32 | ## contextual_fp8_autocast 33 | 34 | [[autodoc]] utils.contextual_fp8_autocast 35 | 36 | ## apply_fp8_autowrap 37 | 38 | [[autodoc]] utils.apply_fp8_autowrap 39 | -------------------------------------------------------------------------------- /docs/source/package_reference/fsdp.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Fully Sharded Data Parallel utilities 17 | 18 | ## enable_fsdp_ram_efficient_loading 19 | 20 | [[autodoc]] utils.enable_fsdp_ram_efficient_loading 21 | 22 | ## disable_fsdp_ram_efficient_loading 23 | 24 | [[autodoc]] utils.disable_fsdp_ram_efficient_loading 25 | 26 | ## merge_fsdp_weights 27 | 28 | [[autodoc]] utils.merge_fsdp_weights 29 | 30 | ## FullyShardedDataParallelPlugin 31 | 32 | [[autodoc]] utils.FullyShardedDataParallelPlugin 33 | 34 | ## fsdp2_load_full_state_dict 35 | 36 | [[autodoc]] utils.fsdp2_load_full_state_dict 37 | 38 | ## fsdp2_switch_optimizer_parameters 39 | 40 | [[autodoc]] utils.fsdp2_switch_optimizer_parameters 41 | 42 | ## fsdp2_prepare_model 43 | 44 | [[autodoc]] utils.fsdp2_prepare_model 45 | 46 | ## fsdp2_prepare_auto_wrap_policy 47 | -------------------------------------------------------------------------------- /docs/source/package_reference/inference.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Pipeline parallelism 17 | 18 | Accelerate supports pipeline parallelism for large-scale training with the PyTorch [torch.distributed.pipelining](https://pytorch.org/docs/stable/distributed.pipelining.html) API. 19 | 20 | ## prepare_pippy 21 | 22 | [[autodoc]] inference.prepare_pippy 23 | -------------------------------------------------------------------------------- /docs/source/package_reference/kwargs.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Kwargs handlers 17 | 18 | The following objects can be passed to the main [`Accelerator`] to customize how some PyTorch objects 19 | related to distributed training or mixed precision are created. 20 | 21 | ## AutocastKwargs 22 | 23 | [[autodoc]] AutocastKwargs 24 | 25 | ## DistributedDataParallelKwargs 26 | 27 | [[autodoc]] DistributedDataParallelKwargs 28 | 29 | ## FP8RecipeKwargs 30 | 31 | [[autodoc]] utils.FP8RecipeKwargs 32 | 33 | ## ProfileKwargs 34 | 35 | [[autodoc]] utils.ProfileKwargs 36 | 37 | ## GradScalerKwargs 38 | 39 | [[autodoc]] GradScalerKwargs 40 | 41 | ## InitProcessGroupKwargs 42 | 43 | [[autodoc]] InitProcessGroupKwargs 44 | 45 | ## KwargsHandler 46 | 47 | [[autodoc]] utils.KwargsHandler 48 | -------------------------------------------------------------------------------- /docs/source/package_reference/launchers.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Launchers 17 | 18 | Functions for launching training on distributed processes. 19 | 20 | ## notebook_launcher 21 | 22 | [[autodoc]] accelerate.notebook_launcher 23 | 24 | ## debug_launcher 25 | 26 | [[autodoc]] accelerate.debug_launcher -------------------------------------------------------------------------------- /docs/source/package_reference/logging.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Logging 17 | 18 | Refer to the [Troubleshooting guide](../usage_guides/troubleshooting#logging) or to the example below to learn 19 | how to use Accelerate's logger. 20 | 21 | [[autodoc]] logging.get_logger -------------------------------------------------------------------------------- /docs/source/package_reference/megatron_lm.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Megatron-LM utilities 17 | 18 | ## MegatronLMPlugin 19 | 20 | [[autodoc]] utils.MegatronLMPlugin 21 | 22 | ## MegatronLMDummyScheduler 23 | 24 | [[autodoc]] utils.MegatronLMDummyScheduler 25 | 26 | ## MegatronLMDummyDataLoader 27 | 28 | [[autodoc]] utils.MegatronLMDummyDataLoader 29 | 30 | ## AbstractTrainStep 31 | 32 | [[autodoc]] utils.AbstractTrainStep 33 | 34 | ## GPTTrainStep 35 | 36 | [[autodoc]] utils.GPTTrainStep 37 | 38 | ## BertTrainStep 39 | 40 | [[autodoc]] utils.BertTrainStep 41 | 42 | ## T5TrainStep 43 | 44 | [[autodoc]] utils.T5TrainStep 45 | 46 | ## avg_losses_across_data_parallel_group 47 | 48 | [[autodoc]] utils.avg_losses_across_data_parallel_group 49 | -------------------------------------------------------------------------------- /docs/source/package_reference/state.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Stateful Classes 17 | 18 | Below are variations of a [singleton class](https://en.wikipedia.org/wiki/Singleton_pattern) in the sense that all 19 | instances share the same state, which is initialized on the first instantiation. 20 | 21 | These classes are immutable and store information about certain configurations or 22 | states. 23 | 24 | ## PartialState 25 | 26 | [[autodoc]] state.PartialState 27 | 28 | ## AcceleratorState 29 | 30 | [[autodoc]] state.AcceleratorState 31 | 32 | ## GradientState 33 | 34 | [[autodoc]] state.GradientState -------------------------------------------------------------------------------- /docs/source/package_reference/torch_wrappers.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # DataLoaders, Optimizers, and Schedulers 17 | 18 | The internal classes Accelerate uses to prepare objects for distributed training 19 | when calling [`~Accelerator.prepare`]. 20 | 21 | ## DataLoader utilities 22 | 23 | [[autodoc]] data_loader.prepare_data_loader 24 | [[autodoc]] data_loader.skip_first_batches 25 | 26 | ## BatchSamplerShard 27 | 28 | [[autodoc]] data_loader.BatchSamplerShard 29 | 30 | ## IterableDatasetShard 31 | 32 | [[autodoc]] data_loader.IterableDatasetShard 33 | 34 | ## DataLoaderShard 35 | 36 | [[autodoc]] data_loader.DataLoaderShard 37 | 38 | ## DataLoaderDispatcher 39 | 40 | [[autodoc]] data_loader.DataLoaderDispatcher 41 | 42 | ## AcceleratedOptimizer 43 | 44 | [[autodoc]] optimizer.AcceleratedOptimizer 45 | 46 | ## AcceleratedScheduler 47 | 48 | [[autodoc]] scheduler.AcceleratedScheduler -------------------------------------------------------------------------------- /docs/source/package_reference/tracking.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Experiment Trackers 17 | 18 | ## GeneralTracker 19 | 20 | [[autodoc]] tracking.GeneralTracker 21 | 22 | ## TensorBoardTracker 23 | 24 | [[autodoc]] tracking.TensorBoardTracker 25 | - __init__ 26 | 27 | ## WandBTracker 28 | 29 | [[autodoc]] tracking.WandBTracker 30 | - __init__ 31 | 32 | ## CometMLTracker 33 | 34 | [[autodoc]] tracking.CometMLTracker 35 | - __init__ 36 | 37 | ## AimTracker 38 | 39 | [[autodoc]] tracking.AimTracker 40 | - __init__ 41 | 42 | ## MLflowTracker 43 | 44 | [[autodoc]] tracking.MLflowTracker 45 | - __init__ 46 | 47 | ## ClearMLTracker 48 | 49 | [[autodoc]] tracking.ClearMLTracker 50 | - __init__ 51 | -------------------------------------------------------------------------------- /docs/source/usage_guides/explore.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Start Here! 17 | 18 | Please use the interactive tool below to help you get started with learning about a particular 19 | feature of Accelerate and how to utilize it! It will provide you with a code diff, an explanation 20 | towards what is going on, as well as provide you with some useful links to explore more within 21 | the documentation! 22 | 23 | Most code examples start from the following python code before integrating Accelerate in some way: 24 | 25 | ```python 26 | for batch in dataloader: 27 | optimizer.zero_grad() 28 | inputs, targets = batch 29 | inputs = inputs.to(device) 30 | targets = targets.to(device) 31 | outputs = model(inputs) 32 | loss = loss_function(outputs, targets) 33 | loss.backward() 34 | optimizer.step() 35 | scheduler.step() 36 | ``` 37 | 38 |
39 | 44 |
45 | 52 | -------------------------------------------------------------------------------- /docs/source/usage_guides/gaudi.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Intel Gaudi 17 | 18 | Users can take advantage of Intel Gaudi AI accelerators for significantly faster and cost-effective model training and inference. 19 | The Intel Gaudi AI accelerator family currently includes three product generations: [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture Overview](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html). 20 | 21 | ## How it works out of the box 22 | 23 | It is enabled by default if an Intel Gaudi device is detected. 24 | To disable it, pass `--cpu` flag to `accelerate launch` command or answer the corresponding question when answering the `accelerate config` questionnaire. 25 | 26 | You can directly run the following script to test it out on Intel Gaudi: 27 | 28 | ```bash 29 | accelerate launch /examples/cv_example.py --data_dir images 30 | ``` 31 | 32 | ## Limitations 33 | 34 | The following features are not part of the Accelerate library and requires [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index): 35 | 36 | - `fast_ddp` which implements DDP by applying an all-reduce on gradients instead of the Torch DDP wrapper. 37 | - `minimize_memory` which is used for fp8 training and enables keeping fp8 weights in memory between the forward and backward passes, leading to a smaller memory footprint at the cost of additional fp8 casts. 38 | - `context_parallel_size` which is used for Context/Sequence Parallelism (CP/SP) and partitions the network inputs and activations along sequence dimension to reduce memory footprint and increase throughput. 39 | -------------------------------------------------------------------------------- /docs/source/usage_guides/mps.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # Accelerated PyTorch Training on Mac 17 | 18 | With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 19 | This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. 20 | Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 21 | This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS. 22 | For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) 23 | and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html). 24 | 25 | ### Benefits of Training and Inference using Apple Silicon Chips 26 | 27 | 1. Enables users to train larger networks or batch sizes locally 28 | 2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 29 | Therefore, improving end-to-end performance. 30 | 3. Reduces costs associated with cloud-based development or the need for additional local GPUs. 31 | 32 | **Pre-requisites**: To install torch with mps support, 33 | please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1). 34 | 35 | 36 | ## How it works out of the box 37 | It is enabled by default on MacOs machines with MPS enabled Apple Silicon GPUs. 38 | To disable it, pass `--cpu` flag to `accelerate launch` command or answer the corresponding question when answering the `accelerate config` questionnaire. 39 | 40 | You can directly run the following script to test it out on MPS enabled Apple Silicon machines: 41 | ```bash 42 | accelerate launch /examples/cv_example.py --data_dir images 43 | ``` 44 | 45 | ## A few caveats to be aware of 46 | 47 | 1. Distributed setups `gloo` and `nccl` are not working with `mps` device. 48 | This means that currently only single GPU of `mps` device type can be used. 49 | 50 | Finally, please, remember that, `Accelerate` only integrates MPS backend, therefore if you 51 | have any problems or questions with regards to MPS backend usage, please, file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues). -------------------------------------------------------------------------------- /examples/config_yaml_templates/README.md: -------------------------------------------------------------------------------- 1 | # Config Zoo 2 | 3 | This folder contains a variety of minimal configurations for `Accelerate` achieving certain goals. You can use these 4 | direct config YAML's, or build off of them for your own YAML's. 5 | 6 | These are highly annoted versions, aiming to teach you what each section does. 7 | 8 | Each config can be run via `accelerate launch --config_file {file} run_me.py` 9 | 10 | `run_me.py` will then print out how the current environment is setup (the contents of the `AcceleratorState`) -------------------------------------------------------------------------------- /examples/config_yaml_templates/deepspeed.yaml: -------------------------------------------------------------------------------- 1 | # Similar to FSDP, we set the distributed type as DEEPSPEED 2 | distributed_type: DEEPSPEED 3 | # With DeepSpeed, we utilize a deepspeed config file for the entire configuration 4 | deepspeed_config: 5 | # Can also be any of the config json's in accelerate/examples/deepspeed_config_templates 6 | deepspeed_config_file: ../deepspeed_config_templates/zero_stage1_config.json 7 | # If using ZeRO-3 and wanting to load big models in, this should be set to `true` so 8 | # `transformers` uses the right `init` function 9 | zero3_init_flag: false # true 10 | 11 | # Finally we need to specify the number of GPUs to use 12 | num_processes: 2 13 | # Optionally we can set the mixed precision now instead of in the deepspeed config file, 14 | # however this requires the `fp16` and `bf16` options to be set to `auto` in the deepspeed config file 15 | # mixed_precision: "bf16" 16 | -------------------------------------------------------------------------------- /examples/config_yaml_templates/fp8.yaml: -------------------------------------------------------------------------------- 1 | # This config template simply setups up the TransformersEngine config (and a config for a single GPU), 2 | # this can interop with the other configs in this folder 3 | distributed_type: "NO" 4 | mixed_precision: "fp8" 5 | # Then we specify the fp8 configuration: 6 | fp8_config: 7 | backend: TE # Can be TE | MS-AMP 8 | # The following are TE specific arguments. 9 | # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#common-api for more details 10 | amax_history_len: 1024 11 | fp8_format: E4M3 12 | interval: 1 13 | margin: 0 14 | override_linear_precision: (false, false, false) 15 | # Generally this should always be set to `false` to have the most realistic fp8 eval performance 16 | use_autocast_during_eval: false 17 | # If using MS-AMP, we ignore all of the prior and set a opt_level 18 | #opt_level: O1 -------------------------------------------------------------------------------- /examples/config_yaml_templates/fsdp.yaml: -------------------------------------------------------------------------------- 1 | # Since we are doing FSDP (even though it's multi-GPU), we need to specify the distributed type as FSDP 2 | distributed_type: FSDP 3 | # Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`, but it works for FSDP as well) 4 | mixed_precision: 'bf16' 5 | # Specify the number of GPUs to use 6 | num_processes: 2 7 | # Then we can specify the FSDP config 8 | fsdp_config: 9 | fsdp_activation_checkpointing: false 10 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 11 | fsdp_backward_prefetch: BACKWARD_PRE 12 | fsdp_cpu_ram_efficient_loading: true 13 | fsdp_forward_prefetch: false 14 | fsdp_offload_params: false 15 | fsdp_sharding_strategy: FULL_SHARD 16 | fsdp_state_dict_type: SHARDED_STATE_DICT 17 | fsdp_sync_module_states: true 18 | fsdp_use_orig_params: true 19 | -------------------------------------------------------------------------------- /examples/config_yaml_templates/multi_gpu.yaml: -------------------------------------------------------------------------------- 1 | # Specify distributed_type as `MULTI_GPU` for DDP 2 | distributed_type: "MULTI_GPU" 3 | # Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`) 4 | mixed_precision: "bf16" 5 | # Specify the number of GPUs to use 6 | num_processes: 2 -------------------------------------------------------------------------------- /examples/config_yaml_templates/multi_node.yaml: -------------------------------------------------------------------------------- 1 | # This config template is for a multi-node setup. This assumes DDP, but can be interop'd with the other configs in this folder 2 | # Generally it's recommended to look at the SLURM config template for a more robust multi-node setup 3 | distributed_type: MULTI_GPU 4 | # We need to specify the current machine's rank 5 | machine_rank: 0 6 | # We then need to specify the IP address and port of the main process 7 | main_process_ip: '1234' 8 | main_process_port: 9999 9 | # We need to specify the number of machines 10 | num_machines: 2 11 | # We need to specify the *total* number of processes 12 | num_processes: 8 13 | # And then we need to specify how rdvz comms will be handled 14 | rdzv_backend: static # or c10d 15 | # If the compute nodes are on the same network (cloud will more than likely be false) 16 | same_network: false 17 | -------------------------------------------------------------------------------- /examples/config_yaml_templates/run_me.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | A base script which outputs the accelerate config for the given environment 17 | """ 18 | 19 | from accelerate import Accelerator 20 | 21 | 22 | accelerator = Accelerator() 23 | 24 | accelerator.print(f"Accelerator state from the current environment:\n{accelerator.state}") 25 | if accelerator.fp8_recipe_handler is not None: 26 | accelerator.print(f"FP8 config:\n{accelerator.fp8_recipe_handler}") 27 | accelerator.end_training() 28 | -------------------------------------------------------------------------------- /examples/config_yaml_templates/single_gpu.yaml: -------------------------------------------------------------------------------- 1 | # Since this is single GPU, we don't need distributed training 2 | distributed_type: "NO" 3 | # Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`) 4 | mixed_precision: "bf16" -------------------------------------------------------------------------------- /examples/deepspeed_config_templates/zero_stage1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "weight_decay": "auto", 15 | "torch_adam": true, 16 | "adam_w_mode": true 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupDecayLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto", 25 | "total_num_steps": "auto" 26 | } 27 | }, 28 | "zero_optimization": { 29 | "stage": 1, 30 | "allgather_partitions": true, 31 | "allgather_bucket_size": 2e8, 32 | "overlap_comm": true, 33 | "reduce_scatter": true, 34 | "reduce_bucket_size": "auto", 35 | "contiguous_gradients": true 36 | }, 37 | "gradient_accumulation_steps": 1, 38 | "gradient_clipping": "auto", 39 | "steps_per_print": 2000, 40 | "train_batch_size": "auto", 41 | "train_micro_batch_size_per_gpu": "auto", 42 | "wall_clock_breakdown": false 43 | } -------------------------------------------------------------------------------- /examples/deepspeed_config_templates/zero_stage2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "weight_decay": "auto", 15 | "torch_adam": true, 16 | "adam_w_mode": true 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupDecayLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto", 25 | "total_num_steps": "auto" 26 | } 27 | }, 28 | "zero_optimization": { 29 | "stage": 2, 30 | "allgather_partitions": true, 31 | "allgather_bucket_size": 2e8, 32 | "overlap_comm": true, 33 | "reduce_scatter": true, 34 | "reduce_bucket_size": "auto", 35 | "contiguous_gradients": true 36 | }, 37 | "gradient_accumulation_steps": 1, 38 | "gradient_clipping": "auto", 39 | "steps_per_print": 2000, 40 | "train_batch_size": "auto", 41 | "train_micro_batch_size_per_gpu": "auto", 42 | "wall_clock_breakdown": false 43 | } -------------------------------------------------------------------------------- /examples/deepspeed_config_templates/zero_stage2_offload_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "weight_decay": "auto", 15 | "torch_adam": true, 16 | "adam_w_mode": true 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupDecayLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto", 25 | "total_num_steps": "auto" 26 | } 27 | }, 28 | "zero_optimization": { 29 | "stage": 2, 30 | "offload_optimizer": { 31 | "device": "cpu", 32 | "pin_memory": true 33 | }, 34 | "allgather_partitions": true, 35 | "allgather_bucket_size": 2e8, 36 | "overlap_comm": true, 37 | "reduce_scatter": true, 38 | "reduce_bucket_size": "auto", 39 | "contiguous_gradients": true 40 | }, 41 | "gradient_accumulation_steps": 1, 42 | "gradient_clipping": "auto", 43 | "steps_per_print": 2000, 44 | "train_batch_size": "auto", 45 | "train_micro_batch_size_per_gpu": "auto", 46 | "wall_clock_breakdown": false 47 | } -------------------------------------------------------------------------------- /examples/deepspeed_config_templates/zero_stage3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "weight_decay": "auto" 15 | } 16 | }, 17 | "scheduler": { 18 | "type": "WarmupDecayLR", 19 | "params": { 20 | "warmup_min_lr": "auto", 21 | "warmup_max_lr": "auto", 22 | "warmup_num_steps": "auto", 23 | "total_num_steps": "auto" 24 | } 25 | }, 26 | "zero_optimization": { 27 | "stage": 3, 28 | "overlap_comm": true, 29 | "contiguous_gradients": true, 30 | "reduce_bucket_size": "auto", 31 | "stage3_prefetch_bucket_size": "auto", 32 | "stage3_param_persistence_threshold": "auto", 33 | "sub_group_size": 1e9, 34 | "stage3_max_live_parameters": 1e9, 35 | "stage3_max_reuse_distance": 1e9, 36 | "stage3_gather_16bit_weights_on_model_save": "auto" 37 | }, 38 | "gradient_accumulation_steps": 1, 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": false 44 | } -------------------------------------------------------------------------------- /examples/deepspeed_config_templates/zero_stage3_offload_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "weight_decay": "auto" 15 | } 16 | }, 17 | "scheduler": { 18 | "type": "WarmupDecayLR", 19 | "params": { 20 | "warmup_min_lr": "auto", 21 | "warmup_max_lr": "auto", 22 | "warmup_num_steps": "auto", 23 | "total_num_steps": "auto" 24 | } 25 | }, 26 | "zero_optimization": { 27 | "stage": 3, 28 | "offload_optimizer": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "offload_param": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "overlap_comm": true, 37 | "contiguous_gradients": true, 38 | "reduce_bucket_size": "auto", 39 | "stage3_prefetch_bucket_size": "auto", 40 | "stage3_param_persistence_threshold": "auto", 41 | "sub_group_size": 1e9, 42 | "stage3_max_live_parameters": 1e9, 43 | "stage3_max_reuse_distance": 1e9, 44 | "stage3_gather_16bit_weights_on_model_save": "auto" 45 | }, 46 | "gradient_accumulation_steps": 1, 47 | "gradient_clipping": "auto", 48 | "steps_per_print": 2000, 49 | "train_batch_size": "auto", 50 | "train_micro_batch_size_per_gpu": "auto", 51 | "wall_clock_breakdown": false 52 | } -------------------------------------------------------------------------------- /examples/inference/distributed/README.md: -------------------------------------------------------------------------------- 1 | # Distributed inference examples 2 | 3 | This folder contains a variety of tutorials for running distributed inference with the following strategy: 4 | 5 | Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time 6 | 7 | ## Installation 8 | 9 | ```bash 10 | pip install accelerate torch 11 | ``` 12 | 13 | ## Running code 14 | 15 | You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script: 16 | 17 | ```bash 18 | accelerate launch --num_processes {NUM_GPUS} phi2.py 19 | ``` 20 | 21 | Or: 22 | 23 | ```bash 24 | torchrun --nproc-per-node {NUM_GPUS} phi2.py 25 | ``` 26 | -------------------------------------------------------------------------------- /examples/inference/distributed/stable_diffusion.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from diffusers import DiffusionPipeline 17 | 18 | from accelerate import PartialState # Can also be Accelerator or AcceleratorState 19 | 20 | 21 | pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16) 22 | distributed_state = PartialState() 23 | pipe.to(distributed_state.device) 24 | 25 | # Assume two processes 26 | # On the first GPU, the prompts will be ["a dog", "a cat"], 27 | # and on the second GPU it will be ["a chicken", "a chicken"]. 28 | # Make sure to drop the final sample, as it will be a duplicate of the previous one. 29 | with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt: 30 | result = pipe(prompt).images 31 | -------------------------------------------------------------------------------- /examples/inference/pippy/README.md: -------------------------------------------------------------------------------- 1 | # Distributed inference examples with PiPPy 2 | 3 | This repo contains a variety of tutorials for using the [PiPPy](https://github.com/PyTorch/PiPPy) pipeline parallelism library with accelerate. You will find examples covering: 4 | 5 | 1. How to trace the model using `accelerate.prepare_pippy` 6 | 2. How to specify inputs based on what the model expects (when to use `kwargs`, `args`, and such) 7 | 3. How to gather the results at the end. 8 | 9 | ## Installation 10 | 11 | This requires the `main` branch of accelerate (or a version at least 0.27.0), `pippy` version of 0.2.0 or greater, and at least python 3.9. Please install using `pip install .` to pull from the `setup.py` in this repo, or run manually: 12 | 13 | ```bash 14 | pip install 'accelerate>=0.27.0' 'torchpippy>=0.2.0' 15 | ``` 16 | 17 | ## Running code 18 | 19 | You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script: 20 | 21 | ```bash 22 | accelerate launch bert.py 23 | ``` 24 | 25 | Or: 26 | 27 | ```bash 28 | accelerate launch --num_processes {NUM_GPUS} bert.py 29 | ``` 30 | 31 | Or: 32 | 33 | ```bash 34 | torchrun --nproc-per-node {NUM_GPUS} bert.py 35 | ``` 36 | 37 | ## General speedups 38 | 39 | One can expect that PiPPy will outperform native model parallism by a multiplicative factor since all GPUs are running at all times with inputs, rather than one input being passed through a GPU at a time waiting for the prior to finish. 40 | 41 | Below are some benchmarks we have found when using the accelerate-pippy integration for a few models when running on 2x4090's: 42 | 43 | ### Bert 44 | 45 | | | Accelerate/Sequential | PiPPy + Accelerate | 46 | |---|---|---| 47 | | First batch | 0.2137s | 0.3119s | 48 | | Average of 5 batches | 0.0099s | **0.0062s** | 49 | 50 | ### GPT2 51 | 52 | | | Accelerate/Sequential | PiPPy + Accelerate | 53 | |---|---|---| 54 | | First batch | 0.1959s | 0.4189s | 55 | | Average of 5 batches | 0.0205s | **0.0126s** | 56 | 57 | ### T5 58 | 59 | | | Accelerate/Sequential | PiPPy + Accelerate | 60 | |---|---|---| 61 | | First batch | 0.2789s | 0.3809s | 62 | | Average of 5 batches | 0.0198s | **0.0166s** | -------------------------------------------------------------------------------- /examples/inference/pippy/bert.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import time 15 | 16 | import torch 17 | from transformers import AutoModelForMaskedLM 18 | 19 | from accelerate import PartialState, prepare_pippy 20 | from accelerate.test_utils import torch_device 21 | from accelerate.utils import set_seed 22 | 23 | 24 | synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize 25 | 26 | # Set the random seed to have reproducable outputs 27 | set_seed(42) 28 | 29 | # Create an example model 30 | model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") 31 | model.eval() 32 | 33 | # Input configs 34 | # Create example inputs for the model 35 | input = torch.randint( 36 | low=0, 37 | high=model.config.vocab_size, 38 | size=(1, 512), # bs x seq_len 39 | device="cpu", 40 | dtype=torch.int64, 41 | requires_grad=False, 42 | ) 43 | 44 | 45 | # Create a pipeline stage from the model 46 | # Using `auto` is equivalent to letting `device_map="auto"` figure 47 | # out device mapping and will also split the model according to the 48 | # number of total GPUs available if it fits on one GPU 49 | model = prepare_pippy(model, split_points="auto", example_args=(input,)) 50 | 51 | # You can pass `gather_output=True` to have the output from the model 52 | # available on all GPUs 53 | # model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True) 54 | 55 | # Create new inputs of the expected size (n_processes) 56 | input = torch.randint( 57 | low=0, 58 | high=model.config.vocab_size, 59 | size=(2, 512), # bs x seq_len 60 | device="cpu", 61 | dtype=torch.int64, 62 | requires_grad=False, 63 | ) 64 | 65 | # Move the inputs to the first device 66 | input = input.to(torch_device) 67 | 68 | # Take an average of 5 times 69 | # Measure first batch 70 | synchronize_func() 71 | start_time = time.time() 72 | with torch.no_grad(): 73 | output = model(input) 74 | synchronize_func() 75 | end_time = time.time() 76 | first_batch = end_time - start_time 77 | 78 | # Now that hpu is init, measure after 79 | synchronize_func() 80 | start_time = time.time() 81 | for i in range(5): 82 | with torch.no_grad(): 83 | output = model(input) 84 | synchronize_func() 85 | end_time = time.time() 86 | 87 | # The outputs are only on the final process by default 88 | if PartialState().is_last_process: 89 | output = torch.stack(tuple(output[0])) 90 | print(f"Time of first pass: {first_batch}") 91 | print(f"Average time per batch: {(end_time - start_time) / 5}") 92 | PartialState().destroy_process_group() 93 | -------------------------------------------------------------------------------- /examples/inference/pippy/gpt2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import time 15 | 16 | import torch 17 | from transformers import AutoModelForSequenceClassification 18 | 19 | from accelerate import PartialState, prepare_pippy 20 | from accelerate.test_utils import torch_device 21 | from accelerate.utils import set_seed 22 | 23 | 24 | synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize 25 | 26 | # Set the random seed to have reproducable outputs 27 | set_seed(42) 28 | 29 | # Create an example model 30 | model = AutoModelForSequenceClassification.from_pretrained("gpt2") 31 | model.eval() 32 | 33 | # Input configs 34 | # Create example inputs for the model 35 | input = torch.randint( 36 | low=0, 37 | high=model.config.vocab_size, 38 | size=(1, 1024), # bs x seq_len 39 | device="cpu", 40 | dtype=torch.int64, 41 | requires_grad=False, 42 | ) 43 | 44 | # Create a pipeline stage from the model 45 | # Using `auto` is equivalent to letting `device_map="auto"` figure 46 | # out device mapping and will also split the model according to the 47 | # number of total GPUs available if it fits on one GPU 48 | model = prepare_pippy(model, split_points="auto", example_args=(input,)) 49 | 50 | # You can pass `gather_output=True` to have the output from the model 51 | # available on all GPUs 52 | # model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True) 53 | 54 | # Create new inputs of the expected size (n_processes) 55 | input = torch.randint( 56 | low=0, 57 | high=model.config.vocab_size, 58 | size=(2, 1024), # bs x seq_len 59 | device="cpu", 60 | dtype=torch.int64, 61 | requires_grad=False, 62 | ) 63 | 64 | # Move the inputs to the first device 65 | input = input.to(torch_device) 66 | 67 | # Take an average of 5 times 68 | # Measure first batch 69 | synchronize_func() 70 | start_time = time.time() 71 | with torch.no_grad(): 72 | output = model(input) 73 | synchronize_func() 74 | end_time = time.time() 75 | first_batch = end_time - start_time 76 | 77 | # Now that device/backend is init, measure after 78 | synchronize_func() 79 | start_time = time.time() 80 | for i in range(5): 81 | with torch.no_grad(): 82 | output = model(input) 83 | synchronize_func() 84 | end_time = time.time() 85 | 86 | # The outputs are only on the final process by default 87 | if PartialState().is_last_process: 88 | output = torch.stack(tuple(output[0])) 89 | print(f"Time of first pass: {first_batch}") 90 | print(f"Average time per batch: {(end_time - start_time) / 5}") 91 | PartialState().destroy_process_group() 92 | -------------------------------------------------------------------------------- /examples/inference/pippy/llama.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | from transformers import AutoModelForCausalLM, AutoTokenizer 16 | 17 | from accelerate import PartialState, prepare_pippy 18 | 19 | 20 | # sdpa implementation which is the default torch>2.1.2 fails with the tracing + attention mask kwarg 21 | # with attn_implementation="eager" mode, the forward is very slow for some reason 22 | model = AutoModelForCausalLM.from_pretrained( 23 | "meta-llama/Llama-2-7b-chat-hf", low_cpu_mem_usage=True, attn_implementation="sdpa" 24 | ) 25 | model.eval() 26 | 27 | # Input configs 28 | # Create example inputs for the model 29 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") 30 | prompts = ("I would like to", "I really like to") # bs = 2, sending 2 per process 31 | tokenizer.pad_token = tokenizer.eos_token 32 | inputs = tokenizer(prompts, return_tensors="pt", padding=True) 33 | 34 | # Create a pipeline stage from the model 35 | # Using `auto` is equivalent to letting `device_map="auto"` figure 36 | # out device mapping and will also split the model according to the 37 | # number of total GPUs available if it fits on one GPU 38 | model = prepare_pippy(model, split_points="auto", example_kwargs=inputs) 39 | 40 | # You can pass `gather_output=True` to have the output from the model 41 | # available on all GPUs 42 | # model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True) 43 | 44 | # currently we don't support `model.generate` 45 | # output = model.generate(**inputs, max_new_tokens=1) 46 | prompts = ("I would like to", "I really like to", "The weather is pretty") # bs = 3 47 | inputs = tokenizer(prompts, return_tensors="pt", padding=True) 48 | inputs = inputs.to(0) 49 | with torch.no_grad(): 50 | output = model(**inputs) 51 | 52 | # The outputs are only on the final process by default 53 | if PartialState().is_last_process: 54 | next_token_logits = output[0][:, -1, :] 55 | next_token = torch.argmax(next_token_logits, dim=-1) 56 | print(tokenizer.batch_decode(next_token)) 57 | PartialState().destroy_process_group() 58 | -------------------------------------------------------------------------------- /examples/inference/pippy/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | pippy>=0.2.0 -------------------------------------------------------------------------------- /examples/inference/pippy/t5.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import time 15 | 16 | import torch 17 | from packaging import version 18 | from transformers import AutoModelForSeq2SeqLM 19 | 20 | from accelerate import PartialState, prepare_pippy 21 | from accelerate import __version__ as accelerate_version 22 | from accelerate.utils import set_seed 23 | 24 | 25 | if version.parse(accelerate_version) > version.parse("0.33.0"): 26 | raise RuntimeError( 27 | "Using encoder/decoder models is not supported with the `torch.pipelining` integration or accelerate>=0.34.0. " 28 | "Please use a lower accelerate version and `torchpippy`, which this example uses." 29 | ) 30 | 31 | 32 | # Set the random seed to have reproducable outputs 33 | set_seed(42) 34 | 35 | # Create an example model 36 | model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") 37 | model.eval() 38 | 39 | # Input configs 40 | # Create example inputs for the model 41 | input = torch.randint( 42 | low=0, 43 | high=model.config.vocab_size, 44 | size=(2, 1024), # bs x seq_len 45 | device="cpu", 46 | dtype=torch.int64, 47 | requires_grad=False, 48 | ) 49 | 50 | example_inputs = {"input_ids": input, "decoder_input_ids": input} 51 | 52 | # Create a pipeline stage from the model 53 | # Using `auto` is equivalent to letting `device_map="auto"` figure 54 | # out device mapping and will also split the model according to the 55 | # number of total GPUs available if it fits on one GPU 56 | model = prepare_pippy( 57 | model, 58 | no_split_module_classes=["T5Block"], 59 | example_kwargs=example_inputs, 60 | ) 61 | 62 | # You can pass `gather_output=True` to have the output from the model 63 | # available on all GPUs 64 | # model = prepare_pippy( 65 | # model, 66 | # no_split_module_classes=["T5Block"], 67 | # example_kwargs=example_inputs, 68 | # gather_outputs=True 69 | # ) 70 | 71 | # The model expects a tuple during real inference 72 | # with the data on the first device 73 | args = (example_inputs["input_ids"].to("cuda:0"), example_inputs["decoder_input_ids"].to("cuda:0")) 74 | 75 | # Take an average of 5 times 76 | # Measure first batch 77 | torch.cuda.synchronize() 78 | start_time = time.time() 79 | with torch.no_grad(): 80 | output = model(*args) 81 | torch.cuda.synchronize() 82 | end_time = time.time() 83 | first_batch = end_time - start_time 84 | 85 | # Now that CUDA is init, measure after 86 | torch.cuda.synchronize() 87 | start_time = time.time() 88 | for i in range(5): 89 | with torch.no_grad(): 90 | output = model(*args) 91 | torch.cuda.synchronize() 92 | end_time = time.time() 93 | 94 | # The outputs are only on the final process by default 95 | if PartialState().is_last_process: 96 | output = torch.stack(tuple(output[0])) 97 | print(f"Time of first pass: {first_batch}") 98 | print(f"Average time per batch: {(end_time - start_time) / 5}") 99 | PartialState().destroy_process_group() 100 | -------------------------------------------------------------------------------- /examples/multigpu_remote_launcher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | 16 | import runhouse as rh 17 | import torch 18 | from nlp_example import training_function 19 | 20 | from accelerate.utils import PrepareForLaunch, patch_environment 21 | 22 | 23 | def launch_train(*args): 24 | num_processes = torch.cuda.device_count() 25 | print(f"Device count: {num_processes}") 26 | with patch_environment( 27 | world_size=num_processes, master_addr="127.0.0.1", master_port="29500", mixed_precision=args[1].mixed_precision 28 | ): 29 | launcher = PrepareForLaunch(training_function, distributed_type="MULTI_GPU") 30 | torch.multiprocessing.start_processes(launcher, args=args, nprocs=num_processes, start_method="spawn") 31 | 32 | 33 | if __name__ == "__main__": 34 | # Refer to https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup 35 | # for cloud access setup instructions (if using on-demand hardware), and for API specifications. 36 | 37 | # on-demand GPU 38 | # gpu = rh.cluster(name='rh-cluster', instance_type='V100:1', provider='cheapest', use_spot=False) # single GPU 39 | gpu = rh.cluster(name="rh-cluster", instance_type="V100:4", provider="cheapest", use_spot=False) # multi GPU 40 | gpu.up_if_not() 41 | 42 | # on-prem GPU 43 | # gpu = rh.cluster( 44 | # ips=["ip_addr"], ssh_creds={ssh_user:"", ssh_private_key:""}, name="rh-cluster" 45 | # ) 46 | 47 | # Set up remote function 48 | reqs = [ 49 | "pip:./", 50 | "transformers", 51 | "datasets", 52 | "evaluate", 53 | "tqdm", 54 | "scipy", 55 | "scikit-learn", 56 | "tensorboard", 57 | "torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117", 58 | ] 59 | launch_train_gpu = rh.function(fn=launch_train, system=gpu, reqs=reqs, name="train_bert_glue") 60 | 61 | # Define train args/config, run train function 62 | train_args = argparse.Namespace(cpu=False, mixed_precision="fp16") 63 | config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16} 64 | launch_train_gpu(config, train_args, stream_logs=True) 65 | 66 | # Alternatively, we can just run as instructed in the README (but only because there's already a wrapper CLI): 67 | # gpu.install_packages(reqs) 68 | # gpu.run(['accelerate launch --multi_gpu accelerate/examples/nlp_example.py']) 69 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate # used to be installed in Amazon SageMaker environment 2 | evaluate 3 | datasets==2.3.2 4 | schedulefree 5 | huggingface_hub>=0.20.0 6 | -------------------------------------------------------------------------------- /examples/slurm/fsdp_config.yaml: -------------------------------------------------------------------------------- 1 | distributed_type: FSDP 2 | fsdp_config: 3 | fsdp_activation_checkpointing: false 4 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 5 | fsdp_backward_prefetch: BACKWARD_PRE 6 | fsdp_cpu_ram_efficient_loading: true 7 | fsdp_forward_prefetch: false 8 | fsdp_offload_params: false 9 | fsdp_sharding_strategy: FULL_SHARD 10 | fsdp_state_dict_type: SHARDED_STATE_DICT 11 | fsdp_sync_module_states: true 12 | fsdp_use_orig_params: true 13 | -------------------------------------------------------------------------------- /examples/slurm/submit_multicpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #SBATCH --job-name=multicpu 4 | #SBATCH --nodes=2 # number of Nodes 5 | #SBATCH --ntasks-per-node=1 # number of MP tasks 6 | #SBATCH --exclusive 7 | #SBATCH --output=O-%x.%j 8 | #SBATCH --error=E-%x.%j 9 | 10 | ###################### 11 | ### Set environment ### 12 | ###################### 13 | source activateEnvironment.sh 14 | 15 | ###################### 16 | #### Set network ##### 17 | ###################### 18 | head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 19 | ###################### 20 | 21 | # Setup env variables for distributed jobs 22 | export MASTER_PORT="${MASTER_PORT:-29555 }" 23 | echo "head_node_ip=${head_node_ip}" 24 | echo "MASTER_PORT=${MASTER_PORT}" 25 | 26 | INSTANCES_PER_NODE="${INSTANCES_PER_NODE:-1}" 27 | 28 | if [[ $SLURM_NNODES == 1 ]] && [[ $INSTANCES_PER_NODE == 1 ]]; then 29 | export CCL_WORKER_COUNT=0 30 | LAUNCHER="" 31 | else 32 | # Setup env variables for distributed jobs 33 | export CCL_WORKER_COUNT="${CCL_WORKER_COUNT:-2}" 34 | echo "CCL_WORKER_COUNT=${CCL_WORKER_COUNT}" 35 | 36 | # Write hostfile 37 | HOSTFILE_PATH=hostfile 38 | scontrol show hostname $SLURM_JOB_NODELIST | perl -ne 'chomb; print "$_"x1'> ${HOSTFILE_PATH} 39 | 40 | export LAUNCHER="accelerate launch \ 41 | --num_processes $((SLURM_NNODES * ${INSTANCES_PER_NODE})) \ 42 | --num_machines $SLURM_NNODES \ 43 | --rdzv_backend c10d \ 44 | --main_process_ip $head_node_ip \ 45 | --main_process_port $MASTER_PORT \ 46 | --mpirun_hostfile $HOSTFILE_PATH \ 47 | --mpirun_ccl $CCL_WORKER_COUNT" 48 | fi 49 | 50 | # This step is necessary because accelerate launch does not handle multiline arguments properly 51 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" 52 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" 53 | export SCRIPT_ARGS=" \ 54 | --cpu \ 55 | --output_dir ${ACCELERATE_DIR}/examples/output \ 56 | " 57 | 58 | # This step is necessary because accelerate launch does not handle multiline arguments properly 59 | export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 60 | # Print the command 61 | echo $CMD 62 | echo "" 63 | 64 | # Run the command 65 | eval $CMD -------------------------------------------------------------------------------- /examples/slurm/submit_multigpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=multigpu 4 | #SBATCH -D . 5 | #SBATCH --output=O-%x.%j 6 | #SBATCH --error=E-%x.%j 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks-per-node=1 # number of MP tasks 9 | #SBATCH --gres=gpu:4 # number of GPUs per node 10 | #SBATCH --cpus-per-task=160 # number of cores per tasks 11 | #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) 12 | 13 | ###################### 14 | ### Set environment ### 15 | ###################### 16 | source activateEnvironment.sh 17 | export GPUS_PER_NODE=4 18 | ###################### 19 | 20 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" 21 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" 22 | export SCRIPT_ARGS=" \ 23 | --mixed_precision fp16 \ 24 | --output_dir ${ACCELERATE_DIR}/examples/output \ 25 | --with_tracking \ 26 | " 27 | 28 | accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS -------------------------------------------------------------------------------- /examples/slurm/submit_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=multinode 4 | #SBATCH -D . 5 | #SBATCH --output=O-%x.%j 6 | #SBATCH --error=E-%x.%j 7 | #SBATCH --nodes=4 # number of nodes 8 | #SBATCH --ntasks-per-node=1 # number of MP tasks 9 | #SBATCH --gres=gpu:4 # number of GPUs per node 10 | #SBATCH --cpus-per-task=160 # number of cores per tasks 11 | #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) 12 | 13 | ###################### 14 | ### Set environment ### 15 | ###################### 16 | source activateEnvironment.sh 17 | export GPUS_PER_NODE=4 18 | ###################### 19 | 20 | ###################### 21 | #### Set network ##### 22 | ###################### 23 | head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 24 | ###################### 25 | 26 | export LAUNCHER="accelerate launch \ 27 | --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ 28 | --num_machines $SLURM_NNODES \ 29 | --rdzv_backend c10d \ 30 | --main_process_ip $head_node_ip \ 31 | --main_process_port 29500 \ 32 | " 33 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" 34 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" 35 | export SCRIPT_ARGS=" \ 36 | --mixed_precision fp16 \ 37 | --output_dir ${ACCELERATE_DIR}/examples/output \ 38 | " 39 | 40 | # This step is necessary because accelerate launch does not handle multiline arguments properly 41 | export CMD="$LAUNCHER $PYTHON_FILE $ARGS" 42 | srun $CMD -------------------------------------------------------------------------------- /examples/slurm/submit_multinode_fsdp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=multinode 4 | #SBATCH -D . 5 | #SBATCH --output=O-%x.%j 6 | #SBATCH --error=E-%x.%j 7 | #SBATCH --nodes=4 # number of nodes 8 | #SBATCH --ntasks-per-node=1 # number of MP tasks 9 | #SBATCH --gres=gpu:4 # number of GPUs per node 10 | #SBATCH --cpus-per-task=160 # number of cores per tasks 11 | #SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) 12 | 13 | ###################### 14 | ### Set environment ### 15 | ###################### 16 | source activateEnvironment.sh 17 | export GPUS_PER_NODE=4 18 | ###################### 19 | 20 | ###################### 21 | #### Set network ##### 22 | ###################### 23 | head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 24 | ###################### 25 | export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" 26 | 27 | export LAUNCHER="accelerate launch \ 28 | --config_file ${ACCELERATE_DIR}/examples/slurm/fsdp_config.yaml \ 29 | --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ 30 | --num_machines $SLURM_NNODES \ 31 | --rdzv_backend c10d \ 32 | --main_process_ip $head_node_ip \ 33 | --main_process_port 29500 \ 34 | " 35 | export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" 36 | export SCRIPT_ARGS=" \ 37 | --mixed_precision fp16 \ 38 | --output_dir ${ACCELERATE_DIR}/examples/output \ 39 | " 40 | 41 | # This step is necessary because accelerate launch does not handle multiline arguments properly 42 | export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 43 | srun $CMD -------------------------------------------------------------------------------- /manim_animations/dataloaders/stage_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from manim import * 16 | 17 | 18 | class Stage0(Scene): 19 | def construct(self): 20 | mascot = ImageMobject("mascot_bookie.png") 21 | mascot.scale(.35) 22 | mascot.move_to([-3.75,-1,0]) 23 | text = Paragraph( 24 | "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?", 25 | font_size=36, 26 | line_spacing=1, 27 | alignment="center", 28 | weight=BOLD, 29 | ) 30 | text.move_to([1.75,.5,0]) 31 | self.add(mascot) 32 | self.add(text) -------------------------------------------------------------------------------- /manim_animations/dataloaders/stage_1.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from manim import * 16 | 17 | class Stage01(Scene): 18 | def construct(self): 19 | mascot = ImageMobject("mascot_bookie.png") 20 | mascot.scale(.35) 21 | mascot.move_to([-3.75,-1,0]) 22 | text = Paragraph( 23 | "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?", 24 | font_size=36, 25 | line_spacing=1, 26 | alignment="center", 27 | weight=BOLD, 28 | ) 29 | text.move_to([1.75,.5,0]) 30 | self.add(mascot) 31 | self.add(text) -------------------------------------------------------------------------------- /manim_animations/dataloaders/stage_3.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from manim import * 16 | 17 | class Stage3(Scene): 18 | def construct(self): 19 | step_1 = MarkupText( 20 | f"To combat this, Accelerate employs one of two different\nSampler wrapper methods depending on the scenario:", 21 | font_size=24 22 | ) 23 | step_1.move_to([0, 1.5, 0]) 24 | self.add(step_1) 25 | step_2 = MarkupText( 26 | f"1. Sharding the dataset before drawing:\n\t● IterableDatasetShard\n\t● BatchSamplerShard", 27 | font_size=24, 28 | ).next_to(step_1, direction=DOWN, aligned_edge=LEFT) 29 | self.add(step_2) 30 | step_3 = MarkupText( 31 | f"\n\n2. Splitting the batch after drawing:\n\t● DataLoaderDispatcher", 32 | font_size=24, 33 | ).next_to(step_2, direction=DOWN, aligned_edge=LEFT) 34 | self.add(step_3) -------------------------------------------------------------------------------- /manim_animations/dataloaders/stage_4.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from manim import * 16 | 17 | class Stage4(Scene): 18 | def construct(self): 19 | 20 | step_1 = MarkupText( 21 | f"To understand the next part fully, let's define two terms,\n`batch_size` and `global_batch_size`:", 22 | font_size=18 23 | ) 24 | step_1.move_to([0, 1.5, 0]) 25 | # 26 | step_2 = MarkupText( 27 | f"\n\n● `batch_size`: \n\tThis will be defined as the batch size seen on a given\n\t*individual* GPU", 28 | font_size=18, 29 | ).next_to(step_1, direction=DOWN, aligned_edge=LEFT) 30 | 31 | step_3 = MarkupText( 32 | f"\n\n● `global_batch_size`:\n\tThis will be defined as the *total* number of\n\tdifferent items seen in the dataset, across all GPUs", 33 | font_size=18, 34 | ).next_to(step_2, direction=DOWN, aligned_edge=LEFT) 35 | 36 | step_4 = MarkupText( 37 | f"\n\nSo if we have a dataset of 64 items, 8 GPUs, \nand a `batch_size` of 8, each *step* will go through\nthe entire dataset one time as 8*8=64", 38 | font_size=18, 39 | ).next_to(step_3, direction=DOWN, aligned_edge=LEFT) 40 | self.play( 41 | Write(step_1, run_time=4), 42 | ) 43 | self.play( 44 | Write(step_2, run_time=4) 45 | ) 46 | self.play( 47 | Write(step_3, run_time=4) 48 | ) 49 | self.play( 50 | Write(step_4, run_time=6) 51 | ) 52 | self.wait() -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 119 3 | target-version = "py39" 4 | 5 | [tool.ruff.lint] 6 | preview = true 7 | extend-select = [ 8 | "B009", # static getattr 9 | "B010", # static setattr 10 | "CPY", # Copyright 11 | "E", # PEP8 errors 12 | "F", # PEP8 formatting 13 | "I", # Import sorting 14 | "TID251", # Banned API 15 | "UP", # Pyupgrade 16 | "W", # PEP8 warnings 17 | ] 18 | ignore = [ 19 | "E501", # Line length (handled by ruff-format) 20 | "E741", # Ambiguous variable name 21 | "W605", # Invalid escape sequence 22 | "UP007", # X | Y type annotations 23 | ] 24 | 25 | [tool.ruff.lint.per-file-ignores] 26 | "__init__.py" = [ 27 | "F401", # Ignore seemingly unused imports (they're meant for re-export) 28 | ] 29 | "manim_animations/*" = ["ALL"] 30 | 31 | [tool.ruff.lint.isort] 32 | lines-after-imports = 2 33 | known-first-party = ["accelerate"] 34 | 35 | [tool.ruff.format] 36 | exclude = [ 37 | "manim_animations/*" 38 | ] 39 | 40 | [tool.ruff.lint.flake8-tidy-imports.banned-api] 41 | "os.getenv".msg = "Use os.environ instead" 42 | "os.putenv".msg = "Use os.environ instead" 43 | "os.unsetenv".msg = "Use os.environ instead" 44 | -------------------------------------------------------------------------------- /src/accelerate/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | __version__ = "1.8.0.dev0" 15 | 16 | from .accelerator import Accelerator 17 | from .big_modeling import ( 18 | cpu_offload, 19 | cpu_offload_with_hook, 20 | disk_offload, 21 | dispatch_model, 22 | init_empty_weights, 23 | init_on_device, 24 | load_checkpoint_and_dispatch, 25 | ) 26 | from .data_loader import skip_first_batches 27 | from .inference import prepare_pippy 28 | from .launchers import debug_launcher, notebook_launcher 29 | from .state import PartialState 30 | from .utils import ( 31 | AutocastKwargs, 32 | DataLoaderConfiguration, 33 | DDPCommunicationHookType, 34 | DeepSpeedPlugin, 35 | DistributedDataParallelKwargs, 36 | DistributedType, 37 | FullyShardedDataParallelPlugin, 38 | GradScalerKwargs, 39 | InitProcessGroupKwargs, 40 | ProfileKwargs, 41 | find_executable_batch_size, 42 | infer_auto_device_map, 43 | is_rich_available, 44 | load_checkpoint_in_model, 45 | synchronize_rng_states, 46 | ) 47 | 48 | 49 | if is_rich_available(): 50 | from .utils import rich 51 | -------------------------------------------------------------------------------- /src/accelerate/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/accelerate/commands/accelerate_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2021 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from accelerate.commands.config import get_config_parser 18 | from accelerate.commands.env import env_command_parser 19 | from accelerate.commands.estimate import estimate_command_parser 20 | from accelerate.commands.launch import launch_command_parser 21 | from accelerate.commands.merge import merge_command_parser 22 | from accelerate.commands.test import test_command_parser 23 | from accelerate.commands.to_fsdp2 import to_fsdp2_command_parser 24 | from accelerate.commands.tpu import tpu_command_parser 25 | from accelerate.commands.utils import CustomArgumentParser 26 | 27 | 28 | def main(): 29 | parser = CustomArgumentParser("Accelerate CLI tool", usage="accelerate []", allow_abbrev=False) 30 | subparsers = parser.add_subparsers(help="accelerate command helpers") 31 | 32 | # Register commands 33 | get_config_parser(subparsers=subparsers) 34 | estimate_command_parser(subparsers=subparsers) 35 | env_command_parser(subparsers=subparsers) 36 | launch_command_parser(subparsers=subparsers) 37 | merge_command_parser(subparsers=subparsers) 38 | tpu_command_parser(subparsers=subparsers) 39 | test_command_parser(subparsers=subparsers) 40 | to_fsdp2_command_parser(subparsers=subparsers) 41 | 42 | # Let's go 43 | args = parser.parse_args() 44 | 45 | if not hasattr(args, "func"): 46 | parser.print_help() 47 | exit(1) 48 | 49 | # Run 50 | args.func(args) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /src/accelerate/commands/config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2021 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | 19 | from .config import config_command_parser 20 | from .config_args import default_config_file, load_config_from_file # noqa: F401 21 | from .default import default_command_parser 22 | from .update import update_command_parser 23 | 24 | 25 | def get_config_parser(subparsers=None): 26 | parent_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) 27 | # The main config parser 28 | config_parser = config_command_parser(subparsers) 29 | # The subparser to add commands to 30 | subcommands = config_parser.add_subparsers(title="subcommands", dest="subcommand") 31 | 32 | # Then add other parsers with the parent parser 33 | default_command_parser(subcommands, parents=[parent_parser]) 34 | update_command_parser(subcommands, parents=[parent_parser]) 35 | 36 | return config_parser 37 | 38 | 39 | def main(): 40 | config_parser = get_config_parser() 41 | args = config_parser.parse_args() 42 | 43 | if not hasattr(args, "func"): 44 | config_parser.print_help() 45 | exit(1) 46 | 47 | # Run 48 | args.func(args) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /src/accelerate/commands/config/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2021 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import os 19 | 20 | from accelerate.utils import ComputeEnvironment 21 | 22 | from .cluster import get_cluster_input 23 | from .config_args import cache_dir, default_config_file, default_yaml_config_file, load_config_from_file # noqa: F401 24 | from .config_utils import _ask_field, _ask_options, _convert_compute_environment # noqa: F401 25 | from .sagemaker import get_sagemaker_input 26 | 27 | 28 | description = "Launches a series of prompts to create and save a `default_config.yaml` configuration file for your training system. Should always be ran first on your machine" 29 | 30 | 31 | def get_user_input(): 32 | compute_environment = _ask_options( 33 | "In which compute environment are you running?", 34 | ["This machine", "AWS (Amazon SageMaker)"], 35 | _convert_compute_environment, 36 | ) 37 | if compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER: 38 | config = get_sagemaker_input() 39 | else: 40 | config = get_cluster_input() 41 | return config 42 | 43 | 44 | def config_command_parser(subparsers=None): 45 | if subparsers is not None: 46 | parser = subparsers.add_parser("config", description=description) 47 | else: 48 | parser = argparse.ArgumentParser("Accelerate config command", description=description) 49 | 50 | parser.add_argument( 51 | "--config_file", 52 | default=None, 53 | help=( 54 | "The path to use to store the config file. Will default to a file named default_config.yaml in the cache " 55 | "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have " 56 | "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed " 57 | "with 'huggingface'." 58 | ), 59 | ) 60 | 61 | if subparsers is not None: 62 | parser.set_defaults(func=config_command) 63 | return parser 64 | 65 | 66 | def config_command(args): 67 | config = get_user_input() 68 | if args.config_file is not None: 69 | config_file = args.config_file 70 | else: 71 | if not os.path.isdir(cache_dir): 72 | os.makedirs(cache_dir) 73 | config_file = default_yaml_config_file 74 | 75 | if config_file.endswith(".json"): 76 | config.to_json_file(config_file) 77 | else: 78 | config.to_yaml_file(config_file) 79 | print(f"accelerate configuration saved at {config_file}") 80 | 81 | 82 | def main(): 83 | parser = config_command_parser() 84 | args = parser.parse_args() 85 | config_command(args) 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /src/accelerate/commands/config/update.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2022 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pathlib import Path 18 | 19 | from .config_args import default_config_file, load_config_from_file 20 | from .config_utils import SubcommandHelpFormatter 21 | 22 | 23 | description = "Update an existing config file with the latest defaults while maintaining the old configuration." 24 | 25 | 26 | def update_config(args): 27 | """ 28 | Update an existing config file with the latest defaults while maintaining the old configuration. 29 | """ 30 | config_file = args.config_file 31 | if config_file is None and Path(default_config_file).exists(): 32 | config_file = default_config_file 33 | elif not Path(config_file).exists(): 34 | raise ValueError(f"The passed config file located at {config_file} doesn't exist.") 35 | config = load_config_from_file(config_file) 36 | 37 | if config_file.endswith(".json"): 38 | config.to_json_file(config_file) 39 | else: 40 | config.to_yaml_file(config_file) 41 | return config_file 42 | 43 | 44 | def update_command_parser(parser, parents): 45 | parser = parser.add_parser("update", parents=parents, help=description, formatter_class=SubcommandHelpFormatter) 46 | parser.add_argument( 47 | "--config_file", 48 | default=None, 49 | help=( 50 | "The path to the config file to update. Will default to a file named default_config.yaml in the cache " 51 | "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have " 52 | "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed " 53 | "with 'huggingface'." 54 | ), 55 | ) 56 | 57 | parser.set_defaults(func=update_config_command) 58 | return parser 59 | 60 | 61 | def update_config_command(args): 62 | config_file = update_config(args) 63 | print(f"Sucessfully updated the configuration file at {config_file}.") 64 | -------------------------------------------------------------------------------- /src/accelerate/commands/menu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .selection_menu import BulletMenu 15 | -------------------------------------------------------------------------------- /src/accelerate/commands/menu/cursor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | A utility for showing and hiding the terminal cursor on Windows and Linux, based on https://github.com/bchao1/bullet 17 | """ 18 | 19 | import os 20 | import sys 21 | from contextlib import contextmanager 22 | 23 | 24 | # Windows only 25 | if os.name == "nt": 26 | import ctypes 27 | import msvcrt # noqa 28 | 29 | class CursorInfo(ctypes.Structure): 30 | # _fields is a specific attr expected by ctypes 31 | _fields_ = [("size", ctypes.c_int), ("visible", ctypes.c_byte)] 32 | 33 | 34 | def hide_cursor(): 35 | if os.name == "nt": 36 | ci = CursorInfo() 37 | handle = ctypes.windll.kernel32.GetStdHandle(-11) 38 | ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci)) 39 | ci.visible = False 40 | ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci)) 41 | elif os.name == "posix": 42 | sys.stdout.write("\033[?25l") 43 | sys.stdout.flush() 44 | 45 | 46 | def show_cursor(): 47 | if os.name == "nt": 48 | ci = CursorInfo() 49 | handle = ctypes.windll.kernel32.GetStdHandle(-11) 50 | ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci)) 51 | ci.visible = True 52 | ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci)) 53 | elif os.name == "posix": 54 | sys.stdout.write("\033[?25h") 55 | sys.stdout.flush() 56 | 57 | 58 | @contextmanager 59 | def hide(): 60 | "Context manager to hide the terminal cursor" 61 | try: 62 | hide_cursor() 63 | yield 64 | finally: 65 | show_cursor() 66 | -------------------------------------------------------------------------------- /src/accelerate/commands/menu/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | A variety of helper functions and constants when dealing with terminal menu choices, based on 17 | https://github.com/bchao1/bullet 18 | """ 19 | 20 | import enum 21 | import shutil 22 | import sys 23 | 24 | 25 | TERMINAL_WIDTH, _ = shutil.get_terminal_size() 26 | 27 | CURSOR_TO_CHAR = {"UP": "A", "DOWN": "B", "RIGHT": "C", "LEFT": "D"} 28 | 29 | 30 | class Direction(enum.Enum): 31 | UP = 0 32 | DOWN = 1 33 | 34 | 35 | def forceWrite(content, end=""): 36 | sys.stdout.write(str(content) + end) 37 | sys.stdout.flush() 38 | 39 | 40 | def writeColor(content, color, end=""): 41 | forceWrite(f"\u001b[{color}m{content}\u001b[0m", end) 42 | 43 | 44 | def reset_cursor(): 45 | forceWrite("\r") 46 | 47 | 48 | def move_cursor(num_lines: int, direction: str): 49 | forceWrite(f"\033[{num_lines}{CURSOR_TO_CHAR[direction.upper()]}") 50 | 51 | 52 | def clear_line(): 53 | forceWrite(" " * TERMINAL_WIDTH) 54 | reset_cursor() 55 | 56 | 57 | def linebreak(): 58 | reset_cursor() 59 | forceWrite("-" * TERMINAL_WIDTH) 60 | -------------------------------------------------------------------------------- /src/accelerate/commands/menu/input.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | This file contains utilities for handling input from the user and registering specific keys to specific functions, 17 | based on https://github.com/bchao1/bullet 18 | """ 19 | 20 | from .keymap import KEYMAP, get_character 21 | 22 | 23 | def mark(key: str): 24 | """ 25 | Mark the function with the key code so it can be handled in the register 26 | """ 27 | 28 | def decorator(func): 29 | handle = getattr(func, "handle_key", []) 30 | handle += [key] 31 | func.handle_key = handle 32 | return func 33 | 34 | return decorator 35 | 36 | 37 | def mark_multiple(*keys: list[str]): 38 | """ 39 | Mark the function with the key codes so it can be handled in the register 40 | """ 41 | 42 | def decorator(func): 43 | handle = getattr(func, "handle_key", []) 44 | handle += keys 45 | func.handle_key = handle 46 | return func 47 | 48 | return decorator 49 | 50 | 51 | class KeyHandler(type): 52 | """ 53 | Metaclass that adds the key handlers to the class 54 | """ 55 | 56 | def __new__(cls, name, bases, attrs): 57 | new_cls = super().__new__(cls, name, bases, attrs) 58 | if not hasattr(new_cls, "key_handler"): 59 | new_cls.key_handler = {} 60 | new_cls.handle_input = KeyHandler.handle_input 61 | 62 | for value in attrs.values(): 63 | handled_keys = getattr(value, "handle_key", []) 64 | for key in handled_keys: 65 | new_cls.key_handler[key] = value 66 | return new_cls 67 | 68 | @staticmethod 69 | def handle_input(cls): 70 | "Finds and returns the selected character if it exists in the handler" 71 | char = get_character() 72 | if char != KEYMAP["undefined"]: 73 | char = ord(char) 74 | handler = cls.key_handler.get(char) 75 | if handler: 76 | cls.current_selection = char 77 | return handler(cls) 78 | else: 79 | return None 80 | 81 | 82 | def register(cls): 83 | """Adds KeyHandler metaclass to the class""" 84 | return KeyHandler(cls.__name__, cls.__bases__, cls.__dict__.copy()) 85 | -------------------------------------------------------------------------------- /src/accelerate/commands/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2024 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from accelerate.commands.utils import CustomArgumentParser 17 | from accelerate.utils import merge_fsdp_weights 18 | 19 | 20 | description = """Utility to merge the weights from multiple FSDP checkpoints into a single combined checkpoint. Should be used if 21 | `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}`. 22 | 23 | This is a CPU-bound process and requires enough RAM to load the entire model state dict.""" 24 | 25 | 26 | def merge_command(args): 27 | merge_fsdp_weights( 28 | args.checkpoint_directory, args.output_path, not args.unsafe_serialization, args.remove_checkpoint_dir 29 | ) 30 | 31 | 32 | def merge_command_parser(subparsers=None): 33 | if subparsers is not None: 34 | parser = subparsers.add_parser("merge-weights", description=description) 35 | else: 36 | parser = CustomArgumentParser(description=description) 37 | 38 | parser.add_argument("checkpoint_directory", type=str, help="A directory containing sharded weights saved by FSDP.") 39 | parser.add_argument( 40 | "output_path", 41 | type=str, 42 | help="The path to save the merged weights. Defaults to the current directory. ", 43 | ) 44 | parser.add_argument( 45 | "--unsafe_serialization", 46 | action="store_true", 47 | default=False, 48 | help="Whether to save the merged weights as `.bin` rather than `.safetensors` (not recommended).", 49 | ) 50 | parser.add_argument( 51 | "--remove_checkpoint_dir", 52 | action="store_true", 53 | help="Whether to remove the checkpoint directory after merging.", 54 | default=False, 55 | ) 56 | 57 | if subparsers is not None: 58 | parser.set_defaults(func=merge_command) 59 | return parser 60 | 61 | 62 | def main(): 63 | parser = merge_command_parser() 64 | args = parser.parse_args() 65 | merge_command(args) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /src/accelerate/commands/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2021 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | 19 | from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package 20 | 21 | 22 | def test_command_parser(subparsers=None): 23 | if subparsers is not None: 24 | parser = subparsers.add_parser("test") 25 | else: 26 | parser = argparse.ArgumentParser("Accelerate test command") 27 | 28 | parser.add_argument( 29 | "--config_file", 30 | default=None, 31 | help=( 32 | "The path to use to store the config file. Will default to a file named default_config.yaml in the cache " 33 | "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have " 34 | "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed " 35 | "with 'huggingface'." 36 | ), 37 | ) 38 | 39 | if subparsers is not None: 40 | parser.set_defaults(func=test_command) 41 | return parser 42 | 43 | 44 | def test_command(args): 45 | script_name = path_in_accelerate_package("test_utils", "scripts", "test_script.py") 46 | 47 | if args.config_file is None: 48 | test_args = [script_name] 49 | else: 50 | test_args = f"--config_file={args.config_file} {script_name}".split() 51 | 52 | cmd = ["accelerate-launch"] + test_args 53 | result = execute_subprocess_async(cmd) 54 | if result.returncode == 0: 55 | print("Test is a success! You are ready for your distributed training!") 56 | 57 | 58 | def main(): 59 | parser = test_command_parser() 60 | args = parser.parse_args() 61 | test_command(args) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /src/accelerate/memory_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import warnings 16 | 17 | 18 | warnings.warn( 19 | "memory_utils has been reorganized to utils.memory. Import `find_executable_batchsize` from the main `__init__`: " 20 | "`from accelerate import find_executable_batch_size` to avoid this warning.", 21 | FutureWarning, 22 | ) 23 | -------------------------------------------------------------------------------- /src/accelerate/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .testing import ( 15 | DEFAULT_LAUNCH_COMMAND, 16 | are_the_same_tensors, 17 | assert_exception, 18 | capture_call_output, 19 | device_count, 20 | execute_subprocess_async, 21 | get_launch_command, 22 | get_torch_dist_unique_port, 23 | memory_allocated_func, 24 | path_in_accelerate_package, 25 | pytest_xdist_worker_id, 26 | require_bnb, 27 | require_cpu, 28 | require_cuda, 29 | require_cuda_or_hpu, 30 | require_cuda_or_xpu, 31 | require_fp8, 32 | require_fp16, 33 | require_huggingface_suite, 34 | require_mlu, 35 | require_mps, 36 | require_multi_device, 37 | require_multi_gpu, 38 | require_multi_gpu_or_xpu, 39 | require_multi_xpu, 40 | require_musa, 41 | require_non_cpu, 42 | require_non_hpu, 43 | require_non_torch_xla, 44 | require_non_xpu, 45 | require_npu, 46 | require_pippy, 47 | require_sdaa, 48 | require_single_device, 49 | require_single_gpu, 50 | require_single_xpu, 51 | require_torch_min_version, 52 | require_torchao, 53 | require_torchvision, 54 | require_tpu, 55 | require_transformer_engine, 56 | require_xpu, 57 | run_first, 58 | skip, 59 | slow, 60 | torch_device, 61 | ) 62 | from .training import RegressionDataset, RegressionModel, RegressionModel4XPU 63 | 64 | 65 | from .scripts import test_script, test_sync, test_ops # isort: skip 66 | -------------------------------------------------------------------------------- /src/accelerate/test_utils/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/accelerate/test_utils/scripts/external_deps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/accelerate/test_utils/scripts/external_deps/test_zero3_integration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.distributed 16 | 17 | from accelerate.test_utils import require_huggingface_suite, torch_device 18 | from accelerate.utils import is_transformers_available 19 | 20 | 21 | if is_transformers_available(): 22 | from transformers import AutoModel, TrainingArguments 23 | 24 | 25 | GPT2_TINY = "sshleifer/tiny-gpt2" 26 | 27 | 28 | @require_huggingface_suite 29 | def init_torch_dist_then_launch_deepspeed(): 30 | if torch_device == "xpu": 31 | backend = "ccl" 32 | elif torch_device == "hpu": 33 | backend = "hccl" 34 | else: 35 | backend = "nccl" 36 | 37 | torch.distributed.init_process_group(backend=backend) 38 | deepspeed_config = { 39 | "zero_optimization": { 40 | "stage": 3, 41 | }, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | } 45 | train_args = TrainingArguments( 46 | output_dir="./", 47 | deepspeed=deepspeed_config, 48 | ) 49 | model = AutoModel.from_pretrained(GPT2_TINY) 50 | assert train_args is not None 51 | assert model is not None 52 | 53 | 54 | def main(): 55 | init_torch_dist_then_launch_deepspeed() 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /src/accelerate/test_utils/scripts/test_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | from accelerate.utils import is_xpu_available 17 | 18 | 19 | def main(): 20 | accelerator_type = "GPU" 21 | num_accelerators = 0 22 | if torch.cuda.is_available(): 23 | num_accelerators = torch.cuda.device_count() 24 | accelerator_type = "GPU" 25 | elif is_xpu_available(): 26 | num_accelerators = torch.xpu.device_count() 27 | accelerator_type = "XPU" 28 | print(f"Successfully ran on {num_accelerators} {accelerator_type}s") 29 | 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /src/accelerate/test_utils/scripts/test_ddp_comm_hook.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs, PartialState 17 | from accelerate.utils import is_hpu_available 18 | 19 | 20 | class MockModel(torch.nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | torch.manual_seed(0) 24 | self.p = torch.nn.Parameter(torch.randn(40, 20)) 25 | 26 | def forward(self, x, rank): 27 | return self.p * (x ** (1 + rank)) 28 | 29 | 30 | def _run_and_get_grads(model, rank): 31 | torch.manual_seed(2024) 32 | input = torch.randn(40, 20) 33 | output = model(input, rank) 34 | output.mean().backward() 35 | param = next(model.parameters()) 36 | return param.grad 37 | 38 | 39 | def test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option): 40 | ddp_kwargs = DistributedDataParallelKwargs( 41 | comm_hook=comm_hook, 42 | comm_wrapper=comm_wrapper, 43 | comm_state_option=comm_state_option, 44 | ) 45 | accelerator = Accelerator(kwargs_handlers=[ddp_kwargs]) 46 | 47 | model = accelerator.prepare(MockModel()) 48 | hook_grads = _run_and_get_grads(model, accelerator.local_process_index) 49 | 50 | reference_model = torch.nn.parallel.DistributedDataParallel( 51 | MockModel().to(accelerator.device), 52 | device_ids=[accelerator.local_process_index], 53 | output_device=accelerator.local_process_index, 54 | ) 55 | reference_grads = _run_and_get_grads(reference_model, accelerator.local_process_index) 56 | 57 | torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-2, atol=1e-2) 58 | 59 | 60 | def main(): 61 | for comm_hook, comm_wrapper, comm_state_option in [ 62 | (DDPCommunicationHookType.NO, DDPCommunicationHookType.NO, {}), 63 | (DDPCommunicationHookType.FP16, DDPCommunicationHookType.NO, {}), 64 | (DDPCommunicationHookType.BF16, DDPCommunicationHookType.NO, {}), 65 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {}), 66 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.FP16, {}), 67 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.BF16, {}), 68 | (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {"matrix_approximation_rank": 2}), 69 | (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.NO, {}), 70 | (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.FP16, {}), 71 | (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.BF16, {}), 72 | ]: 73 | if is_hpu_available(): 74 | HPU_UNSUPPORTED_COMM_HOOKS = {DDPCommunicationHookType.FP16, DDPCommunicationHookType.BF16} 75 | if comm_hook in HPU_UNSUPPORTED_COMM_HOOKS or comm_wrapper in HPU_UNSUPPORTED_COMM_HOOKS: 76 | print(f"Skipping test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper} on HPU") 77 | continue 78 | 79 | print(f"Test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper}") 80 | test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option) 81 | PartialState().destroy_process_group() 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /src/accelerate/utils/rich.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .imports import is_rich_available 16 | 17 | 18 | if is_rich_available(): 19 | from rich.traceback import install 20 | 21 | install(show_locals=False) 22 | 23 | else: 24 | raise ModuleNotFoundError("To use the rich extension, install rich with `pip install rich`") 25 | -------------------------------------------------------------------------------- /src/accelerate/utils/torch_xla.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib.metadata 16 | import subprocess 17 | import sys 18 | 19 | 20 | def install_xla(upgrade: bool = False): 21 | """ 22 | Helper function to install appropriate xla wheels based on the `torch` version in Google Colaboratory. 23 | 24 | Args: 25 | upgrade (`bool`, *optional*, defaults to `False`): 26 | Whether to upgrade `torch` and install the latest `torch_xla` wheels. 27 | 28 | Example: 29 | 30 | ```python 31 | >>> from accelerate.utils import install_xla 32 | 33 | >>> install_xla(upgrade=True) 34 | ``` 35 | """ 36 | in_colab = False 37 | if "IPython" in sys.modules: 38 | in_colab = "google.colab" in str(sys.modules["IPython"].get_ipython()) 39 | 40 | if in_colab: 41 | if upgrade: 42 | torch_install_cmd = ["pip", "install", "-U", "torch"] 43 | subprocess.run(torch_install_cmd, check=True) 44 | # get the current version of torch 45 | torch_version = importlib.metadata.version("torch") 46 | torch_version_trunc = torch_version[: torch_version.rindex(".")] 47 | xla_wheel = f"https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-{torch_version_trunc}-cp37-cp37m-linux_x86_64.whl" 48 | xla_install_cmd = ["pip", "install", xla_wheel] 49 | subprocess.run(xla_install_cmd, check=True) 50 | else: 51 | raise RuntimeError("`install_xla` utility works only on google colab.") 52 | -------------------------------------------------------------------------------- /src/accelerate/utils/tqdm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .imports import is_tqdm_available 17 | 18 | 19 | if is_tqdm_available(): 20 | from tqdm.auto import tqdm as _tqdm 21 | 22 | from ..state import PartialState 23 | 24 | 25 | def tqdm(*args, main_process_only: bool = True, **kwargs): 26 | """ 27 | Wrapper around `tqdm.tqdm` that optionally displays only on the main process. 28 | 29 | Args: 30 | main_process_only (`bool`, *optional*): 31 | Whether to display the progress bar only on the main process 32 | """ 33 | if not is_tqdm_available(): 34 | raise ImportError("Accelerate's `tqdm` module requires `tqdm` to be installed. Please run `pip install tqdm`.") 35 | if len(args) > 0 and isinstance(args[0], bool): 36 | raise ValueError( 37 | "Passing `True` or `False` as the first argument to Accelerate's `tqdm` wrapper is unsupported. " 38 | "Please use the `main_process_only` keyword argument instead." 39 | ) 40 | disable = kwargs.pop("disable", False) 41 | if main_process_only and not disable: 42 | disable = PartialState().local_process_index != 0 43 | return _tqdm(*args, **kwargs, disable=disable) 44 | -------------------------------------------------------------------------------- /src/accelerate/utils/versions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib.metadata 16 | from typing import Union 17 | 18 | from packaging.version import Version, parse 19 | 20 | from .constants import STR_OPERATION_TO_FUNC 21 | 22 | 23 | torch_version = parse(importlib.metadata.version("torch")) 24 | 25 | 26 | def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str): 27 | """ 28 | Compares a library version to some requirement using a given operation. 29 | 30 | Args: 31 | library_or_version (`str` or `packaging.version.Version`): 32 | A library name or a version to check. 33 | operation (`str`): 34 | A string representation of an operator, such as `">"` or `"<="`. 35 | requirement_version (`str`): 36 | The version to compare the library version against 37 | """ 38 | if operation not in STR_OPERATION_TO_FUNC.keys(): 39 | raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}") 40 | operation = STR_OPERATION_TO_FUNC[operation] 41 | if isinstance(library_or_version, str): 42 | library_or_version = parse(importlib.metadata.version(library_or_version)) 43 | return operation(library_or_version, parse(requirement_version)) 44 | 45 | 46 | def is_torch_version(operation: str, version: str): 47 | """ 48 | Compares the current PyTorch version to a given reference with an operation. 49 | 50 | Args: 51 | operation (`str`): 52 | A string representation of an operator, such as `">"` or `"<="` 53 | version (`str`): 54 | A string version of PyTorch 55 | """ 56 | return compare_versions(torch_version, operation, version) 57 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/deepspeed/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "weight_decay": "auto", 18 | "torch_adam": true, 19 | "adam_w_mode": true 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 2, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "allgather_partitions": true, 37 | "allgather_bucket_size": 2e8, 38 | "overlap_comm": true, 39 | "reduce_scatter": true, 40 | "reduce_bucket_size": "auto", 41 | "contiguous_gradients": true 42 | }, 43 | "gradient_accumulation_steps": 1, 44 | "gradient_clipping": "auto", 45 | "steps_per_print": 2000, 46 | "train_batch_size": "auto", 47 | "train_micro_batch_size_per_gpu": "auto", 48 | "wall_clock_breakdown": false 49 | } -------------------------------------------------------------------------------- /tests/deepspeed/ds_config_zero2_model_only.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "zero_optimization": { 14 | "stage": 2, 15 | "offload_optimizer": { 16 | "device": "cpu", 17 | "pin_memory": true 18 | }, 19 | "allgather_partitions": true, 20 | "allgather_bucket_size": 2e8, 21 | "overlap_comm": true, 22 | "reduce_scatter": true, 23 | "reduce_bucket_size": "auto", 24 | "contiguous_gradients": true 25 | }, 26 | "gradient_accumulation_steps": 1, 27 | "gradient_clipping": "auto", 28 | "steps_per_print": 2000, 29 | "train_batch_size": "auto", 30 | "train_micro_batch_size_per_gpu": "auto", 31 | "wall_clock_breakdown": false 32 | } -------------------------------------------------------------------------------- /tests/deepspeed/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "weight_decay": "auto", 18 | "torch_adam": true, 19 | "adam_w_mode": true 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "stage3_gather_16bit_weights_on_model_save": "auto" 49 | }, 50 | "gradient_accumulation_steps": 1, 51 | "gradient_clipping": "auto", 52 | "steps_per_print": 2000, 53 | "train_batch_size": "auto", 54 | "train_micro_batch_size_per_gpu": "auto", 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /tests/deepspeed/ds_config_zero3_model_only.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "zero_optimization": { 14 | "stage": 3, 15 | "offload_param": { 16 | "device": "cpu", 17 | "pin_memory": true 18 | }, 19 | "overlap_comm": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": 1e9, 22 | "stage3_prefetch_bucket_size": 1e9, 23 | "stage3_param_persistence_threshold": 1e9, 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | }, 28 | "train_micro_batch_size_per_gpu": 1 29 | } -------------------------------------------------------------------------------- /tests/test_configs/0_11_0.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: 'NO' 4 | fsdp_config: {} 5 | machine_rank: 0 6 | main_process_ip: null 7 | main_process_port: null 8 | main_training_function: main 9 | mixed_precision: 'no' 10 | num_machines: 1 11 | num_processes: 1 12 | use_cpu: false -------------------------------------------------------------------------------- /tests/test_configs/0_12_0.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: 'NO' 4 | downcast_bf16: 'no' 5 | fsdp_config: {} 6 | machine_rank: 0 7 | main_process_ip: null 8 | main_process_port: null 9 | main_training_function: main 10 | mixed_precision: 'no' 11 | num_machines: 1 12 | num_processes: 1 13 | use_cpu: false -------------------------------------------------------------------------------- /tests/test_configs/0_28_0_mpi.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_CPU 4 | downcast_bf16: 'no' 5 | ipex_config: 6 | ipex: true 7 | machine_rank: 0 8 | main_process_ip: 127.0.0.1 9 | main_process_port: 29500 10 | main_training_function: main 11 | mixed_precision: 'no' 12 | mpirun_config: 13 | mpirun_ccl: '1' 14 | mpirun_hostfile: /home/user/hostfile 15 | num_machines: 4 16 | num_processes: 16 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: true 23 | -------------------------------------------------------------------------------- /tests/test_configs/0_30_0_sagemaker.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: AMAZON_SAGEMAKER 2 | debug: false 3 | distributed_type: NO 4 | mixed_precision: fp16 5 | debug: false 6 | use_cpu: false 7 | ec2_instance_type: MY_TYPE 8 | iam_role_name: MY_ROLE 9 | -------------------------------------------------------------------------------- /tests/test_configs/0_34_0_fp8.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fp8_config: 7 | amax_compute_algo: max 8 | amax_history_len: 1024 9 | backend: TE 10 | fp8_format: E4M3 11 | interval: 1 12 | margin: 0 13 | override_linear_precision: (false, false, false) 14 | use_autocast_during_eval: false 15 | gpu_ids: all 16 | machine_rank: 0 17 | main_training_function: main 18 | mixed_precision: fp8 19 | num_machines: 1 20 | num_processes: 2 21 | rdzv_backend: static 22 | same_network: true 23 | tpu_env: [] 24 | tpu_use_cluster: false 25 | tpu_use_sudo: false 26 | use_cpu: false 27 | -------------------------------------------------------------------------------- /tests/test_configs/README.md: -------------------------------------------------------------------------------- 1 | This folder contains test configs for `accelerate config`. These should be generated for each major version 2 | and are written based on `accelerate config` and selecting the "No distributed training" option. -------------------------------------------------------------------------------- /tests/test_configs/invalid_keys.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: 'NO' 4 | downcast_bf16: 'no' 5 | fsdp_config: {} 6 | machine_rank: 0 7 | main_process_ip: null 8 | main_process_port: null 9 | main_training_function: main 10 | mixed_precision: 'no' 11 | num_machines: 1 12 | num_processes: 1 13 | use_cpu: false 14 | invalid_key: "invalid_value" 15 | another_invalid_key: "another_invalid_value" -------------------------------------------------------------------------------- /tests/test_configs/latest.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: 'NO' 4 | downcast_bf16: 'no' 5 | fsdp_config: {} 6 | gpu_ids: all 7 | machine_rank: 0 8 | main_process_ip: null 9 | main_process_port: null 10 | main_training_function: main 11 | megatron_lm_config: {} 12 | mixed_precision: 'no' 13 | num_machines: 1 14 | num_processes: 1 15 | rdzv_backend: static 16 | same_network: true 17 | use_cpu: false 18 | tpu_name: 'test-tpu' 19 | tpu_zone: 'us-central1-a' 20 | commands: null 21 | command_file: tests/test_samples/test_command_file.sh -------------------------------------------------------------------------------- /tests/test_configs/latest_fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_activation_checkpointing: false 8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 9 | fsdp_backward_prefetch: BACKWARD_PRE 10 | fsdp_cpu_ram_efficient_loading: true 11 | fsdp_forward_prefetch: false 12 | fsdp_offload_params: false 13 | fsdp_sharding_strategy: FULL_SHARD 14 | fsdp_state_dict_type: SHARDED_STATE_DICT 15 | fsdp_sync_module_states: true 16 | fsdp_transformer_layer_cls_to_wrap: BertLayer 17 | fsdp_use_orig_params: true 18 | machine_rank: 0 19 | main_training_function: main 20 | mixed_precision: 'no' 21 | num_machines: 1 22 | num_processes: 1 23 | rdzv_backend: static 24 | same_network: true 25 | tpu_env: [] 26 | tpu_use_cluster: false 27 | tpu_use_sudo: false 28 | use_cpu: false 29 | -------------------------------------------------------------------------------- /tests/test_configs/validate_launch_cmd.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: true 3 | num_processes: 1 4 | distributed_type: 'NO' 5 | fsdp_config: 6 | fsdp_sync_module_states: false 7 | deepspeed_config: 8 | deepspeed_config_file: path/to/be/ignored 9 | -------------------------------------------------------------------------------- /tests/test_cpu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from accelerate import debug_launcher 18 | from accelerate.test_utils import require_cpu, test_ops, test_script 19 | 20 | 21 | @require_cpu 22 | class MultiCPUTester(unittest.TestCase): 23 | def test_cpu(self): 24 | debug_launcher(test_script.main) 25 | 26 | def test_ops(self): 27 | debug_launcher(test_ops.main) 28 | -------------------------------------------------------------------------------- /tests/test_grad_sync.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from accelerate import debug_launcher 16 | from accelerate.test_utils import ( 17 | DEFAULT_LAUNCH_COMMAND, 18 | device_count, 19 | execute_subprocess_async, 20 | path_in_accelerate_package, 21 | require_cpu, 22 | require_multi_device, 23 | require_non_cpu, 24 | run_first, 25 | test_sync, 26 | ) 27 | from accelerate.test_utils.testing import AccelerateTestCase 28 | from accelerate.utils import patch_environment 29 | 30 | 31 | class SyncScheduler(AccelerateTestCase): 32 | test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_sync.py") 33 | 34 | @require_cpu 35 | def test_gradient_sync_cpu_noop(self): 36 | debug_launcher(test_sync.main, num_processes=1) 37 | 38 | @require_cpu 39 | def test_gradient_sync_cpu_multi(self): 40 | debug_launcher(test_sync.main) 41 | 42 | @require_non_cpu 43 | def test_gradient_sync_gpu(self): 44 | test_sync.main() 45 | 46 | @run_first 47 | @require_multi_device 48 | def test_gradient_sync_gpu_multi(self): 49 | print(f"Found {device_count} devices.") 50 | cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path] 51 | with patch_environment(omp_num_threads=1): 52 | execute_subprocess_async(cmd) 53 | -------------------------------------------------------------------------------- /tests/test_launch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import unittest 17 | 18 | from accelerate.utils.launch import prepare_multi_gpu_env 19 | 20 | 21 | class TestPrepareMultiGpuEnv(unittest.TestCase): 22 | def test_auto_port_selection(self): 23 | args = argparse.Namespace( 24 | num_processes=1, 25 | num_machines=1, 26 | main_process_ip="127.0.0.1", 27 | main_process_port=0, 28 | machine_rank=0, 29 | module=False, 30 | no_python=False, 31 | debug=False, 32 | gpu_ids="all", 33 | mixed_precision="no", 34 | dynamo_backend="NO", 35 | dynamo_mode="default", 36 | dynamo_use_fullgraph=False, 37 | dynamo_use_dynamic=False, 38 | dynamo_use_regional_compilation=False, 39 | use_fsdp=False, 40 | fsdp_cpu_ram_efficient_loading=False, 41 | fsdp_sync_module_states=False, 42 | fsdp_version=None, 43 | fsdp_sharding_strategy=None, 44 | fsdp_reshard_after_forward=False, 45 | fsdp_offload_params=False, 46 | fsdp_min_num_params=0, 47 | fsdp_auto_wrap_policy=None, 48 | fsdp_transformer_layer_cls_to_wrap=None, 49 | fsdp_backward_prefetch=None, 50 | fsdp_state_dict_type=None, 51 | fsdp_forward_prefetch=False, 52 | fsdp_use_orig_params=False, 53 | fsdp_activation_checkpointing=False, 54 | use_tp=False, 55 | tp_size=1, 56 | use_megatron_lm=False, 57 | megatron_lm_tp_degree=1, 58 | megatron_lm_pp_degree=1, 59 | megatron_lm_gradient_clipping=1.0, 60 | megatron_lm_num_micro_batches=None, 61 | megatron_lm_sequence_parallelism=None, 62 | megatron_lm_recompute_activations=None, 63 | megatron_lm_use_distributed_optimizer=None, 64 | num_cpu_threads_per_process=1, 65 | enable_cpu_affinity=False, 66 | same_network=False, 67 | ) 68 | 69 | prepare_multi_gpu_env(args) 70 | self.assertIn("master_port", args.__dict__) 71 | self.assertNotEqual(args.master_port, "0") 72 | self.assertTrue(args.master_port.isdigit()) 73 | -------------------------------------------------------------------------------- /tests/test_logging.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import inspect 15 | import logging 16 | import os 17 | 18 | import pytest 19 | 20 | from accelerate import Accelerator 21 | from accelerate.logging import get_logger 22 | from accelerate.state import AcceleratorState 23 | 24 | 25 | def current_lineno() -> int: 26 | # A simple helper that returns the lineno of its call-site. 27 | caller_frame = inspect.currentframe().f_back 28 | caller_info = inspect.getframeinfo(caller_frame) 29 | return caller_info.lineno 30 | 31 | 32 | class CustomLogger(logging.LoggerAdapter): 33 | # Mocks a user-defined custom logger wrapper that sets `stacklevel=3`. 34 | def log(self, level, msg, *args, **kwargs): 35 | # E.g. the user wants to modify `stacklevel`, `accelerate.logging` 36 | # should respect the user's `stacklevel`. For the specific value 37 | # of `3`, calling `CustomLogger.log()`, etc., should log that callsite, 38 | # rather than the callsite of the following `self.logger.log()`. 39 | kwargs["stacklevel"] = 3 40 | self.logger.log(level, msg, *args, **kwargs) 41 | 42 | 43 | @pytest.fixture(scope="module") 44 | def accelerator(): 45 | accelerator = Accelerator() 46 | yield accelerator 47 | AcceleratorState._reset_state(True) 48 | 49 | 50 | @pytest.mark.usefixtures("accelerator") 51 | def test_log_stack(caplog): 52 | logger = get_logger(__name__) 53 | logging.basicConfig( 54 | format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s", 55 | datefmt="%m/%d %H:%M:%S", 56 | ) 57 | 58 | message = "Test" 59 | lineno = current_lineno() + 1 # the next line is the actual callsite 60 | logger.warning(message) 61 | 62 | assert len(caplog.records) == 1 63 | rec = caplog.records[0] 64 | assert rec.levelname == logging.getLevelName(logging.WARNING) 65 | assert rec.filename == os.path.basename(__file__) 66 | assert rec.name == __name__ 67 | assert rec.lineno == lineno 68 | assert rec.funcName == test_log_stack.__name__ 69 | assert rec.message == message 70 | 71 | 72 | @pytest.mark.usefixtures("accelerator") 73 | def test_custom_stacklevel(caplog): 74 | wrapped_logger = get_logger(__name__) 75 | logging.basicConfig( 76 | format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s", 77 | datefmt="%m/%d %H:%M:%S", 78 | ) 79 | logger = CustomLogger(wrapped_logger, {}) 80 | 81 | message = "Test" 82 | lineno = current_lineno() + 1 # the next line is the actual callsite 83 | logger.warning(message) 84 | 85 | # `CustomLogger.log` set custom `stacklevel=3`, so `logger.warning` should 86 | # log its callsite (rather than those of the `warpped_logger`). 87 | assert len(caplog.records) == 1 88 | rec = caplog.records[0] 89 | assert rec.levelname == logging.getLevelName(logging.WARNING) 90 | assert rec.filename == os.path.basename(__file__) 91 | assert rec.name == __name__ 92 | assert rec.lineno == lineno 93 | assert rec.funcName == test_custom_stacklevel.__name__ 94 | assert rec.message == message 95 | -------------------------------------------------------------------------------- /tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | import numpy as np 18 | from packaging import version 19 | 20 | from accelerate import debug_launcher 21 | from accelerate.test_utils import ( 22 | DEFAULT_LAUNCH_COMMAND, 23 | device_count, 24 | execute_subprocess_async, 25 | path_in_accelerate_package, 26 | require_cpu, 27 | require_huggingface_suite, 28 | require_multi_device, 29 | require_single_device, 30 | run_first, 31 | ) 32 | from accelerate.utils import patch_environment 33 | 34 | 35 | @require_huggingface_suite 36 | @unittest.skipIf(version.parse(np.__version__) >= version.parse("2.0"), "Test requires numpy version < 2.0") 37 | class MetricTester(unittest.TestCase): 38 | def setUp(self): 39 | self.test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_metrics.py") 40 | 41 | from accelerate.test_utils.scripts.external_deps import test_metrics # noqa: F401 42 | 43 | self.test_metrics = test_metrics 44 | 45 | @require_cpu 46 | def test_metric_cpu_noop(self): 47 | debug_launcher(self.test_metrics.main, num_processes=1) 48 | 49 | @require_cpu 50 | def test_metric_cpu_multi(self): 51 | debug_launcher(self.test_metrics.main) 52 | 53 | @require_single_device 54 | def test_metric_accelerator(self): 55 | self.test_metrics.main() 56 | 57 | @run_first 58 | @require_multi_device 59 | def test_metric_accelerator_multi(self): 60 | print(f"Found {device_count} devices.") 61 | cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path] 62 | with patch_environment(omp_num_threads=1, ACCELERATE_LOG_LEVEL="INFO"): 63 | execute_subprocess_async(cmd) 64 | -------------------------------------------------------------------------------- /tests/test_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pickle 16 | 17 | import torch 18 | 19 | from accelerate import Accelerator 20 | from accelerate.test_utils import require_cpu, require_fp16, require_non_cpu 21 | from accelerate.test_utils.testing import AccelerateTestCase 22 | 23 | 24 | @require_cpu 25 | class CPUOptimizerTester(AccelerateTestCase): 26 | def test_accelerated_optimizer_pickling(self): 27 | model = torch.nn.Linear(10, 10) 28 | optimizer = torch.optim.SGD(model.parameters(), 0.1) 29 | accelerator = Accelerator() 30 | optimizer = accelerator.prepare(optimizer) 31 | try: 32 | pickle.loads(pickle.dumps(optimizer)) 33 | except Exception as e: 34 | self.fail(f"Accelerated optimizer pickling failed with {e}") 35 | 36 | 37 | @require_fp16 38 | @require_non_cpu 39 | class OptimizerTester(AccelerateTestCase): 40 | def test_accelerated_optimizer_step_was_skipped(self): 41 | model = torch.nn.Linear(5, 5) 42 | optimizer = torch.optim.SGD(model.parameters(), 0.1) 43 | accelerator = Accelerator(mixed_precision="fp16") 44 | model, optimizer = accelerator.prepare(model, optimizer) 45 | 46 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum() 47 | accelerator.backward(loss) 48 | for p in model.parameters(): 49 | # Fake the gradients, as if there's no overflow 50 | p.grad.fill_(0.01) 51 | 52 | optimizer.step() 53 | assert optimizer.step_was_skipped is False 54 | 55 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum() 56 | accelerator.backward(loss) 57 | for p in model.parameters(): 58 | p.grad.fill_(0.01) 59 | # Manually set the gradients to be NaN, as if there's an overflow 60 | p.grad[0] = torch.tensor(float("nan")) 61 | 62 | optimizer.step() 63 | assert optimizer.step_was_skipped is True 64 | 65 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum() 66 | accelerator.backward(loss) 67 | for p in model.parameters(): 68 | p.grad.fill_(0.01) 69 | # Manually set the gradients to be NaN, as if there's an overflow 70 | p.grad[0] = torch.tensor(float("nan")) 71 | 72 | optimizer.step() 73 | assert optimizer.step_was_skipped is True 74 | 75 | loss = model(torch.randn(2, 5, device=accelerator.device)).sum() 76 | accelerator.backward(loss) 77 | for p in model.parameters(): 78 | # Fake the gradients, as if there's no overflow 79 | p.grad.fill_(0.01) 80 | 81 | optimizer.step() 82 | assert optimizer.step_was_skipped is False 83 | -------------------------------------------------------------------------------- /tests/test_sagemaker.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import unittest 15 | from dataclasses import dataclass 16 | 17 | import pytest 18 | 19 | from accelerate.commands.config.config_args import SageMakerConfig 20 | from accelerate.utils import ComputeEnvironment 21 | from accelerate.utils.launch import _convert_nargs_to_dict 22 | 23 | 24 | @dataclass 25 | class MockLaunchConfig(SageMakerConfig): 26 | compute_environment = ComputeEnvironment.AMAZON_SAGEMAKER 27 | fp16 = True 28 | ec2_instance_type = "ml.p3.2xlarge" 29 | iam_role_name = "accelerate_sagemaker_execution_role" 30 | profile = "hf-sm" 31 | region = "us-east-1" 32 | num_machines = 1 33 | base_job_name = "accelerate-sagemaker-1" 34 | pytorch_version = "1.6" 35 | transformers_version = "4.4" 36 | training_script = "train.py" 37 | success_training_script_args = [ 38 | "--model_name_or_path", 39 | "bert", 40 | "--do_train", 41 | "False", 42 | "--epochs", 43 | "3", 44 | "--learning_rate", 45 | "5e-5", 46 | "--max_steps", 47 | "50.5", 48 | ] 49 | fail_training_script_args = [ 50 | "--model_name_or_path", 51 | "bert", 52 | "--do_train", 53 | "--do_test", 54 | "False", 55 | "--do_predict", 56 | "--epochs", 57 | "3", 58 | "--learning_rate", 59 | "5e-5", 60 | "--max_steps", 61 | "50.5", 62 | ] 63 | 64 | 65 | class SageMakerLaunch(unittest.TestCase): 66 | def test_args_convert(self): 67 | # If no defaults are changed, `to_kwargs` returns an empty dict. 68 | converted_args = _convert_nargs_to_dict(MockLaunchConfig.success_training_script_args) 69 | assert isinstance(converted_args["model_name_or_path"], str) 70 | assert isinstance(converted_args["do_train"], bool) 71 | assert isinstance(converted_args["epochs"], int) 72 | assert isinstance(converted_args["learning_rate"], float) 73 | assert isinstance(converted_args["max_steps"], float) 74 | 75 | with pytest.raises(ValueError): 76 | _convert_nargs_to_dict(MockLaunchConfig.fail_training_script_args) 77 | -------------------------------------------------------------------------------- /tests/test_samples/MRPC/dev.csv: -------------------------------------------------------------------------------- 1 | label,sentence1,sentence2 2 | equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ." 3 | not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ." 4 | not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ." 5 | equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ." 7 | equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /tests/test_samples/MRPC/train.csv: -------------------------------------------------------------------------------- 1 | label,sentence1,sentence2 2 | equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ." 3 | not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ." 4 | not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ." 5 | equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ." 7 | equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /tests/test_samples/test_command_file.sh: -------------------------------------------------------------------------------- 1 | echo "hello world" 2 | echo "this is a second command" -------------------------------------------------------------------------------- /tests/test_tpu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | import unittest 18 | 19 | from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package, require_tpu 20 | 21 | 22 | class MultiTPUTester(unittest.TestCase): 23 | test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_script.py") 24 | test_dir = os.path.dirname(__file__) 25 | 26 | @require_tpu 27 | def test_tpu(self): 28 | distributed_args = f""" 29 | {self.test_dir}/xla_spawn.py 30 | --num_cores 8 31 | {self.test_file_path} 32 | """.split() 33 | cmd = [sys.executable] + distributed_args 34 | execute_subprocess_async(cmd) 35 | -------------------------------------------------------------------------------- /tests/tp/test_tp.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from accelerate.test_utils.testing import ( 17 | TempDirTestCase, 18 | execute_subprocess_async, 19 | get_launch_command, 20 | path_in_accelerate_package, 21 | require_multi_device, 22 | require_non_torch_xla, 23 | require_tp, 24 | require_transformers, 25 | run_first, 26 | slow, 27 | ) 28 | from accelerate.utils import patch_environment 29 | 30 | 31 | @require_non_torch_xla 32 | @require_multi_device 33 | @require_transformers 34 | @require_tp 35 | @run_first 36 | @slow 37 | class TPIntegrationTest(TempDirTestCase): 38 | test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps") 39 | 40 | def setUp(self): 41 | super().setUp() 42 | self.test_tp_size = 2 43 | self.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 44 | self.batch_size = 1 45 | from accelerate.utils import set_seed 46 | 47 | set_seed(42) 48 | 49 | def test_working_of_tp(self): 50 | self.test_file_path = self.test_scripts_folder / "test_performance.py" 51 | cmd = get_launch_command(num_processes=self.test_tp_size, num_machines=1, machine_rank=0) 52 | cmd.extend( 53 | [ 54 | self.test_file_path, 55 | f"--output_dir={self.tmpdir}", 56 | f"--model_name_or_path={self.model_name_or_path}", 57 | "--add_pad_token=true", 58 | "--tp_plan=auto", 59 | f"--tp_size={self.test_tp_size}", 60 | ] 61 | ) 62 | with patch_environment(omp_num_threads=1): 63 | execute_subprocess_async(cmd) 64 | -------------------------------------------------------------------------------- /tests/xla_spawn.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | A simple launcher script for TPU training 17 | 18 | Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py 19 | 20 | :: 21 | >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE 22 | YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other 23 | arguments of your training script) 24 | 25 | """ 26 | 27 | import importlib 28 | import sys 29 | from argparse import REMAINDER, ArgumentParser 30 | from pathlib import Path 31 | 32 | import torch_xla.distributed.xla_multiprocessing as xmp 33 | from torch_xla import device_count 34 | 35 | 36 | def parse_args(): 37 | """ 38 | Helper function parsing the command line options 39 | @retval ArgumentParser 40 | """ 41 | parser = ArgumentParser( 42 | description=( 43 | "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes" 44 | ) 45 | ) 46 | 47 | # Optional arguments for the launch helper 48 | num_devices = device_count() 49 | parser.add_argument( 50 | "--num_cores", 51 | type=int, 52 | default=num_devices, 53 | help="Number of TPU cores to use (1 or number of available devices).", 54 | ) 55 | 56 | # positional 57 | parser.add_argument( 58 | "training_script", 59 | type=str, 60 | help=( 61 | "The full path to the single TPU training " 62 | "program/script to be launched in parallel, " 63 | "followed by all the arguments for the " 64 | "training script" 65 | ), 66 | ) 67 | 68 | # rest from the training program 69 | parser.add_argument("training_script_args", nargs=REMAINDER) 70 | 71 | return parser.parse_args() 72 | 73 | 74 | def main(): 75 | args = parse_args() 76 | 77 | # Import training_script as a module. 78 | script_fpath = Path(args.training_script) 79 | sys.path.append(str(script_fpath.parent.resolve())) 80 | mod_name = script_fpath.stem 81 | mod = importlib.import_module(mod_name) 82 | 83 | # Patch sys.argv 84 | sys.argv = [args.training_script] + args.training_script_args 85 | num_cores = args.num_cores 86 | if num_cores == device_count() and num_cores != 1: 87 | # There is an error in xmp.spawn that causes it to fail when num_cores is specified and not 1, so we set it to 88 | # None when it matches the number of devices. 89 | num_cores = None 90 | xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores) 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /utils/stale.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team, the AllenNLP library authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Script to close stale issue. Taken in part from the AllenNLP repository. 16 | https://github.com/allenai/allennlp. 17 | """ 18 | 19 | import os 20 | from datetime import datetime as dt 21 | from datetime import timezone 22 | 23 | from github import Github 24 | 25 | 26 | LABELS_TO_EXEMPT = [ 27 | "good first issue", 28 | "feature request", 29 | "wip", 30 | ] 31 | 32 | 33 | def main(): 34 | g = Github(os.environ["GITHUB_TOKEN"]) 35 | repo = g.get_repo("huggingface/accelerate") 36 | open_issues = repo.get_issues(state="open") 37 | 38 | for issue in open_issues: 39 | comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True) 40 | last_comment = comments[0] if len(comments) > 0 else None 41 | current_time = dt.now(timezone.utc) 42 | days_since_updated = (current_time - issue.updated_at).days 43 | days_since_creation = (current_time - issue.created_at).days 44 | if ( 45 | last_comment is not None 46 | and last_comment.user.login == "github-actions[bot]" 47 | and days_since_updated > 7 48 | and days_since_creation >= 30 49 | and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) 50 | ): 51 | # Close issue since it has been 7 days of inactivity since bot mention. 52 | issue.edit(state="closed") 53 | elif ( 54 | days_since_updated > 23 55 | and days_since_creation >= 30 56 | and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) 57 | ): 58 | # Add stale comment 59 | issue.create_comment( 60 | "This issue has been automatically marked as stale because it has not had " 61 | "recent activity. If you think this still needs to be addressed " 62 | "please comment on this thread.\n\nPlease note that issues that do not follow the " 63 | "[contributing guidelines](https://github.com/huggingface/accelerate/blob/main/CONTRIBUTING.md) " 64 | "are likely to be ignored." 65 | ) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | --------------------------------------------------------------------------------