├── .github ├── pull_request_template.md └── workflows │ └── aggregate-prs.yml ├── .gitignore ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README.md ├── inference-benchmarking ├── Readme.md ├── accuracy.py ├── clients │ ├── __init__.py │ ├── base.py │ ├── lm_eval │ │ ├── __init__.py │ │ ├── client.py │ │ └── scripts │ │ │ ├── __init__.py │ │ │ ├── run_lm_eval.sh │ │ │ └── setup_lm_eval.sh │ └── long_bench │ │ ├── __init__.py │ │ ├── client.py │ │ └── prompts │ │ ├── 0shot.txt │ │ ├── 0shot_cot.txt │ │ ├── 0shot_cot_ans.txt │ │ ├── 0shot_no_context.txt │ │ ├── 0shot_rag.txt │ │ └── __init__.py ├── config.yaml ├── requirements.txt ├── server │ ├── __init__.py │ ├── scripts │ │ ├── __init__.py │ │ └── start_server.sh │ └── vllm.py ├── server_config.py └── utils │ ├── __init__.py │ ├── artifacts.py │ ├── parser.py │ ├── process.py │ └── s3.py ├── releasenotes.md ├── tensorflow-neuron ├── README.md └── inference │ └── unet │ └── UnetTF2.ipynb ├── torch-neuron ├── README.md └── inference │ ├── beit │ └── BEiT.ipynb │ ├── bertbasecased │ └── BertBaseCased.ipynb │ ├── bertlargeuncased │ └── BertLargeUncased.ipynb │ ├── clip │ └── CLIP_Model_HF.ipynb │ ├── common │ ├── processing.py │ └── wrapper.py │ ├── craft │ └── Craft.ipynb │ ├── efficientnet │ └── EfficientNet.ipynb │ ├── fairseq │ └── Fairseq.ipynb │ ├── gfl_mmdet │ └── GFL.ipynb │ ├── hrnet │ └── HRnet.ipynb │ ├── marianmt │ └── MarianMT.ipynb │ ├── rcnn │ └── Rcnn.ipynb │ ├── resnet │ └── Resnet.ipynb │ ├── resnext │ └── Resnext.ipynb │ ├── robertabase │ └── RobertaBase.ipynb │ ├── ssd │ └── SSD300VGG16.ipynb │ ├── trocr │ └── TrOCR.ipynb │ ├── vgg │ └── VGG.ipynb │ ├── vit │ └── ViT.ipynb │ ├── yolof_detectron2 │ └── YoloF.ipynb │ ├── yolov5 │ └── Yolov5.ipynb │ ├── yolov6 │ └── Yolov6.ipynb │ └── yolov7 │ └── Yolov7.ipynb └── torch-neuronx ├── README.md ├── inference ├── customop_mlp │ ├── README.md │ ├── neuron-multicore │ │ ├── build.py │ │ ├── inference.py │ │ ├── model.py │ │ ├── my_ops.py │ │ ├── relu.cpp │ │ └── shape.cpp │ ├── neuron-tcm │ │ ├── build.py │ │ ├── inference.py │ │ ├── model.py │ │ ├── my_ops.py │ │ ├── relu.cpp │ │ └── shape.cpp │ └── neuron │ │ ├── build.py │ │ ├── inference.py │ │ ├── model.py │ │ ├── my_ops.py │ │ ├── relu.cpp │ │ └── shape.cpp ├── hf_pretrained_bert_inference_on_trn1.ipynb ├── hf_pretrained_clip_base_inference_on_inf2.ipynb ├── hf_pretrained_clip_large_inference_on_inf2.ipynb ├── hf_pretrained_distilbert_Inference_on_trn1.ipynb ├── hf_pretrained_gpt2_feature_extraction_on_trn1.ipynb ├── hf_pretrained_perceiver_language_inference.ipynb ├── hf_pretrained_perceiver_multimodal_inference.ipynb ├── hf_pretrained_perceiver_vision_inference.ipynb ├── hf_pretrained_pixart_alpha_inference_on_inf2.ipynb ├── hf_pretrained_pixart_sigma_1k │ ├── compile_latency_optimized.sh │ ├── compile_throughput_optimized.sh │ ├── hf_pretrained_pixart_sigma_1k_latency_optimized.ipynb │ ├── hf_pretrained_pixart_sigma_1k_throughput_optimized.ipynb │ ├── neuron_pixart_sigma │ │ ├── cache_hf_model.py │ │ ├── compile_decoder.py │ │ ├── compile_text_encoder.py │ │ ├── compile_transformer_latency_optimized.py │ │ ├── compile_transformer_throughput_optimized.py │ │ ├── neuron_commons.py │ │ └── neuron_parallel_utils.py │ └── requirements.txt ├── hf_pretrained_pixart_sigma_inference_on_inf2.ipynb ├── hf_pretrained_roberta_inference_on_frn1.ipynb ├── hf_pretrained_sd15_512_inference.ipynb ├── hf_pretrained_sd2_512_inference.ipynb ├── hf_pretrained_sd2_768_inference.ipynb ├── hf_pretrained_sd2_inpainting_936_624_inference.ipynb ├── hf_pretrained_sd_x4_upscaler_inference.ipynb ├── hf_pretrained_sdxl_base_1024_inference.ipynb ├── hf_pretrained_sdxl_base_and_refiner_1024_inference.ipynb ├── hf_pretrained_vit_inference_on_inf2.ipynb ├── hf_pretrained_wav2vec2_conformer_relpos_inference_on_inf2.ipynb ├── hf_pretrained_wav2vec2_conformer_rope_inference_on_inf2.ipynb ├── pretrained_unet_inference_on_trn1.ipynb ├── sd2_inpainting_mask.png ├── sd2_inpainting_photo.png ├── tv_pretrained_resnet50_inference_on_trn1.ipynb └── tv_pretrained_vgg_inference_on_trn1.ipynb ├── microbenchmark ├── matmult_linear.py ├── microbenchmark.ipynb └── ubench_utils.py ├── training ├── aws-batch │ ├── all-reduce │ │ ├── README.md │ │ ├── build_configs_and_setup.sh │ │ ├── docker │ │ │ ├── Dockerfile │ │ │ ├── allreduce.py │ │ │ └── allreduce.sh │ │ ├── submit_job.sh │ │ └── templates │ │ │ ├── build_docker_image.sh │ │ │ ├── compute_env.json │ │ │ ├── create_resources.sh │ │ │ ├── job_def.json │ │ │ ├── job_queue.json │ │ │ └── launch_template.json │ └── llama2 │ │ ├── README.md │ │ ├── config.txt │ │ ├── docker │ │ ├── Dockerfile │ │ └── llama_batch_training.sh │ │ ├── images │ │ └── aws-batch.png │ │ ├── scripts │ │ ├── build_and_push_docker_image.sh │ │ ├── cleanup.sh │ │ ├── create_resources.sh │ │ ├── download_and_tokenize_data.sh │ │ └── submit_batch_job.sh │ │ ├── setup.sh │ │ └── templates │ │ ├── compute_env.json │ │ ├── job_def.json │ │ ├── job_queue.json │ │ └── launch_template.json ├── common │ ├── hf_utils.py │ └── vision_utils.py ├── customop_mlp │ ├── README.md │ ├── neuron │ │ ├── build.py │ │ ├── model.py │ │ ├── my_ops.py │ │ ├── relu.cpp │ │ ├── shape.cpp │ │ └── train.py │ └── pytorch │ │ ├── build.py │ │ ├── model.py │ │ ├── my_ops.py │ │ ├── relu.cpp │ │ └── train_cpu.py ├── dp_bert_hf_pretrain │ ├── adamw_fp32_optim_params.py │ ├── adamw_fp32_params_copy.py │ ├── dp_bert_large_hf_pretrain_hdf5.py │ ├── dp_bert_large_hf_pretrain_hdf5_THIRD-PARTY-LICENSES.txt │ ├── lamb.py │ ├── requirements.txt │ ├── run_dp_bert_large_hf_pretrain_bf16_s128.sh │ ├── run_dp_bert_large_hf_pretrain_bf16_s128_lamb.sh │ ├── run_dp_bert_large_hf_pretrain_bf16_s512_lamb_phase2.sh │ └── run_dp_bert_large_hf_pretrain_bf16_s512_phase2.sh ├── hf_bert_jp │ └── bert-jp-tutorial.ipynb ├── hf_contrastive_image_text │ ├── CLIPBase.ipynb │ ├── CLIPLarge.ipynb │ └── run_clip.py ├── hf_image_classification │ ├── VisionPerceiverConv.ipynb │ ├── run_image_classification.py │ └── vit.ipynb ├── hf_language_modeling │ └── gpt2 │ │ ├── gpt2.ipynb │ │ └── run_clm.patch ├── hf_sentiment_analysis │ ├── .gitignore │ ├── 01-hf-single-neuron.ipynb │ ├── 02-hf-distributed-training.ipynb │ ├── README.md │ ├── code │ │ ├── 01-trainium-single-core │ │ │ └── train.py │ │ └── 02-trainium-distributed-training │ │ │ └── train.py │ └── data │ │ ├── data.csv │ │ ├── test.csv │ │ └── train.csv ├── hf_summarization │ ├── BartLarge.ipynb │ ├── T5Large.ipynb │ └── run_summarization.py ├── hf_text_classification │ ├── AlbertBase.ipynb │ ├── BertBaseCased.ipynb │ ├── BertBaseUncased.ipynb │ ├── BertLargeCased.ipynb │ ├── BertLargeUncased.ipynb │ ├── CamembertBase.ipynb │ ├── DistilbertBaseUncased.ipynb │ ├── ElectraSmall.ipynb │ ├── LanguagePerceiver.ipynb │ ├── README.md │ ├── RobertaBase.ipynb │ ├── RobertaLarge.ipynb │ ├── XlmRobertaBase.ipynb │ └── run_glue.py ├── llama2 │ ├── adamw_fp32_optim_params.py │ ├── convert_checkpoints.py │ ├── get_dataset.py │ ├── modeling_llama_nxd.py │ └── requirements.txt ├── mnist_mlp │ ├── eval.py │ ├── eval_using_trace.py │ ├── model.py │ ├── train.py │ ├── train_cpu.py │ ├── train_torchrun.py │ └── train_xmp.py ├── resnet50 │ ├── resnet50.ipynb │ └── run_image_classification.py ├── stable_diffusion │ ├── requirements.txt │ ├── run.py │ └── sd_training_neuron.py ├── tp_dp_bert_hf_pretrain │ ├── requirements.txt │ └── tp_dp_bert_large_hf_pretrain_hdf5.py ├── tp_dp_gpt_neox_hf_pretrain │ └── common │ │ ├── adamw_fp32_optim_params.py │ │ ├── get_dataset.py │ │ └── requirements.txt ├── unet_image_segmentation │ ├── model.py │ ├── train.py │ └── unet.ipynb └── zero1_gpt2 │ ├── config_1p5B_gpt2.json │ ├── neuron_utils.py │ ├── requirements.txt │ ├── run_clm.sh │ ├── run_clm.slurm │ ├── run_clm_compile.slurm │ ├── run_clm_no_trainer.py │ └── uncomment_gradaccum.sh └── transformers-neuronx ├── README.md └── inference ├── codellama-13b-16k-sampling.ipynb ├── facebook-opt-13b-sampling.ipynb ├── facebook-opt-30b-sampling.ipynb ├── facebook-opt-66b-sampling.ipynb ├── gpt-j-6b-sampling-dp.ipynb ├── gpt-j-6b-sampling.ipynb ├── gpt-j-dp.py ├── llama-3.1-405b-multinode-16k-sampling.ipynb ├── llama-3.1-70b-64k-sampling.ipynb ├── llama-3.1-70b-eagle-speculative-decoding.ipynb ├── llama-3.1-70b-speculative-decoding.ipynb ├── llama-3.1-8b-128k-sampling.ipynb ├── llama-3.1-8b-32k-sampling.ipynb ├── llama-70b-sampling.ipynb ├── meta-llama-2-13b-sampling.ipynb ├── meta-llama-3-70b-sampling.ipynb ├── meta-llama-3-8b-sampling.ipynb ├── meta-llama-3.1-70b-sampling.ipynb ├── meta-llama-3.1-8b-sampling.ipynb ├── mistralai-Mistral-7b-Instruct-v0.2.ipynb ├── mixtral-8x7b-sampling.ipynb └── speculative_sampling.ipynb /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | *Description:* 3 | 4 | *Issue #, sim, or t.corp if available:* 5 | 6 | * Link to RTD for my changes: https://github.com/aws-neuron/aws-neuron-samples-staging/YOUR_BRANCH_NAME/ 7 | 8 | * Submitter Checklist 9 | * Tested on : Neuron SDK , release_version, Instance_type. 10 | * I've completely filled out the form above! 11 | **(MANDATORY) PR needs test run output 12 | 13 | * I have provided the output with expected metrics in a metrics.json file 14 | 15 | * I have attached metric.json in the PR 16 | 17 | * I have attached golden_step_loss.txt 18 | 19 | * I have added screen shot of plotted loss curve 20 | 21 | * (If applicable) I've automated a test to safegaurd my changes from regression. 22 | * (If applicable) I've posted test collateral to prove my change was effective and not harmful. 23 | * (If applicable) I've added someone from QA to the list of reviewers. Do this if you didn't make an automated test or feel it's appropriate for another reason. 24 | * (If applicable) I've reviewed the licenses of updated and new binaries and their dependencies to make sure all licenses are on the pre-approved Amazon license list. 25 | * Reviewer Checklist 26 | * I've verified the changes render correctly on RTD (link above) 27 | * I've ensured the submitter completed the form 28 | * (If appropriate) I've verified the metrics.json file provided by the submitter 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /.github/workflows/aggregate-prs.yml: -------------------------------------------------------------------------------- 1 | name: Merge PR into Dynamic Branch on Label 2 | 3 | on: 4 | pull_request_target: 5 | types: [labeled, synchronize] 6 | branches: 7 | - master 8 | 9 | jobs: 10 | merge-to-dynamic-branch: 11 | if: github.event.label.name != 'do-not-merge' #Excludes those labeled with do-not-merge 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Repository 15 | uses: actions/checkout@v2 16 | with: 17 | ref: ${{ github.event.pull_request.head.ref }} 18 | fetch-depth: 0 19 | 20 | - name: Configure Git 21 | run: | 22 | git config user.name "GitHub Actions" 23 | git config user.email "actions@github.com" 24 | 25 | - name: Check PR Labels and Merge for New Commit Events 26 | if: github.event.action == 'synchronize' 27 | run: | 28 | LABELS_JSON=$(gh pr view ${{ github.event.pull_request.number }} --json labels) 29 | LABELS=$(echo "$LABELS_JSON" | jq -r '.labels[].name') 30 | for LABEL_BRANCH in $LABELS; do 31 | # Check if the branch exists 32 | if git show-ref --verify --quiet refs/heads/$LABEL_BRANCH; then 33 | echo "Branch $LABEL_BRANCH already exists." 34 | else 35 | echo "Branch $LABEL_BRANCH does not exist, creating it." 36 | git branch $LABEL_BRANCH origin/master 37 | fi 38 | git checkout $LABEL_BRANCH 39 | 40 | # Merge PR changes into dynamic branch 41 | git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit 42 | git commit -m "Merged PR #${{ github.event.pull_request.number }} due to new commits on labeled PR" 43 | git push origin $LABEL_BRANCH 44 | done 45 | env: 46 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 47 | 48 | - name: Merge for Labeled Event 49 | if: github.event.action == 'labeled' 50 | run: | 51 | LABEL_BRANCH=${{ github.event.label.name }} 52 | # Check if the branch exists 53 | if git show-ref --verify --quiet refs/heads/$LABEL_BRANCH; then 54 | echo "Branch $LABEL_BRANCH already exists." 55 | else 56 | echo "Branch $LABEL_BRANCH does not exist, creating it." 57 | git branch $LABEL_BRANCH origin/master 58 | fi 59 | git checkout $LABEL_BRANCH 60 | 61 | # Merge PR changes into dynamic branch 62 | git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit 63 | git commit -m "Merged PR #${{ github.event.pull_request.number }} due to label '$LABEL_BRANCH'" 64 | git push origin $LABEL_BRANCH 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | **/__pycache__ 4 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This file creates codeowners for the documentation. It will allow setting code reviewers for all Pull requests to merge to the master branch 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # Refernce guide - https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-code-owners#example-[…]ners-file 5 | # Example - These owners will be the default owners for everything in 6 | # the repo. Unless a later match takes precedence, 7 | # @global-owner1 and @global-owner2 will be requested for 8 | # review when someone opens a pull request. 9 | # * @global-owner1 @global-owner2 10 | 11 | * @aws-maens @natemail-aws @rgrandhiamzn 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Amazon Software License 1.0 2 | 3 | This Amazon Software License ("License") governs your use, reproduction, and 4 | distribution of the accompanying software as specified below. 5 | 6 | 1. Definitions 7 | 8 | "Licensor" means any person or entity that distributes its Work. 9 | 10 | "Software" means the original work of authorship made available under this 11 | License. 12 | 13 | "Work" means the Software and any additions to or derivative works of the 14 | Software that are made available under this License. 15 | 16 | The terms "reproduce," "reproduction," "derivative works," and 17 | "distribution" have the meaning as provided under U.S. copyright law; 18 | provided, however, that for the purposes of this License, derivative works 19 | shall not include works that remain separable from, or merely link (or bind 20 | by name) to the interfaces of, the Work. 21 | 22 | Works, including the Software, are "made available" under this License by 23 | including in or with the Work either (a) a copyright notice referencing the 24 | applicability of this License to the Work, or (b) a copy of this License. 25 | 26 | 2. License Grants 27 | 28 | 2.1 Copyright Grant. Subject to the terms and conditions of this License, 29 | each Licensor grants to you a perpetual, worldwide, non-exclusive, 30 | royalty-free, copyright license to reproduce, prepare derivative works of, 31 | publicly display, publicly perform, sublicense and distribute its Work and 32 | any resulting derivative works in any form. 33 | 34 | 2.2 Patent Grant. Subject to the terms and conditions of this License, each 35 | Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free 36 | patent license to make, have made, use, sell, offer for sale, import, and 37 | otherwise transfer its Work, in whole or in part. The foregoing license 38 | applies only to the patent claims licensable by Licensor that would be 39 | infringed by Licensor's Work (or portion thereof) individually and 40 | excluding any combinations with any other materials or technology. 41 | 42 | 3. Limitations 43 | 44 | 3.1 Redistribution. You may reproduce or distribute the Work only if 45 | (a) you do so under this License, (b) you include a complete copy of this 46 | License with your distribution, and (c) you retain without modification 47 | any copyright, patent, trademark, or attribution notices that are present 48 | in the Work. 49 | 50 | 3.2 Derivative Works. You may specify that additional or different terms 51 | apply to the use, reproduction, and distribution of your derivative works 52 | of the Work ("Your Terms") only if (a) Your Terms provide that the use 53 | limitation in Section 3.3 applies to your derivative works, and (b) you 54 | identify the specific derivative works that are subject to Your Terms. 55 | Notwithstanding Your Terms, this License (including the redistribution 56 | requirements in Section 3.1) will continue to apply to the Work itself. 57 | 58 | 3.3 Use Limitation. The Work and any derivative works thereof only may be 59 | used or intended for use with the web services, computing platforms or 60 | applications provided by Amazon.com, Inc. or its affiliates, including 61 | Amazon Web Services, Inc. 62 | 63 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against 64 | any Licensor (including any claim, cross-claim or counterclaim in a 65 | lawsuit) to enforce any patents that you allege are infringed by any Work, 66 | then your rights under this License from such Licensor (including the 67 | grants in Sections 2.1 and 2.2) will terminate immediately. 68 | 69 | 3.5 Trademarks. This License does not grant any rights to use any 70 | Licensor's or its affiliates' names, logos, or trademarks, except as 71 | necessary to reproduce the notices described in this License. 72 | 73 | 3.6 Termination. If you violate any term of this License, then your rights 74 | under this License (including the grants in Sections 2.1 and 2.2) will 75 | terminate immediately. 76 | 77 | 4. Disclaimer of Warranty. 78 | 79 | THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 80 | EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 81 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR 82 | NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER 83 | THIS LICENSE. SOME STATES' CONSUMER LAWS DO NOT ALLOW EXCLUSION OF AN 84 | IMPLIED WARRANTY, SO THIS DISCLAIMER MAY NOT APPLY TO YOU. 85 | 86 | 5. Limitation of Liability. 87 | 88 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 89 | THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 90 | SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, 91 | INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR 92 | RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING 93 | BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS 94 | OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES 95 | OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF 96 | SUCH DAMAGES. 97 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Neuron Samples 2 | 3 | This repository contains samples for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/), the software development kit (SDK) that enables machine learning (ML) inference and training workloads on the AWS ML accelerator chips [Inferentia](https://aws.amazon.com/machine-learning/inferentia/) and [Trainium](https://aws.amazon.com/machine-learning/trainium/). 4 | 5 | The samples in this repository provide an indication of the types of deep learning models that can be used with Trainium and Inferentia, but do not represent an exhaustive list of supported models. If you have additional model samples that you would like to contribute to this repository, please submit a pull request following the repository's contribution [guidelines](CONTRIBUTING.md). 6 | 7 | Samples are organized by use case (training, inference) and deep learning framework (PyTorch, TensorFlow) below: 8 | 9 | ## Training 10 | 11 | | Framework | Description | Instance Type | 12 | | --- | --- | --- | 13 | | [PyTorch NeuronX (torch-neuronx)](torch-neuronx/README.md#training) | Sample training scripts for training various PyTorch models on AWS Trainium | Trn1, Trn1n & Inf2 | 14 | 15 | | Usage | Description | Instance Type | 16 | | --- | --- | --- | 17 | | [Nemo Megatron for Neuron](https://github.com/aws-neuron/neuronx-nemo-megatron) | A library that enables large-scale distributed training of language models such as Llama and is adapted from Nemo Megatron. | Trn1, Trn1n | 18 | | [AWS Neuron samples for ParallelCluster](https://github.com/aws-neuron/aws-neuron-parallelcluster-samples) | How to use AWS ParallelCluster to build HPC compute cluster that uses trn1 compute nodes to run your distributed ML training job. | Trn1, Trn1n | 19 | | [AWS Neuron samples for EKS](https://github.com/aws-neuron/aws-neuron-eks-samples) | The samples in this repository demonstrate the types of patterns that can be used to deliver inference and distributed training on EKS using Inferentia and Trainium. | Trn1, Trn1n | 20 | | [AWS Neuron samples for SageMaker](https://github.com/aws-neuron/aws-neuron-sagemaker-samples) | SageMaker Samples using ml.trn1 instances for machine learning (ML) training workloads on the AWS ML accelerator chips Trainium. | Trn1, Trn1n | 21 | 22 | 23 | ## Inference 24 | 25 | | Framework | Description | Instance Type | 26 | | --- | --- | --- | 27 | | [PyTorch NeuronX (torch-neuronx)](torch-neuronx/README.md#inference) | Sample Jupyter notebooks demonstrating model compilation and inference for various PyTorch models on AWS Inferentia2 and Trainium | Inf2 & Trn1 | 28 | | [PyTorch NeuronX (transformers-neuronx)](torch-neuronx/transformers-neuronx) | Sample Jupyter Notebooks demonstrating tensor parallel inference for various PyTorch large language models (LLMs) on AWS Inferentia2 and Trainium | Inf2 & Trn1 | 29 | | [PyTorch Neuron (torch-neuron)](torch-neuron) | Sample Jupyter notebooks demonstrating model compilation and inference for various PyTorch models on AWS Inferentia | Inf1 | 30 | | [TensorFlow Neuron (tensorflow-neuron)](tensorflow-neuron) | Sample Jupyter notebooks demonstrating model compilation and inference for various TensorFlow models on AWS Inferentia | Inf1 | 31 | 32 | | Usage | Description | Instance Type | 33 | | --- | --- | --- | 34 | | [AWS Neuron samples for SageMaker](https://github.com/aws-neuron/aws-neuron-sagemaker-samples) | SageMaker Samples using ml.inf2 and ml.trn1 instances for machine learning (ML) inference workloads on the AWS ML accelerator chips Inferentia2 and Trainium. | Inf2 & Trn1 | 35 | 36 | 37 | ## Getting Help 38 | 39 | If you encounter issues with any of the samples in this repository, please open an issue via the GitHub Issues feature. 40 | 41 | ## Contributing 42 | 43 | Please refer to the [CONTRIBUTING](CONTRIBUTING.md) document for details on contributing additional samples to this repository. 44 | 45 | 46 | ## Release Notes 47 | 48 | Please refer to the [Change Log](releasenotes.md). 49 | 50 | ## Known Issues 51 | 52 | | Model | Framework | Training/Inference | Instance Type | Status | 53 | | --- | --- | --- | --- | --- | 54 | | Fairseq | PyTorch | Inference | Inf1 | RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace! | 55 | | Yolof | PyTorch | Inference | Inf1 | RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace! | 56 | -------------------------------------------------------------------------------- /inference-benchmarking/Readme.md: -------------------------------------------------------------------------------- 1 | # Inference benchmarking 2 | 3 | This folder contains scripts to evaluate accuracy of LLM models inference with open source datasets. Please refer [Accuracy Eval Developer Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/accuracy-eval-with-datasets.html) or [Accuracy Evaluation Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/tutorials/trn1-llama3.1-70b-instruct-accuracy-eval.html) on how to use these scripts. In the future we will expand this folder with scripts to benchmark performance with tools such as LLMPerf and other accuracy evaluation scripts. -------------------------------------------------------------------------------- /inference-benchmarking/clients/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/inference-benchmarking/clients/__init__.py -------------------------------------------------------------------------------- /inference-benchmarking/clients/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from pathlib import Path 3 | from typing import Any, Dict 4 | 5 | 6 | class EvalClient(ABC): 7 | """Base class for evaluation clients""" 8 | 9 | def __init__(self): 10 | self.scripts_dir = Path(__file__).parent 11 | 12 | @abstractmethod 13 | def setup(self) -> None: 14 | """Setup the client (install dependencies, etc.)""" 15 | pass 16 | 17 | @abstractmethod 18 | def run(self, server_port: int, scenario_config: Dict[str, Any]) -> Dict[str, Any]: 19 | """ 20 | Run evaluation and return standardized results 21 | 22 | Returns: 23 | Dict with standardized format: 24 | { 25 | "metrics": { 26 | "metric_name": value, 27 | ... 28 | }, 29 | "metadata": { 30 | "scenario": str, 31 | "client": str, 32 | "timestamp": str, 33 | ... 34 | }, 35 | "raw_results": Dict # Original client output 36 | } 37 | """ 38 | pass 39 | 40 | def _get_script_path(self, script_name: str) -> str: 41 | return str(self.scripts_dir / script_name) 42 | -------------------------------------------------------------------------------- /inference-benchmarking/clients/lm_eval/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import LMEvalClient 2 | -------------------------------------------------------------------------------- /inference-benchmarking/clients/lm_eval/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/inference-benchmarking/clients/lm_eval/scripts/__init__.py -------------------------------------------------------------------------------- /inference-benchmarking/clients/lm_eval/scripts/run_lm_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define default values 4 | model=${1} 5 | model_path=${2} 6 | max_concurrent_req=${3:-1} 7 | port=${4:-8000} 8 | task_name=${5:-"gsm8k_cot"} 9 | results_dir=${6} 10 | timeout=${7:-7200} 11 | limit=${8:-200} 12 | use_chat=${9:-true} 13 | 14 | source ~/lm_eval_venv/bin/activate 15 | 16 | echo "Running LM Eval Client for model: ${model}, model_path: ${model_path}, max_concurrent_req: ${max_concurrent_req}, port: ${port}, task_name: ${task_name}, results_dir: ${results_dir}, timeout: ${timeout}, limit: ${limit}, use_chat: ${use_chat}" 17 | 18 | set -x 19 | 20 | export OPENAI_API_KEY=EMPTY 21 | export OPENAI_API_BASE="http://localhost:${port}/v1" 22 | 23 | # Set the endpoint based on use_chat 24 | if [ "$use_chat" = true ] ; then 25 | endpoint="chat/completions" 26 | model_type="local-chat-completions" 27 | additional_args="--apply_chat_template" 28 | echo "Starting lm_eval with chat completions" 29 | else 30 | endpoint="completions" 31 | model_type="local-completions" 32 | additional_args="" 33 | echo "Starting lm_eval without chat completions" 34 | fi 35 | 36 | # Common arguments with dynamic endpoint 37 | common_args=( 38 | "--tasks ${task_name}" 39 | "--model_args model=${model_path},base_url=http://localhost:${port}/v1/${endpoint},tokenized_requests=False,tokenizer_backend=None,num_concurrent=${max_concurrent_req},timeout=${timeout}" 40 | "--log_samples" 41 | "--output_path ${results_dir}" 42 | "--limit ${limit}" 43 | ) 44 | 45 | # Execute the command 46 | python -m lm_eval \ 47 | --model ${model_type} \ 48 | ${common_args[@]} \ 49 | ${additional_args} -------------------------------------------------------------------------------- /inference-benchmarking/clients/lm_eval/scripts/setup_lm_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Install LMEvaL 3 | cd ~ 4 | python3 -m venv ~/lm_eval_venv 5 | source lm_eval_venv/bin/activate 6 | pip install -U pip 7 | pip install lm_eval[api]==0.4.7 -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import LongBenchClient 2 | -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/prompts/0shot.txt: -------------------------------------------------------------------------------- 1 | Please read the following text and answer the question below. 2 | 3 | 4 | $DOC$ 5 | 6 | 7 | What is the correct answer to this question: $Q$ 8 | Choices: 9 | (A) $C_A$ 10 | (B) $C_B$ 11 | (C) $C_C$ 12 | (D) $C_D$ 13 | 14 | Format your response as follows: "The correct answer is (insert answer here)". -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/prompts/0shot_cot.txt: -------------------------------------------------------------------------------- 1 | Please read the following text and answer the questions below. 2 | 3 | 4 | $DOC$ 5 | 6 | 7 | What is the correct answer to this question: $Q$ 8 | Choices: 9 | (A) $C_A$ 10 | (B) $C_B$ 11 | (C) $C_C$ 12 | (D) $C_D$ 13 | 14 | Let’s think step by step: -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/prompts/0shot_cot_ans.txt: -------------------------------------------------------------------------------- 1 | Please read the following text and answer the questions below. 2 | 3 | The text is too long and omitted here. 4 | 5 | What is the correct answer to this question: $Q$ 6 | Choices: 7 | (A) $C_A$ 8 | (B) $C_B$ 9 | (C) $C_C$ 10 | (D) $C_D$ 11 | 12 | Let’s think step by step: $COT$ 13 | 14 | Based on the above, what is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)". -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/prompts/0shot_no_context.txt: -------------------------------------------------------------------------------- 1 | What is the correct answer to this question: $Q$ 2 | Choices: 3 | (A) $C_A$ 4 | (B) $C_B$ 5 | (C) $C_C$ 6 | (D) $C_D$ 7 | 8 | What is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)". -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/prompts/0shot_rag.txt: -------------------------------------------------------------------------------- 1 | Please read the following retrieved text chunks and answer the question below. 2 | 3 | 4 | $DOC$ 5 | 6 | 7 | What is the correct answer to this question: $Q$ 8 | Choices: 9 | (A) $C_A$ 10 | (B) $C_B$ 11 | (C) $C_C$ 12 | (D) $C_D$ 13 | 14 | Format your response as follows: "The correct answer is (insert answer here)". -------------------------------------------------------------------------------- /inference-benchmarking/clients/long_bench/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/inference-benchmarking/clients/long_bench/prompts/__init__.py -------------------------------------------------------------------------------- /inference-benchmarking/config.yaml: -------------------------------------------------------------------------------- 1 | server: 2 | name: "Meta-llama3.1-8B-Instruct" 3 | model_path: "/home/ubuntu/models/Meta-llama3.1-8B-Instruct/" 4 | model_s3_path: null 5 | compiled_model_path: "/home/ubuntu/traced_models/Meta-llama3.1-8B-Instruct/" 6 | max_seq_len: 16384 7 | context_encoding_len: 16384 8 | tp_degree: 32 9 | n_vllm_threads: 32 10 | server_port: 8000 11 | continuous_batch_size: 1 12 | 13 | test: 14 | accuracy: 15 | mytest: 16 | client: "lm_eval" 17 | datasets: ["gsm8k_cot", "mmlu_flan_n_shot_generative_computer_security"] 18 | max_concurrent_requests: 1 19 | timeout: 3600 20 | client_params: 21 | limit: 200 22 | use_chat: True -------------------------------------------------------------------------------- /inference-benchmarking/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | tiktoken 3 | torch 4 | openai 5 | transformers 6 | psutil 7 | botocore -------------------------------------------------------------------------------- /inference-benchmarking/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/inference-benchmarking/server/__init__.py -------------------------------------------------------------------------------- /inference-benchmarking/server/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/inference-benchmarking/server/scripts/__init__.py -------------------------------------------------------------------------------- /inference-benchmarking/server/scripts/start_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | model_id=${1} 3 | port=${2:-8000} 4 | cores=${3:-0-31} 5 | max_seq_len=${4:-2048} 6 | cont_batch_size=${5:-32} 7 | tp_size=${6:-32} 8 | n_threads=${7:-32} 9 | file_path="${8:-/home/ubuntu/vllmlogs.log}" 10 | 11 | # Shift positional arguments out of the way before parsing named arguments 12 | shift 8 13 | set -x 14 | 15 | # Default value for override_neuron_config 16 | override_neuron_config="{}" 17 | 18 | # Parse named arguments 19 | while [[ "$#" -gt 0 ]]; do 20 | case $1 in 21 | --speculative-model) draft_model_id="$2"; shift ;; 22 | --num-speculative-tokens) num_speculative_tokens="$2"; shift ;; 23 | --chat-template) chat_template="$2"; shift ;; 24 | --enable-chunked-prefill) enable_chunked_prefill="$2"; shift ;; 25 | --max-num-batched-tokens) max_num_batched_tokens="$2"; shift ;; 26 | --block-size) block_size="$2"; shift ;; 27 | --num-gpu-blocks-override) num_gpu_blocks_override="$2"; shift ;; 28 | --override-neuron-config) override_neuron_config="$2"; shift ;; 29 | *) echo "Unknown parameter: $1"; exit 1 ;; # Handle unknown parameters 30 | esac 31 | shift # Move to the next argument 32 | done 33 | 34 | # Build base command arguments 35 | cmd_args=( 36 | --model "${model_id}" 37 | --tensor-parallel-size "${tp_size}" 38 | --max-num-seqs "${cont_batch_size}" 39 | --max-model-len "${max_seq_len}" 40 | --port "${port}" 41 | --device "neuron" 42 | --use-v2-block-manager 43 | --disable-log-requests 44 | ) 45 | 46 | # Conditionally set the environment variable and add spec settings via override config 47 | [ -n "$draft_model_id" ] && { 48 | echo "Setting draft model to: ${draft_model_id}" 49 | cmd_args+=(--speculative-max-model-len "${max_seq_len}") 50 | cmd_args+=(--speculative-model "${draft_model_id}") 51 | cmd_args+=(--num-speculative-tokens "${num_speculative_tokens}") 52 | } 53 | 54 | # Conditionally add chunked prefill settings via override config 55 | [ -n "$enable_chunked_prefill" ] && { 56 | echo "Setting chunked prefill args" 57 | cmd_args+=(--enable-chunked-prefill "${enable_chunked_prefill}") 58 | cmd_args+=(--max-num-batched-tokens "${max_num_batched_tokens}") 59 | cmd_args+=(--block-size "${block_size}") 60 | cmd_args+=(--num-gpu-blocks-override "${num_gpu_blocks_override}") 61 | } 62 | 63 | # Conditionally add override config args 64 | if [[ "${override_neuron_config}" != "{}" ]]; then 65 | cmd_args+=(--override-neuron-config "${override_neuron_config}") 66 | fi 67 | 68 | [ -n "$chat_template" ] && cmd_args+=(--chat-template "${chat_template}") 69 | 70 | echo "Starting VLLM Server for model: ${model_id}" 71 | 72 | export NEURON_RT_DBG_RDH_CC=0 73 | export NEURON_RT_INSPECT_ENABLE=0 74 | export XLA_HANDLE_SPECIAL_SCALAR=1 75 | export UNSAFE_FP8FNCAST=1 76 | export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference" 77 | 78 | # Execute the command with all arguments 79 | python3 -m vllm.entrypoints.openai.api_server "${cmd_args[@]}" 2>&1 | tee ${file_path} -------------------------------------------------------------------------------- /inference-benchmarking/server_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, Dict, List, Optional 3 | 4 | 5 | @dataclass 6 | class ServerConfig: 7 | name: str 8 | model_path: str 9 | model_s3_path: str 10 | max_seq_len: int 11 | context_encoding_len: int 12 | tp_degree: int 13 | n_vllm_threads: int 14 | server_port: int 15 | continuous_batch_size: int = 1 16 | 17 | # Optional configurations 18 | draft_model_path: Optional[str] = None 19 | draft_model_s3_path: Optional[str] = None 20 | sharded_weights_path: Optional[str] = None 21 | sharded_weights_s3_path: Optional[str] = None 22 | spec_len: Optional[int] = None 23 | speculation_type: Optional[str] = None 24 | compiled_model_path: Optional[str] = None 25 | inference_demo_script: Optional[str] = None 26 | inference_demo_args: Optional[str] = None 27 | scratchpad_page_size: Optional[int] = None 28 | enable_scratchpad_single_core_debugging: Optional[bool] = False 29 | custom_chat_template_path: Optional[str] = None 30 | 31 | def __post_init__(self): 32 | if self.max_seq_len <= 0: 33 | raise ValueError("max_seq_len must be positive") 34 | if self.context_encoding_len <= 0: 35 | raise ValueError("context_encoding_len must be positive") 36 | if self.tp_degree <= 0: 37 | raise ValueError("tp_degree must be positive") 38 | if self.n_vllm_threads <= 0: 39 | raise ValueError("n_vllm_threads must be positive") 40 | if self.continuous_batch_size <= 0: 41 | raise ValueError("continuous_batch_size must be positive") 42 | if self.server_port < 0 or self.server_port > 65535: 43 | raise ValueError("server_port must be between 0 and 65535") 44 | 45 | # Validate optional configurations 46 | if self.spec_len is not None and self.spec_len <= 0: 47 | raise ValueError("spec_len must be positive if specified") 48 | if self.speculation_type and self.speculation_type not in ["eagle"]: 49 | raise ValueError("speculation_type must be 'eagle' if specified") 50 | -------------------------------------------------------------------------------- /inference-benchmarking/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .process import ( 2 | check_server_terminated, 3 | find_free_port, 4 | is_port_available, 5 | kill_process_and_children, 6 | ) 7 | from .s3 import S3Utils, download_from_s3, get_instance_region 8 | 9 | __all__ = [ 10 | # S3 utilities 11 | "download_from_s3", 12 | "get_instance_region", 13 | "S3Utils", 14 | # System utilities 15 | "kill_process_and_children", 16 | "is_port_available", 17 | "find_free_port", 18 | "check_server_terminated", 19 | ] 20 | -------------------------------------------------------------------------------- /inference-benchmarking/utils/artifacts.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | from datetime import datetime 4 | from pathlib import Path 5 | from typing import Any, Dict, List, Optional 6 | 7 | 8 | 9 | from .s3 import download_from_s3 10 | 11 | 12 | class ArtifactManager: 13 | """Manages model artifacts and test artifacts""" 14 | 15 | def __init__(self, base_dir: Optional[Path] = None): 16 | self.base_dir = base_dir or Path("artifacts") 17 | self.base_dir.mkdir(parents=True, exist_ok=True) 18 | 19 | def download_model_artifacts(self, model_config: Dict[str, Any]) -> None: 20 | """Download model and related artifacts""" 21 | print(model_config) 22 | # Download main model 23 | if model_config.get("model_s3_path"): 24 | download_from_s3(model_config["model_s3_path"], model_config["model_path"]) 25 | 26 | # Download draft model if specified 27 | if model_config.get("draft_model_s3_path"): 28 | download_from_s3(model_config["draft_model_s3_path"], model_config["draft_model_path"]) 29 | 30 | # Download sharded weights if specified 31 | if model_config.get("sharded_weights_s3_path"): 32 | download_from_s3( 33 | model_config["sharded_weights_s3_path"], model_config["sharded_weights_path"] 34 | ) 35 | 36 | def save_artifacts(self, artifacts: Dict[str, Path], destination: str) -> None: 37 | """Save artifacts to specified destination""" 38 | for name, path in artifacts.items(): 39 | if path.is_file(): 40 | shutil.copy2(path, self.base_dir / destination / name) 41 | elif path.is_dir(): 42 | shutil.copytree(path, self.base_dir / destination / name) 43 | 44 | def upload_to_s3(self, local_path: Path, s3_path: str, recursive: bool = False) -> bool: 45 | """Upload artifacts to S3""" 46 | cmd = ["aws", "s3"] 47 | cmd.extend(["sync" if recursive else "cp"]) 48 | cmd.extend([str(local_path), s3_path]) 49 | 50 | try: 51 | subprocess.run(cmd, check=True) 52 | return True 53 | except subprocess.CalledProcessError as e: 54 | print(f"Failed to upload to S3: {e}") 55 | return False 56 | 57 | def cleanup(self, paths: List[Path]) -> None: 58 | """Cleanup artifact paths""" 59 | for path in paths: 60 | if path.is_file(): 61 | path.unlink() 62 | elif path.is_dir(): 63 | shutil.rmtree(path) 64 | -------------------------------------------------------------------------------- /inference-benchmarking/utils/parser.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, Dict 3 | 4 | import yaml 5 | 6 | import sys 7 | sys.path.append("../") 8 | 9 | from accuracy import AccuracyScenario 10 | from server_config import ServerConfig 11 | 12 | 13 | @dataclass 14 | class TestConfig: 15 | accuracy: Dict[str, AccuracyScenario] = field(default_factory=dict) 16 | upload_artifacts: bool = False 17 | 18 | def __post_init__(self): 19 | # Ensure at least one type of test is configured 20 | if not self.accuracy and not self.performance: 21 | raise ValueError("At least one test type (accuracy or performance) must be configured") 22 | 23 | 24 | class ConfigParser: 25 | @staticmethod 26 | def parse_config(config_path: str) -> tuple[ServerConfig, TestConfig]: 27 | with open(config_path) as f: 28 | config = yaml.safe_load(f) 29 | 30 | # Validation happens during dataclass instantiation 31 | server_config = ServerConfig(**config["server"]) 32 | test_config = TestConfig( 33 | accuracy={ 34 | name: AccuracyScenario(**scenario_config) 35 | for name, scenario_config in config["test"].get("accuracy", {}).items() 36 | }, 37 | ) 38 | 39 | return server_config, test_config 40 | -------------------------------------------------------------------------------- /inference-benchmarking/utils/process.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import os 3 | import signal 4 | import socket 5 | import time 6 | 7 | import psutil 8 | import requests 9 | 10 | 11 | def kill_process_and_children(pid): 12 | try: 13 | print(f"Terminating process with pid {pid}") 14 | parent = psutil.Process(pid) 15 | children = parent.children(recursive=True) 16 | 17 | # Send SIGTERM to parent and children 18 | for process in [parent] + children: 19 | print(f"Sending SIGTERM to process with PID: {process.pid}") 20 | process.send_signal(signal.SIGTERM) 21 | 22 | # Wait for processes to terminate 23 | gone, alive = psutil.wait_procs([parent] + children, timeout=30) 24 | 25 | # If any processes are still alive, send SIGKILL 26 | for process in alive: 27 | print(f"Process with PID: {process.pid} did not terminate, sending SIGKILL") 28 | process.send_signal(signal.SIGKILL) 29 | 30 | print( 31 | f"Successfully terminated process with PID: {pid} and its children: {[child.pid for child in children]}" 32 | ) 33 | except Exception as e: 34 | print(f"Failed to terminate process with PID: {pid}. Exception {e}") 35 | 36 | 37 | def check_server_terminated(url, retries=2, delay=30): 38 | print("Checking if server is in terminated state") 39 | for i in range(retries): 40 | try: 41 | response = requests.get(url) 42 | if response.status_code == 200: 43 | print( 44 | f"Attempt {i + 1}/{retries}: Server is not terminated yet. Re-checking in {delay} seconds..." 45 | ) 46 | except requests.ConnectionError: 47 | print("Server is in terminated state.") 48 | return True 49 | time.sleep(delay) 50 | 51 | print("Server did not respond within the retry limit.") 52 | return False 53 | 54 | 55 | def is_port_available(port): 56 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 57 | try: 58 | sock.bind(("localhost", port)) 59 | return True 60 | except socket.error as e: 61 | if e.errno == errno.EADDRINUSE: 62 | return False 63 | else: 64 | # Handle other potential errors 65 | print(f"Unexpected error checking port {port}: {e}") 66 | return False 67 | finally: 68 | sock.close() 69 | 70 | 71 | def find_free_port(start_port=8000, max_port=65535): 72 | for port in range(start_port, max_port): 73 | if is_port_available(port): 74 | return port 75 | raise RuntimeError("Unable to find a free port") 76 | -------------------------------------------------------------------------------- /releasenotes.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## September, 15th 2023 4 | * Added notebook script to fine-tune ``deepmind/language-perceiver`` model using ``torch-neuronx``. 5 | * Added notebook script to fine-tune ``clip-large`` model using ``torch-neuronx``. 6 | * Added ``SD XL Base+Refiner`` inference sample script using ``torch-neuronx``. 7 | * Upgraded default ``diffusers`` library from 0.14.0 to latest 0.20.2 in ``Stable Diffusion 1.5`` and ``Stable Diffusion 2.1`` inference scripts. 8 | * Removed the deprecated ``--model-type=transformer-inference`` flag from ``Llama-2-13B`` model inference sample using ``transformers-neuronx`` 9 | 10 | 11 | 12 | ## August, 28th 2023 13 | * Added sample script for LLaMA V2 13B model inference using transformers-neuronx 14 | * Added samples for training GPT-NEOX 20B and 6.9B models using neuronx-distributed 15 | * Added sample scripts for CLIP and Stable Diffusion XL inference using torch-neuronx 16 | * Added sample scripts for vision and language Perceiver models inference using torch-neuronx 17 | * Added camembert training/finetuning example for Trn1 under hf_text_classification in torch-neuronx 18 | * Updated Fine-tuning Hugging Face BERT Japanese model sample in torch-neuronx 19 | * Updated OPT and GPT-J transformers-neuronx inference samples to install transformers-neuronx from whl instead of using github repo 20 | * Upgraded numpy package to 1.21.6 in GPT-2 and several training samples under hf_text_classification in torch-neuronx 21 | * Removed pinning of torch-neuron and tensorflow-neuron libraries and other minor changes in several of torch-neuron and tensorflow-neuron Inf1 inference samples. 22 | 23 | 24 | ## February, 23rd 2023 25 | * Added OPT-13B, OPT-30B, OPT-66B inference examples under transformers-neuronx 26 | * Added distilbert-base-uncased training/finetuning example for Trn1 under torch-neuronx 27 | 28 | ## November, 7th 2022 29 | 30 | * Added Fine-tuning Hugging Face BERT Japanese model sample 31 | 32 | ## November,4th 2022 33 | * Added HuggingFace Vision Transformer (ViT)training examples for Trn1 under torch-neuronx. 34 | 35 | ## October,27th 2022 36 | * Added HuggingFace GPT2 training examples for Trn1 under torch-neuronx. 37 | * Added 7 Pytorch training examples for Trn1 under torch-neuronx. 38 | 39 | ## October,10th 2022 40 | 41 | * Added 20 Pytorch inference examples for Inf1 under torch-neuron. 42 | * Added 1 TensorFlow inference example for Inf1 under tensorflow-neuron. 43 | * Added 2 Pytorch inference examples for Inf1 under torch-neuronx. 44 | 45 | # Known Issues 46 | 47 | * NA 48 | 49 | -------------------------------------------------------------------------------- /tensorflow-neuron/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow Neuron (tensorflow-neuron) Samples for AWS Inf1 2 | 3 | This directory contains Jupyter notebooks that demonstrate model compilation and inference using TensorFlow Neuron for a variety of popular deep learning models. These samples can be run on [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) (inf1 instances) using [Amazon SageMaker](https://aws.amazon.com/sagemaker) or [Amazon EC2](https://aws.amazon.com/ec2/). 4 | 5 | For each sample you will also find additional information such as the model type, configuration used to compile the model, framework version, and a link to the original model implementation. 6 | 7 | The following samples are available: 8 | 9 | |Model Name |Model Type |Input Shape |NeuronSDK Version |Framework / Version |Original Implementation | 10 | |--- |--- |--- |--- |--- |--- | 11 | |[U-Net](inference/unet) |CV - Semantic Segmentation |1,3,224,224 |2.5.2.2.1.14.0 |Tensorflow 2.5.2 |[link](https://github.com/jakeret/unet)| 12 | 13 | 14 | ### Configuring the environment 15 | 16 | In order to run the samples, you first need to [set up a TensorFlow Neuron development environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html). 17 | 18 | -------------------------------------------------------------------------------- /torch-neuron/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Neuron (torch-neuron) Samples for AWS Inf1 2 | 3 | This directory contains Jupyter notebooks that demonstrate model compilation and inference using PyTorch Neuron for a variety of popular deep learning models. These samples can be run on [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) (inf1 instances) using [Amazon SageMaker](https://aws.amazon.com/sagemaker) or [Amazon EC2](https://aws.amazon.com/ec2/). 4 | 5 | For each sample you will also find additional information such as the model type, configuration used to compile the model, framework version, and a link to the original model implementation. 6 | 7 | The following samples are available: 8 | 9 | |Model Name |Model Type |Input Shape |NeuronSDK Version |Framework / Version |Original Implementation | 10 | |--- |--- |--- |--- |--- |--- | 11 | |[BERT-base](inference/bertbasecased) |NLP |max_length=128 |1.10.1.2.2.0.0 |Pytorch 1.10.2 |[link](https://huggingface.co/bert-base-cased)| 12 | |[BERT-large](inference/bertlargeuncased) |NLP |max_length=128 |1.10.1.2.2.0.0 |Pytorch 1.10.2 |[link](https://huggingface.co/bert-large-uncased)| 13 | |[CRAFT](inference/craft) |CV - Text Detection |1,3,800,800 - max_length=32|1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://github.com/clovaai/CRAFT-pytorch)| 14 | |[EfficientNet](inference/efficientnet) |CV - Image Classification |1,3,224,224 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://pytorch.org/vision/stable/models/efficientnet.html)| 15 | |[GFL](inference/gfl_mmdet) |CV - Object Detection |1,3,800,1216 |1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://github.com/open-mmlab/mmdetection/blob/master/configs/gfl/README.md)| 16 | |[HRNet](inference/hrnet) |CV - Pose Estimation |1,3,384,288 |1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://github.com/leoxiaobin/deep-high-resolution-net.pytorch.git)| 17 | |[MarianNMT](inference/marianmt) |NLP |max_length=32 |1.7.\*|Pytorch 1.7|[link](https://huggingface.co/Helsinki-NLP/opus-mt-en-de)| 18 | |[R-CNN](inference/rcnn) |CV - Image Classification, Detection, and Segmentation |1,3,800,800 |1.11.0.2.5.2.0 |Pytorch 1.11.0 |[link](https://github.com/facebookresearch/detectron2)| 19 | |[ResNet (18,34,50,101,152)](inference/resnet)|CV - Image Classification |1,3,224,224 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://pytorch.org/vision/stable/models/resnet.html)| 20 | |[ResNetX](inference/resnext) |CV - Image Classification |1,3,224,224 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://pytorch.org/vision/stable/models/resnext.html)| 21 | |[Roberta-base](inference/robertabase) |NLP |max_length=128|1.10.1.2.2.0.0 |Pytorch 1.10.2|[link](https://huggingface.co/roberta-base)| 22 | |[SSD (SSD300-VGG16)](inference/ssd) |CV - Object detection |1,3,300,300 |1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://pytorch.org/vision/stable/models/ssd.html)| 23 | |[TrOCR](inference/trocr) |CV - OCR |1,3,384,384 |1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://huggingface.co/docs/transformers/en/model_doc/trocr)| 24 | |[VGG16](inference/vgg) |CV - Image Classification |1,3,224,224 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://pytorch.org/vision/stable/models/vgg.html)| 25 | |[ViT](inference/vit) |CV - Image Classification |1,3,224,224 |1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://huggingface.co/docs/transformers/model_doc/vit)| 26 | |[YOLOv5](inference/yolov5) |CV - Object Detection |1,3,640,640 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://github.com/ultralytics/yolov5/releases/tag/v5.0)| 27 | |[YOLOv6](inference/yolov6) |CV - Object Detection |1,3,640,640 |1.11.0.2.3.0.0 |Pytorch 1.11.0 |[link](https://github.com/meituan/YOLOv6.git)| 28 | |[YOLOv7](inference/yolov7) |CV - Object Detection+Pose Estimation |1,3,960,960 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://github.com/WongKinYiu/yolov7)| 29 | |[YOLOF](inference/yolof_detectron2) |CV - Object Detection |1,3,300,300 |1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://github.com/chensnathan/YOLOF)| 30 | |[Fairseq](inference/fairseq) |NLP|max_length=32|1.10.1.*|Pytorch 1.10.1 |[link](https://github.com/facebookresearch/fairseq)| 31 | 32 | ### Configuring the environment 33 | 34 | In order to run the samples, you first need to [set up a PyTorch Neuron development environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html). 35 | 36 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/README.md: -------------------------------------------------------------------------------- 1 | # Torch Neuron CustomOp MLP 2 | 3 | This folder contains inference examples Torch custom operators for a multi-layer perceptron (MLP) model. 4 | 5 | - The `neuron` folder contains a MLP model with relu implemented as a CustomOp using element-wise accessor. 6 | - The `neuron-tcm` folder contains the same model but relu is implemented using tcm accessor. 7 | - The `neuron-multicore` folder contains the same model but relu is implemeted using tcm accessor and multicore capability. -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-multicore/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load( 6 | name='relu', 7 | compute_srcs=['relu.cpp'], 8 | shape_srcs=['shape.cpp'], 9 | build_directory=os.getcwd(), 10 | multicore=True, 11 | verbose=True 12 | ) 13 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-multicore/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | 13 | # Global constants 14 | EPOCHS = 4 15 | WARMUP_STEPS = 2 16 | BATCH_SIZE = 32 17 | 18 | # Load MNIST inference dataset 19 | inf_dataset = mnist.MNIST(root='./MNIST_DATA_inf', 20 | train=False, download=True, transform=ToTensor()) 21 | 22 | def main(): 23 | # Prepare data loader 24 | inf_loader = DataLoader(inf_dataset, batch_size=BATCH_SIZE) 25 | 26 | # Fix the random number generator seeds for reproducibility 27 | torch.manual_seed(0) 28 | 29 | # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance) 30 | device = 'xla' 31 | 32 | # Init with random weight and move model to device 33 | model = MLP() 34 | torch.nn.init.xavier_normal_(model.fc1.weight) 35 | torch.nn.init.xavier_normal_(model.fc2.weight) 36 | torch.nn.init.xavier_normal_(model.fc3.weight) 37 | model = model.to(device) 38 | 39 | # Run the training loop 40 | print('---------- Inference ---------------') 41 | model.eval() 42 | for _ in range(EPOCHS): 43 | start = time.time() 44 | for idx, (inf_x, _) in enumerate(inf_loader): 45 | inf_x = inf_x.view(inf_x.size(0), -1) 46 | inf_x = inf_x.to(device) 47 | output = model(inf_x) 48 | xm.mark_step() # XLA: collect ops and run them in XLA runtime 49 | if idx < WARMUP_STEPS: # skip warmup iterations 50 | start = time.time() 51 | # Compute statistics for the last epoch 52 | interval = idx - WARMUP_STEPS # skip warmup iterations 53 | throughput = interval / (time.time() - start) 54 | print("Inf throughput (iter/sec): {}".format(throughput)) 55 | 56 | print('----------End Inference ---------------') 57 | 58 | if __name__ == '__main__': 59 | main() 60 | 61 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-multicore/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import my_ops 5 | 6 | # Declare 3-layer MLP for MNIST dataset 7 | class MLP(nn.Module): 8 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [4096, 2048]): 9 | super(MLP, self).__init__() 10 | self.fc1 = nn.Linear(input_size, layers[0]) 11 | self.fc2 = nn.Linear(layers[0], layers[1]) 12 | self.fc3 = nn.Linear(layers[1], output_size) 13 | 14 | def forward(self, x): 15 | f1 = self.fc1(x) 16 | r1 = my_ops.Relu.apply(f1) 17 | f2 = self.fc2(r1) 18 | r2 = my_ops.Relu.apply(f2) 19 | f3 = self.fc3(r2) 20 | return torch.log_softmax(f3, dim=1) 21 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-multicore/my_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load_library('librelu.so') 6 | 7 | class Relu(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input): 10 | ctx.save_for_backward(input) 11 | return torch.ops.my_ops.relu_forward(input) 12 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-multicore/relu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | torch::Tensor relu_forward(const torch::Tensor& t_in) { 7 | size_t num_elem = t_in.numel(); 8 | torch::Tensor t_out = get_dst_tensor(); 9 | 10 | uint32_t cpu_id = get_cpu_id(); 11 | uint32_t cpu_count = get_cpu_count(); 12 | uint32_t partition = num_elem / cpu_count; 13 | if (cpu_id == cpu_count - 1) { 14 | partition = num_elem - partition * (cpu_count - 1); 15 | } 16 | 17 | static constexpr size_t buffer_size = 1024; 18 | float *tcm_buffer = (float*)torch::neuron::tcm_malloc(sizeof(float) * buffer_size); 19 | 20 | if (tcm_buffer != nullptr) { 21 | auto t_in_tcm_acc = t_in.tcm_accessor(); 22 | auto t_out_tcm_acc = t_out.tcm_accessor(); 23 | 24 | for (size_t i = 0; i < partition; i += buffer_size) { 25 | size_t remaining_elem = partition - i; 26 | size_t copy_size = (remaining_elem > buffer_size) ? buffer_size : remaining_elem; 27 | 28 | t_in_tcm_acc.tensor_to_tcm(tcm_buffer, partition *cpu_id + i, copy_size); 29 | for (size_t j = 0; j < copy_size; j++) { 30 | tcm_buffer[j] = tcm_buffer[j] > 0.0 ? tcm_buffer[j] : 0.0; 31 | } 32 | t_out_tcm_acc.tcm_to_tensor(tcm_buffer, partition *cpu_id + i, copy_size); 33 | } 34 | } 35 | torch::neuron::tcm_free(tcm_buffer); 36 | return t_out; 37 | } 38 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-multicore/shape.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "torchneuron/register.h" 5 | 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) { 7 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 8 | return t_out; 9 | } 10 | 11 | NEURON_LIBRARY(my_ops, m) { 12 | m.def("relu_forward", &relu_fwd_shape, "relu_forward"); 13 | } 14 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-tcm/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load( 6 | name='relu', 7 | compute_srcs=['relu.cpp'], 8 | shape_srcs=['shape.cpp'], 9 | build_directory=os.getcwd(), 10 | verbose=True 11 | ) 12 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-tcm/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | 13 | # Global constants 14 | EPOCHS = 4 15 | WARMUP_STEPS = 2 16 | BATCH_SIZE = 32 17 | 18 | # Load MNIST inference dataset 19 | inf_dataset = mnist.MNIST(root='./MNIST_DATA_inf', 20 | train=False, download=True, transform=ToTensor()) 21 | 22 | def main(): 23 | # Prepare data loader 24 | inf_loader = DataLoader(inf_dataset, batch_size=BATCH_SIZE) 25 | 26 | # Fix the random number generator seeds for reproducibility 27 | torch.manual_seed(0) 28 | 29 | # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance) 30 | device = 'xla' 31 | 32 | # Init with random weight and move model to device 33 | model = MLP() 34 | torch.nn.init.xavier_normal_(model.fc1.weight) 35 | torch.nn.init.xavier_normal_(model.fc2.weight) 36 | torch.nn.init.xavier_normal_(model.fc3.weight) 37 | model = model.to(device) 38 | 39 | # Run the training loop 40 | print('---------- Inference ---------------') 41 | model.eval() 42 | for _ in range(EPOCHS): 43 | start = time.time() 44 | for idx, (inf_x, _) in enumerate(inf_loader): 45 | inf_x = inf_x.view(inf_x.size(0), -1) 46 | inf_x = inf_x.to(device) 47 | output = model(inf_x) 48 | xm.mark_step() # XLA: collect ops and run them in XLA runtime 49 | if idx < WARMUP_STEPS: # skip warmup iterations 50 | start = time.time() 51 | # Compute statistics for the last epoch 52 | interval = idx - WARMUP_STEPS # skip warmup iterations 53 | throughput = interval / (time.time() - start) 54 | print("Inf throughput (iter/sec): {}".format(throughput)) 55 | 56 | print('----------End Inference ---------------') 57 | 58 | if __name__ == '__main__': 59 | main() 60 | 61 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-tcm/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import my_ops 5 | 6 | # Declare 3-layer MLP for MNIST dataset 7 | class MLP(nn.Module): 8 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [4096, 2048]): 9 | super(MLP, self).__init__() 10 | self.fc1 = nn.Linear(input_size, layers[0]) 11 | self.fc2 = nn.Linear(layers[0], layers[1]) 12 | self.fc3 = nn.Linear(layers[1], output_size) 13 | 14 | def forward(self, x): 15 | f1 = self.fc1(x) 16 | r1 = my_ops.Relu.apply(f1) 17 | f2 = self.fc2(r1) 18 | r2 = my_ops.Relu.apply(f2) 19 | f3 = self.fc3(r2) 20 | return torch.log_softmax(f3, dim=1) 21 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-tcm/my_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load_library('librelu.so') 6 | 7 | class Relu(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input): 10 | ctx.save_for_backward(input) 11 | return torch.ops.my_ops.relu_forward(input) 12 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-tcm/relu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | torch::Tensor relu_forward(const torch::Tensor& t_in) { 7 | size_t num_elem = t_in.numel(); 8 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 9 | 10 | static constexpr size_t buffer_size = 1024; 11 | float *tcm_buffer = (float*)torch::neuron::tcm_malloc(sizeof(float) * buffer_size); 12 | 13 | if (tcm_buffer != nullptr) { 14 | auto t_in_tcm_acc = t_in.tcm_accessor(); 15 | auto t_out_tcm_acc = t_out.tcm_accessor(); 16 | 17 | for (size_t i = 0; i < num_elem; i += buffer_size) { 18 | size_t remaining_elem = num_elem - i; 19 | size_t copy_size = (remaining_elem > buffer_size) ? buffer_size : remaining_elem; 20 | 21 | t_in_tcm_acc.tensor_to_tcm(tcm_buffer, i, copy_size); 22 | for (size_t j = 0; j < copy_size; j++) { 23 | tcm_buffer[j] = tcm_buffer[j] > 0.0 ? tcm_buffer[j] : 0.0; 24 | } 25 | t_out_tcm_acc.tcm_to_tensor(tcm_buffer, i, copy_size); 26 | } 27 | } 28 | torch::neuron::tcm_free(tcm_buffer); 29 | return t_out; 30 | } 31 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron-tcm/shape.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "torchneuron/register.h" 5 | 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) { 7 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 8 | return t_out; 9 | } 10 | 11 | NEURON_LIBRARY(my_ops, m) { 12 | m.def("relu_forward", &relu_fwd_shape, "relu_forward"); 13 | } 14 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load( 6 | name='relu', 7 | compute_srcs=['relu.cpp'], 8 | shape_srcs=['shape.cpp'], 9 | build_directory=os.getcwd() 10 | ) 11 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | 13 | # Global constants 14 | EPOCHS = 4 15 | WARMUP_STEPS = 2 16 | BATCH_SIZE = 32 17 | 18 | # Load MNIST inference dataset 19 | inf_dataset = mnist.MNIST(root='./MNIST_DATA_inf', 20 | train=False, download=True, transform=ToTensor()) 21 | 22 | def main(): 23 | # Prepare data loader 24 | inf_loader = DataLoader(inf_dataset, batch_size=BATCH_SIZE) 25 | 26 | # Fix the random number generator seeds for reproducibility 27 | torch.manual_seed(0) 28 | 29 | # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance) 30 | device = 'xla' 31 | 32 | # Init with random weight and move model to device 33 | model = MLP() 34 | torch.nn.init.xavier_normal_(model.fc1.weight) 35 | torch.nn.init.xavier_normal_(model.fc2.weight) 36 | torch.nn.init.xavier_normal_(model.fc3.weight) 37 | model = model.to(device) 38 | 39 | # Run the training loop 40 | print('---------- Inference ---------------') 41 | model.eval() 42 | for _ in range(EPOCHS): 43 | start = time.time() 44 | for idx, (inf_x, _) in enumerate(inf_loader): 45 | inf_x = inf_x.view(inf_x.size(0), -1) 46 | inf_x = inf_x.to(device) 47 | output = model(inf_x) 48 | xm.mark_step() # XLA: collect ops and run them in XLA runtime 49 | if idx < WARMUP_STEPS: # skip warmup iterations 50 | start = time.time() 51 | # Compute statistics for the last epoch 52 | interval = idx - WARMUP_STEPS # skip warmup iterations 53 | throughput = interval / (time.time() - start) 54 | print("Inf throughput (iter/sec): {}".format(throughput)) 55 | 56 | print('----------End Inference ---------------') 57 | 58 | if __name__ == '__main__': 59 | main() 60 | 61 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import my_ops 5 | 6 | # Declare 3-layer MLP for MNIST dataset 7 | class MLP(nn.Module): 8 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [4096, 2048]): 9 | super(MLP, self).__init__() 10 | self.fc1 = nn.Linear(input_size, layers[0]) 11 | self.fc2 = nn.Linear(layers[0], layers[1]) 12 | self.fc3 = nn.Linear(layers[1], output_size) 13 | 14 | def forward(self, x): 15 | f1 = self.fc1(x) 16 | r1 = my_ops.Relu.apply(f1) 17 | f2 = self.fc2(r1) 18 | r2 = my_ops.Relu.apply(f2) 19 | f3 = self.fc3(r2) 20 | return torch.log_softmax(f3, dim=1) 21 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron/my_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load_library('librelu.so') 6 | 7 | class Relu(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input): 10 | ctx.save_for_backward(input) 11 | return torch.ops.my_ops.relu_forward(input) 12 | 13 | @staticmethod 14 | def backward(ctx, grad): 15 | input, = ctx.saved_tensors 16 | return torch.ops.my_ops.relu_backward(grad, input), None 17 | 18 | 19 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron/relu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | torch::Tensor relu_forward(const torch::Tensor& t_in) { 6 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 7 | auto t_in_acc = t_in.accessor(); 8 | auto t_out_acc = t_out.accessor(); 9 | auto shape = t_in.sizes(); 10 | for (int i = 0; i < shape[0]; i++) { 11 | for (int j = 0; j < shape[1]; j++) { 12 | t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_in_acc[i][j] : 0.0; 13 | } 14 | } 15 | return t_out; 16 | } 17 | 18 | torch::Tensor relu_backward(const torch::Tensor& t_grad, const torch::Tensor& t_in) { 19 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 20 | auto t_in_acc = t_in.accessor(); 21 | auto t_grad_acc = t_grad.accessor(); 22 | auto t_out_acc = t_out.accessor(); 23 | auto shape = t_in.sizes(); 24 | for (int i = 0; i < shape[0]; i++) { 25 | for (int j = 0; j < shape[1]; j++) { 26 | t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_grad_acc[i][j] : 0.0; 27 | } 28 | } 29 | return t_out; 30 | } 31 | -------------------------------------------------------------------------------- /torch-neuronx/inference/customop_mlp/neuron/shape.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "torchneuron/register.h" 5 | 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) { 7 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 8 | return t_out; 9 | } 10 | 11 | torch::Tensor relu_bwd_shape(torch::Tensor t_grad, torch::Tensor t_in) { 12 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 13 | return t_out; 14 | } 15 | 16 | NEURON_LIBRARY(my_ops, m) { 17 | m.def("relu_forward", &relu_fwd_shape, "relu_forward"); 18 | m.def("relu_backward", &relu_bwd_shape, "relu_backward"); 19 | } 20 | -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/compile_latency_optimized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=`pwd`:$PYTHONPATH 4 | 5 | echo "compiling text encoder" 6 | python neuron_pixart_sigma/compile_text_encoder.py \ 7 | --compiled_models_dir "compile_workdir_latency_optimized" 8 | 9 | echo "compiling transformer" 10 | python neuron_pixart_sigma/compile_transformer_latency_optimized.py \ 11 | --compiled_models_dir "compile_workdir_latency_optimized" 12 | 13 | echo "compiling decoder" 14 | python neuron_pixart_sigma/compile_decoder.py \ 15 | --compiled_models_dir "compile_workdir_latency_optimized" -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/compile_throughput_optimized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=`pwd`:$PYTHONPATH 4 | 5 | echo "compiling text encoder" 6 | python neuron_pixart_sigma/compile_text_encoder.py \ 7 | --compiled_models_dir "compile_workdir_throughput_optimized" 8 | 9 | echo "compiling transformer" 10 | python neuron_pixart_sigma/compile_transformer_throughput_optimized.py \ 11 | --compiled_models_dir "compile_workdir_throughput_optimized" 12 | 13 | echo "compiling decoder" 14 | python neuron_pixart_sigma/compile_decoder.py \ 15 | --compiled_models_dir "compile_workdir_throughput_optimized" -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/cache_hf_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from diffusers import PixArtSigmaPipeline 3 | 4 | pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained( 5 | "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", 6 | torch_dtype=torch.bfloat16, 7 | cache_dir="pixart_sigma_hf_cache_dir_1024") -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_decoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1" 3 | os.environ["NEURON_CUSTOM_SILU"] = "1" 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags 9 | 10 | from diffusers import PixArtSigmaPipeline 11 | import torch 12 | import argparse 13 | import torch_neuronx 14 | from diffusers.models.autoencoders.vae import Decoder 15 | from neuron_commons import attention_wrapper, f32Wrapper 16 | 17 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper 18 | 19 | def upcast_norms_to_f32(decoder: Decoder): 20 | for upblock in decoder.up_blocks: 21 | for resnet in upblock.resnets: 22 | orig_resnet_norm1 = resnet.norm1 23 | orig_resnet_norm2 = resnet.norm2 24 | resnet.norm1 = f32Wrapper(orig_resnet_norm1) 25 | resnet.norm2 = f32Wrapper(orig_resnet_norm2) 26 | for attn in decoder.mid_block.attentions: 27 | orig_group_norm = attn.group_norm 28 | attn.group_norm = f32Wrapper(orig_group_norm) 29 | for resnet in decoder.mid_block.resnets: 30 | orig_resnet_norm1 = resnet.norm1 31 | orig_resnet_norm2 = resnet.norm2 32 | resnet.norm1 = f32Wrapper(orig_resnet_norm1) 33 | resnet.norm2 = f32Wrapper(orig_resnet_norm2) 34 | orig_conv_norm_out = decoder.conv_norm_out 35 | decoder.conv_norm_out = f32Wrapper(orig_conv_norm_out) 36 | 37 | def compile_decoder(args): 38 | latent_height = args.height//8 39 | latent_width = args.width//8 40 | compiler_workdir = args.compiler_workdir 41 | compiled_models_dir = args.compiled_models_dir 42 | 43 | batch_size = 1 44 | dtype = torch.bfloat16 45 | pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained( 46 | "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", 47 | cache_dir="pixart_sigma_hf_cache_dir_1024", 48 | local_files_only=True, 49 | torch_dtype=dtype) 50 | 51 | decoder: Decoder = pipe.vae.decoder 52 | decoder.eval() 53 | upcast_norms_to_f32(decoder) 54 | 55 | with torch.no_grad(): 56 | sample_inputs = torch.rand((batch_size, 4, latent_height, latent_width), dtype=dtype) 57 | compiled_decoder = torch_neuronx.trace( 58 | decoder, 59 | sample_inputs, 60 | compiler_workdir=f"{compiler_workdir}/decoder", 61 | compiler_args=compiler_flags, 62 | inline_weights_to_neff=False) 63 | 64 | compiled_model_dir = f"{compiled_models_dir}/decoder" 65 | if not os.path.exists(compiled_model_dir): 66 | os.makedirs(compiled_model_dir) 67 | torch.jit.save(compiled_decoder, f"{compiled_model_dir}/model.pt") 68 | 69 | compiled_post_quant_conv = torch_neuronx.trace( 70 | pipe.vae.post_quant_conv, 71 | sample_inputs, 72 | compiler_workdir=f"{compiler_workdir}/post_quant_conv", 73 | compiler_args=compiler_flags, 74 | inline_weights_to_neff=False) 75 | 76 | compiled_model_dir = f"{compiled_models_dir}/post_quant_conv" 77 | if not os.path.exists(compiled_model_dir): 78 | os.makedirs(compiled_model_dir) 79 | torch.jit.save(compiled_post_quant_conv, f"{compiled_model_dir}/model.pt") 80 | 81 | if __name__ == "__main__": 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument("--height", help="height of generated image.", type=int, default=1024) 84 | parser.add_argument("--width", help="height of generated image.", type=int, default=1024) 85 | parser.add_argument("--num_images_per_prompt", help="height of generated image.", type=int, default=1) 86 | parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir") 87 | parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models") 88 | args = parser.parse_args() 89 | compile_decoder(args) -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_text_encoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1" 3 | os.environ["NEURON_CUSTOM_SILU"] = "1" 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags 9 | 10 | from diffusers import PixArtSigmaPipeline 11 | import torch 12 | import argparse 13 | import torch_neuronx 14 | import neuronx_distributed 15 | from transformers.models.t5 import T5EncoderModel 16 | from torch import nn 17 | from functools import partial 18 | 19 | from transformers.models.t5.modeling_t5 import T5EncoderModel, T5Block, T5LayerSelfAttention, T5LayerFF 20 | 21 | from neuron_commons import attention_wrapper, f32Wrapper 22 | from neuron_parallel_utils import get_sharded_data, shard_t5_self_attention, shard_t5_ff 23 | 24 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper 25 | 26 | 27 | class TracingT5WrapperTP(nn.Module): 28 | def __init__(self, t: T5EncoderModel, seqlen: int): 29 | super().__init__() 30 | self.t = t 31 | self.device = t.device 32 | precomputed_bias = self.t.encoder.block[0].layer[0].SelfAttention.compute_bias(seqlen, seqlen) 33 | precomputed_bias_tp = get_sharded_data(precomputed_bias, 1) 34 | self.t.encoder.block[0].layer[0].SelfAttention.compute_bias = lambda *args, **kwargs: precomputed_bias_tp 35 | 36 | def forward(self, text_input_ids, prompt_attention_mask): 37 | return self.t( 38 | text_input_ids, 39 | attention_mask=prompt_attention_mask 40 | ) 41 | 42 | def get_text_encoder(tp_degree: int, sequence_length: int): 43 | pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained( 44 | "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", 45 | cache_dir="pixart_sigma_hf_cache_dir_1024", 46 | local_files_only=True, 47 | torch_dtype=torch.bfloat16) 48 | text_encoder: T5EncoderModel = pipe.text_encoder 49 | text_encoder.eval() 50 | for idx, block in enumerate(text_encoder.encoder.block): 51 | block: T5Block = block 52 | block.layer[1].DenseReluDense.act = torch.nn.GELU(approximate="tanh") 53 | selfAttention: T5LayerSelfAttention = block.layer[0].SelfAttention 54 | ff: T5LayerFF = block.layer[1] 55 | layer_norm_0 = block.layer[0].layer_norm.to(torch.float32) 56 | layer_norm_1 = block.layer[1].layer_norm.to(torch.float32) 57 | block.layer[1] = shard_t5_ff(ff) 58 | block.layer[0].SelfAttention = shard_t5_self_attention(tp_degree, selfAttention) 59 | block.layer[0].layer_norm = f32Wrapper(layer_norm_0) 60 | block.layer[1].layer_norm = f32Wrapper(layer_norm_1) 61 | final_layer_norm = pipe.text_encoder.encoder.final_layer_norm.to(torch.float32) 62 | pipe.text_encoder.encoder.final_layer_norm = f32Wrapper(final_layer_norm) 63 | return TracingT5WrapperTP(text_encoder, sequence_length), {} 64 | 65 | def compile_text_encoder(args): 66 | batch_size = 1 # batch_size = args.num_prompts 67 | sequence_length = args.max_sequence_length 68 | tp_degree = 4 # Use tensor parallel degree as 4 for trn2 69 | # tp_degree = 8 # Use tensor parallel degree as 8 for trn1/inf2 70 | os.environ["LOCAL_WORLD_SIZE"] = "4" 71 | get_text_encoder_f = partial(get_text_encoder, tp_degree, sequence_length) 72 | 73 | compiler_workdir = args.compiler_workdir 74 | compiled_models_dir = args.compiled_models_dir 75 | 76 | with torch.no_grad(): 77 | sample_inputs = torch.ones((batch_size, sequence_length), dtype=torch.int64), \ 78 | torch.ones((batch_size, sequence_length), dtype=torch.int64) 79 | compiled_text_encoder = neuronx_distributed.trace.parallel_model_trace( 80 | get_text_encoder_f, 81 | sample_inputs, 82 | compiler_workdir=f"{compiler_workdir}/text_encoder", 83 | compiler_args=compiler_flags, 84 | tp_degree=tp_degree, 85 | ) 86 | compiled_model_dir = f"{compiled_models_dir}/text_encoder" 87 | if not os.path.exists(compiled_model_dir): 88 | os.makedirs(compiled_model_dir) 89 | neuronx_distributed.trace.parallel_model_save( 90 | compiled_text_encoder, f"{compiled_model_dir}") 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--num_prompts", help="number of prompts", type=int, default=1) 95 | parser.add_argument("--max_sequence_length", help="max sequence length.", type=int, default=300) 96 | parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir") 97 | parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models") 98 | args = parser.parse_args() 99 | compile_text_encoder(args) -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_transformer_latency_optimized.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1" 3 | os.environ["NEURON_CUSTOM_SILU"] = "1" 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --internal-hlo2tensorizer-options='--fuse-dot-logistic=false' --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags 9 | 10 | from diffusers import PixArtSigmaPipeline 11 | import torch 12 | import argparse 13 | import neuronx_distributed 14 | 15 | from torch import nn 16 | from functools import partial 17 | 18 | from neuron_commons import attention_wrapper_for_transformer 19 | from neuron_parallel_utils import shard_transformer_attn, shard_transformer_feedforward 20 | 21 | from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel 22 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper_for_transformer 23 | 24 | class TracingTransformerWrapper(nn.Module): 25 | def __init__(self, transformer): 26 | super().__init__() 27 | self.transformer = transformer 28 | self.config = transformer.config 29 | self.dtype = transformer.dtype 30 | self.device = transformer.device 31 | 32 | def forward(self, hidden_states=None, encoder_hidden_states=None, timestep=None, encoder_attention_mask=None, **kwargs): 33 | return self.transformer( 34 | hidden_states=hidden_states, 35 | encoder_hidden_states=encoder_hidden_states, 36 | timestep=timestep, 37 | encoder_attention_mask=encoder_attention_mask, 38 | added_cond_kwargs={"resolution": None, "aspect_ratio": None}, 39 | return_dict=False) 40 | 41 | def get_transformer_model(tp_degree: int): 42 | pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained( 43 | "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", 44 | torch_dtype=torch.bfloat16, 45 | local_files_only=True, 46 | cache_dir="pixart_sigma_hf_cache_dir_1024") 47 | # 28 of these. 48 | for block_idx, block in enumerate(pipe.transformer.transformer_blocks): 49 | block.attn1 = shard_transformer_attn(tp_degree, block.attn1) 50 | block.attn2 = shard_transformer_attn(tp_degree, block.attn2) 51 | block.ff = shard_transformer_feedforward(block.ff) 52 | 53 | mod_pipe_transformer_f = TracingTransformerWrapper(pipe.transformer) 54 | return mod_pipe_transformer_f, {} 55 | 56 | def compile_transformer(args): 57 | tp_degree = 4 58 | # tp_degree = 8 # Use tensor parallel degree as 8 for trn1/inf2 59 | os.environ["LOCAL_WORLD_SIZE"] = "4" # Use tensor parallel degree as 4 for trn2 60 | latent_height = args.height//8 61 | latent_width = args.width//8 62 | num_prompts = 1 63 | num_images_per_prompt = args.num_images_per_prompt 64 | max_sequence_length = args.max_sequence_length 65 | hidden_size = 4096 66 | compiler_workdir = args.compiler_workdir 67 | compiled_models_dir = args.compiled_models_dir 68 | batch_size = 2 69 | sample_hidden_states = torch.ones((batch_size, 4, latent_height, latent_width), dtype=torch.bfloat16) 70 | sample_encoder_hidden_states = torch.ones((batch_size, max_sequence_length, hidden_size), dtype=torch.bfloat16) 71 | sample_timestep = torch.ones((batch_size), dtype=torch.int64) 72 | sample_encoder_attention_mask = torch.ones((batch_size, max_sequence_length), dtype=torch.int64) 73 | 74 | get_transformer_model_f = partial(get_transformer_model, tp_degree) 75 | with torch.no_grad(): 76 | sample_inputs = sample_hidden_states, sample_encoder_hidden_states, sample_timestep, sample_encoder_attention_mask 77 | compiled_transformer = neuronx_distributed.trace.parallel_model_trace( 78 | get_transformer_model_f, 79 | sample_inputs, 80 | compiler_workdir=f"{compiler_workdir}/transformer", 81 | compiler_args=compiler_flags, 82 | tp_degree=tp_degree, 83 | inline_weights_to_neff=False, 84 | ) 85 | compiled_model_dir = f"{compiled_models_dir}/transformer" 86 | if not os.path.exists(compiled_model_dir): 87 | os.makedirs(compiled_model_dir) 88 | neuronx_distributed.trace.parallel_model_save( 89 | compiled_transformer, f"{compiled_model_dir}") 90 | 91 | if __name__ == "__main__": 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument("--height", help="height of generated image.", type=int, default=1024) 94 | parser.add_argument("--width", help="width of generated image.", type=int, default=1024) 95 | parser.add_argument("--num_images_per_prompt", help="number of images per prompt.", type=int, default=1) 96 | parser.add_argument("--max_sequence_length", help="max sequence length.", type=int, default=300) 97 | parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir") 98 | parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models") 99 | args = parser.parse_args() 100 | compile_transformer(args) -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_transformer_throughput_optimized.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1" 3 | os.environ["NEURON_CUSTOM_SILU"] = "1" 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags 9 | 10 | from diffusers import PixArtSigmaPipeline 11 | from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel 12 | import torch 13 | import argparse 14 | import torch_neuronx 15 | from torch import nn 16 | from functools import partial 17 | 18 | from neuron_commons import attention_wrapper, attention_wrapper_for_transformer 19 | from neuron_parallel_utils import shard_transformer_attn, shard_transformer_feedforward, get_sharded_data 20 | 21 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper_for_transformer 22 | 23 | class TracingTransformerWrapper(nn.Module): 24 | def __init__(self, transformer): 25 | super().__init__() 26 | self.transformer = transformer 27 | self.config = transformer.config 28 | self.dtype = transformer.dtype 29 | self.device = transformer.device 30 | 31 | def forward(self, hidden_states=None, encoder_hidden_states=None, timestep=None, encoder_attention_mask=None, **kwargs): 32 | return self.transformer( 33 | hidden_states=hidden_states, 34 | encoder_hidden_states=encoder_hidden_states, 35 | timestep=timestep, 36 | encoder_attention_mask=encoder_attention_mask, 37 | added_cond_kwargs={"resolution": None, "aspect_ratio": None}, 38 | return_dict=False) 39 | 40 | def get_transformer_model(): 41 | pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained( 42 | "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", 43 | torch_dtype=torch.bfloat16, 44 | local_files_only=True, 45 | cache_dir="pixart_sigma_hf_cache_dir_1024") 46 | mod_pipe_transformer_f = TracingTransformerWrapper(pipe.transformer) 47 | return mod_pipe_transformer_f 48 | 49 | def compile_transformer(args): 50 | latent_height = args.height//8 51 | latent_width = args.width//8 52 | num_prompts = 1 53 | num_images_per_prompt = args.num_images_per_prompt 54 | max_sequence_length = args.max_sequence_length 55 | hidden_size = 4096 56 | compiler_workdir = args.compiler_workdir 57 | compiled_models_dir = args.compiled_models_dir 58 | batch_size = 2 59 | sample_hidden_states = torch.ones((batch_size, 4, latent_height, latent_width), dtype=torch.bfloat16) 60 | sample_encoder_hidden_states = torch.ones((batch_size, max_sequence_length, hidden_size), dtype=torch.bfloat16) 61 | sample_timestep = torch.ones((batch_size), dtype=torch.int64) 62 | sample_encoder_attention_mask = torch.ones((batch_size, max_sequence_length), dtype=torch.int64) 63 | get_transformer_model_f = get_transformer_model() #, tp_degree) 64 | with torch.no_grad(): 65 | sample_inputs = sample_hidden_states, sample_encoder_hidden_states, sample_timestep, sample_encoder_attention_mask 66 | compiled_transformer = torch_neuronx.trace( 67 | get_transformer_model_f, 68 | sample_inputs, 69 | compiler_workdir=f"{compiler_workdir}/transformer", 70 | compiler_args=compiler_flags, 71 | inline_weights_to_neff=False) 72 | 73 | compiled_model_dir = f"{compiled_models_dir}/transformer" 74 | if not os.path.exists(compiled_model_dir): 75 | os.makedirs(compiled_model_dir) 76 | torch.jit.save(compiled_transformer, f"{compiled_model_dir}/model.pt") 77 | 78 | if __name__ == "__main__": 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument("--height", help="height of generated image.", type=int, default=1024) 81 | parser.add_argument("--width", help="width of generated image.", type=int, default=1024) 82 | parser.add_argument("--num_images_per_prompt", help="number of images per prompt.", type=int, default=1) 83 | parser.add_argument("--max_sequence_length", help="max sequence length.", type=int, default=300) 84 | parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir") 85 | parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models") 86 | args = parser.parse_args() 87 | compile_transformer(args) -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/neuron_commons.py: -------------------------------------------------------------------------------- 1 | from diffusers import PixArtSigmaPipeline, Transformer2DModel 2 | from transformers.models.t5.modeling_t5 import T5EncoderModel 3 | from torch import nn 4 | 5 | class InferenceTextEncoderWrapper(nn.Module): 6 | def __init__(self, dtype, t: T5EncoderModel, seqlen: int): 7 | super().__init__() 8 | self.dtype = dtype 9 | self.device = t.device 10 | self.t = t 11 | def forward(self, text_input_ids, attention_mask=None): 12 | return [self.t(text_input_ids, attention_mask)['last_hidden_state'].to(self.dtype)] 13 | 14 | class InferenceTransformerWrapper(nn.Module): 15 | def __init__(self, transformer: Transformer2DModel): 16 | super().__init__() 17 | self.transformer = transformer 18 | self.config = transformer.config 19 | self.dtype = transformer.dtype 20 | self.device = transformer.device 21 | def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, 22 | encoder_attention_mask=None, added_cond_kwargs=None, 23 | return_dict=False): 24 | output = self.transformer( 25 | hidden_states, 26 | encoder_hidden_states, 27 | timestep, 28 | encoder_attention_mask) 29 | return output 30 | 31 | class SimpleWrapper(nn.Module): 32 | def __init__(self, model): 33 | super().__init__() 34 | self.model = model 35 | def forward(self, x): 36 | output = self.model(x) 37 | return output 38 | 39 | import torch 40 | import math 41 | from torch import nn 42 | 43 | from neuronxcc.starfish.penguin.targets.nki.private_api import vnc 44 | from torch_neuronx.xla_impl.ops import nki_jit 45 | from neuronxcc.nki._private_kernels.attention import attention_isa_kernel 46 | _flash_fwd_call = nki_jit()(attention_isa_kernel) 47 | 48 | 49 | def neuron_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=None, is_causal=None): 50 | orig_shape = None 51 | if len(query.shape) == 4: 52 | orig_shape = query.shape 53 | def to3d(x): 54 | return x.reshape(-1, x.shape[2], x.shape[3]) 55 | query, key, value = map(to3d, [query, key, value]) 56 | if query.size() == key.size(): 57 | attention_scores = torch.bmm(key, query.transpose(-1, -2)) * ( 58 | 1 / math.sqrt(query.size(-1)) 59 | ) 60 | attention_probs = attention_scores.softmax(dim=1).permute(0, 2, 1) 61 | else: 62 | attention_scores = torch.bmm(query, key.transpose(-1, -2)) * ( 63 | 1 / math.sqrt(query.size(-1)) 64 | ) 65 | attention_probs = attention_scores.softmax(dim=-1) 66 | attn_out = torch.bmm(attention_probs, value) 67 | if orig_shape: 68 | attn_out = attn_out.reshape( 69 | orig_shape[0], orig_shape[1], attn_out.shape[1], attn_out.shape[2] 70 | ) 71 | return attn_out 72 | 73 | 74 | def attention_wrapper_sharded_without_swap(query, key, value): 75 | bs, n_head, q_len, d_head = query.shape 76 | q = query.clone().permute(0, 1, 3, 2).reshape((bs*n_head, d_head, q_len)) 77 | k = key.clone().permute(0, 1, 3, 2).reshape((bs*n_head, d_head, q_len)) 78 | v = value.clone().reshape((bs*n_head, q_len, d_head)) 79 | attn_output = torch.zeros((bs*n_head, q_len, d_head), dtype=torch.bfloat16, device=q.device) 80 | use_sharded_attention_kernel = True # Use "need use_sharded_attention_kernel = True" in case of trn2 81 | # use_sharded_attention_kernel = False # We do not "need use_sharded_attention_kernel" in case of trn1/inf2, so we could make it false 82 | if use_sharded_attention_kernel: 83 | grid = (vnc(2),) 84 | _flash_fwd_call[grid](q, k, v, 0.117, attn_output, kernel_name="AttentionMMSoftmaxMMWithoutSwap") 85 | else: 86 | _flash_fwd_call(q, k, v, 0.117, attn_output, kernel_name="AttentionMMSoftmaxMMWithoutSwap") 87 | attn_output = attn_output.reshape((bs, n_head, q_len, d_head)) 88 | return attn_output 89 | 90 | 91 | sdpa_original = torch.nn.functional.scaled_dot_product_attention 92 | def attention_wrapper(query, key, value, attn_mask=None, dropout_p=None, is_causal=None): 93 | if attn_mask is not None: 94 | return sdpa_original(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal) 95 | else: 96 | return neuron_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal) 97 | 98 | def attention_wrapper_for_transformer(query, key, value, attn_mask=None, dropout_p=None, is_causal=None): 99 | if attn_mask is not None: 100 | return sdpa_original(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal) 101 | else: 102 | return attention_wrapper_sharded_without_swap(query, key, value) 103 | 104 | class f32Wrapper(nn.Module): 105 | def __init__(self, original): 106 | super().__init__() 107 | self.original = original 108 | def forward(self, x): 109 | t = x.dtype 110 | y = x.to(torch.float32) 111 | output = self.original(y) 112 | return output.type(t) 113 | 114 | -------------------------------------------------------------------------------- /torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers==0.31.0 2 | transformers==4.36.2 -------------------------------------------------------------------------------- /torch-neuronx/inference/sd2_inpainting_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/torch-neuronx/inference/sd2_inpainting_mask.png -------------------------------------------------------------------------------- /torch-neuronx/inference/sd2_inpainting_photo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/torch-neuronx/inference/sd2_inpainting_photo.png -------------------------------------------------------------------------------- /torch-neuronx/microbenchmark/ubench_utils.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | import time 3 | 4 | 5 | class Timer: 6 | """ 7 | A simple Timer with high-enough granularity for performance 8 | measurments. The timer is catered towards using as a context manager. 9 | 10 | Example usage: 11 | with ubench_utils.Timer() as benchmark_timer: 12 | time.sleep(1) 13 | time.sleep(4) 14 | 15 | act_time = benchmark_timer() 16 | print("Sleeping for 5 seconds actualy took {:2g} seconds".format(act_time) 17 | """ 18 | 19 | def __enter__(self): 20 | self.start = time.perf_counter() 21 | self.end = 0.0 22 | return lambda: self.end - self.start 23 | 24 | def __exit__(self, *args): 25 | self.end = time.perf_counter() 26 | 27 | -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/README.md: -------------------------------------------------------------------------------- 1 | # AWS Batch / trn1 allreduce example 2 | 3 | This package shows how to run a multi-node allreduce test using trn1.32xlarge instances in AWS Batch. A successful allreduce test indicates that the Neuron driver, Neuron SDK, and EFA driver are installed properly, and the required EFA device configuration + connectivity is in place to support multi-node training. 4 | 5 | It is expected that these scripts will be run from an x86_64-based Linux instance. 6 | 7 | Note: to use trn1n.32xlarge instances, the launch template and job definition will need to be adjusted to use 16 EFA devices (currently using 8 EFA devices for trn1.32xlarge). 8 | 9 | Prereqs: 10 | * Existing VPC with subnet and appropriate [EFA security group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security). Please make sure subnet needs to be private, and the VPC needs to have a NAT gateway to allow internet connectivity for the private subnet. 11 | * ECR repo 12 | * AWS CLI installed and configured with permissions for Batch and ECR 13 | * Docker installed 14 | * jq installed 15 | 16 | Steps: 17 | * Modify `build_configs_and_scripts.sh` with your account/region/etc 18 | * Run `./build_configs_and_scripts.sh` to create the configs/scripts using your config details 19 | * Run `./create_resources.sh` to create the various AWS Batch resources (job definition, compute environment, ...) 20 | * Run `./build_docker_image.sh` to build a training container using the latest Neuron Deep Learning Container (DLC) and push the image to ECR 21 | * Run `./submit_job.sh` to submit a basic 4-node allreduce job in the provisioned Batch environment -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/build_configs_and_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # ECR repo and image details. You can locate the correct Neuron DLC image for 'training' on AWS DLC github page - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers 5 | export BASE_IMAGE_REPO=763104351884.dkr.ecr.us-west-2.amazonaws.com 6 | export BASE_IMAGE_NAME=pytorch-training-neuronx 7 | export BASE_IMAGE_TAG=1.13.1-neuronx-py310-sdk2.15.0-ubuntu20.04 8 | 9 | # Configure your account specific settings below 10 | export REGION= 11 | export ACCOUNT= 12 | export INSTANCE_ROLE= 13 | export SUBNET= 14 | export SG= 15 | export ECR_REPO= 16 | 17 | ECS_AMI_NAME=/aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id 18 | export ECS_AMI=$(aws ssm get-parameter --region $REGION --name $ECS_AMI_NAME | jq -r .Parameter.Value) 19 | export USERDATA=$(cat << EOF | base64 -w0 20 | "MIME-Version: 1.0 21 | Content-Type: multipart/mixed; boundary="==MYBOUNDARY==" 22 | 23 | --==MYBOUNDARY== 24 | Content-Type: text/cloud-boothook; charset="us-ascii" 25 | 26 | cloud-init-per once yum_wget yum install -y wget 27 | cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz 28 | cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp 29 | pushd /tmp/aws-efa-installer 30 | cloud-init-per once install_efa ./efa_installer.sh -y 31 | pop /tmp/aws-efa-installer 32 | 33 | cloud-init-per once efa_info /opt/amazon/efa/bin/fi_info -p efa 34 | 35 | cloud-init-per once neuron_driver1 echo -e "[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0" | tee /etc/yum.repos.d/neuron.repo > /dev/null 36 | cloud-init-per once neuron_driver2 rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB 37 | cloud-init-per once neuron_driver3 yum update -y 38 | cloud-init-per once neuron_driver4 yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y 39 | cloud-init-per once neuron_driver5 yum erase aws-neuronx-dkms -y 40 | cloud-init-per once neuron_driver6 yum install aws-neuronx-dkms-2.* -y 41 | 42 | --==MYBOUNDARY==--" 43 | EOF 44 | ) 45 | 46 | # Apply variable substitutions to template files and resource creation script 47 | mkdir -p ./build 48 | 49 | for i in ./templates/*.json; do 50 | echo $i -\> ./build/`basename $i`; 51 | envsubst < $i > ./build/`basename $i`; 52 | done 53 | 54 | envsubst < ./templates/create_resources.sh > ./create_resources.sh \ 55 | && chmod u+x ./create_resources.sh \ 56 | && echo ./templates/create_resources.sh -\> ./create_resources.sh 57 | envsubst < ./templates/build_docker_image.sh > ./build_docker_image.sh \ 58 | && chmod u+x ./build_docker_image.sh \ 59 | && echo ./templates/build_docker_image.sh -\> ./build_docker_image.sh -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE_REPO 2 | ARG BASE_IMAGE_NAME 3 | ARG BASE_IMAGE_TAG 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | 6 | FROM ${BASE_IMAGE_REPO}/${BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} 7 | 8 | COPY ./allreduce* / 9 | 10 | WORKDIR / 11 | RUN chmod +x allreduce.sh 12 | CMD ["/allreduce.sh"] -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/docker/allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_xla.core.xla_model as xm 3 | import torch.distributed as dist 4 | import torch_xla.distributed.xla_backend 5 | import os 6 | 7 | def rprint(txt): 8 | rank = os.environ.get("LOCAL_RANK", "unk") 9 | if int(rank) == 0: 10 | print(f"{rank}: {txt}", flush=True) 11 | 12 | dist.init_process_group('xla') 13 | rprint("Before 1st rendezvous") 14 | xm.rendezvous('first') 15 | 16 | device = xm.xla_device() 17 | for c in range(1000000): 18 | ones = torch.ones((2, 3)) 19 | xones = ones.to(device) 20 | result = xm.all_reduce('sum', xones) 21 | xm.mark_step() 22 | result_cpu = result.cpu() 23 | expected = torch.ones((2, 3)) * int(os.environ.get("WORLD_SIZE", 0)) 24 | assert torch.all(result_cpu == expected), f'ERROR: {result_cpu} != {expected}' 25 | if c % 100 == 0: 26 | rprint(f"result OK step {c}: {result}") 27 | 28 | rprint("Before final rendezvous") 29 | xm.rendezvous('final') -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/docker/allreduce.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o pipefail 3 | ulimit -n 65535 4 | sysctl -w net.ipv4.ip_local_reserved_ports=41000 5 | 6 | export FI_EFA_USE_DEVICE_RDMA=1 7 | export FI_PROVIDER=efa 8 | export FI_EFA_FORK_SAFE=1 9 | export CCOM_SOCKET_IFNAME=eth0 10 | 11 | export PROCESSES_PER_NODE=32 12 | if [ -v AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS ] 13 | then 14 | export MASTER_ADDR=$AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS 15 | else 16 | export MASTER_ADDR=`ip -f inet addr show eth0 | grep -Po 'inet \K[\d.]+'` 17 | fi 18 | export MASTER_PORT=41000 19 | export NODEID=$AWS_BATCH_JOB_NODE_INDEX 20 | export NTASKS=$AWS_BATCH_JOB_NUM_NODES 21 | 22 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NTASKS --node_rank $NODEID --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 23 | echo $DISTRIBUTED_ARGS 24 | 25 | export MALLOC_ARENA_MAX=128 26 | export XLA_USE_BF16=1 27 | export TF_NUM_INTEROP_THREADS=8192 28 | 29 | set 30 | echo "Starting the job..." 31 | torchrun $DISTRIBUTED_ARGS allreduce.py -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/submit_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # submitting aws batch job 5 | aws batch submit-job \ 6 | --job-name aws-batch-trn1-job \ 7 | --job-queue aws-batch-job-queue \ 8 | --job-definition aws-batch-job-definition \ 9 | --node-overrides numNodes=4 -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/templates/build_docker_image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | # Build a Neuron container image for running all-reduce test on AWS Batch 4 | # and push the image to ECR 5 | export DOCKER_BUILDKIT=1 6 | 7 | # Authenticate with ECR, build & push the image 8 | pushd ./docker 9 | aws ecr get-login-password --region $REGION | docker login --username AWS \ 10 | --password-stdin $BASE_IMAGE_REPO \ 11 | && docker build . -t aws-batch:latest \ 12 | --build-arg BASE_IMAGE_REPO=$BASE_IMAGE_REPO \ 13 | --build-arg BASE_IMAGE_NAME=$BASE_IMAGE_NAME \ 14 | --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG 15 | 16 | aws ecr get-login-password --region $REGION | docker login --username AWS \ 17 | --password-stdin $ECR_REPO \ 18 | && docker tag aws-batch:latest $ECR_REPO:latest \ 19 | && docker push $ECR_REPO:latest 20 | popd -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/templates/compute_env.json: -------------------------------------------------------------------------------- 1 | { 2 | "computeEnvironmentName" : "aws-batch-compute-environment", 3 | "computeResources" : { 4 | "desiredvCpus" : 0, 5 | "instanceRole" : "$INSTANCE_ROLE", 6 | "instanceTypes" : [ 7 | "trn1.32xlarge" 8 | ], 9 | "launchTemplate" : { 10 | "launchTemplateName" : "aws-batch-launch-template", 11 | "version" : "$Latest" 12 | }, 13 | "maxvCpus" : 2088, 14 | "minvCpus" : 0, 15 | "subnets" : [ 16 | "$SUBNET" 17 | ], 18 | "type" : "EC2" 19 | }, 20 | "state" : "ENABLED", 21 | "type" : "MANAGED" 22 | } 23 | -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/templates/create_resources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | if [ ! `which jq` ] 5 | then 6 | echo "Please install jq and re-run this script" && exit 7 | fi 8 | 9 | aws ec2 create-placement-group --group-name "aws-batch-placement-group" --strategy "cluster" --region $REGION # creating the aws placement group 10 | aws ec2 create-launch-template --cli-input-json file://build/launch_template.json # creating the aws launch template 11 | aws batch create-compute-environment --cli-input-json file://build/compute_env.json # creating the aws batch compute environment 12 | aws batch register-job-definition --cli-input-json file://build/job_def.json # creating the aws batch job definition 13 | while [[ ! $(aws batch describe-compute-environments --compute-environments aws-batch-compute-environment | jq -r ".computeEnvironments[].status") =~ VALID ]] 14 | do 15 | echo -n "." 16 | sleep 2 17 | done 18 | aws batch create-job-queue --cli-input-json file://build/job_queue.json # creating the aws batch job queue -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/templates/job_def.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobDefinitionName" : "aws-batch-job-definition", 3 | "nodeProperties" : { 4 | "mainNode" : 0, 5 | "nodeRangeProperties" : [ 6 | { 7 | "container" : { 8 | "image" : "$ECR_REPO:latest", 9 | "linuxParameters" : { 10 | "devices" : [ 11 | { 12 | "hostPath" : "/dev/infiniband/uverbs0" 13 | }, 14 | { 15 | "hostPath" : "/dev/infiniband/uverbs1" 16 | }, 17 | { 18 | "hostPath" : "/dev/infiniband/uverbs2" 19 | }, 20 | { 21 | "hostPath" : "/dev/infiniband/uverbs3" 22 | }, 23 | { 24 | "hostPath" : "/dev/infiniband/uverbs4" 25 | }, 26 | { 27 | "hostPath" : "/dev/infiniband/uverbs5" 28 | }, 29 | { 30 | "hostPath" : "/dev/infiniband/uverbs6" 31 | }, 32 | { 33 | "hostPath" : "/dev/infiniband/uverbs7" 34 | }, 35 | { 36 | "hostPath": "/dev/neuron0" 37 | }, 38 | { 39 | "hostPath": "/dev/neuron1" 40 | }, 41 | { 42 | "hostPath": "/dev/neuron2" 43 | }, 44 | { 45 | "hostPath": "/dev/neuron3" 46 | }, 47 | { 48 | "hostPath": "/dev/neuron4" 49 | }, 50 | { 51 | "hostPath": "/dev/neuron5" 52 | }, 53 | { 54 | "hostPath": "/dev/neuron6" 55 | }, 56 | { 57 | "hostPath": "/dev/neuron7" 58 | }, 59 | { 60 | "hostPath": "/dev/neuron8" 61 | }, 62 | { 63 | "hostPath": "/dev/neuron9" 64 | }, 65 | { 66 | "hostPath": "/dev/neuron10" 67 | }, 68 | { 69 | "hostPath": "/dev/neuron11" 70 | }, 71 | { 72 | "hostPath": "/dev/neuron12" 73 | }, 74 | { 75 | "hostPath": "/dev/neuron13" 76 | }, 77 | { 78 | "hostPath": "/dev/neuron14" 79 | }, 80 | { 81 | "hostPath": "/dev/neuron15" 82 | } 83 | ] 84 | }, 85 | "memory" : 500000, 86 | "ulimits" : [ 87 | { 88 | "hardLimit" : -1, 89 | "name" : "memlock", 90 | "softLimit" : -1 91 | } 92 | ], 93 | "user" : "root", 94 | "vcpus" : 96, 95 | "instanceType" : "trn1.32xlarge" 96 | }, 97 | "targetNodes" : "0:" 98 | } 99 | ], 100 | "numNodes" : 4 101 | }, 102 | "type" : "multinode" 103 | } -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/templates/job_queue.json: -------------------------------------------------------------------------------- 1 | { 2 | "computeEnvironmentOrder" : [ 3 | { 4 | "computeEnvironment" : "aws-batch-compute-environment", 5 | "order" : 1 6 | } 7 | ], 8 | "jobQueueName" : "aws-batch-job-queue", 9 | "priority" : 10, 10 | "state" : "ENABLED" 11 | } 12 | 13 | -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/all-reduce/templates/launch_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateName" : "aws-batch-launch-template", 3 | "LaunchTemplateData" : { 4 | "IamInstanceProfile" : { 5 | "Arn" : "$INSTANCE_ROLE" 6 | }, 7 | "InstanceType" : "trn1.32xlarge", 8 | "ImageId" : "$ECS_AMI", 9 | "NetworkInterfaces" : [ 10 | { 11 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 12 | "DeviceIndex" : 0, 13 | "Groups" : [ 14 | "$SG" 15 | ], 16 | "InterfaceType" : "efa", 17 | "NetworkCardIndex" : 0, 18 | "SubnetId" : "$SUBNET" 19 | }, 20 | { 21 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 22 | "DeviceIndex" : 1, 23 | "Groups" : [ 24 | "$SG" 25 | ], 26 | "InterfaceType" : "efa", 27 | "NetworkCardIndex" : 1, 28 | "SubnetId" : "$SUBNET" 29 | }, 30 | { 31 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 32 | "DeviceIndex" : 1, 33 | "Groups" : [ 34 | "$SG" 35 | ], 36 | "InterfaceType" : "efa", 37 | "NetworkCardIndex" : 2, 38 | "SubnetId" : "$SUBNET" 39 | }, 40 | { 41 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 42 | "DeviceIndex" : 1, 43 | "Groups" : [ 44 | "$SG" 45 | ], 46 | "InterfaceType" : "efa", 47 | "NetworkCardIndex" : 3, 48 | "SubnetId" : "$SUBNET" 49 | }, 50 | { 51 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 52 | "DeviceIndex" : 1, 53 | "Groups" : [ 54 | "$SG" 55 | ], 56 | "InterfaceType" : "efa", 57 | "NetworkCardIndex" : 4, 58 | "SubnetId" : "$SUBNET" 59 | }, 60 | { 61 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 62 | "DeviceIndex" : 1, 63 | "Groups" : [ 64 | "$SG" 65 | ], 66 | "InterfaceType" : "efa", 67 | "NetworkCardIndex" : 5, 68 | "SubnetId" : "$SUBNET" 69 | }, 70 | { 71 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 72 | "DeviceIndex" : 1, 73 | "Groups" : [ 74 | "$SG" 75 | ], 76 | "InterfaceType" : "efa", 77 | "NetworkCardIndex" : 6, 78 | "SubnetId" : "$SUBNET" 79 | }, 80 | { 81 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 82 | "DeviceIndex" : 1, 83 | "Groups" : [ 84 | "$SG" 85 | ], 86 | "InterfaceType" : "efa", 87 | "NetworkCardIndex" : 7, 88 | "SubnetId" : "$SUBNET" 89 | } 90 | ], 91 | "Placement" : { 92 | "GroupName" : "aws-batch-placement-group" 93 | }, 94 | "TagSpecifications" : [ 95 | { 96 | "ResourceType" : "instance", 97 | "Tags" : [ 98 | { 99 | "Key" : "from-lt", 100 | "Value" : "networkInterfacesConfig-EFA-Batch" 101 | } 102 | ] 103 | } 104 | ], 105 | "UserData" : "$USERDATA" 106 | } 107 | } -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/config.txt: -------------------------------------------------------------------------------- 1 | REGION=us-east-1 2 | SUBNET=subnet-012345abcd5689 3 | SG=sg-012345abcd5689 4 | ECR_REPO=1010101010.dkr.ecr.us-east-1.amazonaws.com/your-docker-repo 5 | INSTANCE_ROLE=arn:aws:iam::1010101010:instance-profile/your-instance-role 6 | DO_PRE_COMPILATION=true 7 | TOKENIZED_DATASET_URI=s3://your/s3/location/to/store/tokenized/dataset/ 8 | NEURON_COMPILE_CACHE_URI=s3://your/s3/location/to/store/compile-cache/ 9 | CHECKPOINT_SAVE_URI=s3://your/s3/location/to/store/tokenized/checkpoints/ -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE_REPO 2 | ARG BASE_IMAGE_NAME 3 | ARG BASE_IMAGE_TAG 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | 6 | FROM ${BASE_IMAGE_REPO}/${BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} 7 | 8 | COPY ./llama2 / 9 | 10 | WORKDIR / 11 | RUN chmod +x /llama_batch_training.sh 12 | CMD ["/llama_batch_training.sh"] -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/docker/llama_batch_training.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 6 | ulimit -n 65535 7 | sysctl -w net.ipv4.ip_local_reserved_ports=41000 8 | 9 | export FI_EFA_USE_DEVICE_RDMA=1 10 | export FI_PROVIDER=efa 11 | export FI_EFA_FORK_SAFE=1 12 | export CCOM_SOCKET_IFNAME=eth0 13 | 14 | export MASTER_PORT=41000 15 | export NODEID=$AWS_BATCH_JOB_NODE_INDEX 16 | export NTASKS=$AWS_BATCH_JOB_NUM_NODES 17 | 18 | export MALLOC_ARENA_MAX=64 19 | export XLA_USE_BF16=1 20 | export TF_NUM_INTEROP_THREADS=8192 21 | export PROCESSES_PER_NODE=32 22 | export NEURON_CC_FLAGS="--model-type transformer --distribution-strategy=llm-training --cache_dir=$NEURON_COMPILE_CACHE_URI" 23 | export NEURON_FUSE_SOFTMAX=1 24 | export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3 25 | export NUM_NEURONCORES=32 26 | 27 | export NEURON_RT_NUM_CORES=32 28 | export NUM_NEURONCORES=$NEURON_RT_NUM_CORES 29 | export TPU_NUM_DEVICES=$NEURON_RT_NUM_CORES 30 | export TPU_CHIPS_PER_HOST_BOUNDS=$NEURON_RT_NUM_CORES 31 | export NEURON_RT_ROOT_COMM_ID=localhost:48620 32 | 33 | # TP degree 34 | TP_DEGREE=8 35 | # 0: bf16; 1: mixed precision 36 | USE_MIX_PRECISION=1 37 | # 0: use pure DP; 1: use ZeRO-1 38 | USE_ZERO_1=1 39 | # global batch size 40 | GBS=1024 41 | # micro batch size 42 | MBS=1 43 | # number of steps to run 44 | TOTAL_STEPS=10000 45 | # warmup steps 46 | WARMUP_STEPS=100 47 | # learning rate 48 | LR=3.0e-4 49 | # model path 50 | MODEL_PATH=$SCRIPT_DIR 51 | # data path 52 | DATA_PATH="$HOME/examples_datasets/wikicorpus_llama2_7B_tokenized_4k" 53 | # sequence length 54 | SEQ_LEN=4096 55 | # pre-compilation steps 56 | PRE_COMPILATION_STEPS_COUNT=2 57 | # training job steps 58 | STEPS_THIS_RUN=-1 59 | # output directory 60 | OUTPUT_DIR="/llama_checkpoints" 61 | # S3 checkpoint directory 62 | CURRENT_BATCH_JOB_ID=$(echo "$AWS_BATCH_JOB_ID" | sed 's/#.*//') 63 | CHECKPOINT_PATH="$CHECKPOINT_SAVE_URI$CURRENT_BATCH_JOB_ID" 64 | 65 | if [ -v AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS ] 66 | then 67 | export MASTER_ADDR=$AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS 68 | else 69 | export MASTER_ADDR=`ip -f inet addr show eth0 | grep -Po 'inet \K[\d.]+'` 70 | fi 71 | 72 | DP=$(($NEURON_RT_NUM_CORES * $NTASKS / $TP_DEGREE)) 73 | ACC_STEPS=$(($GBS / $MBS / $DP)) 74 | 75 | EXTRA_ARGS=" " 76 | if [ $USE_MIX_PRECISION -gt 0 ]; then 77 | EXTRA_ARGS+=" --use_mix_precision" 78 | fi 79 | if [ $USE_ZERO_1 -gt 0 ]; then 80 | EXTRA_ARGS+=" --use_zero_1" 81 | fi 82 | 83 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NTASKS --node_rank $NODEID --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 84 | 85 | TRAINING_ARGS="--model_path $MODEL_PATH --data_dir $DATA_PATH --tensor_parallel_size $TP_DEGREE --batch_size $MBS \ 86 | --max_steps $TOTAL_STEPS --warmup_steps $WARMUP_STEPS --lr $LR --grad_accum_usteps $ACC_STEPS --seq_len $SEQ_LEN --sequence_parallel_enabled \ 87 | --selective_checkpoint_enabled --logging_interval 10 --output_dir $OUTPUT_DIR $EXTRA_ARGS" 88 | 89 | TORCH_RUN_COMMAND="torchrun $DISTRIBUTED_ARGS tp_zero1_llama2_7b_hf_pretrain.py $TRAINING_ARGS" 90 | 91 | set 92 | echo "Installing all dependencies..." 93 | python3 -m pip install -r requirements.txt 94 | 95 | # Downloading the pre-tokenized dataset from s3 96 | echo "Downloading tokenized dataset..." 97 | aws s3 cp $TOKENIZED_DATASET_URI $DATA_PATH --recursive --only-show-errors 98 | 99 | # Running Pre-Compilation 100 | if [ "$DO_PRE_COMPILATION" = true ]; then 101 | echo "Starting neuron parallel compilation..." 102 | neuron_parallel_compile $TORCH_RUN_COMMAND --steps_this_run $PRE_COMPILATION_STEPS_COUNT 103 | fi 104 | 105 | # Running Training Job 106 | echo "Starting the training job..." 107 | $TORCH_RUN_COMMAND --steps_this_run $STEPS_THIS_RUN 108 | 109 | # Uploading checkpoints to S3 110 | aws s3 cp $OUTPUT_DIR $CHECKPOINT_PATH --recursive --only-show-errors 111 | echo "Saved the checkpoints to $CHECKPOINT_PATH" -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/images/aws-batch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/15be8c363a3cbcf7d0795f782085a3b0b919e599/torch-neuronx/training/aws-batch/llama2/images/aws-batch.png -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/scripts/build_and_push_docker_image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | export DOCKER_BUILDKIT=1 5 | 6 | pushd ./docker 7 | # Build a Neuron container image for running all-reduce test on AWS Batch and push the image to ECR 8 | # Authenticate with ECR, build & push the image 9 | aws ecr get-login-password --region $REGION | docker login --username AWS \ 10 | --password-stdin $BASE_IMAGE_REPO \ 11 | && docker build . -t aws-batch:latest \ 12 | --build-arg BASE_IMAGE_REPO=$BASE_IMAGE_REPO \ 13 | --build-arg BASE_IMAGE_NAME=$BASE_IMAGE_NAME \ 14 | --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG 15 | 16 | aws ecr get-login-password --region $REGION | docker login --username AWS \ 17 | --password-stdin $ECR_REPO \ 18 | && docker tag aws-batch:latest $ECR_REPO:latest \ 19 | && docker push $ECR_REPO:latest 20 | popd -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/scripts/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | aws ec2 delete-placement-group --group-name $PLACEMENT_GROUP_NAME # deleting the placement group 5 | aws ec2 delete-launch-template --launch-template-name $LAUNCH_TEMPLATE_NAME # deleting the aws batch compute environment 6 | 7 | aws batch update-job-queue --job-queue $JOB_QUEUE_NAME --state DISABLED # disabling the job queue 8 | while [[ ! $( aws batch describe-job-queues --job-queue $JOB_QUEUE_NAME | jq -r ".jobQueues[].state") =~ DISABLED ]] 9 | do 10 | echo -n "." 11 | sleep 2 12 | done 13 | aws batch delete-job-queue --job-queue $JOB_QUEUE_NAME # deleting the job queue 14 | while [[ $(aws batch describe-job-queues --job-queue $JOB_QUEUE_NAME | jq -r '.jobQueues | length') -ne 0 ]]; do 15 | echo -n "." 16 | sleep 5 17 | done 18 | 19 | aws batch update-compute-environment --compute-environment $COMPUTE_ENV_NAME --state DISABLED # disabling the compute environment 20 | while [[ ! $(aws batch describe-compute-environments --compute-environments $COMPUTE_ENV_NAME | jq -r ".computeEnvironments[].status") =~ VALID ]] 21 | do 22 | echo -n "." 23 | sleep 5 24 | done 25 | aws batch delete-compute-environment --compute-environment $COMPUTE_ENV_NAME # deleting the compute environment 26 | aws batch deregister-job-definition --job-definition $JOB_DEF_NAME # deregistering the aws batch job definition 27 | echo -e "\nCleaned up all the resources." -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/scripts/create_resources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | if [ ! `which jq` ] 5 | then 6 | echo "Please install jq and re-run this script" && exit 7 | fi 8 | 9 | aws ec2 create-placement-group --group-name $PLACEMENT_GROUP_NAME --strategy cluster # creating the placement group 10 | aws ec2 create-launch-template --cli-input-json file://build/launch_template.json # creating the aws launch template 11 | aws batch create-compute-environment --cli-input-json file://build/compute_env.json # creating the aws batch compute environment 12 | 13 | aws batch register-job-definition --cli-input-json file://build/job_def.json # creating the aws batch job definition 14 | while [[ ! $(aws batch describe-compute-environments --compute-environments $COMPUTE_ENV_NAME | jq -r ".computeEnvironments[].status") =~ VALID ]] 15 | do 16 | echo -n "." 17 | sleep 2 18 | done 19 | 20 | aws batch create-job-queue --cli-input-json file://build/job_queue.json # creating the aws batch job queue 21 | while [[ ! $(aws batch describe-job-queues --job-queue $JOB_QUEUE_NAME | jq -r ".jobQueues[].status") =~ VALID ]] 22 | do 23 | echo -n "." 24 | sleep 2 25 | done -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/scripts/download_and_tokenize_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eu 3 | 4 | # installing the requirements 5 | python3 -m pip install transformers regex datasets sentencepiece protobuf==3.20.* 6 | 7 | # downloading and tokenizing the dataset 8 | cd ./data 9 | python3 get_dataset.py 10 | 11 | # pushing the tokenized dataset to predefined S3 location 12 | aws s3 cp ~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k/ $TOKENIZED_DATASET_URI --recursive --only-show-errors 13 | echo "Dataset has been processed and tokenized data has been uploaded to $TOKENIZED_DATASET_URI successfully." 14 | -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/scripts/submit_batch_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # submitting aws batch job 5 | aws batch submit-job \ 6 | --job-name $JOB_NAME \ 7 | --job-queue $JOB_QUEUE_NAME \ 8 | --job-definition $JOB_DEF_NAME \ 9 | --node-overrides numNodes=4 -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Read variables from config file 5 | source config.txt 6 | 7 | export REGION 8 | export SUBNET 9 | export SG 10 | export ECR_REPO 11 | export INSTANCE_ROLE 12 | export DO_PRE_COMPILATION 13 | export TOKENIZED_DATASET_URI 14 | export NEURON_COMPILE_CACHE_URI 15 | export CHECKPOINT_SAVE_URI 16 | 17 | # ECR repo and image details. You can locate the correct Neuron DLC image for 'training' on AWS DLC github page - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers 18 | export BASE_IMAGE_REPO=763104351884.dkr.ecr.$REGION.amazonaws.com 19 | export BASE_IMAGE_NAME=pytorch-training-neuronx 20 | export BASE_IMAGE_TAG=1.13.1-neuronx-py310-sdk2.18.0-ubuntu20.04 21 | export ECS_AMI_NAME=/aws/service/ecs/optimized-ami/amazon-linux-2023/recommended/image_id 22 | export ECS_AMI=$(aws ssm get-parameter --region $REGION --name $ECS_AMI_NAME | jq -r .Parameter.Value) 23 | 24 | export PLACEMENT_GROUP_NAME=aws-batch-placement-group 25 | export LAUNCH_TEMPLATE_NAME=aws-batch-launch-template 26 | export COMPUTE_ENV_NAME=aws-batch-compute-environment 27 | export JOB_QUEUE_NAME=aws-batch-job-queue 28 | export JOB_DEF_NAME=aws-batch-job-definition 29 | export JOB_NAME=aws-batch-job 30 | 31 | export USER_DATA=$(cat << EOF | base64 -w0 32 | "MIME-Version: 1.0 33 | Content-Type: multipart/mixed; boundary="==MYBOUNDARY==" 34 | 35 | --==MYBOUNDARY== 36 | Content-Type: text/cloud-boothook; charset="us-ascii" 37 | 38 | #!/bin/bash 39 | sudo yum install -y libibverbs-utils rdma-core-devel ibacm infiniband-diags-compat librdmacm-utils 40 | cloud-init-per once yum_wget yum install -y wget 41 | cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz 42 | cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp 43 | pushd /tmp/aws-efa-installer 44 | cloud-init-per once install_efa ./efa_installer.sh -y 45 | pop /tmp/aws-efa-installer 46 | 47 | cloud-init-per once efa_info /opt/amazon/efa/bin/fi_info -p efa 48 | 49 | cloud-init-per once neuron_driver1 echo -e "[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0" | tee /etc/yum.repos.d/neuron.repo > /dev/null 50 | cloud-init-per once neuron_driver2 rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB 51 | cloud-init-per once neuron_driver3 yum update -y 52 | cloud-init-per once neuron_driver4 yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y 53 | cloud-init-per once neuron_driver5 yum erase aws-neuronx-dkms -y 54 | cloud-init-per once neuron_driver6 yum install aws-neuronx-dkms-2.* -y 55 | 56 | --==MYBOUNDARY==--" 57 | EOF 58 | ) 59 | 60 | # Creating directories required for setup 61 | mkdir -p ./data 62 | mkdir -p ./build 63 | mkdir -p ./docker/llama2 64 | 65 | # Locating and moving tokenizer to required directory 66 | if [[ ! -e "tokenizer.model" ]]; then 67 | echo "Tokenizer File does not exist. Please ensure you have tokenizer file placed in the root directory with the name as 'tokenizer.model'" 68 | exit 1 69 | fi 70 | mv tokenizer.model ./data/ 71 | 72 | # Downloading the sample files required for data pre-processing 73 | wget -q -P ./data/ https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/get_dataset.py 74 | wget -q -P ./data/ https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json 75 | 76 | # Environment substitution is required files 77 | for template in ./templates/*.json; do envsubst < $template > ./build/`basename $template`; done 78 | for script in ./scripts/*.sh; do envsubst < $script > ./`basename $script`; chmod u+x ./`basename $script`; done 79 | envsubst '$DO_PRE_COMPILATION $NEURON_COMPILE_CACHE_URI $CHECKPOINT_SAVE_URI $TOKENIZED_DATASET_URI' < ./docker/llama_batch_training.sh > ./docker/llama2/llama_batch_training.sh 80 | 81 | # Downloading the sample files required for Llama training 82 | pushd . > /dev/null 83 | cd ./docker/llama2 84 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py 85 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/logger.py 86 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/modeling_llama_nxd.py 87 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/requirements.txt 88 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json 89 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/training_utils.py 90 | popd > /dev/null 91 | echo "Set up has been completed successfully." -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/templates/compute_env.json: -------------------------------------------------------------------------------- 1 | { 2 | "computeEnvironmentName" : "$COMPUTE_ENV_NAME", 3 | "computeResources" : { 4 | "desiredvCpus" : 0, 5 | "maxvCpus" : 2088, 6 | "minvCpus" : 0, 7 | "instanceRole" : "$INSTANCE_ROLE", 8 | "instanceTypes" : [ 9 | "trn1.32xlarge" 10 | ], 11 | "launchTemplate" : { 12 | "launchTemplateName" : "$LAUNCH_TEMPLATE_NAME", 13 | "version" : "$Latest" 14 | }, 15 | "subnets" : [ 16 | "$SUBNET" 17 | ], 18 | "type" : "EC2" 19 | }, 20 | "state" : "ENABLED", 21 | "type" : "MANAGED" 22 | } -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/templates/job_def.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobDefinitionName" : "$JOB_DEF_NAME", 3 | "nodeProperties" : { 4 | "mainNode" : 0, 5 | "nodeRangeProperties" : [ 6 | { 7 | "container" : { 8 | "image" : "$ECR_REPO:latest", 9 | "linuxParameters" : { 10 | "devices" : [ 11 | { 12 | "hostPath" : "/dev/infiniband/uverbs0" 13 | }, 14 | { 15 | "hostPath" : "/dev/infiniband/uverbs1" 16 | }, 17 | { 18 | "hostPath" : "/dev/infiniband/uverbs2" 19 | }, 20 | { 21 | "hostPath" : "/dev/infiniband/uverbs3" 22 | }, 23 | { 24 | "hostPath" : "/dev/infiniband/uverbs4" 25 | }, 26 | { 27 | "hostPath" : "/dev/infiniband/uverbs5" 28 | }, 29 | { 30 | "hostPath" : "/dev/infiniband/uverbs6" 31 | }, 32 | { 33 | "hostPath" : "/dev/infiniband/uverbs7" 34 | }, 35 | { 36 | "hostPath": "/dev/neuron0" 37 | }, 38 | { 39 | "hostPath": "/dev/neuron1" 40 | }, 41 | { 42 | "hostPath": "/dev/neuron2" 43 | }, 44 | { 45 | "hostPath": "/dev/neuron3" 46 | }, 47 | { 48 | "hostPath": "/dev/neuron4" 49 | }, 50 | { 51 | "hostPath": "/dev/neuron5" 52 | }, 53 | { 54 | "hostPath": "/dev/neuron6" 55 | }, 56 | { 57 | "hostPath": "/dev/neuron7" 58 | }, 59 | { 60 | "hostPath": "/dev/neuron8" 61 | }, 62 | { 63 | "hostPath": "/dev/neuron9" 64 | }, 65 | { 66 | "hostPath": "/dev/neuron10" 67 | }, 68 | { 69 | "hostPath": "/dev/neuron11" 70 | }, 71 | { 72 | "hostPath": "/dev/neuron12" 73 | }, 74 | { 75 | "hostPath": "/dev/neuron13" 76 | }, 77 | { 78 | "hostPath": "/dev/neuron14" 79 | }, 80 | { 81 | "hostPath": "/dev/neuron15" 82 | } 83 | ] 84 | }, 85 | "memory" : 500000, 86 | "ulimits" : [ 87 | { 88 | "hardLimit" : -1, 89 | "name" : "memlock", 90 | "softLimit" : -1 91 | } 92 | ], 93 | "user" : "root", 94 | "vcpus" : 96, 95 | "instanceType" : "trn1.32xlarge" 96 | }, 97 | "targetNodes" : "0:" 98 | } 99 | ], 100 | "numNodes" : 4 101 | }, 102 | "type" : "multinode" 103 | } -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/templates/job_queue.json: -------------------------------------------------------------------------------- 1 | { 2 | "computeEnvironmentOrder" : [ 3 | { 4 | "computeEnvironment" : "$COMPUTE_ENV_NAME", 5 | "order" : 1 6 | } 7 | ], 8 | "jobQueueName" : "$JOB_QUEUE_NAME", 9 | "priority" : 1, 10 | "state" : "ENABLED" 11 | } -------------------------------------------------------------------------------- /torch-neuronx/training/aws-batch/llama2/templates/launch_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "DryRun": false, 3 | "LaunchTemplateName": "$LAUNCH_TEMPLATE_NAME", 4 | "VersionDescription": "Override Template", 5 | "LaunchTemplateData": { 6 | "IamInstanceProfile": { 7 | "Arn": "$INSTANCE_ROLE" 8 | }, 9 | "InstanceType" : "trn1.32xlarge", 10 | "Placement" : { 11 | "GroupName" : "$PLACEMENT_GROUP_NAME" 12 | }, 13 | "BlockDeviceMappings": [ 14 | { 15 | "DeviceName": "/dev/xvda", 16 | "Ebs": { 17 | "VolumeSize": 200, 18 | "DeleteOnTermination": true 19 | } 20 | } 21 | ], 22 | "ImageId": "$ECS_AMI", 23 | "Monitoring": { 24 | "Enabled": true 25 | }, 26 | "DisableApiTermination": false, 27 | "InstanceInitiatedShutdownBehavior": "stop", 28 | "UserData": "$USER_DATA", 29 | "TagSpecifications": [ 30 | { 31 | "ResourceType": "instance", 32 | "Tags": [ 33 | { 34 | "Key": "purpose", 35 | "Value": "batch multinode training" 36 | } 37 | ] 38 | } 39 | ], 40 | "MetadataOptions": { 41 | "HttpTokens": "required", 42 | "HttpPutResponseHopLimit": 5, 43 | "HttpEndpoint": "enabled" 44 | }, 45 | "NetworkInterfaces" : [ 46 | { 47 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 48 | "DeviceIndex" : 0, 49 | "Groups" : [ 50 | "$SG" 51 | ], 52 | "InterfaceType" : "efa", 53 | "NetworkCardIndex" : 0, 54 | "SubnetId" : "$SUBNET" 55 | }, 56 | { 57 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 58 | "DeviceIndex" : 1, 59 | "Groups" : [ 60 | "$SG" 61 | ], 62 | "InterfaceType" : "efa", 63 | "NetworkCardIndex" : 1, 64 | "SubnetId" : "$SUBNET" 65 | }, 66 | { 67 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 68 | "DeviceIndex" : 1, 69 | "Groups" : [ 70 | "$SG" 71 | ], 72 | "InterfaceType" : "efa", 73 | "NetworkCardIndex" : 2, 74 | "SubnetId" : "$SUBNET" 75 | }, 76 | { 77 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 78 | "DeviceIndex" : 1, 79 | "Groups" : [ 80 | "$SG" 81 | ], 82 | "InterfaceType" : "efa", 83 | "NetworkCardIndex" : 3, 84 | "SubnetId" : "$SUBNET" 85 | }, 86 | { 87 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 88 | "DeviceIndex" : 1, 89 | "Groups" : [ 90 | "$SG" 91 | ], 92 | "InterfaceType" : "efa", 93 | "NetworkCardIndex" : 4, 94 | "SubnetId" : "$SUBNET" 95 | }, 96 | { 97 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 98 | "DeviceIndex" : 1, 99 | "Groups" : [ 100 | "$SG" 101 | ], 102 | "InterfaceType" : "efa", 103 | "NetworkCardIndex" : 5, 104 | "SubnetId" : "$SUBNET" 105 | }, 106 | { 107 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 108 | "DeviceIndex" : 1, 109 | "Groups" : [ 110 | "$SG" 111 | ], 112 | "InterfaceType" : "efa", 113 | "NetworkCardIndex" : 6, 114 | "SubnetId" : "$SUBNET" 115 | }, 116 | { 117 | "Description" : "NetworkInterfaces Configuration For EFA and Batch", 118 | "DeviceIndex" : 1, 119 | "Groups" : [ 120 | "$SG" 121 | ], 122 | "InterfaceType" : "efa", 123 | "NetworkCardIndex" : 7, 124 | "SubnetId" : "$SUBNET" 125 | } 126 | ] 127 | }, 128 | "TagSpecifications": [ 129 | { 130 | "ResourceType": "launch-template", 131 | "Tags": [ 132 | { 133 | "Key": "purpose", 134 | "Value": "batch training" 135 | } 136 | ] 137 | } 138 | ] 139 | } 140 | -------------------------------------------------------------------------------- /torch-neuronx/training/common/hf_utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from torch.utils.data import DataLoader, Dataset 5 | import torch_xla.distributed.parallel_loader as xpl 6 | from transformers import Trainer, TrainingArguments 7 | 8 | 9 | @dataclass 10 | class TrnTrainingArguments(TrainingArguments): 11 | loader_prefetch_size: Optional[int] = field( 12 | default=8, 13 | metadata={"help": "The max capacity of the queue used by the thread which is reading samples from the loader."}, 14 | ) 15 | device_prefetch_size: Optional[int] = field( 16 | default=4, 17 | metadata={"help": "The max size of the per-device queues, where the worker threads deposit tensors which have already been sent to devices."}, 18 | ) 19 | host_to_device_transfer_threads: Optional[int] = field( 20 | default=1, 21 | metadata={"help": "The number of threads that work in parallel to transfer data from loader queue to device queue."}, 22 | ) 23 | @property 24 | def _no_sync_in_gradient_accumulation(self): 25 | return False 26 | 27 | 28 | class TrnTrainer(Trainer): 29 | def get_train_dataloader(self) -> DataLoader: 30 | train_loader = super().get_train_dataloader() 31 | kwargs = { 32 | "loader_prefetch_size": self.args.loader_prefetch_size, 33 | "device_prefetch_size": self.args.device_prefetch_size, 34 | "host_to_device_transfer_threads": self.args.host_to_device_transfer_threads 35 | } 36 | if isinstance(train_loader, xpl.MpDeviceLoader): 37 | train_loader._parallel_loader_kwargs = kwargs 38 | return train_loader 39 | 40 | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: 41 | eval_loader = super().get_eval_dataloader(eval_dataset) 42 | kwargs = { 43 | "loader_prefetch_size": self.args.loader_prefetch_size, 44 | "device_prefetch_size": self.args.device_prefetch_size, 45 | "host_to_device_transfer_threads": self.args.host_to_device_transfer_threads 46 | } 47 | if isinstance(eval_loader, xpl.MpDeviceLoader): 48 | eval_loader._parallel_loader_kwargs = kwargs 49 | return eval_loader 50 | 51 | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: 52 | test_loader = super().get_eval_dataloader(test_dataset) 53 | kwargs = { 54 | "loader_prefetch_size": self.args.loader_prefetch_size, 55 | "device_prefetch_size": self.args.device_prefetch_size, 56 | "host_to_device_transfer_threads": self.args.host_to_device_transfer_threads 57 | } 58 | if isinstance(test_loader, xpl.MpDeviceLoader): 59 | test_loader._parallel_loader_kwargs = kwargs 60 | return test_loader -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/README.md: -------------------------------------------------------------------------------- 1 | # Torch Neuron CustomOp MLP 2 | 3 | This folder contains examples Torch custom operators for a multi-layer perceptron (MLP) model. 4 | 5 | - The `pytorch` folder contains a basic PyTorch (non-neuron) CPU-based MLP model with a custom Relu operator and training script. 6 | - The `neuron` folder contains a the same model but converted to Neuron with an XLA-based training script for trn1-based instances. -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/neuron/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load( 6 | name='relu', 7 | compute_srcs=['relu.cpp'], 8 | shape_srcs=['shape.cpp'], 9 | build_directory=os.getcwd() 10 | ) 11 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/neuron/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import my_ops 5 | 6 | # Declare 3-layer MLP for MNIST dataset 7 | class MLP(nn.Module): 8 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]): 9 | super(MLP, self).__init__() 10 | self.fc1 = nn.Linear(input_size, layers[0]) 11 | self.fc2 = nn.Linear(layers[0], layers[1]) 12 | self.fc3 = nn.Linear(layers[1], output_size) 13 | 14 | def forward(self, x): 15 | f1 = self.fc1(x) 16 | r1 = my_ops.Relu.apply(f1) 17 | f2 = self.fc2(r1) 18 | r2 = my_ops.Relu.apply(f2) 19 | f3 = self.fc3(r2) 20 | return torch.log_softmax(f3, dim=1) 21 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/neuron/my_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_neuronx 3 | from torch_neuronx.xla_impl import custom_op 4 | 5 | custom_op.load_library('librelu.so') 6 | 7 | class Relu(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input): 10 | ctx.save_for_backward(input) 11 | return torch.ops.my_ops.relu_forward(input) 12 | 13 | @staticmethod 14 | def backward(ctx, grad): 15 | input, = ctx.saved_tensors 16 | return torch.ops.my_ops.relu_backward(grad, input), None 17 | 18 | 19 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/neuron/relu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | torch::Tensor relu_forward(const torch::Tensor& t_in) { 6 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 7 | auto t_in_acc = t_in.accessor(); 8 | auto t_out_acc = t_out.accessor(); 9 | auto shape = t_in.sizes(); 10 | for (int i = 0; i < shape[0]; i++) { 11 | for (int j = 0; j < shape[1]; j++) { 12 | t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_in_acc[i][j] : 0.0; 13 | } 14 | } 15 | return t_out; 16 | } 17 | 18 | torch::Tensor relu_backward(const torch::Tensor& t_grad, const torch::Tensor& t_in) { 19 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 20 | auto t_in_acc = t_in.accessor(); 21 | auto t_grad_acc = t_grad.accessor(); 22 | auto t_out_acc = t_out.accessor(); 23 | auto shape = t_in.sizes(); 24 | for (int i = 0; i < shape[0]; i++) { 25 | for (int j = 0; j < shape[1]; j++) { 26 | t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_grad_acc[i][j] : 0.0; 27 | } 28 | } 29 | return t_out; 30 | } 31 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/neuron/shape.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "torchneuron/register.h" 5 | 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) { 7 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 8 | return t_out; 9 | } 10 | 11 | torch::Tensor relu_bwd_shape(torch::Tensor t_grad, torch::Tensor t_in) { 12 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 13 | return t_out; 14 | } 15 | 16 | NEURON_LIBRARY(my_ops, m) { 17 | m.def("relu_forward", &relu_fwd_shape, "relu_forward"); 18 | m.def("relu_backward", &relu_bwd_shape, "relu_backward"); 19 | } 20 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/neuron/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | 13 | # Global constants 14 | EPOCHS = 4 15 | WARMUP_STEPS = 2 16 | BATCH_SIZE = 32 17 | 18 | # Load MNIST train dataset 19 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train', 20 | train=True, download=True, transform=ToTensor()) 21 | 22 | def main(): 23 | # Prepare data loader 24 | train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE) 25 | 26 | # Fix the random number generator seeds for reproducibility 27 | torch.manual_seed(0) 28 | 29 | # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance) 30 | device = 'xla' 31 | 32 | # Move model to device and declare optimizer and loss function 33 | model = MLP().to(device) 34 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 35 | loss_fn = torch.nn.NLLLoss() 36 | 37 | # Run the training loop 38 | print('----------Training ---------------') 39 | model.train() 40 | for epoch in range(EPOCHS): 41 | start = time.time() 42 | for idx, (train_x, train_label) in enumerate(train_loader): 43 | optimizer.zero_grad() 44 | train_x = train_x.view(train_x.size(0), -1) 45 | train_x = train_x.to(device) 46 | train_label = train_label.to(device) 47 | output = model(train_x) 48 | loss = loss_fn(output, train_label) 49 | loss.backward() 50 | optimizer.step() 51 | xm.mark_step() # XLA: collect ops and run them in XLA runtime 52 | if idx < WARMUP_STEPS: # skip warmup iterations 53 | start = time.time() 54 | # Compute statistics for the last epoch 55 | interval = idx - WARMUP_STEPS # skip warmup iterations 56 | throughput = interval / (time.time() - start) 57 | print("Train throughput (iter/sec): {}".format(throughput)) 58 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 59 | 60 | # Save checkpoint for evaluation 61 | os.makedirs("checkpoints", exist_ok=True) 62 | checkpoint = {'state_dict': model.state_dict()} 63 | # XLA: use xm.save instead of torch.save to ensure states are moved back to cpu 64 | # This can prevent "XRT memory handle not found" at end of test.py execution 65 | xm.save(checkpoint,'checkpoints/checkpoint.pt') 66 | 67 | print('----------End Training ---------------') 68 | 69 | if __name__ == '__main__': 70 | main() 71 | 72 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/pytorch/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils import cpp_extension 4 | 5 | cpp_extension.load( 6 | name='librelu', 7 | sources=['relu.cpp'], 8 | is_python_module=False, 9 | build_directory=os.getcwd() 10 | ) 11 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/pytorch/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import my_ops 5 | 6 | # Declare 3-layer MLP for MNIST dataset 7 | class MLP(nn.Module): 8 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]): 9 | super(MLP, self).__init__() 10 | self.fc1 = nn.Linear(input_size, layers[0]) 11 | self.fc2 = nn.Linear(layers[0], layers[1]) 12 | self.fc3 = nn.Linear(layers[1], output_size) 13 | 14 | def forward(self, x): 15 | f1 = self.fc1(x) 16 | r1 = my_ops.Relu.apply(f1) 17 | f2 = self.fc2(r1) 18 | r2 = my_ops.Relu.apply(f2) 19 | f3 = self.fc3(r2) 20 | return torch.log_softmax(f3, dim=1) 21 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/pytorch/my_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.ops.load_library('librelu.so') 4 | 5 | class Relu(torch.autograd.Function): 6 | @staticmethod 7 | def forward(ctx, input): 8 | ctx.save_for_backward(input) 9 | return torch.ops.my_ops.relu_forward(input) 10 | 11 | @staticmethod 12 | def backward(ctx, grad): 13 | input, = ctx.saved_tensors 14 | return torch.ops.my_ops.relu_backward(grad, input), None 15 | 16 | 17 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/pytorch/relu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | torch::Tensor relu_forward(const torch::Tensor& t_in) { 6 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 7 | auto t_in_acc = t_in.accessor(); 8 | auto t_out_acc = t_out.accessor(); 9 | auto shape = t_in.sizes(); 10 | for (int i = 0; i < shape[0]; i++) { 11 | for (int j = 0; j < shape[1]; j++) { 12 | t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_in_acc[i][j] : 0.0; 13 | } 14 | } 15 | return t_out; 16 | } 17 | 18 | torch::Tensor relu_backward(const torch::Tensor& t_grad, const torch::Tensor& t_in) { 19 | torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 20 | auto t_in_acc = t_in.accessor(); 21 | auto t_grad_acc = t_grad.accessor(); 22 | auto t_out_acc = t_out.accessor(); 23 | auto shape = t_in.sizes(); 24 | for (int i = 0; i < shape[0]; i++) { 25 | for (int j = 0; j < shape[1]; j++) { 26 | t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_grad_acc[i][j] : 0.0; 27 | } 28 | } 29 | return t_out; 30 | } 31 | 32 | TORCH_LIBRARY(my_ops, m) { 33 | m.def("relu_forward", &relu_forward); 34 | m.def("relu_backward", &relu_backward); 35 | } 36 | -------------------------------------------------------------------------------- /torch-neuronx/training/customop_mlp/pytorch/train_cpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # Global constants 11 | EPOCHS = 4 12 | WARMUP_STEPS = 2 13 | BATCH_SIZE = 32 14 | 15 | # Load MNIST train dataset 16 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train', 17 | train=True, download=True, transform=ToTensor()) 18 | 19 | def main(): 20 | # Prepare data loader 21 | train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE) 22 | 23 | # Fix the random number generator seeds for reproducibility 24 | torch.manual_seed(0) 25 | 26 | # Move model to device and declare optimizer and loss function 27 | device = 'cpu' 28 | model = MLP().to(device) 29 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 30 | loss_fn = torch.nn.NLLLoss() 31 | 32 | # Run the training loop 33 | print('----------Training ---------------') 34 | model.train() 35 | for epoch in range(EPOCHS): 36 | start = time.time() 37 | for idx, (train_x, train_label) in enumerate(train_loader): 38 | optimizer.zero_grad() 39 | train_x = train_x.view(train_x.size(0), -1) 40 | train_x = train_x.to(device) 41 | train_label = train_label.to(device) 42 | output = model(train_x) 43 | loss = loss_fn(output, train_label) 44 | loss.backward() 45 | optimizer.step() 46 | if idx < WARMUP_STEPS: # skip warmup iterations 47 | start = time.time() 48 | 49 | # Compute statistics for the last epoch 50 | interval = idx - WARMUP_STEPS # skip warmup iterations 51 | throughput = interval / (time.time() - start) 52 | print("Train throughput (iter/sec): {}".format(throughput)) 53 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 54 | 55 | # Save checkpoint for evaluation 56 | os.makedirs("checkpoints", exist_ok=True) 57 | checkpoint = {'state_dict': model.state_dict()} 58 | torch.save(checkpoint,'checkpoints/checkpoint.pt') 59 | print('----------End Training ---------------') 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /torch-neuronx/training/dp_bert_hf_pretrain/dp_bert_large_hf_pretrain_hdf5_THIRD-PARTY-LICENSES.txt: -------------------------------------------------------------------------------- 1 | ** PyTorch-XLA MNIST data parallel training script; version 8151971 -- https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py 2 | 3 | Copyright (c) 2018 Google Inc. 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories 18 | America 19 | and IDIAP Research Institute nor the names of its contributors may be 20 | used to endorse or promote products derived from this software without 21 | specific prior written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 27 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 28 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 29 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 30 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 31 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 | POSSIBILITY OF SUCH DAMAGE. 34 | 35 | ------ 36 | 37 | ** NVidia DeepLearningExamples BERT pretraining script; version 7a4c425 -- https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py 38 | 39 | 40 | # Licensed under the Apache License, Version 2.0 (the "License"); 41 | # you may not use this file except in compliance with the License. 42 | # You may obtain a copy of the License at 43 | # 44 | # http://www.apache.org/licenses/LICENSE-2.0 45 | # 46 | # Unless required by applicable law or agreed to in writing, software 47 | # distributed under the License is distributed on an "AS IS" BASIS, 48 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 49 | # See the License for the specific language governing permissions and 50 | # limitations under the License. 51 | * For NVidia DeepLearningExamples BERT pretraining script see also this required 52 | NOTICE: 53 | # coding=utf-8 54 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 55 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. 56 | team. 57 | 58 | -------------------------------------------------------------------------------- /torch-neuronx/training/dp_bert_hf_pretrain/requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz 2 | tensorboard==2.14 3 | transformers==4.44.0 4 | evaluate 5 | pillow 6 | pytest 7 | accelerate 8 | datasets==2.19.1 9 | sentencepiece==0.2.0 10 | h5py 11 | requests==2.31.0 12 | huggingface-hub==0.24.5 13 | -------------------------------------------------------------------------------- /torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o pipefail 3 | 4 | pip3 list | grep -e neuron > run_installed_neuron_pkgs.txt 5 | #apt list | grep neuron >> run_installed_neuron_pkgs.txt 6 | 7 | export NEURON_RT_EXEC_TIMEOUT=600 8 | export NEURON_RT_STOCHASTIC_ROUNDING_SEED=0 9 | export TF_GRPC_DEFAULT_OPTIONS=grpc.keepalive_time_ms=60000,grpc.keepalive_timeout_ms=14400000,grpc.http2.max_pings_without_data=0,grpc.http2.min_ping_interval_without_data_ms=600000 10 | 11 | IMDS_TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` 12 | INSTANCEID=`curl -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id` 13 | WORLD_SIZE_JOB=1 14 | RANK_NODE=0 15 | MAX_STEPS=28125 16 | 17 | if [ "$1" == "amp" ]; then 18 | echo "Enable PyTorch Autocast (AMP)" 19 | BATCH_SIZE=16 20 | GRAD_ACCUM_USTEPS=32 21 | ADD_ARGS="--enable_pt_autocast" 22 | elif [ "$1" == "fp32paramscopy" ]; then 23 | echo "Enable BF16 with FP32 copy of weights" 24 | BATCH_SIZE=16 25 | GRAD_ACCUM_USTEPS=32 26 | ADD_ARGS="--optimizer=AdamW_FP32ParamsCopy" 27 | elif [ "$1" == "fp32" ]; then 28 | echo "Enable Full FP32" 29 | BATCH_SIZE=8 30 | GRAD_ACCUM_USTEPS=64 31 | ADD_ARGS="--optimizer=AdamW --enable_fp32" 32 | # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+ 33 | # Switched to using model.to(torch.bfloat16) 34 | else 35 | echo "Enable Full BF16 (model.to(torch.bfloat16)) and FP32 optimizer parameters" 36 | BATCH_SIZE=16 37 | GRAD_ACCUM_USTEPS=32 38 | ADD_ARGS="" 39 | # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+ 40 | # Switched to using model.to(torch.bfloat16) 41 | fi 42 | 43 | if [ -e /opt/aws/neuron/bin/neuron-ls ]; then 44 | NUM_DEVICES=`/opt/aws/neuron/bin/neuron-ls -j | jq '. | length'` 45 | NC_PER_DEVICE=`/opt/aws/neuron/bin/neuron-ls -j | jq '.[0].nc_count'` 46 | echo "Found $NC_PER_DEVICE NeuronCores per device" 47 | 48 | if [ -z "$NUM_DEVICES" ] || [ "$NUM_DEVICES" == "0" ] || [ -z "$NC_PER_DEVICE" ] || [ "$NC_PER_DEVICE" == "null" ]; then 49 | NUM_NEURONCORES=32 50 | echo "Unable to extract device count and nc_count from neuron-ls json output; using default $NUM_NEURONCORES NeuronCores" 51 | else 52 | let NUM_NEURONCORES=$NUM_DEVICES*$NC_PER_DEVICE 53 | echo "Found $NUM_NEURONCORES NeuronCores" 54 | fi 55 | else 56 | NUM_NEURONCORES=32 57 | echo "neuron-ls not installed (aws-neuronx-tools); using default $NUM_NEURONCORES NeuronCores" 58 | fi 59 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES" 60 | OUTPUT_DIR=output 61 | LOG_FILE=log_ph1_bf16 62 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then 63 | LOG_FILE=${LOG_FILE}_compile 64 | fi 65 | 66 | if [ ! -z "$SLURM_NTASKS" ]; then 67 | export FI_EFA_USE_DEVICE_RDMA=1 68 | export FI_PROVIDER=efa 69 | export FI_EFA_FORK_SAFE=1 70 | export BUCKET_CAP_MB=512 71 | export XLA_TRANSFER_SEED_ASYNC=1 72 | WORLD_SIZE_JOB=$SLURM_NTASKS 73 | RANK_NODE=$SLURM_NODEID 74 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`) 75 | MASTER_PORT=2022 76 | GRAD_ACCUM_USTEPS=$(($GRAD_ACCUM_USTEPS/$WORLD_SIZE_JOB)) 77 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 78 | echo $DISTRIBUTED_ARGS 79 | OUTPUT_DIR=output_$SLURM_JOB_ID 80 | LOG_FILE=${LOG_FILE}_${RANK_NODE}_${WORLD_SIZE_JOB} 81 | if [ -z "$NEURON_COMPILE_CACHE_URL" ]; then 82 | CACHE_DIR=$HOME/neuron_cache/bert/`hostname` 83 | export NEURON_CC_FLAGS="--cache_dir=$CACHE_DIR" 84 | fi 85 | export HF_HOME=/tmp/hf_cache/ 86 | mkdir -p $HF_HOME 87 | if [ -e $HOME/.cache/huggingface ]; then 88 | rsync -av $HOME/.cache/huggingface/ $HF_HOME 89 | fi 90 | # HF ver > 4.22: Move cache ahead of time to prevent multiple workers moving at the same time 91 | python -c "import transformers.utils as utils; utils.move_cache()" 92 | fi 93 | 94 | HOST=`hostname` 95 | echo "Hostname: $HOST (instance ID: $INSTANCEID)" 96 | 97 | steps_this_run=$MAX_STEPS 98 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then 99 | steps_this_run=5 100 | fi 101 | 102 | update_test_variables=../../load_test_variables.sh 103 | if [ -e $update_test_variables ]; then 104 | . ./$update_test_variables $@ || echo "Unable to find test env." 105 | fi 106 | mkdir -p $OUTPUT_DIR 107 | if [ -z "$json" ]; then json="$OUTPUT_DIR/results.json" && rm -f $json; fi 108 | 109 | sudo sysctl -w net.ipv4.ip_local_reserved_ports=48620 || exit 1 110 | torchrun $DISTRIBUTED_ARGS dp_bert_large_hf_pretrain_hdf5.py $ADD_ARGS --output_dir $OUTPUT_DIR --steps_this_run $steps_this_run --metrics_file $json --batch_size=$BATCH_SIZE --grad_accum_usteps=$GRAD_ACCUM_USTEPS |& tee $OUTPUT_DIR/$LOG_FILE 111 | 112 | ret_val=${PIPESTATUS[0]} 113 | echo $ret_val 114 | if [ $ret_val -eq 0 ]; then 115 | success=1 116 | else 117 | success=0 118 | fi 119 | 120 | if [ -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then 121 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh 122 | if [ -e $dump_to_s3_update_json_scr ]; then 123 | $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON." 124 | else 125 | echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON." 126 | fi 127 | fi 128 | 129 | # copy final checkpoint for ph2 130 | if [ -e $OUTPUT_DIR/ckpt_28125.pt ]; then cp -f $OUTPUT_DIR/ckpt_28125.pt ../; fi 131 | 132 | exit $ret_val 133 | -------------------------------------------------------------------------------- /torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128_lamb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o pipefail 3 | 4 | pip3 list | grep -e neuron > run_installed_neuron_pkgs.txt 5 | #apt list | grep neuron >> run_installed_neuron_pkgs.txt 6 | 7 | export NEURON_RT_EXEC_TIMEOUT=600 8 | export NEURON_RT_STOCHASTIC_ROUNDING_SEED=0 9 | export TF_GRPC_DEFAULT_OPTIONS=grpc.keepalive_time_ms=60000,grpc.keepalive_timeout_ms=14400000,grpc.http2.max_pings_without_data=0,grpc.http2.min_ping_interval_without_data_ms=600000 10 | 11 | IMDS_TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` 12 | INSTANCEID=`curl -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id` 13 | WORLD_SIZE_JOB=1 14 | RANK_NODE=0 15 | MAX_STEPS=7032 16 | 17 | #keep the GBS=64k to benchmark over GPU 18 | if [ "$1" == "amp" ]; then 19 | echo "Enable PyTorch Autocast (AMP)" 20 | BATCH_SIZE=16 21 | GRAD_ACCUM_USTEPS=128 22 | ADD_ARGS="--enable_pt_autocast" 23 | elif [ "$1" == "fp32" ]; then 24 | echo "Enable Full FP32" 25 | BATCH_SIZE=8 26 | GRAD_ACCUM_USTEPS=256 27 | ADD_ARGS="--optimizer=AdamW --enable_fp32" 28 | # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+ 29 | # Switched to using model.to(torch.bfloat16) 30 | else 31 | echo "Enable Full BF16 (model.to(torch.bfloat16)) and FP32 optimizer parameters" 32 | BATCH_SIZE=16 33 | GRAD_ACCUM_USTEPS=128 34 | ADD_ARGS="" 35 | # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+ 36 | # Switched to using model.to(torch.bfloat16) 37 | fi 38 | 39 | if [ -e /opt/aws/neuron/bin/neuron-ls ]; then 40 | NUM_DEVICES=`/opt/aws/neuron/bin/neuron-ls -j | jq '. | length'` 41 | NC_PER_DEVICE=`/opt/aws/neuron/bin/neuron-ls -j | jq '.[0].nc_count'` 42 | echo "Found $NC_PER_DEVICE NeuronCores per device" 43 | 44 | if [ -z "$NUM_DEVICES" ] || [ "$NUM_DEVICES" == "0" ] || [ -z "$NC_PER_DEVICE" ] || [ "$NC_PER_DEVICE" == "null" ]; then 45 | NUM_NEURONCORES=32 46 | echo "Unable to extract device count and nc_count from neuron-ls json output; using default $NUM_NEURONCORES NeuronCores" 47 | else 48 | let NUM_NEURONCORES=$NUM_DEVICES*$NC_PER_DEVICE 49 | echo "Found $NUM_NEURONCORES NeuronCores" 50 | fi 51 | else 52 | NUM_NEURONCORES=32 53 | echo "neuron-ls not installed (aws-neuronx-tools); using default $NUM_NEURONCORES NeuronCores" 54 | fi 55 | 56 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES" 57 | OUTPUT_DIR=output 58 | OPT=LAMB 59 | LOG_FILE=log_ph1_bf16 60 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then 61 | LOG_FILE=${LOG_FILE}_compile 62 | fi 63 | 64 | if [ ! -z "$SLURM_NTASKS" ]; then 65 | export FI_EFA_USE_DEVICE_RDMA=1 66 | export FI_PROVIDER=efa 67 | export FI_EFA_FORK_SAFE=1 68 | export BUCKET_CAP_MB=512 69 | export XLA_TRANSFER_SEED_ASYNC=1 70 | WORLD_SIZE_JOB=$SLURM_NTASKS 71 | RANK_NODE=$SLURM_NODEID 72 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`) 73 | MASTER_PORT=2022 74 | GRAD_ACCUM_USTEPS=$(($GRAD_ACCUM_USTEPS/$WORLD_SIZE_JOB)) 75 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 76 | echo $DISTRIBUTED_ARGS 77 | OUTPUT_DIR=output_$SLURM_JOB_ID 78 | LOG_FILE=${LOG_FILE}_${RANK_NODE}_${WORLD_SIZE_JOB} 79 | if [ -z "$NEURON_COMPILE_CACHE_URL" ]; then 80 | CACHE_DIR=$HOME/neuron_cache/bert/`hostname` 81 | export NEURON_CC_FLAGS="--cache_dir=$CACHE_DIR" 82 | fi 83 | export HF_HOME=/tmp/hf_cache/ 84 | mkdir -p $HF_HOME 85 | if [ -e $HOME/.cache/huggingface ]; then 86 | rsync -av $HOME/.cache/huggingface/ $HF_HOME 87 | fi 88 | # HF ver > 4.22: Move cache ahead of time to prevent multiple workers moving at the same time 89 | python -c "import transformers.utils as utils; utils.move_cache()" 90 | fi 91 | 92 | HOST=`hostname` 93 | echo "Hostname: $HOST (instance ID: $INSTANCEID)" 94 | 95 | steps_this_run=$MAX_STEPS 96 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then 97 | steps_this_run=5 98 | fi 99 | 100 | update_test_variables=../../load_test_variables.sh 101 | if [ -e $update_test_variables ]; then 102 | . ./$update_test_variables $@ || echo "Unable to find test env." 103 | fi 104 | mkdir -p $OUTPUT_DIR 105 | if [ -z "$json" ]; then json="$OUTPUT_DIR/results.json" && rm -f $json; fi 106 | 107 | sudo sysctl -w net.ipv4.ip_local_reserved_ports=48620 || exit 1 108 | torchrun $DISTRIBUTED_ARGS dp_bert_large_hf_pretrain_hdf5.py $ADD_ARGS --optimizer $OPT --lr 6e-3 --output_dir $OUTPUT_DIR --max_steps $MAX_STEPS --steps_this_run $steps_this_run --metrics_file $json --batch_size=$BATCH_SIZE --grad_accum_usteps=$GRAD_ACCUM_USTEPS |& tee $OUTPUT_DIR/$LOG_FILE 109 | 110 | ret_val=${PIPESTATUS[0]} 111 | echo $ret_val 112 | if [ $ret_val -eq 0 ]; then 113 | success=1 114 | else 115 | success=0 116 | fi 117 | 118 | if [ -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then 119 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh 120 | if [ -e $dump_to_s3_update_json_scr ]; then 121 | $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON." 122 | else 123 | echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON." 124 | fi 125 | fi 126 | 127 | # copy final checkpoint for ph2 128 | if [ -e $OUTPUT_DIR/ckpt_$MAX_STEPS.pt ]; then cp -f $OUTPUT_DIR/ckpt_$MAX_STEPS.pt ../; fi 129 | 130 | exit $ret_val 131 | -------------------------------------------------------------------------------- /torch-neuronx/training/hf_language_modeling/gpt2/run_clm.patch: -------------------------------------------------------------------------------- 1 | diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py 2 | index 3cf36ec9d..960b3a169 100755 3 | --- a/examples/pytorch/language-modeling/run_clm.py 4 | +++ b/examples/pytorch/language-modeling/run_clm.py 5 | @@ -53,6 +53,22 @@ from transformers.trainer_utils import get_last_checkpoint 6 | from transformers.utils import check_min_version, send_example_telemetry 7 | from transformers.utils.versions import require_version 8 | 9 | +from importlib.metadata import version 10 | + 11 | +if version("torch") >= "2.0": 12 | + import copy 13 | + import torch_xla.core.xla_model as xm 14 | + def mesh_reduce(tag, data, reduce_fn): 15 | + xm.rendezvous(tag) 16 | + xdatain = copy.deepcopy(data) 17 | + xdatain = xdatain.to("xla") 18 | + xdata = xm.all_gather(xdatain, pin_layout=False) 19 | + cpu_xdata = xdata.detach().to("cpu") 20 | + cpu_xdata_split = torch.split(cpu_xdata, xdatain.shape[0]) 21 | + xldata = [x for x in cpu_xdata_split] 22 | + return reduce_fn(xldata) 23 | + xm.mesh_reduce = mesh_reduce 24 | + 25 | 26 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 27 | check_min_version("4.27.0") 28 | -------------------------------------------------------------------------------- /torch-neuronx/training/hf_sentiment_analysis/.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | #*.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | .idea/ 127 | .vscode/ 128 | deploy/ 129 | test/ 130 | **/.DS_Store 131 | cdk.out/ 132 | 133 | */**/models 134 | *.pem -------------------------------------------------------------------------------- /torch-neuronx/training/hf_sentiment_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Hugging Face BERT Sentinment Analysis - AWS Trainium 2 | 3 | ## Introduction 4 | 5 | In this example, we will go through the steps required for easily adapt your PyTorch code for training a Machine Learning 6 | (ML) model by using [Hugging Face](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face) and BERT as 7 | model type on an Amazon EC2 instance by using AWS Trainium chip. 8 | 9 | In this repository, we are sharing some code examples for: 10 | 1. Train BERT ML model by using PyTorch and Hugging Face 11 | 1. Code: [single Neuron Core](code/01-trainium-single-core/train.py) 12 | 2. Notebook: [notebook single Neuron Core](./01-hf-single-neuron.ipynb) 13 | 2. Distributed training of BERT ML model by using PyTorch and Hugging Face 14 | 1. Code: [distributed training on Neuron Cores](code/02-trainium-distributed-training/train.py) 15 | 2. Notebook: [notebook distributed training on Neuron Cores](./02-hf-distributed-training.ipynb) 16 | 17 | ## Infrastructure Setup for AWS Trainium 18 | 19 | ### Prerequisites 20 | 21 | * Instance Image: [Deep Learning AMI Neuron PyTorch 1.11](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-11-amazon-linux-2/) 22 | * Instance Type: trn1.32xlarge 23 | * Git installed on the EC2 instance 24 | 25 | ``` 26 | git --version 27 | ``` 28 | 29 | ### Activate pre-built PyTorch environment 30 | 31 | ``` 32 | source /opt/aws_neuron_venv_pytorch/bin/activate 33 | ``` 34 | 35 | ### Check AWS Neuron SDK installation 36 | 37 | ``` 38 | neuron-ls 39 | 40 | neuron-top 41 | ``` 42 | 43 | ## ML Training on single Neuron Core 44 | 45 | Activate [pre-built PyTorch environment](#activate-pre-built-pytorch-environment) 46 | 47 | Test the code execution by using the provided [notebook](./01-hf-single-neuron.ipynb) 48 | 49 | ### CL execution example 50 | 51 | ``` 52 | cd examples/01-trainium-single-core 53 | 54 | python3 train.py 55 | ``` 56 | 57 | ## Distributed Training on all available Neuron Cores 58 | 59 | Activate [pre-built PyTorch environment](#activate-pre-built-pytorch-environment) 60 | 61 | Test the code execution by using the provided [notebook](./02-hf-distributed-training.ipynb) 62 | 63 | ### CL execution example 64 | 65 | ``` 66 | cd examples/02-trainium-distributed-training 67 | 68 | export TOKENIZERS_PARALLELISM=false 69 | 70 | torchrun --nproc_per_node=32 train.py 71 | ``` 72 | 73 | # Errors 74 | 75 | 1. Flush Neuron Cores 76 | 77 | ``` 78 | sudo rmmod neuron; sudo modprobe neuron 79 | ``` -------------------------------------------------------------------------------- /torch-neuronx/training/hf_sentiment_analysis/code/01-trainium-single-core/train.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from datasets import Dataset, DatasetDict 3 | import logging 4 | import os 5 | import pandas as pd 6 | from time import gmtime, strftime 7 | from tqdm.auto import tqdm 8 | import torch 9 | import torch_xla.core.xla_model as xm 10 | import torch_xla.runtime as xr 11 | from torch.optim import AdamW 12 | from torch.utils.data import DataLoader 13 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | logger = logging.getLogger(__name__) 17 | 18 | model_name = "bert-base-cased" 19 | ## define xla as device for using AWS Trainium Neuron Cores 20 | device = "xla" 21 | 22 | batch_size = 8 23 | num_epochs = 6 24 | 25 | logger.info("Device: {}".format(device)) 26 | 27 | ## tokenize_and_encode 28 | # params: 29 | # data: DatasetDict 30 | # This method returns a dictionary of input_ids, token_type_ids, attention_mask 31 | def tokenize_and_encode(data): 32 | results = tokenizer(data["text"], padding="max_length", truncation=True) 33 | return results 34 | 35 | if __name__ == '__main__': 36 | path = os.path.abspath("data") 37 | csv_path = path + "/train.csv" 38 | 39 | train = pd.read_csv( 40 | csv_path, 41 | sep=',', 42 | quotechar='"', 43 | quoting=csv.QUOTE_ALL, 44 | escapechar='\\', 45 | encoding='utf-8' 46 | ) 47 | 48 | train_dataset = Dataset.from_dict(train) 49 | 50 | hg_dataset = DatasetDict({"train": train_dataset}) 51 | 52 | ## Loading Hugging Face AutoTokenizer for the defined model 53 | tokenizer = AutoTokenizer.from_pretrained(model_name) 54 | 55 | ds_encoded = hg_dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"]) 56 | 57 | ds_encoded.set_format("torch") 58 | 59 | ## Creating a DataLoader object for iterating over it during the training epochs 60 | train_dl = DataLoader(ds_encoded["train"], shuffle=True, batch_size=batch_size) 61 | 62 | ## Loading Hugging Face pre-trained model for sequence classification for the defined model 63 | model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) 64 | model.to(device) 65 | 66 | current_timestamp = strftime("%Y-%m-%d-%H-%M", gmtime()) 67 | 68 | optimizer = AdamW(model.parameters(), lr=1.45e-4) 69 | 70 | num_training_steps = num_epochs * len(train_dl) 71 | progress_bar = tqdm(range(num_training_steps)) 72 | lr_scheduler = get_scheduler( 73 | name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps 74 | ) 75 | 76 | logger.info("Start training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) 77 | 78 | ## Start model training and defining the training loop 79 | model.train() 80 | for epoch in range(num_epochs): 81 | for batch in train_dl: 82 | batch = {k: v.to(device) for k, v in batch.items()} 83 | outputs = model(**batch) 84 | loss = outputs.loss 85 | loss.backward() 86 | optimizer.step() 87 | lr_scheduler.step() 88 | ## xm.mark_step is executing the current graph, updating the model params, and notifiy end of step to Neuron Core 89 | xm.mark_step() 90 | optimizer.zero_grad() 91 | progress_bar.update(1) 92 | 93 | logger.info("Epoch {}, rank {}, Loss {:0.4f}".format(epoch, xr.global_ordinal(), loss.detach().to("cpu"))) 94 | 95 | logger.info("End training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) 96 | 97 | ## Using XLA for saving model after training for being sure only one copy of the model is saved 98 | os.makedirs("./../../models/checkpoints/{}".format(current_timestamp), exist_ok=True) 99 | checkpoint = {"state_dict": model.state_dict()} 100 | xm.save(checkpoint, "./../../models/checkpoints/{}/checkpoint.pt".format(current_timestamp)) 101 | -------------------------------------------------------------------------------- /torch-neuronx/training/hf_sentiment_analysis/code/02-trainium-distributed-training/train.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from datasets import Dataset, DatasetDict 3 | import logging 4 | import os 5 | import pandas as pd 6 | from time import gmtime, strftime 7 | from tqdm.auto import tqdm 8 | import torch 9 | import torch_xla.core.xla_model as xm 10 | import torch_xla.distributed.parallel_loader as pl 11 | import torch_xla.distributed.xla_backend 12 | import torch_xla.runtime as xr 13 | from torch.optim import AdamW 14 | from torch.utils.data import DataLoader 15 | from torch.utils.data.distributed import DistributedSampler 16 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | logger = logging.getLogger(__name__) 20 | 21 | torch.manual_seed(0) 22 | 23 | model_name = "bert-base-cased" 24 | ## define xla as device for using AWS Trainium Neuron Cores 25 | device = "xla" 26 | 27 | torch.distributed.init_process_group(device) 28 | 29 | # Get the global number of workes. 30 | world_size = xr.world_size() 31 | logger.info("Workers: {}".format(world_size)) 32 | 33 | batch_size = 8 34 | num_epochs = 6 35 | 36 | logger.info("Device: {}".format(device)) 37 | 38 | ## tokenize_and_encode 39 | # params: 40 | # data: DatasetDict 41 | # This method returns a dictionary of input_ids, token_type_ids, attention_mask 42 | def tokenize_and_encode(data): 43 | results = tokenizer(data["text"], padding="max_length", truncation=True) 44 | return results 45 | 46 | if __name__ == '__main__': 47 | path = os.path.abspath("data") 48 | csv_path = path + "/train.csv" 49 | 50 | train = pd.read_csv( 51 | csv_path, 52 | sep=',', 53 | quotechar='"', 54 | quoting=csv.QUOTE_ALL, 55 | escapechar='\\', 56 | encoding='utf-8' 57 | ) 58 | 59 | train_dataset = Dataset.from_dict(train) 60 | 61 | hg_dataset = DatasetDict({"train": train_dataset}) 62 | 63 | ## Loading Hugging Face AutoTokenizer for the defined model 64 | tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True) 65 | 66 | ds_encoded = hg_dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"]) 67 | 68 | ds_encoded.set_format("torch") 69 | 70 | ## Create a subsed of data sampler, for parallelizing the training across multiple cores 71 | if world_size > 1: 72 | train_sampler = DistributedSampler( 73 | ds_encoded["train"], 74 | num_replicas=world_size, 75 | rank=xr.global_ordinal(), 76 | shuffle=True, 77 | ) 78 | 79 | ## Creating a DataLoader object for iterating over it during the training epochs 80 | train_dl = DataLoader( 81 | ds_encoded["train"], 82 | batch_size=batch_size, 83 | sampler=train_sampler, 84 | shuffle=False if train_sampler else True) 85 | 86 | ## Loading a subset of the data in the different Neuron Cores provided as input 87 | train_device_loader = pl.MpDeviceLoader(train_dl, device) 88 | 89 | ## Loading Hugging Face pre-trained model for sequence classification for the defined model 90 | model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, force_download=True).to(device) 91 | 92 | current_timestamp = strftime("%Y-%m-%d-%H-%M", gmtime()) 93 | 94 | optimizer = AdamW(model.parameters(), lr=1.45e-4 * world_size) 95 | 96 | num_training_steps = num_epochs * len(train_dl) 97 | progress_bar = tqdm(range(num_training_steps)) 98 | 99 | logger.info("Start training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) 100 | 101 | ## Start model training and defining the training loop 102 | model.train() 103 | for epoch in range(num_epochs): 104 | for batch in train_device_loader: 105 | batch = {k: v.to(device) for k, v in batch.items()} 106 | outputs = model(**batch) 107 | optimizer.zero_grad() 108 | loss = outputs.loss 109 | loss.backward() 110 | ## xm.optimizer_step is performing the sum of all the gradients updates done in the different Cores 111 | xm.optimizer_step(optimizer) 112 | progress_bar.update(1) 113 | 114 | logger.info("Epoch {}, rank {}, Loss {:0.4f}".format(epoch, xr.global_ordinal(), loss.detach().to("cpu"))) 115 | 116 | logger.info("End training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) 117 | 118 | ## Using XLA for saving model after training for being sure only one copy of the model is saved 119 | os.makedirs("./../../models/checkpoints/{}".format(current_timestamp), exist_ok=True) 120 | checkpoint = {"state_dict": model.state_dict()} 121 | xm.save(checkpoint, "./../../models/checkpoints/{}/checkpoint.pt".format(current_timestamp)) 122 | -------------------------------------------------------------------------------- /torch-neuronx/training/hf_text_classification/README.md: -------------------------------------------------------------------------------- 1 | # Hugging Face Text Classification 2 | 3 | This folder contains various examples of Hugging Face models that can be trained with AWS Trainium for a Text Classification task. Each Jupyter notebook contains a specific example of training a model using the Hugging Face Trainer API and uses a slightly modified script called [run_glue.py](run_glue.py) to fine tune the pretrained model. 4 | 5 | The following models are currently supported and tested with AWS Trainium: 6 | - [BERT base cased](BertBaseCased.ipynb) 7 | - [BERT base uncased](BertBaseUncased.ipynb) 8 | - [BERT large cased](BertLargeCased.ipynb) 9 | - [BERT large uncased](BertLargeUncased.ipynb) 10 | - [RoBERTa base](RobertaBase.ipynb) 11 | - [RoBERTa large](RobertaLarge.ipynb) 12 | - [XLM RoBERTa base](XlmRobertaBase.ipynb) 13 | - [AlBERT base](AlbertBase.ipynb) 14 | - [Distilbert base uncased](DistilbertBaseUncased.ipynb) 15 | -------------------------------------------------------------------------------- /torch-neuronx/training/llama2/get_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoTokenizer 3 | from itertools import chain 4 | import os 5 | 6 | dataset_name = "wikicorpus" 7 | dataset_config_name = "raw_en" 8 | save_path = "~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k" 9 | tokenizer_path = os.getcwd() 10 | 11 | save_path = os.path.expanduser(save_path) 12 | tokenizer_path = os.path.expanduser(tokenizer_path) 13 | if not os.path.exists(save_path): 14 | os.makedirs(save_path) 15 | 16 | block_size = 4096 17 | 18 | raw_datasets = load_dataset(dataset_name, dataset_config_name) 19 | 20 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) 21 | 22 | column_names = raw_datasets["train"].column_names 23 | text_column_name = "text" if "text" in column_names else column_names[0] 24 | 25 | def tokenize_function(examples): 26 | return tokenizer(examples[text_column_name]) 27 | 28 | tokenized_datasets = raw_datasets.map( 29 | tokenize_function, 30 | batched=True, 31 | remove_columns=column_names, 32 | load_from_cache_file=True, 33 | desc="Running tokenizer on dataset", 34 | ) 35 | 36 | if block_size > tokenizer.model_max_length: 37 | print("block_size > tokenizer.model_max_length") 38 | block_size = min(block_size, tokenizer.model_max_length) 39 | 40 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 41 | def group_texts(examples): 42 | # Concatenate all texts. 43 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} 44 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 45 | # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. 46 | # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. 47 | total_length = (total_length // block_size) * block_size 48 | # Split by chunks of max_len. 49 | result = { 50 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 51 | for k, t in concatenated_examples.items() 52 | } 53 | result["labels"] = result["input_ids"].copy() 54 | return result 55 | 56 | lm_datasets = tokenized_datasets.map( 57 | group_texts, 58 | batched=True, 59 | load_from_cache_file=True, 60 | desc=f"Grouping texts in chunks of {block_size}", 61 | ) 62 | 63 | train_dataset = lm_datasets["train"] 64 | print(len(train_dataset)) 65 | 66 | train_dataset.save_to_disk(save_path) 67 | 68 | -------------------------------------------------------------------------------- /torch-neuronx/training/llama2/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.31.0 2 | regex 3 | tensorboard 4 | datasets 5 | sentencepiece 6 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | 13 | # Load MNIST test dataset 14 | test_dataset = mnist.MNIST(root='./MNIST_DATA_test', \ 15 | train=False, download=True, transform=ToTensor()) 16 | 17 | def main(): 18 | # Fix the random number generator seeds for reproducibility 19 | torch.manual_seed(0) 20 | 21 | # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance) 22 | device = xm.xla_device() 23 | # Move model to device 24 | model = MLP().to(device) 25 | 26 | # Load check point 27 | checkpoint = torch.load('checkpoints/checkpoint.pt', map_location='cpu') 28 | model.load_state_dict(checkpoint['state_dict']) 29 | 30 | # Prepare data loader 31 | test_loader = DataLoader(test_dataset, batch_size=32) 32 | 33 | # Run the evaluation loop 34 | print('----------Evaluating---------------') 35 | match_count = 0 36 | model.eval() 37 | start = time.time() 38 | for idx, (test_x, test_label) in enumerate(test_loader): 39 | test_x = test_x.view(test_x.size(0), -1) 40 | test_x = test_x.to(device) 41 | test_pred = model(test_x) 42 | pred_label = torch.argmax(test_pred, dim=1) 43 | match_count += sum(pred_label == test_label.to(device)) 44 | xm.mark_step() # XLA: collect ops and run them in XLA runtime 45 | if idx < 2: # skip warmup iterations 46 | start = time.time() 47 | 48 | # Compute statistics 49 | interval = idx - 2 # skip warmup iterations 50 | throughput = interval / (time.time() - start) 51 | print("Test throughput (iter/sec): {}".format(throughput)) 52 | 53 | accuracy = match_count / (idx * 32) 54 | print("Accuracy: {}".format(accuracy)) 55 | assert(accuracy > 0.92) 56 | print('----------Done Evaluating---------------') 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/eval_using_trace.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # Load MNIST test dataset 11 | test_dataset = mnist.MNIST(root='./MNIST_DATA_test', \ 12 | train=False, download=True, transform=ToTensor()) 13 | 14 | def main(): 15 | # Fix the random number generator seeds for reproducibility 16 | torch.manual_seed(0) 17 | 18 | # Use cpu device for trace API 19 | device = "cpu" 20 | # Move model to device 21 | model = MLP().to(device) 22 | 23 | # Load check point 24 | checkpoint = torch.load('checkpoints/checkpoint.pt', map_location='cpu') 25 | model.load_state_dict(checkpoint['state_dict']) 26 | 27 | # Prepare data loader 28 | test_loader = DataLoader(test_dataset, batch_size=32, drop_last=True) 29 | 30 | # Run the evaluation loop 31 | print('----------Evaluating---------------') 32 | match_count = 0 33 | model.eval() 34 | start = time.time() 35 | for idx, (test_x, test_label) in enumerate(test_loader): 36 | test_x = test_x.view(test_x.size(0), -1) 37 | test_x = test_x.to(device) 38 | if idx == 0: 39 | import torch_neuronx 40 | model = torch_neuronx.trace(model, test_x) 41 | test_pred = model(test_x) 42 | pred_label = torch.argmax(test_pred, dim=1) 43 | match_count += sum(pred_label == test_label.to(device)) 44 | if idx < 2: # skip warmup iterations 45 | start = time.time() 46 | 47 | # Compute statistics 48 | interval = idx - 2 # skip warmup iterations 49 | throughput = interval / (time.time() - start) 50 | print("Test throughput (iter/sec): {}".format(throughput)) 51 | 52 | accuracy = match_count / (idx * 32) 53 | print("Accuracy: {}".format(accuracy)) 54 | assert(accuracy > 0.92) 55 | print('----------Done Evaluating---------------') 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | # Declare 3-layer MLP for MNIST dataset 5 | class MLP(nn.Module): 6 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]): 7 | super(MLP, self).__init__() 8 | self.fc1 = nn.Linear(input_size, layers[0]) 9 | self.fc2 = nn.Linear(layers[0], layers[1]) 10 | self.fc3 = nn.Linear(layers[1], output_size) 11 | 12 | def forward(self, x): 13 | x = F.relu(self.fc1(x)) 14 | x = F.relu(self.fc2(x)) 15 | x = self.fc3(x) 16 | return F.log_softmax(x, dim=1) 17 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | 13 | # Global constants 14 | EPOCHS = 4 15 | WARMUP_STEPS = 2 16 | BATCH_SIZE = 32 17 | 18 | # Load MNIST train dataset 19 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train', 20 | train=True, download=True, transform=ToTensor()) 21 | 22 | def main(): 23 | # Prepare data loader 24 | train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE) 25 | 26 | # Fix the random number generator seeds for reproducibility 27 | torch.manual_seed(0) 28 | 29 | # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance) 30 | device = 'xla' 31 | 32 | # Move model to device and declare optimizer and loss function 33 | model = MLP().to(device) 34 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 35 | loss_fn = torch.nn.NLLLoss() 36 | 37 | # Run the training loop 38 | print('----------Training ---------------') 39 | model.train() 40 | for epoch in range(EPOCHS): 41 | start = time.time() 42 | for idx, (train_x, train_label) in enumerate(train_loader): 43 | optimizer.zero_grad() 44 | train_x = train_x.view(train_x.size(0), -1) 45 | train_x = train_x.to(device) 46 | train_label = train_label.to(device) 47 | output = model(train_x) 48 | loss = loss_fn(output, train_label) 49 | loss.backward() 50 | optimizer.step() 51 | xm.mark_step() # XLA: collect ops and run them in XLA runtime 52 | if idx < WARMUP_STEPS: # skip warmup iterations 53 | start = time.time() 54 | 55 | # Compute statistics for the last epoch 56 | interval = idx - WARMUP_STEPS # skip warmup iterations 57 | throughput = interval / (time.time() - start) 58 | print("Train throughput (iter/sec): {}".format(throughput)) 59 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 60 | 61 | # Save checkpoint for evaluation 62 | os.makedirs("checkpoints", exist_ok=True) 63 | checkpoint = {'state_dict': model.state_dict()} 64 | # XLA: use xm.save instead of torch.save to ensure states are moved back to cpu 65 | # This can prevent "XRT memory handle not found" at end of test.py execution 66 | xm.save(checkpoint,'checkpoints/checkpoint.pt') 67 | 68 | print('----------End Training ---------------') 69 | 70 | if __name__ == '__main__': 71 | main() 72 | 73 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/train_cpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # Global constants 11 | EPOCHS = 4 12 | WARMUP_STEPS = 2 13 | BATCH_SIZE = 32 14 | 15 | # Load MNIST train dataset 16 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train', 17 | train=True, download=True, transform=ToTensor()) 18 | 19 | def main(): 20 | # Prepare data loader 21 | train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE) 22 | 23 | # Fix the random number generator seeds for reproducibility 24 | torch.manual_seed(0) 25 | 26 | # Move model to device and declare optimizer and loss function 27 | device = 'cpu' 28 | model = MLP().to(device) 29 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 30 | loss_fn = torch.nn.NLLLoss() 31 | 32 | # Run the training loop 33 | print('----------Training ---------------') 34 | model.train() 35 | for epoch in range(EPOCHS): 36 | start = time.time() 37 | for idx, (train_x, train_label) in enumerate(train_loader): 38 | optimizer.zero_grad() 39 | train_x = train_x.view(train_x.size(0), -1) 40 | train_x = train_x.to(device) 41 | train_label = train_label.to(device) 42 | output = model(train_x) 43 | loss = loss_fn(output, train_label) 44 | loss.backward() 45 | optimizer.step() 46 | if idx < WARMUP_STEPS: # skip warmup iterations 47 | start = time.time() 48 | 49 | # Compute statistics for the last epoch 50 | interval = idx - WARMUP_STEPS # skip warmup iterations 51 | throughput = interval / (time.time() - start) 52 | print("Train throughput (iter/sec): {}".format(throughput)) 53 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 54 | 55 | # Save checkpoint for evaluation 56 | os.makedirs("checkpoints", exist_ok=True) 57 | checkpoint = {'state_dict': model.state_dict()} 58 | torch.save(checkpoint,'checkpoints/checkpoint.pt') 59 | print('----------End Training ---------------') 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/train_torchrun.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | import torch_xla.runtime as xr 13 | 14 | # XLA imports for parallel loader and multi-processing 15 | import torch_xla.distributed.parallel_loader as pl 16 | from torch.utils.data.distributed import DistributedSampler 17 | 18 | # Initialize XLA process group for torchrun 19 | import torch_xla.distributed.xla_backend 20 | torch.distributed.init_process_group('xla') 21 | 22 | # Global constants 23 | EPOCHS = 4 24 | WARMUP_STEPS = 2 25 | BATCH_SIZE = 32 26 | 27 | # Load MNIST train dataset 28 | if not xm.is_master_ordinal(): xm.rendezvous('dataset_download') 29 | train_dataset = mnist.MNIST(root='/tmp/MNIST_DATA_train', 30 | train=True, download=True, transform=ToTensor()) 31 | if xm.is_master_ordinal(): xm.rendezvous('dataset_download') 32 | 33 | def main(): 34 | # XLA MP: get world size 35 | world_size = xr.world_size() 36 | # multi-processing: ensure each worker has same initial weights 37 | torch.manual_seed(0) 38 | 39 | # Move model to device and declare optimizer and loss function 40 | device = 'xla' 41 | model = MLP().to(device) 42 | # For multiprocessing, scale up learning rate 43 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size) 44 | loss_fn = torch.nn.NLLLoss() 45 | 46 | # Prepare data loader 47 | train_sampler = None 48 | if world_size > 1: 49 | train_sampler = DistributedSampler(train_dataset, 50 | num_replicas=world_size, 51 | rank=xr.global_ordinal(), 52 | shuffle=True) 53 | train_loader = DataLoader(train_dataset, 54 | batch_size=BATCH_SIZE, 55 | sampler=train_sampler, 56 | shuffle=False if train_sampler else True) 57 | # XLA MP: use MpDeviceLoader from torch_xla.distributed 58 | train_device_loader = pl.MpDeviceLoader(train_loader, device) 59 | 60 | # Run the training loop 61 | print('----------Training ---------------') 62 | model.train() 63 | for epoch in range(EPOCHS): 64 | start = time.time() 65 | for idx, (train_x, train_label) in enumerate(train_device_loader): 66 | optimizer.zero_grad() 67 | train_x = train_x.view(train_x.size(0), -1) 68 | output = model(train_x) 69 | loss = loss_fn(output, train_label) 70 | loss.backward() 71 | xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step 72 | if idx < WARMUP_STEPS: # skip warmup iterations 73 | start = time.time() 74 | 75 | # Compute statistics for the last epoch 76 | interval = idx - WARMUP_STEPS # skip warmup iterations 77 | throughput = interval / (time.time() - start) 78 | print("Train throughput (iter/sec): {}".format(throughput)) 79 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 80 | 81 | # Save checkpoint for evaluation (xm.save ensures only one process save) 82 | os.makedirs("checkpoints", exist_ok=True) 83 | checkpoint = {'state_dict': model.state_dict()} 84 | xm.save(checkpoint,'checkpoints/checkpoint.pt') 85 | 86 | print('----------End Training ---------------') 87 | 88 | if __name__ == '__main__': 89 | main() 90 | 91 | -------------------------------------------------------------------------------- /torch-neuronx/training/mnist_mlp/train_xmp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from model import MLP 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | # XLA imports for parallel loader and multi-processing 13 | import torch_xla.distributed.parallel_loader as pl 14 | import torch_xla.distributed.xla_multiprocessing as xmp 15 | import torch_xla.runtime as xr 16 | from torch.utils.data.distributed import DistributedSampler 17 | 18 | # Global constants 19 | EPOCHS = 4 20 | WARMUP_STEPS = 2 21 | BATCH_SIZE = 32 22 | 23 | # Load MNIST train dataset 24 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train', 25 | train=True, download=True, transform=ToTensor()) 26 | 27 | def main(index): 28 | # XLA MP: get world size 29 | world_size = xr.world_size() 30 | # multi-processing: ensure each worker has same initial weights 31 | torch.manual_seed(0) 32 | # Move model to device and declare optimizer and loss function 33 | device = 'xla' 34 | model = MLP().to(device) 35 | # For multiprocessing, scale up learning rate 36 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size) 37 | loss_fn = torch.nn.NLLLoss() 38 | 39 | # Prepare data loader 40 | train_sampler = None 41 | if world_size > 1: 42 | train_sampler = DistributedSampler(train_dataset, 43 | num_replicas=world_size, 44 | rank=xr.global_ordinal(), 45 | shuffle=True) 46 | train_loader = DataLoader(train_dataset, 47 | batch_size=BATCH_SIZE, 48 | sampler=train_sampler, 49 | shuffle=False if train_sampler else True) 50 | # XLA MP: use MpDeviceLoader from torch_xla.distributed 51 | train_device_loader = pl.MpDeviceLoader(train_loader, device) 52 | 53 | # Run the training loop 54 | print('----------Training ---------------') 55 | model.train() 56 | for epoch in range(EPOCHS): 57 | start = time.time() 58 | for idx, (train_x, train_label) in enumerate(train_device_loader): 59 | optimizer.zero_grad() 60 | train_x = train_x.view(train_x.size(0), -1) 61 | output = model(train_x) 62 | loss = loss_fn(output, train_label) 63 | loss.backward() 64 | xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step 65 | if idx < WARMUP_STEPS: # skip warmup iterations 66 | start = time.time() 67 | 68 | # Compute statistics for the last epoch 69 | interval = idx - WARMUP_STEPS # skip warmup iterations 70 | throughput = interval / (time.time() - start) 71 | print("Train throughput (iter/sec): {}".format(throughput)) 72 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 73 | 74 | # Save checkpoint for evaluation (xm.save ensures only one process save) 75 | os.makedirs("checkpoints", exist_ok=True) 76 | checkpoint = {'state_dict': model.state_dict()} 77 | xm.save(checkpoint,'checkpoints/checkpoint.pt') 78 | 79 | print('----------End Training ---------------') 80 | 81 | if __name__ == '__main__': 82 | xmp.spawn(main) 83 | 84 | -------------------------------------------------------------------------------- /torch-neuronx/training/stable_diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | torchvision 2 | diffusers==0.19.3 # Intentionally pin to 0.19.3. More recent versions have problems on Neuron. 3 | transformers==4.31.0 4 | datasets==2.14.2 5 | fsspec==2023.9.2 -------------------------------------------------------------------------------- /torch-neuronx/training/tp_dp_bert_hf_pretrain/requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz 2 | tensorboard==2.6 3 | transformers==4.26.0 4 | evaluate 5 | pillow 6 | pytest 7 | accelerate 8 | datasets >= 1.8.0 9 | sentencepiece != 0.1.92 10 | h5py 11 | requests 12 | -------------------------------------------------------------------------------- /torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoTokenizer 3 | from itertools import chain 4 | import os 5 | 6 | dataset_name = "wikicorpus" 7 | dataset_config_name = "raw_en" 8 | save_path = "~/examples_datasets/wikicorpus_gpt_neox_tokenized_2k" 9 | 10 | save_path = os.path.expanduser(save_path) 11 | if not os.path.exists(save_path): 12 | os.makedirs(save_path) 13 | 14 | block_size = 2048 15 | 16 | raw_datasets = load_dataset(dataset_name, dataset_config_name) 17 | 18 | model_name = "EleutherAI/gpt-neox-20b" 19 | tokenizer = AutoTokenizer.from_pretrained(model_name) 20 | 21 | column_names = raw_datasets["train"].column_names 22 | text_column_name = "text" if "text" in column_names else column_names[0] 23 | 24 | def tokenize_function(examples): 25 | return tokenizer(examples[text_column_name]) 26 | 27 | tokenized_datasets = raw_datasets.map( 28 | tokenize_function, 29 | batched=True, 30 | remove_columns=column_names, 31 | load_from_cache_file=True, 32 | desc="Running tokenizer on dataset", 33 | ) 34 | 35 | if block_size > tokenizer.model_max_length: 36 | print("block_size > tokenizer.model_max_length") 37 | block_size = min(block_size, tokenizer.model_max_length) 38 | 39 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 40 | def group_texts(examples): 41 | # Concatenate all texts. 42 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} 43 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 44 | # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. 45 | # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. 46 | total_length = (total_length // block_size) * block_size 47 | # Split by chunks of max_len. 48 | result = { 49 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 50 | for k, t in concatenated_examples.items() 51 | } 52 | result["labels"] = result["input_ids"].copy() 53 | return result 54 | 55 | lm_datasets = tokenized_datasets.map( 56 | group_texts, 57 | batched=True, 58 | load_from_cache_file=True, 59 | desc=f"Grouping texts in chunks of {block_size}", 60 | ) 61 | 62 | train_dataset = lm_datasets["train"] 63 | print(len(train_dataset)) 64 | 65 | train_dataset.save_to_disk(save_path) 66 | -------------------------------------------------------------------------------- /torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.26.0 2 | regex 3 | tensorboard 4 | datasets 5 | sentencepiece 6 | -------------------------------------------------------------------------------- /torch-neuronx/training/unet_image_segmentation/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # milesial, U-Net: Semantic segmentation with PyTorch, GitHub repository 6 | # https://github.com/milesial/Pytorch-UNet 7 | 8 | class DoubleConv(nn.Module): 9 | def __init__(self, in_channels, out_channels, mid_channels=None): 10 | super().__init__() 11 | if not mid_channels: 12 | mid_channels = out_channels 13 | self.double_conv = nn.Sequential( 14 | nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False), 15 | nn.BatchNorm2d(mid_channels), 16 | nn.ReLU(inplace=True), 17 | nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False), 18 | nn.BatchNorm2d(out_channels), 19 | nn.ReLU(inplace=True) 20 | ) 21 | 22 | def forward(self, x): 23 | return self.double_conv(x) 24 | 25 | 26 | class Down(nn.Module): 27 | def __init__(self, in_channels, out_channels): 28 | super().__init__() 29 | self.maxpool_conv = nn.Sequential( 30 | nn.MaxPool2d(2), 31 | DoubleConv(in_channels, out_channels) 32 | ) 33 | 34 | def forward(self, x): 35 | return self.maxpool_conv(x) 36 | 37 | 38 | class Up(nn.Module): 39 | def __init__(self, in_channels, out_channels, bilinear=True): 40 | super().__init__() 41 | 42 | if bilinear: 43 | self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) 44 | self.conv = DoubleConv(in_channels, out_channels, in_channels // 2) 45 | else: 46 | self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2) 47 | self.conv = DoubleConv(in_channels, out_channels) 48 | 49 | def forward(self, x1, x2): 50 | x1 = self.up(x1) 51 | # input is CHW 52 | diffY = x2.size()[2] - x1.size()[2] 53 | diffX = x2.size()[3] - x1.size()[3] 54 | 55 | x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2, 56 | diffY // 2, diffY - diffY // 2]) 57 | 58 | x = torch.cat([x2, x1], dim=1) 59 | return self.conv(x) 60 | 61 | 62 | class OutConv(nn.Module): 63 | def __init__(self, in_channels, out_channels): 64 | super(OutConv, self).__init__() 65 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1) 66 | 67 | def forward(self, x): 68 | return self.conv(x) 69 | 70 | class UNet(nn.Module): 71 | def __init__(self, n_channels, n_classes, bilinear=False): 72 | super(UNet, self).__init__() 73 | self.n_channels = n_channels 74 | self.n_classes = n_classes 75 | self.bilinear = bilinear 76 | 77 | self.inc = (DoubleConv(n_channels, 64)) 78 | self.down1 = (Down(64, 128)) 79 | self.down2 = (Down(128, 256)) 80 | self.down3 = (Down(256, 512)) 81 | factor = 2 if bilinear else 1 82 | self.down4 = (Down(512, 1024 // factor)) 83 | self.up1 = (Up(1024, 512 // factor, bilinear)) 84 | self.up2 = (Up(512, 256 // factor, bilinear)) 85 | self.up3 = (Up(256, 128 // factor, bilinear)) 86 | self.up4 = (Up(128, 64, bilinear)) 87 | self.outc = (OutConv(64, n_classes)) 88 | 89 | def forward(self, x): 90 | x1 = self.inc(x) 91 | x2 = self.down1(x1) 92 | x3 = self.down2(x2) 93 | x4 = self.down3(x3) 94 | x5 = self.down4(x4) 95 | x = self.up1(x5, x4) 96 | x = self.up2(x, x3) 97 | x = self.up3(x, x2) 98 | x = self.up4(x, x1) 99 | logits = self.outc(x) 100 | return logits 101 | -------------------------------------------------------------------------------- /torch-neuronx/training/zero1_gpt2/config_1p5B_gpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function": "gelu_new", 3 | "architectures": [ 4 | "GPT2LMHeadModel" 5 | ], 6 | "attn_pdrop": 0.1, 7 | "bos_token_id": 50256, 8 | "embd_pdrop": 0.1, 9 | "eos_token_id": 50256, 10 | "initializer_range": 0.02, 11 | "layer_norm_epsilon": 1e-05, 12 | "model_type": "gpt2", 13 | "n_ctx": 1024, 14 | "n_embd": 1600, 15 | "n_head": 25, 16 | "n_layer": 48, 17 | "n_positions": 1024, 18 | "output_past": true, 19 | "resid_pdrop": 0.1, 20 | "summary_activation": null, 21 | "summary_first_dropout": 0.1, 22 | "summary_proj_to_labels": true, 23 | "summary_type": "cls_index", 24 | "summary_use_proj": true, 25 | "task_specific_params": { 26 | "text-generation": { 27 | "do_sample": true, 28 | "max_length": 50 29 | } 30 | }, 31 | "vocab_size": 50257 32 | } 33 | -------------------------------------------------------------------------------- /torch-neuronx/training/zero1_gpt2/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.27.3 2 | accelerate==0.17 3 | datasets 4 | tensorboard==2.12.2 5 | huggingface-hub<0.23 6 | -------------------------------------------------------------------------------- /torch-neuronx/training/zero1_gpt2/run_clm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o pipefail 3 | 4 | sudo rmmod neuron; sudo modprobe neuron 5 | sudo sysctl -w net.ipv4.ip_local_reserved_ports=44000,48620 6 | sudo sysctl -w kernel.threads-max=10000000 7 | ulimit -c unlimited 8 | 9 | NUM_NEURONCORES=32 10 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES" 11 | 12 | LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" 13 | MALLOC_ARENA_MAX=64 14 | echo "MALLOC_ARENA_MAX" $MALLOC_ARENA_MAX 15 | echo "LD_PRELOAD" $LD_PRELOAD 16 | 17 | if [ ! -z "$SLURM_NTASKS" ]; then 18 | # if running inside slurm, handle here 19 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`) 20 | MASTER_PORT=2022 21 | WORLD_SIZE_JOB=$SLURM_NTASKS 22 | RANK_NODE=$SLURM_NODEID 23 | JOB_ID_TAG=job-"$SLURM_JOB_ID" 24 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 25 | echo $DISTRIBUTED_ARGS 26 | export NEURON_RT_ROOT_COMM_ID=$MASTER_ADDR:46820 27 | export FI_EFA_FORK_SAFE=1 28 | export FI_EFA_USE_DEVICE_RDMA=1 29 | export FI_PROVIDER=efa 30 | echo "WORLD_SIZE_JOB=$WORLD_SIZE_JOB, RANK_NODE=$RANK_NODE, MASTER_ADDR_JOB=$MASTER_ADDR_JOB, NODE_LIST=$NODE_LIST" 31 | export TRANSFORMERS_CACHE=$HOME/hf_cache/`hostname`/hub 32 | export HF_DATASETS_CACHE=$HOME/hf_cache/`hostname`/datasets 33 | fi 34 | 35 | #Print Slurm Config 36 | date;hostname; 37 | 38 | export TRAINING_PRECISION=$1 #options FP32, BF16, MIXED 39 | export NEURON_RT_STOCHASTIC_ROUNDING_EN=1 40 | 41 | if [[ "BF16" == $TRAINING_PRECISION ]]; then 42 | echo "USING BF16 ONLY" 43 | export XLA_USE_BF16=1 44 | export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy llm-training --model-type transformer" 45 | elif [[ "MIXED" == $TRAINING_PRECISION ]]; then 46 | echo "USING MIXED PRECISION BF16 and FP32" 47 | export NEURON_CC_FLAGS="--retry_failed_compilation --enable-mixed-precision-accumulation --distribution-strategy llm-training --model-type transformer" 48 | else 49 | echo "USING FP32 as default" 50 | export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy llm-training --model-type transformer" 51 | fi 52 | 53 | NEURON_CC_FLAGS+=" --cache_dir=$HOME/neuron_cache/gpt_1p5B/`hostname`" 54 | 55 | export DISABLE_NUMERIC_CC_TOKEN=1 56 | export NEURON_RT_HIERARCHICAL_CC=1 57 | 58 | export NEURON_RT_EXEC_TIMEOUT=600 59 | export TF_NUM_INTEROP_THREADS=8192 60 | 61 | export NEURON_ENABLE_NOSEED_DROPOUT=1 62 | 63 | GRAD_ACCUM_STEP=1 64 | BATCH_SIZE=1 65 | MODEL_CONFIG="config_1p5B_gpt2.json" 66 | MODEL_SIZE=$(echo $CONFIG | grep -m 1 -Eo '[0-9MBp]+' | head -n1 | tr -d '\n') 67 | DATASET_CONFIG=$2 68 | 69 | if [ $GRAD_ACCUM_STEP -gt 1 ]; then 70 | echo "need to uncomment accelerator.py code to run" 71 | ./uncomment_gradaccum.sh 72 | fi 73 | 74 | MAX_STEPS=100000 75 | LOG_FILE_NAME="run_log_hf_gpt2_param_"$MODEL_SIZE"_nodes"$WORLD_SIZE_JOB"_grad_accum"$GRAD_ACCUM_STEP"_bs"$BATCH_SIZE_$(date +"%m-%d-%Y")_$(date +"%H:%M:%S") 76 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then 77 | MAX_STEPS=10 78 | LOG_FILE_NAME="compile_log_hf_gpt2_param_"$MODEL_SIZE"_grad_accum"$GRAD_ACCUM_STEP"_bs"$BATCH_SIZE_$(date +"%m-%d-%Y")_$(date +"%H:%M:%S") 79 | fi 80 | 81 | torchrun $DISTRIBUTED_ARGS run_clm_no_trainer.py \ 82 | --model_name_or_path gpt2 \ 83 | --dataset_name wikitext \ 84 | --dataset_config_name $DATASET_CONFIG \ 85 | --config_name $MODEL_CONFIG \ 86 | --per_device_train_batch_size $BATCH_SIZE \ 87 | --gradient_accumulation_steps $GRAD_ACCUM_STEP \ 88 | --max_train_steps $MAX_STEPS \ 89 | --weight_decay 0.01 \ 90 | --learning_rate 0.00015 \ 91 | --lr_scheduler_type cosine \ 92 | --use_zero1 \ 93 | --gradient_checkpointing \ 94 | --seed 1234 \ 95 | --num_warmup_steps 75 \ 96 | --use_grad_clipping \ 97 | --validation_split_percentage 0 \ 98 | --output_dir gpt_1p5B \ 99 | |& tee $LOG_FILE_NAME 100 | -------------------------------------------------------------------------------- /torch-neuronx/training/zero1_gpt2/run_clm.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --nodes=4 3 | #SBATCH --exclusive 4 | #SBATCH --output=slurm-%x-%j.out 5 | 6 | srun ./run_clm.sh MIXED wikitext-103-raw-v1 7 | -------------------------------------------------------------------------------- /torch-neuronx/training/zero1_gpt2/run_clm_compile.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --nodes=4 3 | #SBATCH --exclusive 4 | #SBATCH --output=slurm-%x-%j.out 5 | 6 | srun neuron_parallel_compile ./run_clm.sh MIXED wikitext-103-raw-v1 7 | -------------------------------------------------------------------------------- /torch-neuronx/training/zero1_gpt2/uncomment_gradaccum.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | script_output=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])') 3 | script_output+="/accelerate/accelerator.py" 4 | 5 | experitment_grad_accum() { 6 | echo 'uncommenting the assersiont to run grad_accum steps > 1' 7 | # look for "Gradient accumulation on TPU is not supported. Pass in `gradient_accumulation_steps=1`" 8 | ln=$(grep -wn "NotImplementedError" $script_output | cut -d: -f1) 9 | let start=$ln-2 10 | let end=$ln+3 11 | let tagln=$start-1 12 | sed -i "${tagln}a \\ #ExperimentalHackOn" $script_output 13 | while [[ start -le $end ]] 14 | do 15 | sed -i "$start s/./#&/" $script_output 16 | ((start = start + 1)) 17 | done 18 | } 19 | 20 | if grep -r 'ExperimentalHackOn' $script_output; then 21 | echo Already edited the accelerator code 22 | else 23 | echo Editing accelerator code 24 | experitment_grad_accum 25 | fi 26 | -------------------------------------------------------------------------------- /torch-neuronx/transformers-neuronx/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Neuron (transformers-neuronx) Samples for AWS Inf2 & Trn1 2 | 3 | This directory contains sample Jupyter Notebooks demonstrating tensor parallel inference for various PyTorch large language models (LLMs) on [AWS Inferentia](https://aws.amazon.com/ec2/instance-types/inf2/) (Inf2) instances) and [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/) (Trn1) instances. 4 | 5 | For additional information on these training scripts, please refer to the tutorials found in the official Inferentia and Trainium documentation. 6 | 7 | ## Inference 8 | 9 | The following samples are available for LLM tensor parallel inference: 10 | 11 | | Name | Instance type | 12 | |-------------------------------------------------------------| --------------- | 13 | | [facebook/opt-13b](inference/facebook-opt-13b-sampling.ipynb) | Inf2 & Trn1 | 14 | | [facebook/opt-30b](inference/facebook-opt-30b-sampling.ipynb) | Inf2 & Trn1 | 15 | | [facebook/opt-66b](inference/facebook-opt-66b-sampling.ipynb) | Inf2 | 16 | | [meta-llama/Llama-2-13b](inference/meta-llama-2-13b-sampling.ipynb) | Inf2 & Trn1 | 17 | -------------------------------------------------------------------------------- /torch-neuronx/transformers-neuronx/inference/gpt-j-dp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from transformers import AutoTokenizer 5 | from transformers_neuronx.gptj.model import GPTJForSampling 6 | from multiprocessing import Process 7 | 8 | def load_model_infer(): 9 | # load model to NeuronCores with 8-way tensor parallel and DP 10 | load_compile_time = time.time() 11 | neuron_model = GPTJForSampling.from_pretrained('./gptj-6b-split', n_positions=1024, batch_size=64, tp_degree=8, amp='f16') 12 | neuron_model.to_neuron() 13 | load_compile_elapsed = time.time() - load_compile_time 14 | print(f'Model load & compile time in a single process {load_compile_elapsed} seconds') 15 | 16 | # construct a tokenizer and encode prompt text 17 | tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6B') 18 | 19 | batch_prompts = [ 20 | "I am specialized at sentence generation language models,", 21 | ] 22 | batch_prompts = batch_prompts * 64 23 | 24 | input_ids = torch.as_tensor([tokenizer.encode(text) for text in batch_prompts]) 25 | 26 | 27 | with torch.inference_mode(): 28 | # warmup 29 | generated_sequences = neuron_model.sample(input_ids, sequence_length=1024) 30 | 31 | start = time.time() 32 | for i in range(2): 33 | generated_sequences = neuron_model.sample(input_ids, sequence_length=1024) 34 | elapsed = (time.time() - start) / 2 35 | 36 | generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences] 37 | print(f'Averaged Latency for one inference {elapsed} seconds') 38 | 39 | if __name__ == '__main__': 40 | os.environ['NEURON_RT_NUM_CORES']='8' 41 | total_start = time.time() 42 | p1 = Process(target=load_model_infer) 43 | p2 = Process(target=load_model_infer) 44 | p3 = Process(target=load_model_infer) 45 | p1.start() 46 | p2.start() 47 | p3.start() 48 | p1.join() 49 | p2.join() 50 | p3.join() 51 | total_elapsed = time.time() - total_start 52 | print(f'total processes time including compilation finished in {total_elapsed} seconds') 53 | print(f'TPS {(30/total_elapsed)*64} ') 54 | p1.terminate() 55 | p2.terminate() 56 | p3.terminate() --------------------------------------------------------------------------------