├── .github
    ├── pull_request_template.md
    └── workflows
    │   └── aggregate-prs.yml
├── .gitignore
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── inference-benchmarking
    ├── Readme.md
    ├── accuracy.py
    ├── clients
    │   ├── __init__.py
    │   ├── base.py
    │   ├── lm_eval
    │   │   ├── __init__.py
    │   │   ├── client.py
    │   │   └── scripts
    │   │   │   ├── __init__.py
    │   │   │   ├── run_lm_eval.sh
    │   │   │   └── setup_lm_eval.sh
    │   └── long_bench
    │   │   ├── __init__.py
    │   │   ├── client.py
    │   │   └── prompts
    │   │       ├── 0shot.txt
    │   │       ├── 0shot_cot.txt
    │   │       ├── 0shot_cot_ans.txt
    │   │       ├── 0shot_no_context.txt
    │   │       ├── 0shot_rag.txt
    │   │       └── __init__.py
    ├── config.yaml
    ├── requirements.txt
    ├── server
    │   ├── __init__.py
    │   ├── scripts
    │   │   ├── __init__.py
    │   │   └── start_server.sh
    │   └── vllm.py
    ├── server_config.py
    └── utils
    │   ├── __init__.py
    │   ├── artifacts.py
    │   ├── parser.py
    │   ├── process.py
    │   └── s3.py
├── releasenotes.md
├── tensorflow-neuron
    ├── README.md
    └── inference
    │   └── unet
    │       └── UnetTF2.ipynb
├── torch-neuron
    ├── README.md
    └── inference
    │   ├── beit
    │       └── BEiT.ipynb
    │   ├── bertbasecased
    │       └── BertBaseCased.ipynb
    │   ├── bertlargeuncased
    │       └── BertLargeUncased.ipynb
    │   ├── clip
    │       └── CLIP_Model_HF.ipynb
    │   ├── common
    │       ├── processing.py
    │       └── wrapper.py
    │   ├── craft
    │       └── Craft.ipynb
    │   ├── efficientnet
    │       └── EfficientNet.ipynb
    │   ├── fairseq
    │       └── Fairseq.ipynb
    │   ├── gfl_mmdet
    │       └── GFL.ipynb
    │   ├── hrnet
    │       └── HRnet.ipynb
    │   ├── marianmt
    │       └── MarianMT.ipynb
    │   ├── rcnn
    │       └── Rcnn.ipynb
    │   ├── resnet
    │       └── Resnet.ipynb
    │   ├── resnext
    │       └── Resnext.ipynb
    │   ├── robertabase
    │       └── RobertaBase.ipynb
    │   ├── ssd
    │       └── SSD300VGG16.ipynb
    │   ├── trocr
    │       └── TrOCR.ipynb
    │   ├── vgg
    │       └── VGG.ipynb
    │   ├── vit
    │       └── ViT.ipynb
    │   ├── yolof_detectron2
    │       └── YoloF.ipynb
    │   ├── yolov5
    │       └── Yolov5.ipynb
    │   ├── yolov6
    │       └── Yolov6.ipynb
    │   └── yolov7
    │       └── Yolov7.ipynb
└── torch-neuronx
    ├── README.md
    ├── inference
        ├── customop_mlp
        │   ├── README.md
        │   ├── neuron-multicore
        │   │   ├── build.py
        │   │   ├── inference.py
        │   │   ├── model.py
        │   │   ├── my_ops.py
        │   │   ├── relu.cpp
        │   │   └── shape.cpp
        │   ├── neuron-tcm
        │   │   ├── build.py
        │   │   ├── inference.py
        │   │   ├── model.py
        │   │   ├── my_ops.py
        │   │   ├── relu.cpp
        │   │   └── shape.cpp
        │   └── neuron
        │   │   ├── build.py
        │   │   ├── inference.py
        │   │   ├── model.py
        │   │   ├── my_ops.py
        │   │   ├── relu.cpp
        │   │   └── shape.cpp
        ├── hf_pretrained_bert_inference_on_trn1.ipynb
        ├── hf_pretrained_clip_base_inference_on_inf2.ipynb
        ├── hf_pretrained_clip_large_inference_on_inf2.ipynb
        ├── hf_pretrained_distilbert_Inference_on_trn1.ipynb
        ├── hf_pretrained_gpt2_feature_extraction_on_trn1.ipynb
        ├── hf_pretrained_perceiver_language_inference.ipynb
        ├── hf_pretrained_perceiver_multimodal_inference.ipynb
        ├── hf_pretrained_perceiver_vision_inference.ipynb
        ├── hf_pretrained_pixart_alpha_inference_on_inf2.ipynb
        ├── hf_pretrained_pixart_sigma_1k
        │   ├── compile_latency_optimized.sh
        │   ├── compile_throughput_optimized.sh
        │   ├── hf_pretrained_pixart_sigma_1k_latency_optimized.ipynb
        │   ├── hf_pretrained_pixart_sigma_1k_throughput_optimized.ipynb
        │   ├── neuron_pixart_sigma
        │   │   ├── cache_hf_model.py
        │   │   ├── compile_decoder.py
        │   │   ├── compile_text_encoder.py
        │   │   ├── compile_transformer_latency_optimized.py
        │   │   ├── compile_transformer_throughput_optimized.py
        │   │   ├── neuron_commons.py
        │   │   └── neuron_parallel_utils.py
        │   └── requirements.txt
        ├── hf_pretrained_pixart_sigma_inference_on_inf2.ipynb
        ├── hf_pretrained_roberta_inference_on_frn1.ipynb
        ├── hf_pretrained_sd15_512_inference.ipynb
        ├── hf_pretrained_sd2_512_inference.ipynb
        ├── hf_pretrained_sd2_768_inference.ipynb
        ├── hf_pretrained_sd2_inpainting_936_624_inference.ipynb
        ├── hf_pretrained_sd_x4_upscaler_inference.ipynb
        ├── hf_pretrained_sdxl_base_1024_inference.ipynb
        ├── hf_pretrained_sdxl_base_and_refiner_1024_inference.ipynb
        ├── hf_pretrained_vit_inference_on_inf2.ipynb
        ├── hf_pretrained_wav2vec2_conformer_relpos_inference_on_inf2.ipynb
        ├── hf_pretrained_wav2vec2_conformer_rope_inference_on_inf2.ipynb
        ├── pretrained_unet_inference_on_trn1.ipynb
        ├── sd2_inpainting_mask.png
        ├── sd2_inpainting_photo.png
        ├── tv_pretrained_resnet50_inference_on_trn1.ipynb
        └── tv_pretrained_vgg_inference_on_trn1.ipynb
    ├── microbenchmark
        ├── matmult_linear.py
        ├── microbenchmark.ipynb
        └── ubench_utils.py
    ├── training
        ├── aws-batch
        │   ├── all-reduce
        │   │   ├── README.md
        │   │   ├── build_configs_and_setup.sh
        │   │   ├── docker
        │   │   │   ├── Dockerfile
        │   │   │   ├── allreduce.py
        │   │   │   └── allreduce.sh
        │   │   ├── submit_job.sh
        │   │   └── templates
        │   │   │   ├── build_docker_image.sh
        │   │   │   ├── compute_env.json
        │   │   │   ├── create_resources.sh
        │   │   │   ├── job_def.json
        │   │   │   ├── job_queue.json
        │   │   │   └── launch_template.json
        │   └── llama2
        │   │   ├── README.md
        │   │   ├── config.txt
        │   │   ├── docker
        │   │       ├── Dockerfile
        │   │       └── llama_batch_training.sh
        │   │   ├── images
        │   │       └── aws-batch.png
        │   │   ├── scripts
        │   │       ├── build_and_push_docker_image.sh
        │   │       ├── cleanup.sh
        │   │       ├── create_resources.sh
        │   │       ├── download_and_tokenize_data.sh
        │   │       └── submit_batch_job.sh
        │   │   ├── setup.sh
        │   │   └── templates
        │   │       ├── compute_env.json
        │   │       ├── job_def.json
        │   │       ├── job_queue.json
        │   │       └── launch_template.json
        ├── common
        │   ├── hf_utils.py
        │   └── vision_utils.py
        ├── customop_mlp
        │   ├── README.md
        │   ├── neuron
        │   │   ├── build.py
        │   │   ├── model.py
        │   │   ├── my_ops.py
        │   │   ├── relu.cpp
        │   │   ├── shape.cpp
        │   │   └── train.py
        │   └── pytorch
        │   │   ├── build.py
        │   │   ├── model.py
        │   │   ├── my_ops.py
        │   │   ├── relu.cpp
        │   │   └── train_cpu.py
        ├── dp_bert_hf_pretrain
        │   ├── adamw_fp32_optim_params.py
        │   ├── adamw_fp32_params_copy.py
        │   ├── dp_bert_large_hf_pretrain_hdf5.py
        │   ├── dp_bert_large_hf_pretrain_hdf5_THIRD-PARTY-LICENSES.txt
        │   ├── lamb.py
        │   ├── requirements.txt
        │   ├── run_dp_bert_large_hf_pretrain_bf16_s128.sh
        │   ├── run_dp_bert_large_hf_pretrain_bf16_s128_lamb.sh
        │   ├── run_dp_bert_large_hf_pretrain_bf16_s512_lamb_phase2.sh
        │   └── run_dp_bert_large_hf_pretrain_bf16_s512_phase2.sh
        ├── hf_bert_jp
        │   └── bert-jp-tutorial.ipynb
        ├── hf_contrastive_image_text
        │   ├── CLIPBase.ipynb
        │   ├── CLIPLarge.ipynb
        │   └── run_clip.py
        ├── hf_image_classification
        │   ├── VisionPerceiverConv.ipynb
        │   ├── run_image_classification.py
        │   └── vit.ipynb
        ├── hf_language_modeling
        │   └── gpt2
        │   │   ├── gpt2.ipynb
        │   │   └── run_clm.patch
        ├── hf_sentiment_analysis
        │   ├── .gitignore
        │   ├── 01-hf-single-neuron.ipynb
        │   ├── 02-hf-distributed-training.ipynb
        │   ├── README.md
        │   ├── code
        │   │   ├── 01-trainium-single-core
        │   │   │   └── train.py
        │   │   └── 02-trainium-distributed-training
        │   │   │   └── train.py
        │   └── data
        │   │   ├── data.csv
        │   │   ├── test.csv
        │   │   └── train.csv
        ├── hf_summarization
        │   ├── BartLarge.ipynb
        │   ├── T5Large.ipynb
        │   └── run_summarization.py
        ├── hf_text_classification
        │   ├── AlbertBase.ipynb
        │   ├── BertBaseCased.ipynb
        │   ├── BertBaseUncased.ipynb
        │   ├── BertLargeCased.ipynb
        │   ├── BertLargeUncased.ipynb
        │   ├── CamembertBase.ipynb
        │   ├── DistilbertBaseUncased.ipynb
        │   ├── ElectraSmall.ipynb
        │   ├── LanguagePerceiver.ipynb
        │   ├── README.md
        │   ├── RobertaBase.ipynb
        │   ├── RobertaLarge.ipynb
        │   ├── XlmRobertaBase.ipynb
        │   └── run_glue.py
        ├── llama2
        │   ├── adamw_fp32_optim_params.py
        │   ├── convert_checkpoints.py
        │   ├── get_dataset.py
        │   ├── modeling_llama_nxd.py
        │   └── requirements.txt
        ├── mnist_mlp
        │   ├── eval.py
        │   ├── eval_using_trace.py
        │   ├── model.py
        │   ├── train.py
        │   ├── train_cpu.py
        │   ├── train_torchrun.py
        │   └── train_xmp.py
        ├── resnet50
        │   ├── resnet50.ipynb
        │   └── run_image_classification.py
        ├── stable_diffusion
        │   ├── requirements.txt
        │   ├── run.py
        │   └── sd_training_neuron.py
        ├── tp_dp_bert_hf_pretrain
        │   ├── requirements.txt
        │   └── tp_dp_bert_large_hf_pretrain_hdf5.py
        ├── tp_dp_gpt_neox_hf_pretrain
        │   └── common
        │   │   ├── adamw_fp32_optim_params.py
        │   │   ├── get_dataset.py
        │   │   └── requirements.txt
        ├── unet_image_segmentation
        │   ├── model.py
        │   ├── train.py
        │   └── unet.ipynb
        └── zero1_gpt2
        │   ├── config_1p5B_gpt2.json
        │   ├── neuron_utils.py
        │   ├── requirements.txt
        │   ├── run_clm.sh
        │   ├── run_clm.slurm
        │   ├── run_clm_compile.slurm
        │   ├── run_clm_no_trainer.py
        │   └── uncomment_gradaccum.sh
    └── transformers-neuronx
        ├── README.md
        └── inference
            ├── codellama-13b-16k-sampling.ipynb
            ├── facebook-opt-13b-sampling.ipynb
            ├── facebook-opt-30b-sampling.ipynb
            ├── facebook-opt-66b-sampling.ipynb
            ├── gpt-j-6b-sampling-dp.ipynb
            ├── gpt-j-6b-sampling.ipynb
            ├── gpt-j-dp.py
            ├── llama-3.1-405b-multinode-16k-sampling.ipynb
            ├── llama-3.1-70b-64k-sampling.ipynb
            ├── llama-3.1-70b-eagle-speculative-decoding.ipynb
            ├── llama-3.1-70b-speculative-decoding.ipynb
            ├── llama-3.1-8b-128k-sampling.ipynb
            ├── llama-3.1-8b-32k-sampling.ipynb
            ├── llama-70b-sampling.ipynb
            ├── meta-llama-2-13b-sampling.ipynb
            ├── meta-llama-3-70b-sampling.ipynb
            ├── meta-llama-3-8b-sampling.ipynb
            ├── meta-llama-3.1-70b-sampling.ipynb
            ├── meta-llama-3.1-8b-sampling.ipynb
            ├── mistralai-Mistral-7b-Instruct-v0.2.ipynb
            ├── mixtral-8x7b-sampling.ipynb
            └── speculative_sampling.ipynb


/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | 
 2 | *Description:*
 3 | 
 4 | *Issue #, sim, or t.corp if available:*
 5 | 
 6 | * Link to RTD for my changes: https://github.com/aws-neuron/aws-neuron-samples-staging/YOUR_BRANCH_NAME/
 7 | 
 8 | * Submitter Checklist        
 9 |     * Tested on : Neuron SDK <version>, release_version, Instance_type.
10 |     *  I've completely filled out the form above!
11 |        **(MANDATORY) PR needs test run output
12 |         
13 |             * I have provided the output with expected metrics in a metrics.json file
14 |        
15 |             * I have attached metric.json in the PR
16 | 
17 |             * I have attached golden_step_loss.txt
18 |        
19 |             * I have added screen shot of plotted loss curve
20 |        
21 |         *  (If applicable) I've automated a test to safegaurd my changes from regression.
22 |         *  (If applicable) I've posted test collateral to prove my change was effective and not harmful.
23 |         *  (If applicable) I've added someone from QA to the list of reviewers. Do this if you didn't make an automated test or feel it's appropriate for another reason.
24 |         *  (If applicable) I've reviewed the licenses of updated and new binaries and their dependencies to make sure all licenses are on the pre-approved Amazon license list.
25 | * Reviewer Checklist
26 |         *  I've verified the changes render correctly on RTD (link above)
27 |         *  I've ensured the submitter completed the form
28 |         *  (If appropriate) I've verified the metrics.json file provided by the submitter
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/.github/workflows/aggregate-prs.yml:
--------------------------------------------------------------------------------
 1 | name: Merge PR into Dynamic Branch on Label
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [labeled, synchronize]
 6 |     branches:
 7 |       - master
 8 | 
 9 | jobs:
10 |   merge-to-dynamic-branch:
11 |     if: github.event.label.name != 'do-not-merge' #Excludes those labeled with do-not-merge
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - name: Checkout Repository
15 |       uses: actions/checkout@v2
16 |       with:
17 |         ref: ${{ github.event.pull_request.head.ref }}
18 |         fetch-depth: 0
19 | 
20 |     - name: Configure Git
21 |       run: |
22 |         git config user.name "GitHub Actions"
23 |         git config user.email "actions@github.com"
24 | 
25 |     - name: Check PR Labels and Merge for New Commit Events
26 |       if: github.event.action == 'synchronize'
27 |       run: |
28 |         LABELS_JSON=$(gh pr view ${{ github.event.pull_request.number }} --json labels)
29 |         LABELS=$(echo "$LABELS_JSON" | jq -r '.labels[].name')
30 |         for LABEL_BRANCH in $LABELS; do
31 |           # Check if the branch exists
32 |           if git show-ref --verify --quiet refs/heads/$LABEL_BRANCH; then
33 |             echo "Branch $LABEL_BRANCH already exists."
34 |           else
35 |             echo "Branch $LABEL_BRANCH does not exist, creating it."
36 |             git branch $LABEL_BRANCH origin/master
37 |           fi
38 |           git checkout $LABEL_BRANCH
39 | 
40 |           # Merge PR changes into dynamic branch
41 |           git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit
42 |           git commit -m "Merged PR #${{ github.event.pull_request.number }} due to new commits on labeled PR"
43 |           git push origin $LABEL_BRANCH
44 |         done
45 |       env:
46 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
47 | 
48 |     - name: Merge for Labeled Event
49 |       if: github.event.action == 'labeled'
50 |       run: |
51 |         LABEL_BRANCH=${{ github.event.label.name }}
52 |         # Check if the branch exists
53 |         if git show-ref --verify --quiet refs/heads/$LABEL_BRANCH; then
54 |           echo "Branch $LABEL_BRANCH already exists."
55 |         else
56 |           echo "Branch $LABEL_BRANCH does not exist, creating it."
57 |           git branch $LABEL_BRANCH origin/master
58 |         fi
59 |         git checkout $LABEL_BRANCH
60 | 
61 |         # Merge PR changes into dynamic branch
62 |         git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit
63 |         git commit -m "Merged PR #${{ github.event.pull_request.number }} due to label '$LABEL_BRANCH'"
64 |         git push origin $LABEL_BRANCH
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | **/__pycache__
4 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # This file creates codeowners for the documentation. It will allow setting code reviewers for all Pull requests to merge to the master branch
 2 | # Each line is a file pattern followed by one or more owners.
 3 | 
 4 | # Refernce guide - https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-code-owners#example-[…]ners-file
 5 | # Example - These owners will be the default owners for everything in
 6 | # the repo. Unless a later match takes precedence,
 7 | # @global-owner1 and @global-owner2 will be requested for
 8 | # review when someone opens a pull request.
 9 | # *       @global-owner1 @global-owner2
10 | 
11 | *       @aws-maens  @natemail-aws @rgrandhiamzn
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Amazon Software License 1.0
 2 | 
 3 | This Amazon Software License ("License") governs your use, reproduction, and
 4 | distribution of the accompanying software as specified below.
 5 | 
 6 | 1. Definitions
 7 | 
 8 |   "Licensor" means any person or entity that distributes its Work.
 9 | 
10 |   "Software" means the original work of authorship made available under this
11 |   License.
12 | 
13 |   "Work" means the Software and any additions to or derivative works of the
14 |   Software that are made available under this License.
15 | 
16 |   The terms "reproduce," "reproduction," "derivative works," and
17 |   "distribution" have the meaning as provided under U.S. copyright law;
18 |   provided, however, that for the purposes of this License, derivative works
19 |   shall not include works that remain separable from, or merely link (or bind
20 |   by name) to the interfaces of, the Work.
21 | 
22 |   Works, including the Software, are "made available" under this License by
23 |   including in or with the Work either (a) a copyright notice referencing the
24 |   applicability of this License to the Work, or (b) a copy of this License.
25 | 
26 | 2. License Grants
27 | 
28 |   2.1 Copyright Grant. Subject to the terms and conditions of this License,
29 |   each Licensor grants to you a perpetual, worldwide, non-exclusive,
30 |   royalty-free, copyright license to reproduce, prepare derivative works of,
31 |   publicly display, publicly perform, sublicense and distribute its Work and
32 |   any resulting derivative works in any form.
33 | 
34 |   2.2 Patent Grant. Subject to the terms and conditions of this License, each
35 |   Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free
36 |   patent license to make, have made, use, sell, offer for sale, import, and
37 |   otherwise transfer its Work, in whole or in part. The foregoing license
38 |   applies only to the patent claims licensable by Licensor that would be
39 |   infringed by Licensor's Work (or portion thereof) individually and
40 |   excluding any combinations with any other materials or technology.
41 | 
42 | 3. Limitations
43 | 
44 |   3.1 Redistribution. You may reproduce or distribute the Work only if
45 |   (a) you do so under this License, (b) you include a complete copy of this
46 |   License with your distribution, and (c) you retain without modification
47 |   any copyright, patent, trademark, or attribution notices that are present
48 |   in the Work.
49 | 
50 |   3.2 Derivative Works. You may specify that additional or different terms
51 |   apply to the use, reproduction, and distribution of your derivative works
52 |   of the Work ("Your Terms") only if (a) Your Terms provide that the use
53 |   limitation in Section 3.3 applies to your derivative works, and (b) you
54 |   identify the specific derivative works that are subject to Your Terms.
55 |   Notwithstanding Your Terms, this License (including the redistribution
56 |   requirements in Section 3.1) will continue to apply to the Work itself.
57 | 
58 |   3.3 Use Limitation. The Work and any derivative works thereof only may be
59 |   used or intended for use with the web services, computing platforms or
60 |   applications provided by Amazon.com, Inc. or its affiliates, including
61 |   Amazon Web Services, Inc.
62 | 
63 |   3.4 Patent Claims. If you bring or threaten to bring a patent claim against
64 |   any Licensor (including any claim, cross-claim or counterclaim in a
65 |   lawsuit) to enforce any patents that you allege are infringed by any Work,
66 |   then your rights under this License from such Licensor (including the
67 |   grants in Sections 2.1 and 2.2) will terminate immediately.
68 | 
69 |   3.5 Trademarks. This License does not grant any rights to use any
70 |   Licensor's or its affiliates' names, logos, or trademarks, except as
71 |   necessary to reproduce the notices described in this License.
72 | 
73 |   3.6 Termination. If you violate any term of this License, then your rights
74 |   under this License (including the grants in Sections 2.1 and 2.2) will
75 |   terminate immediately.
76 | 
77 | 4. Disclaimer of Warranty.
78 | 
79 |   THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
80 |   EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81 |   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82 |   NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83 |   THIS LICENSE. SOME STATES' CONSUMER LAWS DO NOT ALLOW EXCLUSION OF AN
84 |   IMPLIED WARRANTY, SO THIS DISCLAIMER MAY NOT APPLY TO YOU.
85 | 
86 | 5. Limitation of Liability.
87 | 
88 |   EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
89 |   THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
90 |   SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
91 |   INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR
92 |   RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING
93 |   BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS
94 |   OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES
95 |   OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF
96 |   SUCH DAMAGES.
97 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Neuron Samples
 2 | 
 3 | This repository contains samples for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/), the software development kit (SDK) that enables machine learning (ML) inference and training workloads on the AWS ML accelerator chips [Inferentia](https://aws.amazon.com/machine-learning/inferentia/) and [Trainium](https://aws.amazon.com/machine-learning/trainium/).
 4 | 
 5 | The samples in this repository provide an indication of the types of deep learning models that can be used with Trainium and Inferentia, but do not represent an exhaustive list of supported models. If you have additional model samples that you would like to contribute to this repository, please submit a pull request following the repository's contribution [guidelines](CONTRIBUTING.md).
 6 | 
 7 | Samples are organized by use case (training, inference) and deep learning framework (PyTorch, TensorFlow) below:
 8 | 
 9 | ## Training
10 | 
11 | | Framework | Description | Instance Type |
12 | | --- | --- | --- |
13 | | [PyTorch NeuronX (torch-neuronx)](torch-neuronx/README.md#training) | Sample training scripts for training various PyTorch models on AWS Trainium | Trn1, Trn1n & Inf2 |
14 | 
15 | | Usage | Description | Instance Type |
16 | | --- | --- | --- |
17 | | [Nemo Megatron for Neuron](https://github.com/aws-neuron/neuronx-nemo-megatron) | A library that enables large-scale distributed training of language models such as Llama and is adapted from Nemo Megatron. | Trn1, Trn1n |
18 | | [AWS Neuron samples for ParallelCluster](https://github.com/aws-neuron/aws-neuron-parallelcluster-samples) | How to use AWS ParallelCluster to build HPC compute cluster that uses trn1 compute nodes to run your distributed ML training job.  | Trn1, Trn1n |
19 | | [AWS Neuron samples for EKS](https://github.com/aws-neuron/aws-neuron-eks-samples) | The samples in this repository demonstrate the types of patterns that can be used to deliver inference and distributed training on EKS using Inferentia and Trainium. | Trn1, Trn1n |
20 | | [AWS Neuron samples for SageMaker](https://github.com/aws-neuron/aws-neuron-sagemaker-samples) | SageMaker Samples using ml.trn1 instances for machine learning (ML) training workloads on the AWS ML accelerator chips Trainium. | Trn1, Trn1n |
21 | 
22 | 
23 | ## Inference
24 | 
25 | | Framework | Description | Instance Type |
26 | | --- | --- | --- |
27 | | [PyTorch NeuronX (torch-neuronx)](torch-neuronx/README.md#inference) | Sample Jupyter notebooks demonstrating model compilation and inference for various PyTorch models on AWS Inferentia2 and Trainium | Inf2 & Trn1 |
28 | | [PyTorch NeuronX (transformers-neuronx)](torch-neuronx/transformers-neuronx) | Sample Jupyter Notebooks demonstrating tensor parallel inference for various PyTorch large language models (LLMs) on AWS Inferentia2 and Trainium | Inf2 & Trn1 |
29 | | [PyTorch Neuron (torch-neuron)](torch-neuron) | Sample Jupyter notebooks demonstrating model compilation and inference for various PyTorch models on AWS Inferentia | Inf1 |
30 | | [TensorFlow Neuron (tensorflow-neuron)](tensorflow-neuron) | Sample Jupyter notebooks demonstrating model compilation and inference for various TensorFlow models on AWS Inferentia | Inf1 |
31 | 
32 | | Usage | Description | Instance Type |
33 | | --- | --- | --- |
34 | | [AWS Neuron samples for SageMaker](https://github.com/aws-neuron/aws-neuron-sagemaker-samples) | SageMaker Samples using ml.inf2 and ml.trn1 instances for machine learning (ML) inference workloads on the AWS ML accelerator chips Inferentia2 and Trainium.  | Inf2 & Trn1 |
35 | 
36 | 
37 | ## Getting Help
38 | 
39 | If you encounter issues with any of the samples in this repository, please open an issue via the GitHub Issues feature.
40 | 
41 | ## Contributing
42 | 
43 | Please refer to the [CONTRIBUTING](CONTRIBUTING.md) document for details on contributing additional samples to this repository.
44 | 
45 | 
46 | ## Release Notes
47 | 
48 | Please refer to the [Change Log](releasenotes.md).
49 | 
50 | ## Known Issues
51 | 
52 | | Model | Framework | Training/Inference | Instance Type | Status |
53 | | --- | --- | --- | --- | --- |
54 | | Fairseq | PyTorch | Inference | Inf1 | RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace! |
55 | | Yolof | PyTorch | Inference | Inf1 | RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace! |
56 | 


--------------------------------------------------------------------------------
/inference-benchmarking/Readme.md:
--------------------------------------------------------------------------------
1 | # Inference benchmarking
2 | 
3 | This folder contains scripts to evaluate accuracy of LLM models inference with open source datasets. Please refer [Accuracy Eval Developer Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/accuracy-eval-with-datasets.html) or [Accuracy Evaluation Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/tutorials/trn1-llama3.1-70b-instruct-accuracy-eval.html) on how to use these scripts. In the future we will expand this folder with scripts to benchmark performance with tools such as LLMPerf and other accuracy evaluation scripts.


--------------------------------------------------------------------------------
/inference-benchmarking/clients/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/inference-benchmarking/clients/__init__.py


--------------------------------------------------------------------------------
/inference-benchmarking/clients/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from pathlib import Path
 3 | from typing import Any, Dict
 4 | 
 5 | 
 6 | class EvalClient(ABC):
 7 |     """Base class for evaluation clients"""
 8 | 
 9 |     def __init__(self):
10 |         self.scripts_dir = Path(__file__).parent
11 | 
12 |     @abstractmethod
13 |     def setup(self) -> None:
14 |         """Setup the client (install dependencies, etc.)"""
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def run(self, server_port: int, scenario_config: Dict[str, Any]) -> Dict[str, Any]:
19 |         """
20 |         Run evaluation and return standardized results
21 | 
22 |         Returns:
23 |             Dict with standardized format:
24 |             {
25 |                 "metrics": {
26 |                     "metric_name": value,
27 |                     ...
28 |                 },
29 |                 "metadata": {
30 |                     "scenario": str,
31 |                     "client": str,
32 |                     "timestamp": str,
33 |                     ...
34 |                 },
35 |                 "raw_results": Dict  # Original client output
36 |             }
37 |         """
38 |         pass
39 | 
40 |     def _get_script_path(self, script_name: str) -> str:
41 |         return str(self.scripts_dir / script_name)
42 | 


--------------------------------------------------------------------------------
/inference-benchmarking/clients/lm_eval/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import LMEvalClient
2 | 


--------------------------------------------------------------------------------
/inference-benchmarking/clients/lm_eval/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/inference-benchmarking/clients/lm_eval/scripts/__init__.py


--------------------------------------------------------------------------------
/inference-benchmarking/clients/lm_eval/scripts/run_lm_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define default values
 4 | model=${1}
 5 | model_path=${2}
 6 | max_concurrent_req=${3:-1}
 7 | port=${4:-8000} 
 8 | task_name=${5:-"gsm8k_cot"}
 9 | results_dir=${6}
10 | timeout=${7:-7200}
11 | limit=${8:-200}
12 | use_chat=${9:-true}
13 | 
14 | source ~/lm_eval_venv/bin/activate
15 | 
16 | echo "Running LM Eval Client for model: ${model}, model_path: ${model_path}, max_concurrent_req: ${max_concurrent_req}, port: ${port}, task_name: ${task_name}, results_dir: ${results_dir}, timeout: ${timeout}, limit: ${limit}, use_chat: ${use_chat}"
17 | 
18 | set -x 
19 | 
20 | export OPENAI_API_KEY=EMPTY
21 | export OPENAI_API_BASE="http://localhost:${port}/v1"
22 | 
23 | # Set the endpoint based on use_chat
24 | if [ "$use_chat" = true ] ; then
25 |     endpoint="chat/completions"
26 |     model_type="local-chat-completions"
27 |     additional_args="--apply_chat_template"
28 |     echo "Starting lm_eval with chat completions"
29 | else
30 |     endpoint="completions"
31 |     model_type="local-completions"
32 |     additional_args=""
33 |     echo "Starting lm_eval without chat completions"
34 | fi
35 | 
36 | # Common arguments with dynamic endpoint
37 | common_args=(
38 |     "--tasks ${task_name}"
39 |     "--model_args model=${model_path},base_url=http://localhost:${port}/v1/${endpoint},tokenized_requests=False,tokenizer_backend=None,num_concurrent=${max_concurrent_req},timeout=${timeout}"
40 |     "--log_samples"
41 |     "--output_path ${results_dir}"
42 |     "--limit ${limit}"
43 | )
44 | 
45 | # Execute the command
46 | python -m lm_eval \
47 |     --model ${model_type} \
48 |     ${common_args[@]} \
49 |     ${additional_args}


--------------------------------------------------------------------------------
/inference-benchmarking/clients/lm_eval/scripts/setup_lm_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Install LMEvaL
3 | cd ~
4 | python3 -m venv ~/lm_eval_venv
5 | source lm_eval_venv/bin/activate
6 | pip install -U pip
7 | pip install lm_eval[api]==0.4.7


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import LongBenchClient
2 | 


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/prompts/0shot.txt:
--------------------------------------------------------------------------------
 1 | Please read the following text and answer the question below.
 2 | 
 3 | <text>
 4 | $DOC$
 5 | </text>
 6 | 
 7 | What is the correct answer to this question: $Q$
 8 | Choices:
 9 | (A) $C_A$
10 | (B) $C_B$
11 | (C) $C_C$
12 | (D) $C_D$
13 | 
14 | Format your response as follows: "The correct answer is (insert answer here)".


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/prompts/0shot_cot.txt:
--------------------------------------------------------------------------------
 1 | Please read the following text and answer the questions below.
 2 | 
 3 | <text>
 4 | $DOC$
 5 | </text>
 6 | 
 7 | What is the correct answer to this question: $Q$
 8 | Choices:
 9 | (A) $C_A$
10 | (B) $C_B$
11 | (C) $C_C$
12 | (D) $C_D$
13 | 
14 | Let’s think step by step:


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/prompts/0shot_cot_ans.txt:
--------------------------------------------------------------------------------
 1 | Please read the following text and answer the questions below.
 2 | 
 3 | The text is too long and omitted here.
 4 | 
 5 | What is the correct answer to this question: $Q$
 6 | Choices:
 7 | (A) $C_A$
 8 | (B) $C_B$
 9 | (C) $C_C$
10 | (D) $C_D$
11 | 
12 | Let’s think step by step: $COT$
13 | 
14 | Based on the above, what is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)".


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/prompts/0shot_no_context.txt:
--------------------------------------------------------------------------------
1 | What is the correct answer to this question: $Q$
2 | Choices:
3 | (A) $C_A$
4 | (B) $C_B$
5 | (C) $C_C$
6 | (D) $C_D$
7 | 
8 | What is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)".


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/prompts/0shot_rag.txt:
--------------------------------------------------------------------------------
 1 | Please read the following retrieved text chunks and answer the question below.
 2 | 
 3 | <text>
 4 | $DOC$
 5 | </text>
 6 | 
 7 | What is the correct answer to this question: $Q$
 8 | Choices:
 9 | (A) $C_A$
10 | (B) $C_B$
11 | (C) $C_C$
12 | (D) $C_D$
13 | 
14 | Format your response as follows: "The correct answer is (insert answer here)".


--------------------------------------------------------------------------------
/inference-benchmarking/clients/long_bench/prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/inference-benchmarking/clients/long_bench/prompts/__init__.py


--------------------------------------------------------------------------------
/inference-benchmarking/config.yaml:
--------------------------------------------------------------------------------
 1 | server:
 2 |   name: "Meta-llama3.1-8B-Instruct"
 3 |   model_path: "/home/ubuntu/models/Meta-llama3.1-8B-Instruct/"
 4 |   model_s3_path: null
 5 |   compiled_model_path: "/home/ubuntu/traced_models/Meta-llama3.1-8B-Instruct/"
 6 |   max_seq_len: 16384
 7 |   context_encoding_len: 16384
 8 |   tp_degree: 32
 9 |   n_vllm_threads: 32
10 |   server_port: 8000
11 |   continuous_batch_size: 1
12 | 
13 | test:
14 |   accuracy:
15 |     mytest:
16 |       client: "lm_eval"
17 |       datasets: ["gsm8k_cot", "mmlu_flan_n_shot_generative_computer_security"]
18 |       max_concurrent_requests: 1
19 |       timeout: 3600
20 |       client_params:
21 |         limit: 200
22 |         use_chat: True


--------------------------------------------------------------------------------
/inference-benchmarking/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | tiktoken
3 | torch
4 | openai
5 | transformers
6 | psutil
7 | botocore


--------------------------------------------------------------------------------
/inference-benchmarking/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/inference-benchmarking/server/__init__.py


--------------------------------------------------------------------------------
/inference-benchmarking/server/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/inference-benchmarking/server/scripts/__init__.py


--------------------------------------------------------------------------------
/inference-benchmarking/server/scripts/start_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | model_id=${1}
 3 | port=${2:-8000}
 4 | cores=${3:-0-31}
 5 | max_seq_len=${4:-2048}
 6 | cont_batch_size=${5:-32}
 7 | tp_size=${6:-32}
 8 | n_threads=${7:-32}
 9 | file_path="${8:-/home/ubuntu/vllmlogs.log}"
10 | 
11 | # Shift positional arguments out of the way before parsing named arguments
12 | shift 8
13 | set -x
14 | 
15 | # Default value for override_neuron_config
16 | override_neuron_config="{}"
17 | 
18 | # Parse named arguments
19 | while [[ "$#" -gt 0 ]]; do
20 |     case $1 in
21 |         --speculative-model) draft_model_id="$2"; shift ;;
22 |         --num-speculative-tokens) num_speculative_tokens="$2"; shift ;;
23 |         --chat-template) chat_template="$2"; shift ;;
24 |         --enable-chunked-prefill) enable_chunked_prefill="$2"; shift ;;
25 |         --max-num-batched-tokens) max_num_batched_tokens="$2"; shift ;;
26 |         --block-size) block_size="$2"; shift ;;
27 |         --num-gpu-blocks-override) num_gpu_blocks_override="$2"; shift ;;
28 |         --override-neuron-config) override_neuron_config="$2"; shift ;;
29 |         *) echo "Unknown parameter: $1"; exit 1 ;;  # Handle unknown parameters
30 |     esac
31 |     shift  # Move to the next argument
32 | done
33 | 
34 | # Build base command arguments
35 | cmd_args=(
36 |     --model "${model_id}"
37 |     --tensor-parallel-size "${tp_size}"
38 |     --max-num-seqs "${cont_batch_size}"
39 |     --max-model-len "${max_seq_len}"
40 |     --port "${port}"
41 |     --device "neuron"
42 |     --use-v2-block-manager
43 |     --disable-log-requests
44 | )
45 | 
46 | # Conditionally set the environment variable and add spec settings via override config
47 | [ -n "$draft_model_id" ] && {
48 |     echo "Setting draft model to: ${draft_model_id}"
49 |     cmd_args+=(--speculative-max-model-len "${max_seq_len}")
50 |     cmd_args+=(--speculative-model "${draft_model_id}")
51 |     cmd_args+=(--num-speculative-tokens "${num_speculative_tokens}")
52 | }
53 | 
54 | # Conditionally add chunked prefill settings via override config
55 | [ -n "$enable_chunked_prefill" ] && {
56 |     echo "Setting chunked prefill args"
57 |     cmd_args+=(--enable-chunked-prefill "${enable_chunked_prefill}")
58 |     cmd_args+=(--max-num-batched-tokens "${max_num_batched_tokens}")
59 |     cmd_args+=(--block-size "${block_size}")
60 |     cmd_args+=(--num-gpu-blocks-override "${num_gpu_blocks_override}")
61 | }
62 | 
63 | # Conditionally add override config args
64 | if [[ "${override_neuron_config}" != "{}" ]]; then
65 |     cmd_args+=(--override-neuron-config "${override_neuron_config}")
66 | fi
67 | 
68 | [ -n "$chat_template" ] && cmd_args+=(--chat-template "${chat_template}")
69 | 
70 | echo "Starting VLLM Server for model: ${model_id}"
71 | 
72 | export NEURON_RT_DBG_RDH_CC=0
73 | export NEURON_RT_INSPECT_ENABLE=0
74 | export XLA_HANDLE_SPECIAL_SCALAR=1
75 | export UNSAFE_FP8FNCAST=1
76 | export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
77 | 
78 | # Execute the command with all arguments
79 | python3 -m vllm.entrypoints.openai.api_server "${cmd_args[@]}" 2>&1 | tee ${file_path}


--------------------------------------------------------------------------------
/inference-benchmarking/server_config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class ServerConfig:
 7 |     name: str
 8 |     model_path: str
 9 |     model_s3_path: str
10 |     max_seq_len: int
11 |     context_encoding_len: int
12 |     tp_degree: int
13 |     n_vllm_threads: int
14 |     server_port: int
15 |     continuous_batch_size: int = 1
16 | 
17 |     # Optional configurations
18 |     draft_model_path: Optional[str] = None
19 |     draft_model_s3_path: Optional[str] = None
20 |     sharded_weights_path: Optional[str] = None
21 |     sharded_weights_s3_path: Optional[str] = None
22 |     spec_len: Optional[int] = None
23 |     speculation_type: Optional[str] = None
24 |     compiled_model_path: Optional[str] = None
25 |     inference_demo_script: Optional[str] = None
26 |     inference_demo_args: Optional[str] = None
27 |     scratchpad_page_size: Optional[int] = None
28 |     enable_scratchpad_single_core_debugging: Optional[bool] = False
29 |     custom_chat_template_path: Optional[str] = None
30 | 
31 |     def __post_init__(self):
32 |         if self.max_seq_len <= 0:
33 |             raise ValueError("max_seq_len must be positive")
34 |         if self.context_encoding_len <= 0:
35 |             raise ValueError("context_encoding_len must be positive")
36 |         if self.tp_degree <= 0:
37 |             raise ValueError("tp_degree must be positive")
38 |         if self.n_vllm_threads <= 0:
39 |             raise ValueError("n_vllm_threads must be positive")
40 |         if self.continuous_batch_size <= 0:
41 |             raise ValueError("continuous_batch_size must be positive")
42 |         if self.server_port < 0 or self.server_port > 65535:
43 |             raise ValueError("server_port must be between 0 and 65535")
44 | 
45 |         # Validate optional configurations
46 |         if self.spec_len is not None and self.spec_len <= 0:
47 |             raise ValueError("spec_len must be positive if specified")
48 |         if self.speculation_type and self.speculation_type not in ["eagle"]:
49 |             raise ValueError("speculation_type must be 'eagle' if specified")
50 | 


--------------------------------------------------------------------------------
/inference-benchmarking/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .process import (
 2 |     check_server_terminated,
 3 |     find_free_port,
 4 |     is_port_available,
 5 |     kill_process_and_children,
 6 | )
 7 | from .s3 import S3Utils, download_from_s3, get_instance_region
 8 | 
 9 | __all__ = [
10 |     # S3 utilities
11 |     "download_from_s3",
12 |     "get_instance_region",
13 |     "S3Utils",
14 |     # System utilities
15 |     "kill_process_and_children",
16 |     "is_port_available",
17 |     "find_free_port",
18 |     "check_server_terminated",
19 | ]
20 | 


--------------------------------------------------------------------------------
/inference-benchmarking/utils/artifacts.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import subprocess
 3 | from datetime import datetime
 4 | from pathlib import Path
 5 | from typing import Any, Dict, List, Optional
 6 | 
 7 | 
 8 | 
 9 | from .s3 import download_from_s3
10 | 
11 | 
12 | class ArtifactManager:
13 |     """Manages model artifacts and test artifacts"""
14 | 
15 |     def __init__(self, base_dir: Optional[Path] = None):
16 |         self.base_dir = base_dir or Path("artifacts")
17 |         self.base_dir.mkdir(parents=True, exist_ok=True)
18 | 
19 |     def download_model_artifacts(self, model_config: Dict[str, Any]) -> None:
20 |         """Download model and related artifacts"""
21 |         print(model_config)
22 |         # Download main model
23 |         if model_config.get("model_s3_path"):
24 |             download_from_s3(model_config["model_s3_path"], model_config["model_path"])
25 | 
26 |         # Download draft model if specified
27 |         if model_config.get("draft_model_s3_path"):
28 |             download_from_s3(model_config["draft_model_s3_path"], model_config["draft_model_path"])
29 | 
30 |         # Download sharded weights if specified
31 |         if model_config.get("sharded_weights_s3_path"):
32 |             download_from_s3(
33 |                 model_config["sharded_weights_s3_path"], model_config["sharded_weights_path"]
34 |             )
35 | 
36 |     def save_artifacts(self, artifacts: Dict[str, Path], destination: str) -> None:
37 |         """Save artifacts to specified destination"""
38 |         for name, path in artifacts.items():
39 |             if path.is_file():
40 |                 shutil.copy2(path, self.base_dir / destination / name)
41 |             elif path.is_dir():
42 |                 shutil.copytree(path, self.base_dir / destination / name)
43 | 
44 |     def upload_to_s3(self, local_path: Path, s3_path: str, recursive: bool = False) -> bool:
45 |         """Upload artifacts to S3"""
46 |         cmd = ["aws", "s3"]
47 |         cmd.extend(["sync" if recursive else "cp"])
48 |         cmd.extend([str(local_path), s3_path])
49 | 
50 |         try:
51 |             subprocess.run(cmd, check=True)
52 |             return True
53 |         except subprocess.CalledProcessError as e:
54 |             print(f"Failed to upload to S3: {e}")
55 |             return False
56 | 
57 |     def cleanup(self, paths: List[Path]) -> None:
58 |         """Cleanup artifact paths"""
59 |         for path in paths:
60 |             if path.is_file():
61 |                 path.unlink()
62 |             elif path.is_dir():
63 |                 shutil.rmtree(path)
64 | 


--------------------------------------------------------------------------------
/inference-benchmarking/utils/parser.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any, Dict
 3 | 
 4 | import yaml
 5 | 
 6 | import sys
 7 | sys.path.append("../")
 8 | 
 9 | from accuracy import AccuracyScenario
10 | from server_config import ServerConfig
11 | 
12 | 
13 | @dataclass
14 | class TestConfig:
15 |     accuracy: Dict[str, AccuracyScenario] = field(default_factory=dict)
16 |     upload_artifacts: bool = False
17 | 
18 |     def __post_init__(self):
19 |         # Ensure at least one type of test is configured
20 |         if not self.accuracy and not self.performance:
21 |             raise ValueError("At least one test type (accuracy or performance) must be configured")
22 | 
23 | 
24 | class ConfigParser:
25 |     @staticmethod
26 |     def parse_config(config_path: str) -> tuple[ServerConfig, TestConfig]:
27 |         with open(config_path) as f:
28 |             config = yaml.safe_load(f)
29 | 
30 |         # Validation happens during dataclass instantiation
31 |         server_config = ServerConfig(**config["server"])
32 |         test_config = TestConfig(
33 |             accuracy={
34 |                 name: AccuracyScenario(**scenario_config)
35 |                 for name, scenario_config in config["test"].get("accuracy", {}).items()
36 |             },
37 |         )
38 | 
39 |         return server_config, test_config
40 | 


--------------------------------------------------------------------------------
/inference-benchmarking/utils/process.py:
--------------------------------------------------------------------------------
 1 | import errno
 2 | import os
 3 | import signal
 4 | import socket
 5 | import time
 6 | 
 7 | import psutil
 8 | import requests
 9 | 
10 | 
11 | def kill_process_and_children(pid):
12 |     try:
13 |         print(f"Terminating process with pid {pid}")
14 |         parent = psutil.Process(pid)
15 |         children = parent.children(recursive=True)
16 | 
17 |         # Send SIGTERM to parent and children
18 |         for process in [parent] + children:
19 |             print(f"Sending SIGTERM to process with PID: {process.pid}")
20 |             process.send_signal(signal.SIGTERM)
21 | 
22 |         # Wait for processes to terminate
23 |         gone, alive = psutil.wait_procs([parent] + children, timeout=30)
24 | 
25 |         # If any processes are still alive, send SIGKILL
26 |         for process in alive:
27 |             print(f"Process with PID: {process.pid} did not terminate, sending SIGKILL")
28 |             process.send_signal(signal.SIGKILL)
29 | 
30 |         print(
31 |             f"Successfully terminated process with PID: {pid} and its children: {[child.pid for child in children]}"
32 |         )
33 |     except Exception as e:
34 |         print(f"Failed to terminate process with PID: {pid}. Exception {e}")
35 | 
36 | 
37 | def check_server_terminated(url, retries=2, delay=30):
38 |     print("Checking if server is in terminated state")
39 |     for i in range(retries):
40 |         try:
41 |             response = requests.get(url)
42 |             if response.status_code == 200:
43 |                 print(
44 |                     f"Attempt {i + 1}/{retries}: Server is not terminated yet. Re-checking in {delay} seconds..."
45 |                 )
46 |         except requests.ConnectionError:
47 |             print("Server is in terminated state.")
48 |             return True
49 |         time.sleep(delay)
50 | 
51 |     print("Server did not respond within the retry limit.")
52 |     return False
53 | 
54 | 
55 | def is_port_available(port):
56 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
57 |     try:
58 |         sock.bind(("localhost", port))
59 |         return True
60 |     except socket.error as e:
61 |         if e.errno == errno.EADDRINUSE:
62 |             return False
63 |         else:
64 |             # Handle other potential errors
65 |             print(f"Unexpected error checking port {port}: {e}")
66 |             return False
67 |     finally:
68 |         sock.close()
69 | 
70 | 
71 | def find_free_port(start_port=8000, max_port=65535):
72 |     for port in range(start_port, max_port):
73 |         if is_port_available(port):
74 |             return port
75 |     raise RuntimeError("Unable to find a free port")
76 | 


--------------------------------------------------------------------------------
/releasenotes.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | ## September, 15th 2023
 4 | * Added notebook script to fine-tune ``deepmind/language-perceiver`` model using ``torch-neuronx``. 
 5 | * Added notebook script to fine-tune ``clip-large`` model using ``torch-neuronx``.
 6 | * Added ``SD XL Base+Refiner`` inference sample script using ``torch-neuronx``.
 7 | * Upgraded default ``diffusers`` library from 0.14.0 to latest 0.20.2 in ``Stable Diffusion 1.5`` and ``Stable Diffusion 2.1`` inference scripts. 
 8 | * Removed the deprecated  ``--model-type=transformer-inference`` flag from ``Llama-2-13B`` model inference sample using ``transformers-neuronx``
 9 | 
10 | 
11 | 
12 | ## August, 28th 2023
13 | * Added sample script for LLaMA V2 13B model inference using transformers-neuronx
14 | * Added samples for training GPT-NEOX 20B and 6.9B models using neuronx-distributed 
15 | * Added sample scripts for CLIP and Stable Diffusion XL inference using torch-neuronx
16 | * Added sample scripts for vision and language Perceiver models inference using torch-neuronx
17 | * Added camembert training/finetuning example for Trn1 under hf_text_classification in torch-neuronx
18 | * Updated Fine-tuning Hugging Face BERT Japanese model sample in torch-neuronx
19 | * Updated OPT and GPT-J transformers-neuronx inference samples to install transformers-neuronx from whl instead of using github repo
20 | * Upgraded numpy package to 1.21.6 in GPT-2 and several training samples under hf_text_classification in torch-neuronx
21 | * Removed pinning of torch-neuron and tensorflow-neuron libraries and other minor changes in several of torch-neuron and tensorflow-neuron Inf1 inference samples.
22 | 
23 | 
24 | ## February, 23rd 2023
25 | * Added OPT-13B, OPT-30B, OPT-66B inference examples under transformers-neuronx
26 | * Added distilbert-base-uncased training/finetuning example for Trn1 under torch-neuronx
27 | 
28 | ## November, 7th 2022
29 | 
30 | * Added Fine-tuning Hugging Face BERT Japanese model sample
31 | 
32 | ## November,4th 2022
33 | * Added HuggingFace Vision Transformer (ViT)training examples for Trn1 under torch-neuronx.
34 | 
35 | ## October,27th 2022
36 | * Added HuggingFace GPT2 training examples for Trn1 under torch-neuronx.
37 | * Added 7 Pytorch training examples for Trn1 under torch-neuronx.
38 | 
39 | ## October,10th 2022
40 | 
41 | * Added 20 Pytorch inference examples for Inf1 under torch-neuron.
42 | * Added 1 TensorFlow inference example for Inf1 under tensorflow-neuron.
43 | * Added 2 Pytorch inference examples for Inf1 under torch-neuronx.
44 | 
45 | # Known Issues
46 | 
47 | * NA
48 | 
49 | 


--------------------------------------------------------------------------------
/tensorflow-neuron/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow Neuron (tensorflow-neuron) Samples for AWS Inf1
 2 | 
 3 | This directory contains Jupyter notebooks that demonstrate model compilation and inference using TensorFlow Neuron for a variety of popular deep learning models. These samples can be run on [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) (inf1 instances) using [Amazon SageMaker](https://aws.amazon.com/sagemaker) or [Amazon EC2](https://aws.amazon.com/ec2/).
 4 | 
 5 | For each sample you will also find additional information such as the model type, configuration used to compile the model, framework version, and a link to the original model implementation.
 6 | 
 7 | The following samples are available:
 8 | 
 9 | |Model Name	|Model Type	|Input Shape	|NeuronSDK Version	|Framework / Version	|Original Implementation	|
10 | |---	|---	|---	|---	|---	|---	|
11 | |[U-Net](inference/unet) |CV - Semantic Segmentation    |1,3,224,224    |2.5.2.2.1.14.0    |Tensorflow 2.5.2    |[link](https://github.com/jakeret/unet)|
12 | 
13 | 
14 | ### Configuring the environment
15 | 
16 | In order to run the samples, you first need to [set up a TensorFlow Neuron development environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html).
17 | 
18 | 


--------------------------------------------------------------------------------
/torch-neuron/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Neuron (torch-neuron) Samples for AWS Inf1
 2 | 
 3 | This directory contains Jupyter notebooks that demonstrate model compilation and inference using PyTorch Neuron for a variety of popular deep learning models. These samples can be run on [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) (inf1 instances) using [Amazon SageMaker](https://aws.amazon.com/sagemaker) or [Amazon EC2](https://aws.amazon.com/ec2/).
 4 | 
 5 | For each sample you will also find additional information such as the model type, configuration used to compile the model, framework version, and a link to the original model implementation.
 6 | 
 7 | The following samples are available:
 8 | 
 9 | |Model Name	|Model Type	|Input Shape	|NeuronSDK Version	|Framework / Version	|Original Implementation	|
10 | |---	|---	|---	|---	|---	|---	|
11 | |[BERT-base](inference/bertbasecased)	|NLP	|max_length=128	|1.10.1.2.2.0.0	|Pytorch 1.10.2	|[link](https://huggingface.co/bert-base-cased)|
12 | |[BERT-large](inference/bertlargeuncased)	|NLP	|max_length=128	|1.10.1.2.2.0.0	|Pytorch 1.10.2	|[link](https://huggingface.co/bert-large-uncased)|
13 | |[CRAFT](inference/craft)		|CV - Text Detection	|1,3,800,800 - max_length=32|1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://github.com/clovaai/CRAFT-pytorch)|
14 | |[EfficientNet](inference/efficientnet)	|CV - Image Classification	|1,3,224,224	|1.10.1.2.2.0.0	|Pytorch 1.10.1	|[link](https://pytorch.org/vision/stable/models/efficientnet.html)|
15 | |[GFL](inference/gfl_mmdet)		|CV - Object Detection	|1,3,800,1216	|1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://github.com/open-mmlab/mmdetection/blob/master/configs/gfl/README.md)|
16 | |[HRNet](inference/hrnet)	|CV - Pose Estimation	|1,3,384,288	|1.10.2.2.3.0.0	|Pytorch 1.10.2	|[link](https://github.com/leoxiaobin/deep-high-resolution-net.pytorch.git)|
17 | |[MarianNMT](inference/marianmt)	|NLP	|max_length=32 |1.7.\*|Pytorch 1.7|[link](https://huggingface.co/Helsinki-NLP/opus-mt-en-de)|
18 | |[R-CNN](inference/rcnn)   |CV - Image Classification, Detection, and Segmentation   |1,3,800,800 	|1.11.0.2.5.2.0   |Pytorch 1.11.0	 |[link](https://github.com/facebookresearch/detectron2)|
19 | |[ResNet (18,34,50,101,152)](inference/resnet)|CV - Image Classification	|1,3,224,224	|1.10.1.2.2.0.0	|Pytorch 1.10.1	|[link](https://pytorch.org/vision/stable/models/resnet.html)|
20 | |[ResNetX](inference/resnext)	|CV - Image Classification	|1,3,224,224	|1.10.1.2.2.0.0	|Pytorch 1.10.1	|[link](https://pytorch.org/vision/stable/models/resnext.html)|
21 | |[Roberta-base](inference/robertabase)	|NLP	|max_length=128|1.10.1.2.2.0.0	|Pytorch 1.10.2|[link](https://huggingface.co/roberta-base)|
22 | |[SSD (SSD300-VGG16)](inference/ssd)	|CV - Object detection	|1,3,300,300	|1.10.2.2.3.0.0	|Pytorch 1.10.2	|[link](https://pytorch.org/vision/stable/models/ssd.html)|
23 | |[TrOCR](inference/trocr)		|CV - OCR	|1,3,384,384	|1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://huggingface.co/docs/transformers/en/model_doc/trocr)|
24 | |[VGG16](inference/vgg)	|CV - Image Classification	|1,3,224,224	|1.10.1.2.2.0.0	|Pytorch 1.10.1	|[link](https://pytorch.org/vision/stable/models/vgg.html)|
25 | |[ViT](inference/vit)		|CV - Image Classification	|1,3,224,224	|1.10.2.2.3.0.0 |Pytorch 1.10.2 |[link](https://huggingface.co/docs/transformers/model_doc/vit)|
26 | |[YOLOv5](inference/yolov5)	|CV - Object Detection	|1,3,640,640	|1.10.1.2.2.0.0	|Pytorch 1.10.1	|[link](https://github.com/ultralytics/yolov5/releases/tag/v5.0)|
27 | |[YOLOv6](inference/yolov6)	|CV - Object Detection	|1,3,640,640	|1.11.0.2.3.0.0 |Pytorch 1.11.0 |[link](https://github.com/meituan/YOLOv6.git)|
28 | |[YOLOv7](inference/yolov7)	|CV - Object Detection+Pose Estimation	|1,3,960,960	|1.10.1.2.2.0.0 |Pytorch 1.10.1 |[link](https://github.com/WongKinYiu/yolov7)|
29 | |[YOLOF](inference/yolof_detectron2)	|CV - Object Detection	|1,3,300,300	|1.10.1.2.2.0.0	|Pytorch 1.10.1	|[link](https://github.com/chensnathan/YOLOF)|
30 | |[Fairseq](inference/fairseq)	|NLP|max_length=32|1.10.1.*|Pytorch 1.10.1	|[link](https://github.com/facebookresearch/fairseq)|
31 | 
32 | ### Configuring the environment
33 | 
34 | In order to run the samples, you first need to [set up a PyTorch Neuron development environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html).
35 | 
36 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/README.md:
--------------------------------------------------------------------------------
1 | # Torch Neuron CustomOp MLP
2 | 
3 | This folder contains inference examples Torch custom operators for a multi-layer perceptron (MLP) model.
4 | 
5 | - The `neuron` folder contains a MLP model with relu implemented as a CustomOp using element-wise accessor.
6 | - The `neuron-tcm` folder contains the same model but relu is implemented using tcm accessor.
7 | - The `neuron-multicore` folder contains the same model but relu is implemeted using tcm accessor and multicore capability.


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-multicore/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load(
 6 |     name='relu',
 7 |     compute_srcs=['relu.cpp'],
 8 |     shape_srcs=['shape.cpp'],
 9 |     build_directory=os.getcwd(),
10 |     multicore=True,
11 |     verbose=True
12 | )
13 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-multicore/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | 
13 | # Global constants
14 | EPOCHS = 4
15 | WARMUP_STEPS = 2
16 | BATCH_SIZE = 32
17 | 
18 | # Load MNIST inference dataset
19 | inf_dataset = mnist.MNIST(root='./MNIST_DATA_inf',
20 |                             train=False, download=True, transform=ToTensor())
21 | 
22 | def main():
23 |     # Prepare data loader
24 |     inf_loader = DataLoader(inf_dataset, batch_size=BATCH_SIZE)
25 | 
26 |     # Fix the random number generator seeds for reproducibility
27 |     torch.manual_seed(0)
28 | 
29 |     # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance)
30 |     device = 'xla'
31 | 
32 |     # Init with random weight and move model to device
33 |     model = MLP()
34 |     torch.nn.init.xavier_normal_(model.fc1.weight)
35 |     torch.nn.init.xavier_normal_(model.fc2.weight)
36 |     torch.nn.init.xavier_normal_(model.fc3.weight)
37 |     model = model.to(device)
38 | 
39 |     # Run the training loop
40 |     print('---------- Inference ---------------')
41 |     model.eval()
42 |     for _ in range(EPOCHS):
43 |         start = time.time()
44 |         for idx, (inf_x, _) in enumerate(inf_loader):
45 |             inf_x = inf_x.view(inf_x.size(0), -1)
46 |             inf_x = inf_x.to(device)
47 |             output = model(inf_x)
48 |             xm.mark_step() # XLA: collect ops and run them in XLA runtime
49 |             if idx < WARMUP_STEPS: # skip warmup iterations
50 |                 start = time.time()
51 |     # Compute statistics for the last epoch
52 |     interval = idx - WARMUP_STEPS # skip warmup iterations
53 |     throughput = interval / (time.time() - start)
54 |     print("Inf throughput (iter/sec): {}".format(throughput))
55 | 
56 |     print('----------End Inference ---------------')
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 
61 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-multicore/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import functional as F
 4 | import my_ops
 5 | 
 6 | # Declare 3-layer MLP for MNIST dataset
 7 | class MLP(nn.Module):
 8 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [4096, 2048]):
 9 |     super(MLP, self).__init__()
10 |     self.fc1 = nn.Linear(input_size, layers[0])
11 |     self.fc2 = nn.Linear(layers[0], layers[1])
12 |     self.fc3 = nn.Linear(layers[1], output_size)
13 | 
14 |   def forward(self, x):
15 |     f1 = self.fc1(x)
16 |     r1 = my_ops.Relu.apply(f1)
17 |     f2 = self.fc2(r1)
18 |     r2 = my_ops.Relu.apply(f2)
19 |     f3 = self.fc3(r2)
20 |     return torch.log_softmax(f3, dim=1)
21 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-multicore/my_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load_library('librelu.so')
 6 | 
 7 | class Relu(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input):
10 |         ctx.save_for_backward(input)
11 |         return torch.ops.my_ops.relu_forward(input)
12 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-multicore/relu.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | #include <neuron/neuron-utils.hpp>
 5 | 
 6 | torch::Tensor relu_forward(const torch::Tensor& t_in) {
 7 |   size_t num_elem = t_in.numel();
 8 |   torch::Tensor t_out = get_dst_tensor();
 9 | 
10 |   uint32_t cpu_id = get_cpu_id();
11 |   uint32_t cpu_count = get_cpu_count();
12 |   uint32_t partition = num_elem / cpu_count;
13 |   if (cpu_id == cpu_count - 1) {
14 |     partition = num_elem - partition * (cpu_count - 1);
15 |   }
16 | 
17 |   static constexpr size_t buffer_size = 1024;
18 |   float *tcm_buffer = (float*)torch::neuron::tcm_malloc(sizeof(float) * buffer_size);
19 | 
20 |   if (tcm_buffer != nullptr) {
21 |     auto t_in_tcm_acc = t_in.tcm_accessor();
22 |     auto t_out_tcm_acc = t_out.tcm_accessor();
23 | 
24 |     for (size_t i = 0; i < partition; i += buffer_size) {
25 |       size_t remaining_elem = partition - i;
26 |       size_t copy_size = (remaining_elem > buffer_size) ? buffer_size : remaining_elem;
27 | 
28 |       t_in_tcm_acc.tensor_to_tcm<float>(tcm_buffer, partition *cpu_id + i, copy_size);
29 |       for (size_t j = 0; j < copy_size; j++) {
30 |           tcm_buffer[j] = tcm_buffer[j] > 0.0 ? tcm_buffer[j] : 0.0;
31 |       }
32 |       t_out_tcm_acc.tcm_to_tensor<float>(tcm_buffer, partition *cpu_id + i, copy_size);
33 |     }
34 |   }
35 |   torch::neuron::tcm_free(tcm_buffer);
36 |   return t_out;
37 | }
38 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-multicore/shape.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | #include "torchneuron/register.h"
 5 | 
 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) {
 7 |     torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat);
 8 |     return t_out;
 9 | }
10 | 
11 | NEURON_LIBRARY(my_ops, m) {
12 |   m.def("relu_forward", &relu_fwd_shape, "relu_forward");
13 | }
14 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-tcm/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load(
 6 |     name='relu',
 7 |     compute_srcs=['relu.cpp'],
 8 |     shape_srcs=['shape.cpp'],
 9 |     build_directory=os.getcwd(),
10 |     verbose=True
11 | )
12 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-tcm/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | 
13 | # Global constants
14 | EPOCHS = 4
15 | WARMUP_STEPS = 2
16 | BATCH_SIZE = 32
17 | 
18 | # Load MNIST inference dataset
19 | inf_dataset = mnist.MNIST(root='./MNIST_DATA_inf',
20 |                             train=False, download=True, transform=ToTensor())
21 | 
22 | def main():
23 |     # Prepare data loader
24 |     inf_loader = DataLoader(inf_dataset, batch_size=BATCH_SIZE)
25 | 
26 |     # Fix the random number generator seeds for reproducibility
27 |     torch.manual_seed(0)
28 | 
29 |     # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance)
30 |     device = 'xla'
31 | 
32 |     # Init with random weight and move model to device
33 |     model = MLP()
34 |     torch.nn.init.xavier_normal_(model.fc1.weight)
35 |     torch.nn.init.xavier_normal_(model.fc2.weight)
36 |     torch.nn.init.xavier_normal_(model.fc3.weight)
37 |     model = model.to(device)
38 | 
39 |     # Run the training loop
40 |     print('---------- Inference ---------------')
41 |     model.eval()
42 |     for _ in range(EPOCHS):
43 |         start = time.time()
44 |         for idx, (inf_x, _) in enumerate(inf_loader):
45 |             inf_x = inf_x.view(inf_x.size(0), -1)
46 |             inf_x = inf_x.to(device)
47 |             output = model(inf_x)
48 |             xm.mark_step() # XLA: collect ops and run them in XLA runtime
49 |             if idx < WARMUP_STEPS: # skip warmup iterations
50 |                 start = time.time()
51 |     # Compute statistics for the last epoch
52 |     interval = idx - WARMUP_STEPS # skip warmup iterations
53 |     throughput = interval / (time.time() - start)
54 |     print("Inf throughput (iter/sec): {}".format(throughput))
55 | 
56 |     print('----------End Inference ---------------')
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 
61 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-tcm/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import functional as F
 4 | import my_ops
 5 | 
 6 | # Declare 3-layer MLP for MNIST dataset
 7 | class MLP(nn.Module):
 8 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [4096, 2048]):
 9 |     super(MLP, self).__init__()
10 |     self.fc1 = nn.Linear(input_size, layers[0])
11 |     self.fc2 = nn.Linear(layers[0], layers[1])
12 |     self.fc3 = nn.Linear(layers[1], output_size)
13 | 
14 |   def forward(self, x):
15 |     f1 = self.fc1(x)
16 |     r1 = my_ops.Relu.apply(f1)
17 |     f2 = self.fc2(r1)
18 |     r2 = my_ops.Relu.apply(f2)
19 |     f3 = self.fc3(r2)
20 |     return torch.log_softmax(f3, dim=1)
21 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-tcm/my_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load_library('librelu.so')
 6 | 
 7 | class Relu(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input):
10 |         ctx.save_for_backward(input)
11 |         return torch.ops.my_ops.relu_forward(input)
12 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-tcm/relu.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | #include <neuron/neuron-utils.hpp>
 5 | 
 6 | torch::Tensor relu_forward(const torch::Tensor& t_in) {
 7 |   size_t num_elem = t_in.numel();
 8 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
 9 | 
10 |   static constexpr size_t buffer_size = 1024;
11 |   float *tcm_buffer = (float*)torch::neuron::tcm_malloc(sizeof(float) * buffer_size);
12 | 
13 |   if (tcm_buffer != nullptr) {
14 |     auto t_in_tcm_acc = t_in.tcm_accessor();
15 |     auto t_out_tcm_acc = t_out.tcm_accessor();
16 | 
17 |     for (size_t i = 0; i < num_elem; i += buffer_size) {
18 |       size_t remaining_elem = num_elem - i;
19 |       size_t copy_size = (remaining_elem > buffer_size) ? buffer_size : remaining_elem;
20 | 
21 |       t_in_tcm_acc.tensor_to_tcm<float>(tcm_buffer, i, copy_size);
22 |       for (size_t j = 0; j < copy_size; j++) {
23 |           tcm_buffer[j] = tcm_buffer[j] > 0.0 ? tcm_buffer[j] : 0.0;
24 |       }
25 |       t_out_tcm_acc.tcm_to_tensor<float>(tcm_buffer, i, copy_size);
26 |     }
27 |   }
28 |   torch::neuron::tcm_free(tcm_buffer);
29 |   return t_out;
30 | }
31 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron-tcm/shape.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | #include "torchneuron/register.h"
 5 | 
 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) {
 7 |     torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat);
 8 |     return t_out;
 9 | }
10 | 
11 | NEURON_LIBRARY(my_ops, m) {
12 |   m.def("relu_forward", &relu_fwd_shape, "relu_forward");
13 | }
14 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load(
 6 |     name='relu',
 7 |     compute_srcs=['relu.cpp'],
 8 |     shape_srcs=['shape.cpp'],
 9 |     build_directory=os.getcwd()
10 | )
11 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | 
13 | # Global constants
14 | EPOCHS = 4
15 | WARMUP_STEPS = 2
16 | BATCH_SIZE = 32
17 | 
18 | # Load MNIST inference dataset
19 | inf_dataset = mnist.MNIST(root='./MNIST_DATA_inf',
20 |                             train=False, download=True, transform=ToTensor())
21 | 
22 | def main():
23 |     # Prepare data loader
24 |     inf_loader = DataLoader(inf_dataset, batch_size=BATCH_SIZE)
25 | 
26 |     # Fix the random number generator seeds for reproducibility
27 |     torch.manual_seed(0)
28 | 
29 |     # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance)
30 |     device = 'xla'
31 | 
32 |     # Init with random weight and move model to device
33 |     model = MLP()
34 |     torch.nn.init.xavier_normal_(model.fc1.weight)
35 |     torch.nn.init.xavier_normal_(model.fc2.weight)
36 |     torch.nn.init.xavier_normal_(model.fc3.weight)
37 |     model = model.to(device)
38 | 
39 |     # Run the training loop
40 |     print('---------- Inference ---------------')
41 |     model.eval()
42 |     for _ in range(EPOCHS):
43 |         start = time.time()
44 |         for idx, (inf_x, _) in enumerate(inf_loader):
45 |             inf_x = inf_x.view(inf_x.size(0), -1)
46 |             inf_x = inf_x.to(device)
47 |             output = model(inf_x)
48 |             xm.mark_step() # XLA: collect ops and run them in XLA runtime
49 |             if idx < WARMUP_STEPS: # skip warmup iterations
50 |                 start = time.time()
51 |     # Compute statistics for the last epoch
52 |     interval = idx - WARMUP_STEPS # skip warmup iterations
53 |     throughput = interval / (time.time() - start)
54 |     print("Inf throughput (iter/sec): {}".format(throughput))
55 | 
56 |     print('----------End Inference ---------------')
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 
61 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import functional as F
 4 | import my_ops
 5 | 
 6 | # Declare 3-layer MLP for MNIST dataset
 7 | class MLP(nn.Module):
 8 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [4096, 2048]):
 9 |     super(MLP, self).__init__()
10 |     self.fc1 = nn.Linear(input_size, layers[0])
11 |     self.fc2 = nn.Linear(layers[0], layers[1])
12 |     self.fc3 = nn.Linear(layers[1], output_size)
13 | 
14 |   def forward(self, x):
15 |     f1 = self.fc1(x)
16 |     r1 = my_ops.Relu.apply(f1)
17 |     f2 = self.fc2(r1)
18 |     r2 = my_ops.Relu.apply(f2)
19 |     f3 = self.fc3(r2)
20 |     return torch.log_softmax(f3, dim=1)
21 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron/my_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load_library('librelu.so')
 6 | 
 7 | class Relu(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input):
10 |         ctx.save_for_backward(input)
11 |         return torch.ops.my_ops.relu_forward(input)
12 | 
13 |     @staticmethod
14 |     def backward(ctx, grad):
15 |         input, = ctx.saved_tensors
16 |         return torch.ops.my_ops.relu_backward(grad, input), None
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron/relu.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | 
 5 | torch::Tensor relu_forward(const torch::Tensor& t_in) {
 6 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
 7 |   auto t_in_acc = t_in.accessor<float, 2>();
 8 |   auto t_out_acc = t_out.accessor<float, 2>();
 9 |   auto shape = t_in.sizes();
10 |   for (int i = 0; i < shape[0]; i++) {
11 |     for (int j = 0; j < shape[1]; j++) {
12 |       t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_in_acc[i][j] : 0.0;
13 |     }
14 |   }
15 |   return t_out;
16 | }
17 | 
18 | torch::Tensor relu_backward(const torch::Tensor& t_grad, const torch::Tensor& t_in) {
19 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
20 |   auto t_in_acc = t_in.accessor<float, 2>();
21 |   auto t_grad_acc = t_grad.accessor<float, 2>();
22 |   auto t_out_acc = t_out.accessor<float, 2>();
23 |   auto shape = t_in.sizes();
24 |   for (int i = 0; i < shape[0]; i++) {
25 |     for (int j = 0; j < shape[1]; j++) {
26 |       t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_grad_acc[i][j] : 0.0;
27 |     }
28 |   }
29 |   return t_out;
30 | }
31 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/customop_mlp/neuron/shape.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | #include "torchneuron/register.h"
 5 | 
 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) {
 7 |     torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat);
 8 |     return t_out;
 9 | }
10 | 
11 | torch::Tensor relu_bwd_shape(torch::Tensor t_grad, torch::Tensor t_in) {
12 |     torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat);
13 |     return t_out;
14 | }
15 | 
16 | NEURON_LIBRARY(my_ops, m) {
17 |   m.def("relu_forward", &relu_fwd_shape, "relu_forward");
18 |   m.def("relu_backward", &relu_bwd_shape, "relu_backward");
19 | }
20 | 


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/compile_latency_optimized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH=`pwd`:$PYTHONPATH
 4 | 
 5 | echo "compiling text encoder"
 6 | python neuron_pixart_sigma/compile_text_encoder.py \
 7 | --compiled_models_dir "compile_workdir_latency_optimized"
 8 | 
 9 | echo "compiling transformer"
10 | python neuron_pixart_sigma/compile_transformer_latency_optimized.py \
11 | --compiled_models_dir "compile_workdir_latency_optimized"
12 | 
13 | echo "compiling decoder"
14 | python neuron_pixart_sigma/compile_decoder.py \
15 | --compiled_models_dir "compile_workdir_latency_optimized"


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/compile_throughput_optimized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH=`pwd`:$PYTHONPATH
 4 | 
 5 | echo "compiling text encoder"
 6 | python neuron_pixart_sigma/compile_text_encoder.py \
 7 | --compiled_models_dir "compile_workdir_throughput_optimized"
 8 | 
 9 | echo "compiling transformer"
10 | python neuron_pixart_sigma/compile_transformer_throughput_optimized.py \
11 | --compiled_models_dir "compile_workdir_throughput_optimized"
12 | 
13 | echo "compiling decoder"
14 | python neuron_pixart_sigma/compile_decoder.py \
15 | --compiled_models_dir "compile_workdir_throughput_optimized"


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/cache_hf_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from diffusers import PixArtSigmaPipeline
3 | 
4 | pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained(
5 |     "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
6 |     torch_dtype=torch.bfloat16,
7 |     cache_dir="pixart_sigma_hf_cache_dir_1024")


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_decoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
 3 | os.environ["NEURON_CUSTOM_SILU"] = "1"
 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2
 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2
 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2
 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2
 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags
 9 | 
10 | from diffusers import PixArtSigmaPipeline
11 | import torch
12 | import argparse
13 | import torch_neuronx
14 | from diffusers.models.autoencoders.vae import Decoder
15 | from neuron_commons import attention_wrapper, f32Wrapper
16 | 
17 | torch.nn.functional.scaled_dot_product_attention =  attention_wrapper
18 | 
19 | def upcast_norms_to_f32(decoder: Decoder):
20 |     for upblock in decoder.up_blocks:
21 |         for resnet in upblock.resnets:
22 |             orig_resnet_norm1 = resnet.norm1
23 |             orig_resnet_norm2 = resnet.norm2
24 |             resnet.norm1 = f32Wrapper(orig_resnet_norm1)
25 |             resnet.norm2 = f32Wrapper(orig_resnet_norm2)
26 |     for attn in decoder.mid_block.attentions:
27 |         orig_group_norm = attn.group_norm
28 |         attn.group_norm = f32Wrapper(orig_group_norm)
29 |     for resnet in decoder.mid_block.resnets:
30 |         orig_resnet_norm1 = resnet.norm1
31 |         orig_resnet_norm2 = resnet.norm2
32 |         resnet.norm1 = f32Wrapper(orig_resnet_norm1)
33 |         resnet.norm2 = f32Wrapper(orig_resnet_norm2)
34 |     orig_conv_norm_out = decoder.conv_norm_out
35 |     decoder.conv_norm_out = f32Wrapper(orig_conv_norm_out)
36 | 
37 | def compile_decoder(args):
38 |     latent_height = args.height//8
39 |     latent_width = args.width//8
40 |     compiler_workdir = args.compiler_workdir
41 |     compiled_models_dir = args.compiled_models_dir
42 |     
43 |     batch_size = 1 
44 |     dtype = torch.bfloat16
45 |     pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained(
46 |         "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
47 |         cache_dir="pixart_sigma_hf_cache_dir_1024",
48 |         local_files_only=True,
49 |         torch_dtype=dtype)
50 |     
51 |     decoder: Decoder = pipe.vae.decoder
52 |     decoder.eval()
53 |     upcast_norms_to_f32(decoder)
54 | 
55 |     with torch.no_grad():
56 |         sample_inputs = torch.rand((batch_size, 4, latent_height, latent_width), dtype=dtype)
57 |         compiled_decoder = torch_neuronx.trace(
58 |             decoder,
59 |             sample_inputs,
60 |             compiler_workdir=f"{compiler_workdir}/decoder",
61 |             compiler_args=compiler_flags,
62 |             inline_weights_to_neff=False)
63 |         
64 |         compiled_model_dir = f"{compiled_models_dir}/decoder"
65 |         if not os.path.exists(compiled_model_dir):
66 |             os.makedirs(compiled_model_dir)
67 |         torch.jit.save(compiled_decoder, f"{compiled_model_dir}/model.pt")
68 | 
69 |         compiled_post_quant_conv = torch_neuronx.trace(
70 |             pipe.vae.post_quant_conv,
71 |             sample_inputs,
72 |             compiler_workdir=f"{compiler_workdir}/post_quant_conv",
73 |             compiler_args=compiler_flags,
74 |             inline_weights_to_neff=False)
75 |         
76 |         compiled_model_dir = f"{compiled_models_dir}/post_quant_conv"
77 |         if not os.path.exists(compiled_model_dir):
78 |             os.makedirs(compiled_model_dir)     
79 |         torch.jit.save(compiled_post_quant_conv, f"{compiled_model_dir}/model.pt")
80 | 
81 | if __name__ == "__main__":
82 |     parser = argparse.ArgumentParser()
83 |     parser.add_argument("--height", help="height of generated image.", type=int, default=1024)
84 |     parser.add_argument("--width", help="height of generated image.", type=int, default=1024)
85 |     parser.add_argument("--num_images_per_prompt", help="height of generated image.", type=int, default=1)
86 |     parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir")
87 |     parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models")
88 |     args = parser.parse_args()
89 |     compile_decoder(args)


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_text_encoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
 3 | os.environ["NEURON_CUSTOM_SILU"] = "1"
 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2
 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2
 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2
 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2
 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags
 9 | 
10 | from diffusers import PixArtSigmaPipeline
11 | import torch
12 | import argparse
13 | import torch_neuronx
14 | import neuronx_distributed
15 | from transformers.models.t5 import T5EncoderModel
16 | from torch import nn
17 | from functools import partial
18 | 
19 | from transformers.models.t5.modeling_t5 import T5EncoderModel, T5Block, T5LayerSelfAttention, T5LayerFF
20 | 
21 | from neuron_commons import attention_wrapper, f32Wrapper
22 | from neuron_parallel_utils import get_sharded_data, shard_t5_self_attention, shard_t5_ff
23 | 
24 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper
25 | 
26 | 
27 | class TracingT5WrapperTP(nn.Module):
28 |     def __init__(self, t: T5EncoderModel, seqlen: int):
29 |         super().__init__()
30 |         self.t = t
31 |         self.device = t.device
32 |         precomputed_bias = self.t.encoder.block[0].layer[0].SelfAttention.compute_bias(seqlen, seqlen)
33 |         precomputed_bias_tp = get_sharded_data(precomputed_bias, 1)
34 |         self.t.encoder.block[0].layer[0].SelfAttention.compute_bias = lambda *args, **kwargs: precomputed_bias_tp
35 |     
36 |     def forward(self, text_input_ids, prompt_attention_mask):
37 |         return self.t(
38 |             text_input_ids, 
39 |             attention_mask=prompt_attention_mask
40 |         )
41 | 
42 | def get_text_encoder(tp_degree: int, sequence_length: int):
43 |     pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained(
44 |         "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
45 |         cache_dir="pixart_sigma_hf_cache_dir_1024",
46 |         local_files_only=True,
47 |         torch_dtype=torch.bfloat16)
48 |     text_encoder: T5EncoderModel = pipe.text_encoder
49 |     text_encoder.eval()
50 |     for idx, block in enumerate(text_encoder.encoder.block):
51 |         block: T5Block = block
52 |         block.layer[1].DenseReluDense.act = torch.nn.GELU(approximate="tanh")
53 |         selfAttention: T5LayerSelfAttention = block.layer[0].SelfAttention
54 |         ff: T5LayerFF = block.layer[1]
55 |         layer_norm_0 = block.layer[0].layer_norm.to(torch.float32)
56 |         layer_norm_1 = block.layer[1].layer_norm.to(torch.float32)       
57 |         block.layer[1] = shard_t5_ff(ff)
58 |         block.layer[0].SelfAttention = shard_t5_self_attention(tp_degree, selfAttention)
59 |         block.layer[0].layer_norm = f32Wrapper(layer_norm_0)
60 |         block.layer[1].layer_norm = f32Wrapper(layer_norm_1)
61 |     final_layer_norm = pipe.text_encoder.encoder.final_layer_norm.to(torch.float32)
62 |     pipe.text_encoder.encoder.final_layer_norm = f32Wrapper(final_layer_norm)             
63 |     return TracingT5WrapperTP(text_encoder, sequence_length), {}
64 | 
65 | def compile_text_encoder(args):
66 |     batch_size = 1 # batch_size = args.num_prompts
67 |     sequence_length = args.max_sequence_length
68 |     tp_degree = 4 # Use tensor parallel degree as 4 for trn2
69 |     # tp_degree = 8 # Use tensor parallel degree as 8 for trn1/inf2
70 |     os.environ["LOCAL_WORLD_SIZE"] = "4"
71 |     get_text_encoder_f = partial(get_text_encoder, tp_degree, sequence_length)
72 |     
73 |     compiler_workdir = args.compiler_workdir
74 |     compiled_models_dir = args.compiled_models_dir
75 |     
76 |     with torch.no_grad():
77 |         sample_inputs = torch.ones((batch_size, sequence_length), dtype=torch.int64), \
78 |             torch.ones((batch_size, sequence_length), dtype=torch.int64)
79 |         compiled_text_encoder = neuronx_distributed.trace.parallel_model_trace(
80 |             get_text_encoder_f,
81 |             sample_inputs,
82 |             compiler_workdir=f"{compiler_workdir}/text_encoder",
83 |             compiler_args=compiler_flags,
84 |             tp_degree=tp_degree,
85 |         )
86 |         compiled_model_dir = f"{compiled_models_dir}/text_encoder"
87 |         if not os.path.exists(compiled_model_dir):
88 |             os.makedirs(compiled_model_dir)           
89 |         neuronx_distributed.trace.parallel_model_save(
90 |             compiled_text_encoder, f"{compiled_model_dir}")
91 | 
92 | if __name__ == "__main__":
93 |     parser = argparse.ArgumentParser()
94 |     parser.add_argument("--num_prompts", help="number of prompts", type=int, default=1)
95 |     parser.add_argument("--max_sequence_length", help="max sequence length.", type=int, default=300)
96 |     parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir")
97 |     parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str,  default="compiled_models")
98 |     args = parser.parse_args()
99 |     compile_text_encoder(args)


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_transformer_latency_optimized.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
  3 | os.environ["NEURON_CUSTOM_SILU"] = "1"
  4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2
  5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2
  6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --internal-hlo2tensorizer-options='--fuse-dot-logistic=false' --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2
  7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2
  8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags
  9 | 
 10 | from diffusers import PixArtSigmaPipeline
 11 | import torch
 12 | import argparse
 13 | import neuronx_distributed
 14 | 
 15 | from torch import nn
 16 | from functools import partial
 17 | 
 18 | from neuron_commons import attention_wrapper_for_transformer
 19 | from neuron_parallel_utils import shard_transformer_attn, shard_transformer_feedforward
 20 | 
 21 | from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel
 22 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper_for_transformer
 23 | 
 24 | class TracingTransformerWrapper(nn.Module):
 25 |     def __init__(self, transformer):
 26 |         super().__init__()
 27 |         self.transformer = transformer
 28 |         self.config = transformer.config
 29 |         self.dtype = transformer.dtype
 30 |         self.device = transformer.device    
 31 |     
 32 |     def forward(self, hidden_states=None, encoder_hidden_states=None, timestep=None, encoder_attention_mask=None, **kwargs):
 33 |         return self.transformer(
 34 |         hidden_states=hidden_states, 
 35 |         encoder_hidden_states=encoder_hidden_states, 
 36 |         timestep=timestep, 
 37 |         encoder_attention_mask=encoder_attention_mask,
 38 |         added_cond_kwargs={"resolution": None, "aspect_ratio": None},
 39 |         return_dict=False)
 40 | 
 41 | def get_transformer_model(tp_degree: int):
 42 |     pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained(
 43 |         "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
 44 |         torch_dtype=torch.bfloat16,
 45 |         local_files_only=True,
 46 |         cache_dir="pixart_sigma_hf_cache_dir_1024")
 47 |     # 28 of these.
 48 |     for block_idx, block in enumerate(pipe.transformer.transformer_blocks):
 49 |         block.attn1 = shard_transformer_attn(tp_degree, block.attn1)
 50 |         block.attn2 = shard_transformer_attn(tp_degree, block.attn2)
 51 |         block.ff = shard_transformer_feedforward(block.ff)
 52 | 
 53 |     mod_pipe_transformer_f = TracingTransformerWrapper(pipe.transformer)
 54 |     return mod_pipe_transformer_f, {}
 55 | 
 56 | def compile_transformer(args):
 57 |     tp_degree = 4
 58 |     # tp_degree = 8 # Use tensor parallel degree as 8 for trn1/inf2
 59 |     os.environ["LOCAL_WORLD_SIZE"] = "4" # Use tensor parallel degree as 4 for trn2
 60 |     latent_height = args.height//8
 61 |     latent_width = args.width//8
 62 |     num_prompts = 1
 63 |     num_images_per_prompt = args.num_images_per_prompt
 64 |     max_sequence_length = args.max_sequence_length
 65 |     hidden_size = 4096
 66 |     compiler_workdir = args.compiler_workdir
 67 |     compiled_models_dir = args.compiled_models_dir
 68 |     batch_size = 2
 69 |     sample_hidden_states = torch.ones((batch_size, 4, latent_height, latent_width), dtype=torch.bfloat16)
 70 |     sample_encoder_hidden_states = torch.ones((batch_size, max_sequence_length, hidden_size), dtype=torch.bfloat16)
 71 |     sample_timestep = torch.ones((batch_size), dtype=torch.int64)
 72 |     sample_encoder_attention_mask = torch.ones((batch_size, max_sequence_length), dtype=torch.int64)
 73 | 
 74 |     get_transformer_model_f = partial(get_transformer_model, tp_degree)
 75 |     with torch.no_grad():
 76 |         sample_inputs = sample_hidden_states, sample_encoder_hidden_states, sample_timestep, sample_encoder_attention_mask
 77 |         compiled_transformer = neuronx_distributed.trace.parallel_model_trace(
 78 |             get_transformer_model_f,
 79 |             sample_inputs,
 80 |             compiler_workdir=f"{compiler_workdir}/transformer",
 81 |             compiler_args=compiler_flags,
 82 |             tp_degree=tp_degree,
 83 |             inline_weights_to_neff=False,
 84 |         )
 85 |         compiled_model_dir = f"{compiled_models_dir}/transformer"
 86 |         if not os.path.exists(compiled_model_dir):
 87 |             os.makedirs(compiled_model_dir)         
 88 |         neuronx_distributed.trace.parallel_model_save(
 89 |             compiled_transformer, f"{compiled_model_dir}")
 90 | 
 91 | if __name__ == "__main__":
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument("--height", help="height of generated image.", type=int, default=1024)
 94 |     parser.add_argument("--width", help="width of generated image.", type=int, default=1024)
 95 |     parser.add_argument("--num_images_per_prompt", help="number of images per prompt.", type=int, default=1)
 96 |     parser.add_argument("--max_sequence_length", help="max sequence length.", type=int, default=300)
 97 |     parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir")
 98 |     parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models")
 99 |     args = parser.parse_args()
100 |     compile_transformer(args)


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/compile_transformer_throughput_optimized.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
 3 | os.environ["NEURON_CUSTOM_SILU"] = "1"
 4 | os.environ["NEURON_RT_VIRTUAL_CORE_SIZE"] = "2" # Comment this line out if using trn1/inf2
 5 | os.environ["NEURON_LOGICAL_NC_CONFIG"] = "2" # Comment this line out if using trn1/inf2
 6 | compiler_flags = """ --verbose=INFO --target=trn2 --lnc=2 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn2
 7 | # compiler_flags = """ --verbose=INFO --target=trn1 --model-type=unet-inference --enable-fast-loading-neuron-binaries """ # Use these compiler flags for trn1/inf2
 8 | os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + compiler_flags
 9 | 
10 | from diffusers import PixArtSigmaPipeline
11 | from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel
12 | import torch
13 | import argparse
14 | import torch_neuronx
15 | from torch import nn
16 | from functools import partial
17 | 
18 | from neuron_commons import attention_wrapper, attention_wrapper_for_transformer
19 | from neuron_parallel_utils import shard_transformer_attn, shard_transformer_feedforward, get_sharded_data
20 | 
21 | torch.nn.functional.scaled_dot_product_attention = attention_wrapper_for_transformer
22 | 
23 | class TracingTransformerWrapper(nn.Module):
24 |     def __init__(self, transformer):
25 |         super().__init__()
26 |         self.transformer = transformer
27 |         self.config = transformer.config
28 |         self.dtype = transformer.dtype
29 |         self.device = transformer.device    
30 |     
31 |     def forward(self, hidden_states=None, encoder_hidden_states=None, timestep=None, encoder_attention_mask=None, **kwargs):
32 |         return self.transformer(
33 |         hidden_states=hidden_states, 
34 |         encoder_hidden_states=encoder_hidden_states, 
35 |         timestep=timestep, 
36 |         encoder_attention_mask=encoder_attention_mask,
37 |         added_cond_kwargs={"resolution": None, "aspect_ratio": None},
38 |         return_dict=False)
39 | 
40 | def get_transformer_model():
41 |     pipe: PixArtSigmaPipeline = PixArtSigmaPipeline.from_pretrained(
42 |         "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
43 |         torch_dtype=torch.bfloat16,
44 |         local_files_only=True,
45 |         cache_dir="pixart_sigma_hf_cache_dir_1024")    
46 |     mod_pipe_transformer_f = TracingTransformerWrapper(pipe.transformer)
47 |     return mod_pipe_transformer_f
48 | 
49 | def compile_transformer(args):
50 |     latent_height = args.height//8
51 |     latent_width = args.width//8
52 |     num_prompts = 1
53 |     num_images_per_prompt = args.num_images_per_prompt
54 |     max_sequence_length = args.max_sequence_length
55 |     hidden_size = 4096
56 |     compiler_workdir = args.compiler_workdir
57 |     compiled_models_dir = args.compiled_models_dir
58 |     batch_size = 2
59 |     sample_hidden_states = torch.ones((batch_size, 4, latent_height, latent_width), dtype=torch.bfloat16)
60 |     sample_encoder_hidden_states = torch.ones((batch_size, max_sequence_length, hidden_size), dtype=torch.bfloat16)
61 |     sample_timestep = torch.ones((batch_size), dtype=torch.int64)
62 |     sample_encoder_attention_mask = torch.ones((batch_size, max_sequence_length), dtype=torch.int64)
63 |     get_transformer_model_f = get_transformer_model() #, tp_degree)
64 |     with torch.no_grad():
65 |         sample_inputs = sample_hidden_states, sample_encoder_hidden_states, sample_timestep, sample_encoder_attention_mask
66 |         compiled_transformer = torch_neuronx.trace(
67 |             get_transformer_model_f,
68 |             sample_inputs,
69 |             compiler_workdir=f"{compiler_workdir}/transformer",
70 |             compiler_args=compiler_flags,
71 |             inline_weights_to_neff=False)
72 |         
73 |         compiled_model_dir = f"{compiled_models_dir}/transformer"
74 |         if not os.path.exists(compiled_model_dir):
75 |             os.makedirs(compiled_model_dir)
76 |         torch.jit.save(compiled_transformer, f"{compiled_model_dir}/model.pt")   
77 | 
78 | if __name__ == "__main__":
79 |     parser = argparse.ArgumentParser()
80 |     parser.add_argument("--height", help="height of generated image.", type=int, default=1024)
81 |     parser.add_argument("--width", help="width of generated image.", type=int, default=1024)
82 |     parser.add_argument("--num_images_per_prompt", help="number of images per prompt.", type=int, default=1)
83 |     parser.add_argument("--max_sequence_length", help="max sequence length.", type=int, default=300)
84 |     parser.add_argument("--compiler_workdir", help="dir for compiler artifacts.", type=str, default="compiler_workdir")
85 |     parser.add_argument("--compiled_models_dir", help="dir for compiled artifacts.", type=str, default="compiled_models")
86 |     args = parser.parse_args()
87 |     compile_transformer(args)


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/neuron_pixart_sigma/neuron_commons.py:
--------------------------------------------------------------------------------
  1 | from diffusers import PixArtSigmaPipeline, Transformer2DModel
  2 | from transformers.models.t5.modeling_t5 import T5EncoderModel
  3 | from torch import nn
  4 | 
  5 | class InferenceTextEncoderWrapper(nn.Module):
  6 |     def __init__(self, dtype, t: T5EncoderModel, seqlen: int):
  7 |         super().__init__()
  8 |         self.dtype = dtype
  9 |         self.device = t.device
 10 |         self.t = t
 11 |     def forward(self, text_input_ids, attention_mask=None):
 12 |         return [self.t(text_input_ids, attention_mask)['last_hidden_state'].to(self.dtype)]
 13 | 
 14 | class InferenceTransformerWrapper(nn.Module):
 15 |     def __init__(self, transformer: Transformer2DModel):
 16 |         super().__init__()
 17 |         self.transformer = transformer
 18 |         self.config = transformer.config
 19 |         self.dtype = transformer.dtype
 20 |         self.device = transformer.device
 21 |     def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, 
 22 |                             encoder_attention_mask=None, added_cond_kwargs=None,
 23 |                             return_dict=False):
 24 |         output = self.transformer(
 25 |             hidden_states, 
 26 |             encoder_hidden_states, 
 27 |             timestep, 
 28 |             encoder_attention_mask)
 29 |         return output
 30 | 
 31 | class SimpleWrapper(nn.Module):
 32 |     def __init__(self, model):
 33 |         super().__init__()
 34 |         self.model = model
 35 |     def forward(self, x):
 36 |         output = self.model(x)
 37 |         return output
 38 | 
 39 | import torch
 40 | import math
 41 | from torch import nn
 42 | 
 43 | from neuronxcc.starfish.penguin.targets.nki.private_api import vnc
 44 | from torch_neuronx.xla_impl.ops import nki_jit
 45 | from neuronxcc.nki._private_kernels.attention import attention_isa_kernel
 46 | _flash_fwd_call = nki_jit()(attention_isa_kernel)
 47 | 
 48 | 
 49 | def neuron_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=None, is_causal=None):
 50 |     orig_shape = None
 51 |     if len(query.shape) == 4:
 52 |         orig_shape = query.shape
 53 |         def to3d(x):
 54 |             return x.reshape(-1, x.shape[2], x.shape[3])
 55 |         query, key, value = map(to3d, [query, key, value])
 56 |     if query.size() == key.size():
 57 |         attention_scores = torch.bmm(key, query.transpose(-1, -2)) * (
 58 |             1 / math.sqrt(query.size(-1))
 59 |         )
 60 |         attention_probs = attention_scores.softmax(dim=1).permute(0, 2, 1)
 61 |     else:
 62 |         attention_scores = torch.bmm(query, key.transpose(-1, -2)) * (
 63 |             1 / math.sqrt(query.size(-1))
 64 |         )
 65 |         attention_probs = attention_scores.softmax(dim=-1)
 66 |     attn_out = torch.bmm(attention_probs, value)
 67 |     if orig_shape:
 68 |         attn_out = attn_out.reshape(
 69 |             orig_shape[0], orig_shape[1], attn_out.shape[1], attn_out.shape[2]
 70 |         )
 71 |     return attn_out
 72 | 
 73 | 
 74 | def attention_wrapper_sharded_without_swap(query, key, value):
 75 |     bs, n_head, q_len, d_head = query.shape
 76 |     q = query.clone().permute(0, 1, 3, 2).reshape((bs*n_head, d_head, q_len))
 77 |     k = key.clone().permute(0, 1, 3, 2).reshape((bs*n_head, d_head, q_len))
 78 |     v = value.clone().reshape((bs*n_head, q_len, d_head))
 79 |     attn_output = torch.zeros((bs*n_head, q_len, d_head), dtype=torch.bfloat16, device=q.device)
 80 |     use_sharded_attention_kernel = True # Use "need use_sharded_attention_kernel = True" in case of trn2
 81 |     # use_sharded_attention_kernel = False # We do not "need use_sharded_attention_kernel" in case of trn1/inf2, so we could make it false
 82 |     if use_sharded_attention_kernel:
 83 |         grid = (vnc(2),)
 84 |         _flash_fwd_call[grid](q, k, v, 0.117, attn_output, kernel_name="AttentionMMSoftmaxMMWithoutSwap")
 85 |     else:
 86 |         _flash_fwd_call(q, k, v, 0.117, attn_output, kernel_name="AttentionMMSoftmaxMMWithoutSwap")
 87 |     attn_output = attn_output.reshape((bs, n_head, q_len, d_head))
 88 |     return attn_output
 89 | 
 90 | 
 91 | sdpa_original = torch.nn.functional.scaled_dot_product_attention
 92 | def attention_wrapper(query, key, value, attn_mask=None, dropout_p=None, is_causal=None):
 93 |     if attn_mask is not None:
 94 |         return sdpa_original(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal)
 95 |     else:
 96 |         return neuron_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal)
 97 |         
 98 | def attention_wrapper_for_transformer(query, key, value, attn_mask=None, dropout_p=None, is_causal=None):
 99 |     if attn_mask is not None:
100 |         return sdpa_original(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal)
101 |     else:
102 |         return attention_wrapper_sharded_without_swap(query, key, value)
103 |         
104 | class f32Wrapper(nn.Module):
105 |     def __init__(self, original):
106 |         super().__init__()
107 |         self.original = original
108 |     def forward(self, x):
109 |         t = x.dtype
110 |         y = x.to(torch.float32)
111 |         output = self.original(y)
112 |         return output.type(t)
113 |     
114 |     


--------------------------------------------------------------------------------
/torch-neuronx/inference/hf_pretrained_pixart_sigma_1k/requirements.txt:
--------------------------------------------------------------------------------
1 | diffusers==0.31.0
2 | transformers==4.36.2


--------------------------------------------------------------------------------
/torch-neuronx/inference/sd2_inpainting_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/torch-neuronx/inference/sd2_inpainting_mask.png


--------------------------------------------------------------------------------
/torch-neuronx/inference/sd2_inpainting_photo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/torch-neuronx/inference/sd2_inpainting_photo.png


--------------------------------------------------------------------------------
/torch-neuronx/microbenchmark/ubench_utils.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | import time
 3 | 
 4 | 
 5 | class Timer:
 6 |     """
 7 |     A simple Timer with high-enough granularity for performance
 8 |     measurments. The timer is catered towards using as a context manager.
 9 | 
10 |     Example usage:
11 |           with ubench_utils.Timer() as benchmark_timer:
12 |               time.sleep(1)
13 |               time.sleep(4)
14 | 
15 |           act_time = benchmark_timer()
16 |           print("Sleeping for 5 seconds actualy took {:2g} seconds".format(act_time)
17 |     """
18 | 
19 |     def __enter__(self):
20 |         self.start = time.perf_counter()
21 |         self.end = 0.0
22 |         return lambda: self.end - self.start
23 | 
24 |     def __exit__(self, *args):
25 |         self.end = time.perf_counter()
26 | 
27 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Batch / trn1 allreduce example
 2 | 
 3 | This package shows how to run a multi-node allreduce test using trn1.32xlarge instances in AWS Batch. A successful allreduce test indicates that the Neuron driver, Neuron SDK, and EFA driver are installed properly, and the required EFA device configuration + connectivity is in place to support multi-node training.
 4 | 
 5 | It is expected that these scripts will be run from an x86_64-based Linux instance.
 6 | 
 7 | Note: to use trn1n.32xlarge instances, the launch template and job definition will need to be adjusted to use 16 EFA devices (currently using 8 EFA devices for trn1.32xlarge).
 8 | 
 9 | Prereqs:
10 | * Existing VPC with subnet and appropriate [EFA security group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security). Please make sure subnet needs to be private, and the VPC needs to have a NAT gateway to allow internet connectivity for the private subnet.
11 | * ECR repo
12 | * AWS CLI installed and configured with permissions for Batch and ECR
13 | * Docker installed 
14 | * jq installed
15 | 
16 | Steps:
17 | * Modify `build_configs_and_scripts.sh` with your account/region/etc
18 | * Run `./build_configs_and_scripts.sh` to create the configs/scripts using your config details
19 | * Run `./create_resources.sh` to create the various AWS Batch resources (job definition, compute environment, ...)
20 | * Run `./build_docker_image.sh` to build a training container using the latest Neuron Deep Learning Container (DLC) and push the image to ECR
21 | * Run `./submit_job.sh` to submit a basic 4-node allreduce job in the provisioned Batch environment


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/build_configs_and_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # ECR repo and image details. You can locate the correct Neuron DLC image for 'training' on AWS DLC github page - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers
 5 | export BASE_IMAGE_REPO=763104351884.dkr.ecr.us-west-2.amazonaws.com
 6 | export BASE_IMAGE_NAME=pytorch-training-neuronx
 7 | export BASE_IMAGE_TAG=1.13.1-neuronx-py310-sdk2.15.0-ubuntu20.04
 8 | 
 9 | # Configure your account specific settings below
10 | export REGION=<your-region-name>
11 | export ACCOUNT=<your-account-name>
12 | export INSTANCE_ROLE=<your-iam-instance-role-name>
13 | export SUBNET=<your-subnet-name>
14 | export SG=<your-secutrity-group-name>
15 | export ECR_REPO=<your-ecr-repo-name>
16 | 
17 | ECS_AMI_NAME=/aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id
18 | export ECS_AMI=$(aws ssm get-parameter --region $REGION --name $ECS_AMI_NAME | jq -r .Parameter.Value)
19 | export USERDATA=$(cat << EOF | base64 -w0
20 | "MIME-Version: 1.0
21 | Content-Type: multipart/mixed; boundary="==MYBOUNDARY=="
22 | 
23 | --==MYBOUNDARY==
24 | Content-Type: text/cloud-boothook; charset="us-ascii"
25 | 
26 | cloud-init-per once yum_wget yum install -y wget
27 | cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz
28 | cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp
29 | pushd /tmp/aws-efa-installer
30 | cloud-init-per once install_efa ./efa_installer.sh -y
31 | pop /tmp/aws-efa-installer
32 | 
33 | cloud-init-per once efa_info /opt/amazon/efa/bin/fi_info -p efa
34 | 
35 | cloud-init-per once neuron_driver1 echo -e "[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0" | tee /etc/yum.repos.d/neuron.repo > /dev/null
36 | cloud-init-per once neuron_driver2 rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
37 | cloud-init-per once neuron_driver3 yum update -y
38 | cloud-init-per once neuron_driver4 yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y
39 | cloud-init-per once neuron_driver5 yum erase aws-neuronx-dkms -y
40 | cloud-init-per once neuron_driver6 yum install aws-neuronx-dkms-2.* -y
41 | 
42 | --==MYBOUNDARY==--"
43 | EOF
44 | )
45 | 
46 | # Apply variable substitutions to template files and resource creation script
47 | mkdir -p ./build
48 | 
49 | for i in ./templates/*.json; do
50 |   echo $i -\> ./build/`basename $i`;
51 |   envsubst < $i > ./build/`basename $i`;
52 | done
53 | 
54 | envsubst < ./templates/create_resources.sh > ./create_resources.sh \
55 |     && chmod u+x ./create_resources.sh \
56 |     && echo ./templates/create_resources.sh -\> ./create_resources.sh
57 | envsubst < ./templates/build_docker_image.sh > ./build_docker_image.sh \
58 |     && chmod u+x ./build_docker_image.sh \
59 |     && echo ./templates/build_docker_image.sh -\> ./build_docker_image.sh


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE_REPO
 2 | ARG BASE_IMAGE_NAME
 3 | ARG BASE_IMAGE_TAG
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | 
 6 | FROM ${BASE_IMAGE_REPO}/${BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
 7 | 
 8 | COPY ./allreduce* /
 9 | 
10 | WORKDIR /
11 | RUN chmod +x allreduce.sh
12 | CMD ["/allreduce.sh"]


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/docker/allreduce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_xla.core.xla_model as xm
 3 | import torch.distributed as dist
 4 | import torch_xla.distributed.xla_backend
 5 | import os
 6 | 
 7 | def rprint(txt):
 8 |     rank = os.environ.get("LOCAL_RANK", "unk")
 9 |     if int(rank) == 0:
10 |         print(f"{rank}: {txt}", flush=True)
11 | 
12 | dist.init_process_group('xla')
13 | rprint("Before 1st rendezvous")
14 | xm.rendezvous('first')
15 | 
16 | device = xm.xla_device()
17 | for c in range(1000000):
18 |     ones = torch.ones((2, 3))
19 |     xones = ones.to(device)
20 |     result = xm.all_reduce('sum', xones)
21 |     xm.mark_step()
22 |     result_cpu = result.cpu()
23 |     expected = torch.ones((2, 3)) * int(os.environ.get("WORLD_SIZE", 0))
24 |     assert torch.all(result_cpu == expected), f'ERROR: {result_cpu} != {expected}'
25 |     if c % 100 == 0:
26 |         rprint(f"result OK  step {c}: {result}")
27 | 
28 | rprint("Before final rendezvous")
29 | xm.rendezvous('final')


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/docker/allreduce.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o pipefail
 3 | ulimit -n 65535
 4 | sysctl -w net.ipv4.ip_local_reserved_ports=41000
 5 | 
 6 | export FI_EFA_USE_DEVICE_RDMA=1
 7 | export FI_PROVIDER=efa
 8 | export FI_EFA_FORK_SAFE=1
 9 | export CCOM_SOCKET_IFNAME=eth0
10 | 
11 | export PROCESSES_PER_NODE=32
12 | if [ -v AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS ]
13 | then
14 | 	export MASTER_ADDR=$AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS
15 | else
16 | 	export MASTER_ADDR=`ip -f inet addr show eth0 | grep -Po 'inet \K[\d.]+'`
17 | fi
18 | export MASTER_PORT=41000
19 | export NODEID=$AWS_BATCH_JOB_NODE_INDEX
20 | export NTASKS=$AWS_BATCH_JOB_NUM_NODES
21 | 
22 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NTASKS --node_rank $NODEID --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
23 | echo $DISTRIBUTED_ARGS
24 | 
25 | export MALLOC_ARENA_MAX=128
26 | export XLA_USE_BF16=1
27 | export TF_NUM_INTEROP_THREADS=8192
28 | 
29 | set
30 | echo "Starting the job..."
31 | torchrun $DISTRIBUTED_ARGS allreduce.py


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/submit_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eu
3 | 
4 | # submitting aws batch job
5 | aws batch submit-job \
6 |     --job-name aws-batch-trn1-job \
7 |     --job-queue aws-batch-job-queue \
8 |     --job-definition aws-batch-job-definition \
9 |     --node-overrides numNodes=4


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/templates/build_docker_image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | # Build a Neuron container image for running all-reduce test on AWS Batch
 4 | # and push the image to ECR
 5 | export DOCKER_BUILDKIT=1
 6 | 
 7 | # Authenticate with ECR, build & push the image
 8 | pushd ./docker
 9 | aws ecr get-login-password --region $REGION | docker login --username AWS \
10 |     --password-stdin $BASE_IMAGE_REPO \
11 |   && docker build . -t aws-batch:latest \
12 |     --build-arg BASE_IMAGE_REPO=$BASE_IMAGE_REPO \
13 |     --build-arg BASE_IMAGE_NAME=$BASE_IMAGE_NAME \
14 |     --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG
15 | 
16 | aws ecr get-login-password --region $REGION | docker login --username AWS \
17 |     --password-stdin $ECR_REPO \
18 |   && docker tag aws-batch:latest $ECR_REPO:latest \
19 |   && docker push $ECR_REPO:latest
20 | popd


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/templates/compute_env.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "computeEnvironmentName" : "aws-batch-compute-environment",
 3 |    "computeResources" : {
 4 |       "desiredvCpus" : 0,
 5 |       "instanceRole" : "$INSTANCE_ROLE",
 6 |       "instanceTypes" : [
 7 |          "trn1.32xlarge"
 8 |       ],
 9 |       "launchTemplate" : {
10 |          "launchTemplateName" : "aws-batch-launch-template",
11 |          "version" : "$Latest"
12 |       },
13 |       "maxvCpus" : 2088,
14 |       "minvCpus" : 0,
15 |       "subnets" : [
16 |          "$SUBNET"
17 |       ],
18 |       "type" : "EC2"
19 |    },
20 |    "state" : "ENABLED",
21 |    "type" : "MANAGED"
22 | }
23 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/templates/create_resources.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | if [ ! `which jq` ]
 5 | then
 6 | 	echo "Please install jq and re-run this script" && exit
 7 | fi
 8 | 
 9 | aws ec2 create-placement-group --group-name "aws-batch-placement-group" --strategy "cluster" --region $REGION   # creating the aws placement group
10 | aws ec2 create-launch-template --cli-input-json file://build/launch_template.json              # creating the aws launch template
11 | aws batch create-compute-environment --cli-input-json file://build/compute_env.json            # creating the aws batch compute environment
12 | aws batch register-job-definition --cli-input-json file://build/job_def.json                   # creating the aws batch job definition
13 | while [[ ! $(aws batch describe-compute-environments --compute-environments aws-batch-compute-environment | jq -r ".computeEnvironments[].status") =~ VALID ]]
14 | do
15 |         echo -n "."
16 |         sleep 2
17 | done
18 | aws batch create-job-queue --cli-input-json file://build/job_queue.json                        # creating the aws batch job queue


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/templates/job_def.json:
--------------------------------------------------------------------------------
  1 | {
  2 |    "jobDefinitionName" : "aws-batch-job-definition",
  3 |    "nodeProperties" : {
  4 |       "mainNode" : 0,
  5 |       "nodeRangeProperties" : [
  6 |          {
  7 |             "container" : {
  8 |                "image" : "$ECR_REPO:latest",
  9 |                "linuxParameters" : {
 10 |                   "devices" : [
 11 |                      {
 12 |                         "hostPath" : "/dev/infiniband/uverbs0"
 13 |                      },
 14 |                      {
 15 |                         "hostPath" : "/dev/infiniband/uverbs1"
 16 |                      },
 17 |                      {
 18 |                         "hostPath" : "/dev/infiniband/uverbs2"
 19 |                      },
 20 |                      {
 21 |                         "hostPath" : "/dev/infiniband/uverbs3"
 22 |                      },
 23 |                      {
 24 |                         "hostPath" : "/dev/infiniband/uverbs4"
 25 |                      },
 26 |                      {
 27 |                         "hostPath" : "/dev/infiniband/uverbs5"
 28 |                      },
 29 |                      {
 30 |                         "hostPath" : "/dev/infiniband/uverbs6"
 31 |                      },
 32 |                      {
 33 |                         "hostPath" : "/dev/infiniband/uverbs7"
 34 |                      },
 35 |                      {
 36 |                         "hostPath": "/dev/neuron0"
 37 |                      },
 38 |                      {
 39 |                         "hostPath": "/dev/neuron1"
 40 |                      },
 41 |                      {
 42 |                         "hostPath": "/dev/neuron2"
 43 |                      },
 44 |                      {
 45 |                         "hostPath": "/dev/neuron3"
 46 |                      },
 47 |                      {
 48 |                         "hostPath": "/dev/neuron4"
 49 |                      },
 50 |                      {
 51 |                         "hostPath": "/dev/neuron5"
 52 |                      },
 53 |                      {
 54 |                         "hostPath": "/dev/neuron6"
 55 |                      },
 56 |                      {
 57 |                         "hostPath": "/dev/neuron7"
 58 |                      },
 59 |                      {
 60 |                         "hostPath": "/dev/neuron8"
 61 |                      },
 62 |                      {
 63 |                         "hostPath": "/dev/neuron9"
 64 |                      },
 65 |                      {
 66 |                         "hostPath": "/dev/neuron10"
 67 |                      },
 68 |                      {
 69 |                         "hostPath": "/dev/neuron11"
 70 |                      },
 71 |                      {
 72 |                         "hostPath": "/dev/neuron12"
 73 |                      },
 74 |                      {
 75 |                         "hostPath": "/dev/neuron13"
 76 |                      },
 77 |                      {
 78 |                         "hostPath": "/dev/neuron14"
 79 |                      },
 80 |                      {
 81 |                         "hostPath": "/dev/neuron15"
 82 |                      }
 83 |                   ]
 84 |                },
 85 |                "memory" : 500000,
 86 |                "ulimits" : [
 87 |                   {
 88 |                      "hardLimit" : -1,
 89 |                      "name" : "memlock",
 90 |                      "softLimit" : -1
 91 |                   }
 92 |                ],
 93 |                "user" : "root",
 94 |                "vcpus" : 96,
 95 |                "instanceType" : "trn1.32xlarge"
 96 |             },
 97 |             "targetNodes" : "0:"
 98 |          }
 99 |       ],
100 |       "numNodes" : 4
101 |    },
102 |    "type" : "multinode"
103 | }


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/templates/job_queue.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "computeEnvironmentOrder" : [
 3 |       {
 4 |          "computeEnvironment" : "aws-batch-compute-environment",
 5 |          "order" : 1
 6 |       }
 7 |    ],
 8 |    "jobQueueName" : "aws-batch-job-queue",
 9 |    "priority" : 10,
10 |    "state" : "ENABLED"
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/all-reduce/templates/launch_template.json:
--------------------------------------------------------------------------------
  1 | {
  2 |    "LaunchTemplateName" : "aws-batch-launch-template",
  3 |    "LaunchTemplateData" : {
  4 |       "IamInstanceProfile" : {
  5 |          "Arn" : "$INSTANCE_ROLE"
  6 |       },
  7 |       "InstanceType" : "trn1.32xlarge",
  8 |       "ImageId" : "$ECS_AMI",
  9 |       "NetworkInterfaces" : [
 10 |          {
 11 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 12 |             "DeviceIndex" : 0,
 13 |             "Groups" : [
 14 |                "$SG"
 15 |             ],
 16 |             "InterfaceType" : "efa",
 17 |             "NetworkCardIndex" : 0,
 18 |             "SubnetId" : "$SUBNET"
 19 |          },
 20 |          {
 21 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 22 |             "DeviceIndex" : 1,
 23 |             "Groups" : [
 24 |                "$SG"
 25 |             ],
 26 |             "InterfaceType" : "efa",
 27 |             "NetworkCardIndex" : 1,
 28 |             "SubnetId" : "$SUBNET"
 29 |          },
 30 |          {
 31 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 32 |             "DeviceIndex" : 1,
 33 |             "Groups" : [
 34 |                "$SG"
 35 |             ],
 36 |             "InterfaceType" : "efa",
 37 |             "NetworkCardIndex" : 2,
 38 |             "SubnetId" : "$SUBNET"
 39 |          },
 40 |          {
 41 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 42 |             "DeviceIndex" : 1,
 43 |             "Groups" : [
 44 |                "$SG"
 45 |             ],
 46 |             "InterfaceType" : "efa",
 47 |             "NetworkCardIndex" : 3,
 48 |             "SubnetId" : "$SUBNET"
 49 |          },
 50 |          {
 51 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 52 |             "DeviceIndex" : 1,
 53 |             "Groups" : [
 54 |                "$SG"
 55 |             ],
 56 |             "InterfaceType" : "efa",
 57 |             "NetworkCardIndex" : 4,
 58 |             "SubnetId" : "$SUBNET"
 59 |          },
 60 |          {
 61 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 62 |             "DeviceIndex" : 1,
 63 |             "Groups" : [
 64 |                "$SG"
 65 |             ],
 66 |             "InterfaceType" : "efa",
 67 |             "NetworkCardIndex" : 5,
 68 |             "SubnetId" : "$SUBNET"
 69 |          },
 70 |          {
 71 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 72 |             "DeviceIndex" : 1,
 73 |             "Groups" : [
 74 |                "$SG"
 75 |             ],
 76 |             "InterfaceType" : "efa",
 77 |             "NetworkCardIndex" : 6,
 78 |             "SubnetId" : "$SUBNET"
 79 |          },
 80 |          {
 81 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 82 |             "DeviceIndex" : 1,
 83 |             "Groups" : [
 84 |                "$SG"
 85 |             ],
 86 |             "InterfaceType" : "efa",
 87 |             "NetworkCardIndex" : 7,
 88 |             "SubnetId" : "$SUBNET"
 89 |          }
 90 |       ],
 91 |       "Placement" : {
 92 |          "GroupName" : "aws-batch-placement-group"
 93 |       },
 94 |       "TagSpecifications" : [
 95 |          {
 96 |             "ResourceType" : "instance",
 97 |             "Tags" : [
 98 |                {
 99 |                   "Key" : "from-lt",
100 |                   "Value" : "networkInterfacesConfig-EFA-Batch"
101 |                }
102 |             ]
103 |          }
104 |       ],
105 |       "UserData" : "$USERDATA"
106 |    }
107 | }


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/config.txt:
--------------------------------------------------------------------------------
1 | REGION=us-east-1
2 | SUBNET=subnet-012345abcd5689
3 | SG=sg-012345abcd5689
4 | ECR_REPO=1010101010.dkr.ecr.us-east-1.amazonaws.com/your-docker-repo
5 | INSTANCE_ROLE=arn:aws:iam::1010101010:instance-profile/your-instance-role
6 | DO_PRE_COMPILATION=true
7 | TOKENIZED_DATASET_URI=s3://your/s3/location/to/store/tokenized/dataset/
8 | NEURON_COMPILE_CACHE_URI=s3://your/s3/location/to/store/compile-cache/
9 | CHECKPOINT_SAVE_URI=s3://your/s3/location/to/store/tokenized/checkpoints/


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE_REPO
 2 | ARG BASE_IMAGE_NAME
 3 | ARG BASE_IMAGE_TAG
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | 
 6 | FROM ${BASE_IMAGE_REPO}/${BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
 7 | 
 8 | COPY ./llama2 /
 9 | 
10 | WORKDIR /
11 | RUN chmod +x /llama_batch_training.sh
12 | CMD ["/llama_batch_training.sh"]


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/docker/llama_batch_training.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -euo pipefail
  3 | 
  4 | python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com
  5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 | ulimit -n 65535
  7 | sysctl -w net.ipv4.ip_local_reserved_ports=41000
  8 | 
  9 | export FI_EFA_USE_DEVICE_RDMA=1
 10 | export FI_PROVIDER=efa
 11 | export FI_EFA_FORK_SAFE=1
 12 | export CCOM_SOCKET_IFNAME=eth0
 13 | 
 14 | export MASTER_PORT=41000
 15 | export NODEID=$AWS_BATCH_JOB_NODE_INDEX
 16 | export NTASKS=$AWS_BATCH_JOB_NUM_NODES
 17 | 
 18 | export MALLOC_ARENA_MAX=64
 19 | export XLA_USE_BF16=1
 20 | export TF_NUM_INTEROP_THREADS=8192
 21 | export PROCESSES_PER_NODE=32
 22 | export NEURON_CC_FLAGS="--model-type transformer --distribution-strategy=llm-training --cache_dir=$NEURON_COMPILE_CACHE_URI"
 23 | export NEURON_FUSE_SOFTMAX=1
 24 | export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
 25 | export NUM_NEURONCORES=32
 26 | 
 27 | export NEURON_RT_NUM_CORES=32
 28 | export NUM_NEURONCORES=$NEURON_RT_NUM_CORES
 29 | export TPU_NUM_DEVICES=$NEURON_RT_NUM_CORES
 30 | export TPU_CHIPS_PER_HOST_BOUNDS=$NEURON_RT_NUM_CORES
 31 | export NEURON_RT_ROOT_COMM_ID=localhost:48620
 32 | 
 33 | # TP degree
 34 | TP_DEGREE=8
 35 | # 0: bf16; 1: mixed precision
 36 | USE_MIX_PRECISION=1
 37 | # 0: use pure DP; 1: use ZeRO-1
 38 | USE_ZERO_1=1
 39 | # global batch size
 40 | GBS=1024
 41 | # micro batch size
 42 | MBS=1
 43 | # number of steps to run
 44 | TOTAL_STEPS=10000
 45 | # warmup steps
 46 | WARMUP_STEPS=100
 47 | # learning rate
 48 | LR=3.0e-4
 49 | # model path
 50 | MODEL_PATH=$SCRIPT_DIR
 51 | # data path
 52 | DATA_PATH="$HOME/examples_datasets/wikicorpus_llama2_7B_tokenized_4k"
 53 | # sequence length
 54 | SEQ_LEN=4096
 55 | # pre-compilation steps
 56 | PRE_COMPILATION_STEPS_COUNT=2
 57 | # training job steps
 58 | STEPS_THIS_RUN=-1
 59 | # output directory
 60 | OUTPUT_DIR="/llama_checkpoints"
 61 | # S3 checkpoint directory
 62 | CURRENT_BATCH_JOB_ID=$(echo "$AWS_BATCH_JOB_ID" | sed 's/#.*//')
 63 | CHECKPOINT_PATH="$CHECKPOINT_SAVE_URI$CURRENT_BATCH_JOB_ID"
 64 | 
 65 | if [ -v AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS ]
 66 | then
 67 | 	export MASTER_ADDR=$AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS
 68 | else
 69 | 	export MASTER_ADDR=`ip -f inet addr show eth0 | grep -Po 'inet \K[\d.]+'`
 70 | fi
 71 | 
 72 | DP=$(($NEURON_RT_NUM_CORES * $NTASKS / $TP_DEGREE))
 73 | ACC_STEPS=$(($GBS / $MBS / $DP))
 74 | 
 75 | EXTRA_ARGS=" "
 76 | if [ $USE_MIX_PRECISION -gt 0 ]; then
 77 |     EXTRA_ARGS+=" --use_mix_precision"
 78 | fi
 79 | if [ $USE_ZERO_1 -gt 0 ]; then
 80 |     EXTRA_ARGS+=" --use_zero_1"
 81 | fi
 82 | 
 83 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NTASKS --node_rank $NODEID --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 84 | 
 85 | TRAINING_ARGS="--model_path $MODEL_PATH --data_dir $DATA_PATH --tensor_parallel_size $TP_DEGREE --batch_size $MBS \
 86 |                 --max_steps $TOTAL_STEPS --warmup_steps $WARMUP_STEPS --lr $LR --grad_accum_usteps $ACC_STEPS --seq_len $SEQ_LEN --sequence_parallel_enabled \
 87 |                 --selective_checkpoint_enabled --logging_interval 10 --output_dir $OUTPUT_DIR $EXTRA_ARGS"
 88 | 
 89 | TORCH_RUN_COMMAND="torchrun $DISTRIBUTED_ARGS tp_zero1_llama2_7b_hf_pretrain.py $TRAINING_ARGS"
 90 | 
 91 | set
 92 | echo "Installing all dependencies..."
 93 | python3 -m pip install -r requirements.txt
 94 | 
 95 | # Downloading the pre-tokenized dataset from s3
 96 | echo "Downloading tokenized dataset..."
 97 | aws s3 cp $TOKENIZED_DATASET_URI $DATA_PATH --recursive --only-show-errors
 98 | 
 99 | # Running Pre-Compilation
100 | if [ "$DO_PRE_COMPILATION" = true ]; then
101 |   echo "Starting neuron parallel compilation..."
102 |   neuron_parallel_compile $TORCH_RUN_COMMAND --steps_this_run $PRE_COMPILATION_STEPS_COUNT
103 | fi
104 | 
105 | # Running Training Job
106 | echo "Starting the training job..."
107 | $TORCH_RUN_COMMAND --steps_this_run $STEPS_THIS_RUN
108 | 
109 | # Uploading checkpoints to S3
110 | aws s3 cp $OUTPUT_DIR $CHECKPOINT_PATH --recursive --only-show-errors
111 | echo "Saved the checkpoints to $CHECKPOINT_PATH"


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/images/aws-batch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-samples/facb1117f7d7d5d42d5cbb24fcf7fb76c62d2715/torch-neuronx/training/aws-batch/llama2/images/aws-batch.png


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/scripts/build_and_push_docker_image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | export DOCKER_BUILDKIT=1
 5 | 
 6 | pushd ./docker
 7 | # Build a Neuron container image for running all-reduce test on AWS Batch and push the image to ECR
 8 | # Authenticate with ECR, build & push the image
 9 | aws ecr get-login-password --region $REGION | docker login --username AWS \
10 |     --password-stdin $BASE_IMAGE_REPO \
11 |   && docker build . -t aws-batch:latest \
12 |     --build-arg BASE_IMAGE_REPO=$BASE_IMAGE_REPO \
13 |     --build-arg BASE_IMAGE_NAME=$BASE_IMAGE_NAME \
14 |     --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG
15 | 
16 | aws ecr get-login-password --region $REGION | docker login --username AWS \
17 |     --password-stdin $ECR_REPO \
18 |   && docker tag aws-batch:latest $ECR_REPO:latest \
19 |   && docker push $ECR_REPO:latest
20 | popd


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/scripts/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | aws ec2 delete-placement-group --group-name $PLACEMENT_GROUP_NAME                                                           # deleting the placement group
 5 | aws ec2 delete-launch-template --launch-template-name $LAUNCH_TEMPLATE_NAME                                                 # deleting the aws batch compute environment
 6 | 
 7 | aws batch update-job-queue --job-queue $JOB_QUEUE_NAME --state DISABLED                                                     # disabling the job queue
 8 | while [[ ! $( aws batch describe-job-queues --job-queue $JOB_QUEUE_NAME | jq -r ".jobQueues[].state") =~ DISABLED ]]
 9 | do
10 |         echo -n "."
11 |         sleep 2
12 | done
13 | aws batch delete-job-queue --job-queue $JOB_QUEUE_NAME                                                                      # deleting the job queue
14 | while [[ $(aws batch describe-job-queues --job-queue $JOB_QUEUE_NAME | jq -r '.jobQueues | length') -ne 0 ]]; do
15 |     echo -n "."
16 |     sleep 5
17 | done
18 | 
19 | aws batch update-compute-environment --compute-environment $COMPUTE_ENV_NAME --state DISABLED                               # disabling the compute environment
20 | while [[ ! $(aws batch describe-compute-environments --compute-environments $COMPUTE_ENV_NAME  | jq -r ".computeEnvironments[].status") =~ VALID ]]
21 | do
22 |         echo -n "."
23 |         sleep 5
24 | done
25 | aws batch delete-compute-environment --compute-environment $COMPUTE_ENV_NAME                                                # deleting the compute environment
26 | aws batch deregister-job-definition --job-definition $JOB_DEF_NAME                                                          # deregistering the aws batch job definition
27 | echo -e "\nCleaned up all the resources."


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/scripts/create_resources.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | if [ ! `which jq` ]
 5 | then
 6 | 	echo "Please install jq and re-run this script" && exit
 7 | fi
 8 | 
 9 | aws ec2 create-placement-group --group-name $PLACEMENT_GROUP_NAME --strategy cluster           # creating the placement group
10 | aws ec2 create-launch-template --cli-input-json file://build/launch_template.json              # creating the aws launch template
11 | aws batch create-compute-environment --cli-input-json file://build/compute_env.json            # creating the aws batch compute environment
12 | 
13 | aws batch register-job-definition --cli-input-json file://build/job_def.json                   # creating the aws batch job definition
14 | while [[ ! $(aws batch describe-compute-environments --compute-environments $COMPUTE_ENV_NAME | jq -r ".computeEnvironments[].status") =~ VALID ]]
15 | do
16 |         echo -n "."
17 |         sleep 2
18 | done
19 | 
20 | aws batch create-job-queue --cli-input-json file://build/job_queue.json                        # creating the aws batch job queue
21 | while [[ ! $(aws batch describe-job-queues --job-queue $JOB_QUEUE_NAME | jq -r ".jobQueues[].status") =~ VALID ]]
22 | do
23 |         echo -n "."
24 |         sleep 2
25 | done


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/scripts/download_and_tokenize_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eu
 3 | 
 4 | # installing the requirements
 5 | python3 -m pip install transformers regex datasets sentencepiece protobuf==3.20.*
 6 | 
 7 | # downloading and tokenizing the dataset
 8 | cd ./data
 9 | python3 get_dataset.py
10 | 
11 | # pushing the tokenized dataset to predefined S3 location
12 | aws s3 cp ~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k/ $TOKENIZED_DATASET_URI --recursive --only-show-errors
13 | echo "Dataset has been processed and tokenized data has been uploaded to $TOKENIZED_DATASET_URI successfully."
14 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/scripts/submit_batch_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eu
3 | 
4 | # submitting aws batch job
5 | aws batch submit-job \
6 |     --job-name $JOB_NAME \
7 |     --job-queue $JOB_QUEUE_NAME \
8 |     --job-definition $JOB_DEF_NAME \
9 |     --node-overrides numNodes=4


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # Read variables from config file
 5 | source config.txt
 6 | 
 7 | export REGION
 8 | export SUBNET
 9 | export SG
10 | export ECR_REPO
11 | export INSTANCE_ROLE
12 | export DO_PRE_COMPILATION
13 | export TOKENIZED_DATASET_URI
14 | export NEURON_COMPILE_CACHE_URI
15 | export CHECKPOINT_SAVE_URI
16 | 
17 | # ECR repo and image details. You can locate the correct Neuron DLC image for 'training' on AWS DLC github page - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers
18 | export BASE_IMAGE_REPO=763104351884.dkr.ecr.$REGION.amazonaws.com
19 | export BASE_IMAGE_NAME=pytorch-training-neuronx
20 | export BASE_IMAGE_TAG=1.13.1-neuronx-py310-sdk2.18.0-ubuntu20.04
21 | export ECS_AMI_NAME=/aws/service/ecs/optimized-ami/amazon-linux-2023/recommended/image_id
22 | export ECS_AMI=$(aws ssm get-parameter --region $REGION --name $ECS_AMI_NAME | jq -r .Parameter.Value)
23 | 
24 | export PLACEMENT_GROUP_NAME=aws-batch-placement-group
25 | export LAUNCH_TEMPLATE_NAME=aws-batch-launch-template
26 | export COMPUTE_ENV_NAME=aws-batch-compute-environment
27 | export JOB_QUEUE_NAME=aws-batch-job-queue
28 | export JOB_DEF_NAME=aws-batch-job-definition
29 | export JOB_NAME=aws-batch-job
30 | 
31 | export USER_DATA=$(cat << EOF | base64 -w0
32 | "MIME-Version: 1.0
33 | Content-Type: multipart/mixed; boundary="==MYBOUNDARY=="
34 | 
35 | --==MYBOUNDARY==
36 | Content-Type: text/cloud-boothook; charset="us-ascii"
37 | 
38 | #!/bin/bash
39 | sudo yum install -y libibverbs-utils rdma-core-devel ibacm infiniband-diags-compat librdmacm-utils
40 | cloud-init-per once yum_wget yum install -y wget
41 | cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz
42 | cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp
43 | pushd /tmp/aws-efa-installer
44 | cloud-init-per once install_efa ./efa_installer.sh -y
45 | pop /tmp/aws-efa-installer
46 | 
47 | cloud-init-per once efa_info /opt/amazon/efa/bin/fi_info -p efa
48 | 
49 | cloud-init-per once neuron_driver1 echo -e "[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0" | tee /etc/yum.repos.d/neuron.repo > /dev/null
50 | cloud-init-per once neuron_driver2 rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
51 | cloud-init-per once neuron_driver3 yum update -y
52 | cloud-init-per once neuron_driver4 yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y
53 | cloud-init-per once neuron_driver5 yum erase aws-neuronx-dkms -y
54 | cloud-init-per once neuron_driver6 yum install aws-neuronx-dkms-2.* -y
55 | 
56 | --==MYBOUNDARY==--"
57 | EOF
58 | )
59 | 
60 | # Creating directories required for setup
61 | mkdir -p ./data
62 | mkdir -p ./build
63 | mkdir -p ./docker/llama2
64 | 
65 | # Locating and moving tokenizer to required directory
66 | if [[ ! -e "tokenizer.model" ]]; then
67 |   echo "Tokenizer File does not exist. Please ensure you have tokenizer file placed in the root directory with the name as 'tokenizer.model'"
68 |   exit 1
69 | fi
70 | mv tokenizer.model ./data/
71 | 
72 | # Downloading the sample files required for data pre-processing
73 | wget -q -P ./data/ https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/get_dataset.py
74 | wget -q -P ./data/ https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json
75 | 
76 | # Environment substitution is required files
77 | for template in ./templates/*.json; do envsubst < $template > ./build/`basename $template`; done
78 | for script in ./scripts/*.sh; do envsubst < $script > ./`basename $script`; chmod u+x ./`basename $script`; done
79 | envsubst  '$DO_PRE_COMPILATION $NEURON_COMPILE_CACHE_URI $CHECKPOINT_SAVE_URI $TOKENIZED_DATASET_URI' < ./docker/llama_batch_training.sh > ./docker/llama2/llama_batch_training.sh
80 | 
81 | # Downloading the sample files required for Llama training
82 | pushd . > /dev/null
83 | cd ./docker/llama2
84 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py
85 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/logger.py
86 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/modeling_llama_nxd.py
87 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/requirements.txt
88 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json
89 | wget -q https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama2/training_utils.py
90 | popd > /dev/null
91 | echo "Set up has been completed successfully."


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/templates/compute_env.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "computeEnvironmentName" : "$COMPUTE_ENV_NAME",
 3 |    "computeResources" : {
 4 |       "desiredvCpus" : 0,
 5 |       "maxvCpus" : 2088,
 6 |       "minvCpus" : 0,
 7 |       "instanceRole" : "$INSTANCE_ROLE",
 8 |       "instanceTypes" : [
 9 |          "trn1.32xlarge"
10 |       ],
11 |       "launchTemplate" : {
12 |          "launchTemplateName" : "$LAUNCH_TEMPLATE_NAME",
13 |          "version" : "$Latest"
14 |       },
15 |       "subnets" : [
16 |          "$SUBNET"
17 |       ],
18 |       "type" : "EC2"
19 |    },
20 |    "state" : "ENABLED",
21 |    "type" : "MANAGED"
22 | }


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/templates/job_def.json:
--------------------------------------------------------------------------------
  1 | {
  2 |    "jobDefinitionName" : "$JOB_DEF_NAME",
  3 |    "nodeProperties" : {
  4 |       "mainNode" : 0,
  5 |       "nodeRangeProperties" : [
  6 |          {
  7 |             "container" : {
  8 |                "image" : "$ECR_REPO:latest",
  9 |                "linuxParameters" : {
 10 |                   "devices" : [
 11 |                      {
 12 |                         "hostPath" : "/dev/infiniband/uverbs0"
 13 |                      },
 14 |                      {
 15 |                         "hostPath" : "/dev/infiniband/uverbs1"
 16 |                      },
 17 |                      {
 18 |                         "hostPath" : "/dev/infiniband/uverbs2"
 19 |                      },
 20 |                      {
 21 |                         "hostPath" : "/dev/infiniband/uverbs3"
 22 |                      },
 23 |                      {
 24 |                         "hostPath" : "/dev/infiniband/uverbs4"
 25 |                      },
 26 |                      {
 27 |                         "hostPath" : "/dev/infiniband/uverbs5"
 28 |                      },
 29 |                      {
 30 |                         "hostPath" : "/dev/infiniband/uverbs6"
 31 |                      },
 32 |                      {
 33 |                         "hostPath" : "/dev/infiniband/uverbs7"
 34 |                      },
 35 |                      {
 36 |                         "hostPath": "/dev/neuron0"
 37 |                      },
 38 |                      {
 39 |                         "hostPath": "/dev/neuron1"
 40 |                      },
 41 |                      {
 42 |                         "hostPath": "/dev/neuron2"
 43 |                      },
 44 |                      {
 45 |                         "hostPath": "/dev/neuron3"
 46 |                      },
 47 |                      {
 48 |                         "hostPath": "/dev/neuron4"
 49 |                      },
 50 |                      {
 51 |                         "hostPath": "/dev/neuron5"
 52 |                      },
 53 |                      {
 54 |                         "hostPath": "/dev/neuron6"
 55 |                      },
 56 |                      {
 57 |                         "hostPath": "/dev/neuron7"
 58 |                      },
 59 |                      {
 60 |                         "hostPath": "/dev/neuron8"
 61 |                      },
 62 |                      {
 63 |                         "hostPath": "/dev/neuron9"
 64 |                      },
 65 |                      {
 66 |                         "hostPath": "/dev/neuron10"
 67 |                      },
 68 |                      {
 69 |                         "hostPath": "/dev/neuron11"
 70 |                      },
 71 |                      {
 72 |                         "hostPath": "/dev/neuron12"
 73 |                      },
 74 |                      {
 75 |                         "hostPath": "/dev/neuron13"
 76 |                      },
 77 |                      {
 78 |                         "hostPath": "/dev/neuron14"
 79 |                      },
 80 |                      {
 81 |                         "hostPath": "/dev/neuron15"
 82 |                      }
 83 |                   ]
 84 |                },
 85 |                "memory" : 500000,
 86 |                "ulimits" : [
 87 |                   {
 88 |                      "hardLimit" : -1,
 89 |                      "name" : "memlock",
 90 |                      "softLimit" : -1
 91 |                   }
 92 |                ],
 93 |                "user" : "root",
 94 |                "vcpus" : 96,
 95 |                "instanceType" : "trn1.32xlarge"
 96 |             },
 97 |             "targetNodes" : "0:"
 98 |          }
 99 |       ],
100 |       "numNodes" : 4
101 |    },
102 |    "type" : "multinode"
103 | }


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/templates/job_queue.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "computeEnvironmentOrder" : [
 3 |       {
 4 |          "computeEnvironment" : "$COMPUTE_ENV_NAME",
 5 |          "order" : 1
 6 |       }
 7 |    ],
 8 |    "jobQueueName" : "$JOB_QUEUE_NAME",
 9 |    "priority" : 1,
10 |    "state" : "ENABLED"
11 | }


--------------------------------------------------------------------------------
/torch-neuronx/training/aws-batch/llama2/templates/launch_template.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "DryRun": false,
  3 |   "LaunchTemplateName": "$LAUNCH_TEMPLATE_NAME",
  4 |   "VersionDescription": "Override Template",
  5 |   "LaunchTemplateData": {
  6 |     "IamInstanceProfile": {
  7 |       "Arn": "$INSTANCE_ROLE"
  8 |     },
  9 |     "InstanceType" : "trn1.32xlarge",
 10 |     "Placement" : {
 11 |          "GroupName" : "$PLACEMENT_GROUP_NAME"
 12 |       },
 13 |     "BlockDeviceMappings": [
 14 |       {
 15 |         "DeviceName": "/dev/xvda",
 16 |         "Ebs": {
 17 |           "VolumeSize": 200,
 18 |           "DeleteOnTermination": true
 19 |         }
 20 |       }
 21 |     ],
 22 |     "ImageId": "$ECS_AMI",
 23 |     "Monitoring": {
 24 |       "Enabled": true
 25 |     },
 26 |     "DisableApiTermination": false,
 27 |     "InstanceInitiatedShutdownBehavior": "stop",
 28 |     "UserData": "$USER_DATA",
 29 |     "TagSpecifications": [
 30 |       {
 31 |         "ResourceType": "instance",
 32 |         "Tags": [
 33 |           {
 34 |             "Key": "purpose",
 35 |             "Value": "batch multinode training"
 36 |           }
 37 |         ]
 38 |       }
 39 |     ],
 40 |     "MetadataOptions": {
 41 |       "HttpTokens": "required",
 42 |       "HttpPutResponseHopLimit": 5,
 43 |       "HttpEndpoint": "enabled"
 44 |     },
 45 |     "NetworkInterfaces" : [
 46 |          {
 47 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 48 |             "DeviceIndex" : 0,
 49 |             "Groups" : [
 50 |                "$SG"
 51 |             ],
 52 |             "InterfaceType" : "efa",
 53 |             "NetworkCardIndex" : 0,
 54 |             "SubnetId" : "$SUBNET"
 55 |          },
 56 |          {
 57 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 58 |             "DeviceIndex" : 1,
 59 |             "Groups" : [
 60 |                "$SG"
 61 |             ],
 62 |             "InterfaceType" : "efa",
 63 |             "NetworkCardIndex" : 1,
 64 |             "SubnetId" : "$SUBNET"
 65 |          },
 66 |          {
 67 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 68 |             "DeviceIndex" : 1,
 69 |             "Groups" : [
 70 |                "$SG"
 71 |             ],
 72 |             "InterfaceType" : "efa",
 73 |             "NetworkCardIndex" : 2,
 74 |             "SubnetId" : "$SUBNET"
 75 |          },
 76 |          {
 77 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 78 |             "DeviceIndex" : 1,
 79 |             "Groups" : [
 80 |                "$SG"
 81 |             ],
 82 |             "InterfaceType" : "efa",
 83 |             "NetworkCardIndex" : 3,
 84 |             "SubnetId" : "$SUBNET"
 85 |          },
 86 |          {
 87 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 88 |             "DeviceIndex" : 1,
 89 |             "Groups" : [
 90 |                "$SG"
 91 |             ],
 92 |             "InterfaceType" : "efa",
 93 |             "NetworkCardIndex" : 4,
 94 |             "SubnetId" : "$SUBNET"
 95 |          },
 96 |          {
 97 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
 98 |             "DeviceIndex" : 1,
 99 |             "Groups" : [
100 |                "$SG"
101 |             ],
102 |             "InterfaceType" : "efa",
103 |             "NetworkCardIndex" : 5,
104 |             "SubnetId" : "$SUBNET"
105 |          },
106 |          {
107 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
108 |             "DeviceIndex" : 1,
109 |             "Groups" : [
110 |                "$SG"
111 |             ],
112 |             "InterfaceType" : "efa",
113 |             "NetworkCardIndex" : 6,
114 |             "SubnetId" : "$SUBNET"
115 |          },
116 |          {
117 |             "Description" : "NetworkInterfaces Configuration For EFA and Batch",
118 |             "DeviceIndex" : 1,
119 |             "Groups" : [
120 |                "$SG"
121 |             ],
122 |             "InterfaceType" : "efa",
123 |             "NetworkCardIndex" : 7,
124 |             "SubnetId" : "$SUBNET"
125 |          }
126 |       ]
127 |   },
128 |   "TagSpecifications": [
129 |     {
130 |       "ResourceType": "launch-template",
131 |       "Tags": [
132 |         {
133 |           "Key": "purpose",
134 |           "Value": "batch training"
135 |         }
136 |       ]
137 |     }
138 |   ]
139 | }
140 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/common/hf_utils.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from torch.utils.data import DataLoader, Dataset
 5 | import torch_xla.distributed.parallel_loader as xpl
 6 | from transformers import Trainer, TrainingArguments
 7 | 
 8 | 
 9 | @dataclass
10 | class TrnTrainingArguments(TrainingArguments):
11 |     loader_prefetch_size: Optional[int] = field(
12 |         default=8,
13 |         metadata={"help": "The max capacity of the queue used by the thread which is reading samples from the loader."},
14 |     )
15 |     device_prefetch_size: Optional[int] = field(
16 |         default=4,
17 |         metadata={"help": "The max size of the per-device queues, where the worker threads deposit tensors which have already been sent to devices."},
18 |     )
19 |     host_to_device_transfer_threads: Optional[int] = field(
20 |         default=1,
21 |         metadata={"help": "The number of threads that work in parallel to transfer data from loader queue to device queue."},
22 |     )
23 |     @property
24 |     def _no_sync_in_gradient_accumulation(self):
25 |         return False
26 | 
27 | 
28 | class TrnTrainer(Trainer):
29 |     def get_train_dataloader(self) -> DataLoader:
30 |         train_loader = super().get_train_dataloader()
31 |         kwargs = {
32 |             "loader_prefetch_size": self.args.loader_prefetch_size,
33 |             "device_prefetch_size": self.args.device_prefetch_size,
34 |             "host_to_device_transfer_threads": self.args.host_to_device_transfer_threads
35 |         }
36 |         if isinstance(train_loader, xpl.MpDeviceLoader):
37 |             train_loader._parallel_loader_kwargs = kwargs
38 |         return train_loader
39 | 
40 |     def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
41 |         eval_loader = super().get_eval_dataloader(eval_dataset)
42 |         kwargs = {
43 |             "loader_prefetch_size": self.args.loader_prefetch_size,
44 |             "device_prefetch_size": self.args.device_prefetch_size,
45 |             "host_to_device_transfer_threads": self.args.host_to_device_transfer_threads
46 |         }
47 |         if isinstance(eval_loader, xpl.MpDeviceLoader):
48 |             eval_loader._parallel_loader_kwargs = kwargs
49 |         return eval_loader
50 | 
51 |     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
52 |         test_loader = super().get_eval_dataloader(test_dataset)
53 |         kwargs = {
54 |             "loader_prefetch_size": self.args.loader_prefetch_size,
55 |             "device_prefetch_size": self.args.device_prefetch_size,
56 |             "host_to_device_transfer_threads": self.args.host_to_device_transfer_threads
57 |         }
58 |         if isinstance(test_loader, xpl.MpDeviceLoader):
59 |             test_loader._parallel_loader_kwargs = kwargs
60 |         return test_loader


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/README.md:
--------------------------------------------------------------------------------
1 | # Torch Neuron CustomOp MLP
2 | 
3 | This folder contains examples Torch custom operators for a multi-layer perceptron (MLP) model.
4 | 
5 | - The `pytorch` folder contains a basic PyTorch (non-neuron) CPU-based MLP model with a custom Relu operator and training script.
6 | - The `neuron` folder contains a the same model but converted to Neuron with an XLA-based training script for trn1-based instances.


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/neuron/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load(
 6 |     name='relu',
 7 |     compute_srcs=['relu.cpp'],
 8 |     shape_srcs=['shape.cpp'],
 9 |     build_directory=os.getcwd()
10 | )
11 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/neuron/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import functional as F
 4 | import my_ops
 5 | 
 6 | # Declare 3-layer MLP for MNIST dataset
 7 | class MLP(nn.Module):
 8 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]):
 9 |     super(MLP, self).__init__()
10 |     self.fc1 = nn.Linear(input_size, layers[0])
11 |     self.fc2 = nn.Linear(layers[0], layers[1])
12 |     self.fc3 = nn.Linear(layers[1], output_size)
13 | 
14 |   def forward(self, x):
15 |     f1 = self.fc1(x)
16 |     r1 = my_ops.Relu.apply(f1)
17 |     f2 = self.fc2(r1)
18 |     r2 = my_ops.Relu.apply(f2)
19 |     f3 = self.fc3(r2)
20 |     return torch.log_softmax(f3, dim=1)
21 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/neuron/my_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_neuronx
 3 | from torch_neuronx.xla_impl import custom_op
 4 | 
 5 | custom_op.load_library('librelu.so')
 6 | 
 7 | class Relu(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input):
10 |         ctx.save_for_backward(input)
11 |         return torch.ops.my_ops.relu_forward(input)
12 | 
13 |     @staticmethod
14 |     def backward(ctx, grad):
15 |         input, = ctx.saved_tensors
16 |         return torch.ops.my_ops.relu_backward(grad, input), None
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/neuron/relu.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | 
 5 | torch::Tensor relu_forward(const torch::Tensor& t_in) {
 6 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
 7 |   auto t_in_acc = t_in.accessor<float, 2>();
 8 |   auto t_out_acc = t_out.accessor<float, 2>();
 9 |   auto shape = t_in.sizes();
10 |   for (int i = 0; i < shape[0]; i++) {
11 |     for (int j = 0; j < shape[1]; j++) {
12 |       t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_in_acc[i][j] : 0.0;
13 |     }
14 |   }
15 |   return t_out;
16 | }
17 | 
18 | torch::Tensor relu_backward(const torch::Tensor& t_grad, const torch::Tensor& t_in) {
19 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
20 |   auto t_in_acc = t_in.accessor<float, 2>();
21 |   auto t_grad_acc = t_grad.accessor<float, 2>();
22 |   auto t_out_acc = t_out.accessor<float, 2>();
23 |   auto shape = t_in.sizes();
24 |   for (int i = 0; i < shape[0]; i++) {
25 |     for (int j = 0; j < shape[1]; j++) {
26 |       t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_grad_acc[i][j] : 0.0;
27 |     }
28 |   }
29 |   return t_out;
30 | }
31 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/neuron/shape.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | #include "torchneuron/register.h"
 5 | 
 6 | torch::Tensor relu_fwd_shape(torch::Tensor t_in) {
 7 |     torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat);
 8 |     return t_out;
 9 | }
10 | 
11 | torch::Tensor relu_bwd_shape(torch::Tensor t_grad, torch::Tensor t_in) {
12 |     torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat);
13 |     return t_out;
14 | }
15 | 
16 | NEURON_LIBRARY(my_ops, m) {
17 |   m.def("relu_forward", &relu_fwd_shape, "relu_forward");
18 |   m.def("relu_backward", &relu_bwd_shape, "relu_backward");
19 | }
20 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/neuron/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | 
13 | # Global constants
14 | EPOCHS = 4
15 | WARMUP_STEPS = 2
16 | BATCH_SIZE = 32
17 | 
18 | # Load MNIST train dataset
19 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train',
20 |                             train=True, download=True, transform=ToTensor())
21 | 
22 | def main():
23 |     # Prepare data loader
24 |     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
25 | 
26 |     # Fix the random number generator seeds for reproducibility
27 |     torch.manual_seed(0)
28 | 
29 |     # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance)
30 |     device = 'xla'
31 | 
32 |     # Move model to device and declare optimizer and loss function
33 |     model = MLP().to(device)
34 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
35 |     loss_fn = torch.nn.NLLLoss()
36 | 
37 |     # Run the training loop
38 |     print('----------Training ---------------')
39 |     model.train()
40 |     for epoch in range(EPOCHS):
41 |         start = time.time()
42 |         for idx, (train_x, train_label) in enumerate(train_loader):
43 |             optimizer.zero_grad()
44 |             train_x = train_x.view(train_x.size(0), -1)
45 |             train_x = train_x.to(device)
46 |             train_label = train_label.to(device)
47 |             output = model(train_x)
48 |             loss = loss_fn(output, train_label)
49 |             loss.backward()
50 |             optimizer.step()
51 |             xm.mark_step() # XLA: collect ops and run them in XLA runtime
52 |             if idx < WARMUP_STEPS: # skip warmup iterations
53 |                 start = time.time()
54 |     # Compute statistics for the last epoch
55 |     interval = idx - WARMUP_STEPS # skip warmup iterations
56 |     throughput = interval / (time.time() - start)
57 |     print("Train throughput (iter/sec): {}".format(throughput))
58 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
59 | 
60 |     # Save checkpoint for evaluation
61 |     os.makedirs("checkpoints", exist_ok=True)
62 |     checkpoint = {'state_dict': model.state_dict()}
63 |     # XLA: use xm.save instead of torch.save to ensure states are moved back to cpu
64 |     # This can prevent "XRT memory handle not found" at end of test.py execution
65 |     xm.save(checkpoint,'checkpoints/checkpoint.pt')
66 | 
67 |     print('----------End Training ---------------')
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 | 
72 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/pytorch/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils import cpp_extension
 4 | 
 5 | cpp_extension.load(
 6 |     name='librelu',
 7 |     sources=['relu.cpp'],
 8 |     is_python_module=False,
 9 |     build_directory=os.getcwd()
10 | )
11 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/pytorch/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import functional as F
 4 | import my_ops
 5 | 
 6 | # Declare 3-layer MLP for MNIST dataset
 7 | class MLP(nn.Module):
 8 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]):
 9 |     super(MLP, self).__init__()
10 |     self.fc1 = nn.Linear(input_size, layers[0])
11 |     self.fc2 = nn.Linear(layers[0], layers[1])
12 |     self.fc3 = nn.Linear(layers[1], output_size)
13 | 
14 |   def forward(self, x):
15 |     f1 = self.fc1(x)
16 |     r1 = my_ops.Relu.apply(f1)
17 |     f2 = self.fc2(r1)
18 |     r2 = my_ops.Relu.apply(f2)
19 |     f3 = self.fc3(r2)
20 |     return torch.log_softmax(f3, dim=1)
21 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/pytorch/my_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | torch.ops.load_library('librelu.so')
 4 | 
 5 | class Relu(torch.autograd.Function):
 6 |     @staticmethod
 7 |     def forward(ctx, input):
 8 |         ctx.save_for_backward(input)
 9 |         return torch.ops.my_ops.relu_forward(input)
10 | 
11 |     @staticmethod
12 |     def backward(ctx, grad):
13 |         input, = ctx.saved_tensors
14 |         return torch.ops.my_ops.relu_backward(grad, input), None
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/pytorch/relu.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | #include <torch/torch.h>
 4 | 
 5 | torch::Tensor relu_forward(const torch::Tensor& t_in) {
 6 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
 7 |   auto t_in_acc = t_in.accessor<float, 2>();
 8 |   auto t_out_acc = t_out.accessor<float, 2>();
 9 |   auto shape = t_in.sizes();
10 |   for (int i = 0; i < shape[0]; i++) {
11 |     for (int j = 0; j < shape[1]; j++) {
12 |       t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_in_acc[i][j] : 0.0;
13 |     }
14 |   }
15 |   return t_out;
16 | }
17 | 
18 | torch::Tensor relu_backward(const torch::Tensor& t_grad, const torch::Tensor& t_in) {
19 |   torch::Tensor t_out = torch::zeros(t_in.sizes(), torch::kFloat); 
20 |   auto t_in_acc = t_in.accessor<float, 2>();
21 |   auto t_grad_acc = t_grad.accessor<float, 2>();
22 |   auto t_out_acc = t_out.accessor<float, 2>();
23 |   auto shape = t_in.sizes();
24 |   for (int i = 0; i < shape[0]; i++) {
25 |     for (int j = 0; j < shape[1]; j++) {
26 |       t_out_acc[i][j] = t_in_acc[i][j] > 0.0 ? t_grad_acc[i][j] : 0.0;
27 |     }
28 |   }
29 |   return t_out;
30 | }
31 | 
32 | TORCH_LIBRARY(my_ops, m) {
33 |   m.def("relu_forward", &relu_forward);
34 |   m.def("relu_backward", &relu_backward);
35 | }
36 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/customop_mlp/pytorch/train_cpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # Global constants
11 | EPOCHS = 4
12 | WARMUP_STEPS = 2
13 | BATCH_SIZE = 32
14 | 
15 | # Load MNIST train dataset
16 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train',
17 |                             train=True, download=True, transform=ToTensor())
18 | 
19 | def main():
20 |     # Prepare data loader
21 |     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
22 | 
23 |     # Fix the random number generator seeds for reproducibility
24 |     torch.manual_seed(0)
25 | 
26 |     # Move model to device and declare optimizer and loss function
27 |     device = 'cpu'
28 |     model = MLP().to(device)
29 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
30 |     loss_fn = torch.nn.NLLLoss()
31 | 
32 |     # Run the training loop
33 |     print('----------Training ---------------')
34 |     model.train()
35 |     for epoch in range(EPOCHS):
36 |         start = time.time()
37 |         for idx, (train_x, train_label) in enumerate(train_loader):
38 |             optimizer.zero_grad()
39 |             train_x = train_x.view(train_x.size(0), -1)
40 |             train_x = train_x.to(device)
41 |             train_label = train_label.to(device)
42 |             output = model(train_x)
43 |             loss = loss_fn(output, train_label)
44 |             loss.backward()
45 |             optimizer.step()
46 |             if idx < WARMUP_STEPS: # skip warmup iterations
47 |                 start = time.time()
48 | 
49 |     # Compute statistics for the last epoch
50 |     interval = idx - WARMUP_STEPS # skip warmup iterations
51 |     throughput = interval / (time.time() - start)
52 |     print("Train throughput (iter/sec): {}".format(throughput))
53 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
54 | 
55 |     # Save checkpoint for evaluation
56 |     os.makedirs("checkpoints", exist_ok=True)
57 |     checkpoint = {'state_dict': model.state_dict()}
58 |     torch.save(checkpoint,'checkpoints/checkpoint.pt')
59 |     print('----------End Training ---------------')
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/dp_bert_hf_pretrain/dp_bert_large_hf_pretrain_hdf5_THIRD-PARTY-LICENSES.txt:
--------------------------------------------------------------------------------
 1 | ** PyTorch-XLA MNIST data parallel training script; version 8151971 -- https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py
 2 |  
 3 | Copyright (c) 2018 Google Inc.
 4 | All rights reserved.
 5 |  
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | 1. Redistributions of source code must retain the above copyright
11 |    notice, this list of conditions and the following disclaimer.
12 | 
13 | 2. Redistributions in binary form must reproduce the above copyright
14 |    notice, this list of conditions and the following disclaimer in the
15 |    documentation and/or other materials provided with the distribution.
16 | 
17 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
18 | America
19 |    and IDIAP Research Institute nor the names of its contributors may be
20 |    used to endorse or promote products derived from this software without
21 |    specific prior written permission.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 | POSSIBILITY OF SUCH DAMAGE.
34 | 
35 | ------
36 | 
37 | ** NVidia DeepLearningExamples BERT pretraining script; version 7a4c425 -- https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py
38 |  
39 |  
40 | # Licensed under the Apache License, Version 2.0 (the "License");
41 | # you may not use this file except in compliance with the License.
42 | # You may obtain a copy of the License at
43 | #
44 | #     http://www.apache.org/licenses/LICENSE-2.0
45 | #
46 | # Unless required by applicable law or agreed to in writing, software
47 | # distributed under the License is distributed on an "AS IS" BASIS,
48 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49 | # See the License for the specific language governing permissions and
50 | # limitations under the License.
51 | * For NVidia DeepLearningExamples BERT pretraining script see also this required
52 | NOTICE:
53 |     # coding=utf-8
54 |     # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
55 |     # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc.
56 | team.
57 | 
58 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/dp_bert_hf_pretrain/requirements.txt:
--------------------------------------------------------------------------------
 1 | graphviz
 2 | tensorboard==2.14
 3 | transformers==4.44.0
 4 | evaluate
 5 | pillow
 6 | pytest
 7 | accelerate
 8 | datasets==2.19.1
 9 | sentencepiece==0.2.0
10 | h5py
11 | requests==2.31.0
12 | huggingface-hub==0.24.5
13 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -o pipefail
  3 | 
  4 | pip3 list | grep -e neuron > run_installed_neuron_pkgs.txt
  5 | #apt list | grep neuron >> run_installed_neuron_pkgs.txt
  6 | 
  7 | export NEURON_RT_EXEC_TIMEOUT=600
  8 | export NEURON_RT_STOCHASTIC_ROUNDING_SEED=0
  9 | export TF_GRPC_DEFAULT_OPTIONS=grpc.keepalive_time_ms=60000,grpc.keepalive_timeout_ms=14400000,grpc.http2.max_pings_without_data=0,grpc.http2.min_ping_interval_without_data_ms=600000
 10 | 
 11 | IMDS_TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"`
 12 | INSTANCEID=`curl -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id`
 13 | WORLD_SIZE_JOB=1
 14 | RANK_NODE=0
 15 | MAX_STEPS=28125
 16 | 
 17 | if [ "$1" == "amp" ]; then
 18 |     echo "Enable PyTorch Autocast (AMP)"
 19 |     BATCH_SIZE=16
 20 |     GRAD_ACCUM_USTEPS=32
 21 |     ADD_ARGS="--enable_pt_autocast"
 22 | elif [ "$1" == "fp32paramscopy" ]; then
 23 |     echo "Enable BF16 with FP32 copy of weights"
 24 |     BATCH_SIZE=16
 25 |     GRAD_ACCUM_USTEPS=32
 26 |     ADD_ARGS="--optimizer=AdamW_FP32ParamsCopy"
 27 | elif [ "$1" == "fp32" ]; then
 28 |     echo "Enable Full FP32"
 29 |     BATCH_SIZE=8
 30 |     GRAD_ACCUM_USTEPS=64
 31 |     ADD_ARGS="--optimizer=AdamW --enable_fp32"
 32 |     # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+
 33 |     # Switched to using model.to(torch.bfloat16)
 34 | else
 35 |     echo "Enable Full BF16 (model.to(torch.bfloat16)) and FP32 optimizer parameters"
 36 |     BATCH_SIZE=16
 37 |     GRAD_ACCUM_USTEPS=32
 38 |     ADD_ARGS=""
 39 |     # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+
 40 |     # Switched to using model.to(torch.bfloat16)
 41 | fi
 42 | 
 43 | if [ -e /opt/aws/neuron/bin/neuron-ls ]; then
 44 |     NUM_DEVICES=`/opt/aws/neuron/bin/neuron-ls -j | jq '. | length'`
 45 |     NC_PER_DEVICE=`/opt/aws/neuron/bin/neuron-ls -j | jq '.[0].nc_count'`
 46 |     echo "Found $NC_PER_DEVICE NeuronCores per device"
 47 | 
 48 |     if [ -z "$NUM_DEVICES" ] || [ "$NUM_DEVICES" == "0" ] || [ -z "$NC_PER_DEVICE" ] || [ "$NC_PER_DEVICE" == "null" ]; then
 49 |         NUM_NEURONCORES=32
 50 |         echo "Unable to extract device count and nc_count from neuron-ls json output; using default $NUM_NEURONCORES NeuronCores"
 51 |     else
 52 |         let NUM_NEURONCORES=$NUM_DEVICES*$NC_PER_DEVICE
 53 |         echo "Found $NUM_NEURONCORES NeuronCores"
 54 |     fi
 55 | else
 56 |     NUM_NEURONCORES=32
 57 |     echo "neuron-ls not installed (aws-neuronx-tools); using default $NUM_NEURONCORES NeuronCores"
 58 | fi
 59 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
 60 | OUTPUT_DIR=output
 61 | LOG_FILE=log_ph1_bf16
 62 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
 63 |    LOG_FILE=${LOG_FILE}_compile
 64 | fi
 65 | 
 66 | if [ ! -z "$SLURM_NTASKS" ]; then
 67 |     export FI_EFA_USE_DEVICE_RDMA=1
 68 |     export FI_PROVIDER=efa
 69 |     export FI_EFA_FORK_SAFE=1
 70 |     export BUCKET_CAP_MB=512
 71 |     export XLA_TRANSFER_SEED_ASYNC=1
 72 |     WORLD_SIZE_JOB=$SLURM_NTASKS
 73 |     RANK_NODE=$SLURM_NODEID
 74 |     MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
 75 |     MASTER_PORT=2022
 76 |     GRAD_ACCUM_USTEPS=$(($GRAD_ACCUM_USTEPS/$WORLD_SIZE_JOB))
 77 |     DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 78 |     echo $DISTRIBUTED_ARGS
 79 |     OUTPUT_DIR=output_$SLURM_JOB_ID
 80 |     LOG_FILE=${LOG_FILE}_${RANK_NODE}_${WORLD_SIZE_JOB}
 81 |     if [ -z "$NEURON_COMPILE_CACHE_URL" ]; then
 82 |         CACHE_DIR=$HOME/neuron_cache/bert/`hostname`
 83 |         export NEURON_CC_FLAGS="--cache_dir=$CACHE_DIR"
 84 |     fi
 85 |     export HF_HOME=/tmp/hf_cache/
 86 |     mkdir -p $HF_HOME
 87 |     if [ -e $HOME/.cache/huggingface ]; then
 88 |         rsync -av $HOME/.cache/huggingface/ $HF_HOME
 89 |     fi
 90 |     # HF ver > 4.22: Move cache ahead of time to prevent multiple workers moving at the same time
 91 |     python -c "import transformers.utils as utils; utils.move_cache()"
 92 | fi
 93 | 
 94 | HOST=`hostname`
 95 | echo "Hostname: $HOST (instance ID: $INSTANCEID)"
 96 | 
 97 | steps_this_run=$MAX_STEPS
 98 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
 99 |     steps_this_run=5
100 | fi
101 | 
102 | update_test_variables=../../load_test_variables.sh
103 | if [ -e $update_test_variables ]; then
104 |     . ./$update_test_variables $@ || echo "Unable to find test env."
105 | fi
106 | mkdir -p $OUTPUT_DIR
107 | if [ -z "$json" ]; then json="$OUTPUT_DIR/results.json" && rm -f $json; fi
108 | 
109 | sudo sysctl -w net.ipv4.ip_local_reserved_ports=48620 || exit 1
110 | torchrun $DISTRIBUTED_ARGS dp_bert_large_hf_pretrain_hdf5.py $ADD_ARGS --output_dir $OUTPUT_DIR --steps_this_run $steps_this_run --metrics_file $json --batch_size=$BATCH_SIZE --grad_accum_usteps=$GRAD_ACCUM_USTEPS |& tee $OUTPUT_DIR/$LOG_FILE
111 | 
112 | ret_val=${PIPESTATUS[0]}
113 | echo $ret_val
114 | if [ $ret_val -eq 0 ]; then
115 |     success=1
116 | else
117 |     success=0
118 | fi
119 | 
120 | if [ -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
121 |     dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh
122 |     if [ -e $dump_to_s3_update_json_scr ]; then
123 |         $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON."
124 |     else
125 |         echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON."
126 |     fi
127 | fi
128 | 
129 | # copy final checkpoint for ph2
130 | if [ -e $OUTPUT_DIR/ckpt_28125.pt ]; then cp -f $OUTPUT_DIR/ckpt_28125.pt ../; fi
131 | 
132 | exit $ret_val
133 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128_lamb.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -o pipefail
  3 | 
  4 | pip3 list | grep -e neuron > run_installed_neuron_pkgs.txt
  5 | #apt list | grep neuron >> run_installed_neuron_pkgs.txt
  6 | 
  7 | export NEURON_RT_EXEC_TIMEOUT=600
  8 | export NEURON_RT_STOCHASTIC_ROUNDING_SEED=0
  9 | export TF_GRPC_DEFAULT_OPTIONS=grpc.keepalive_time_ms=60000,grpc.keepalive_timeout_ms=14400000,grpc.http2.max_pings_without_data=0,grpc.http2.min_ping_interval_without_data_ms=600000
 10 | 
 11 | IMDS_TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"`
 12 | INSTANCEID=`curl -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id`
 13 | WORLD_SIZE_JOB=1
 14 | RANK_NODE=0
 15 | MAX_STEPS=7032
 16 | 
 17 | #keep the GBS=64k to benchmark over GPU
 18 | if [ "$1" == "amp" ]; then
 19 |     echo "Enable PyTorch Autocast (AMP)"
 20 |     BATCH_SIZE=16
 21 |     GRAD_ACCUM_USTEPS=128
 22 |     ADD_ARGS="--enable_pt_autocast"
 23 | elif [ "$1" == "fp32" ]; then
 24 |     echo "Enable Full FP32"
 25 |     BATCH_SIZE=8
 26 |     GRAD_ACCUM_USTEPS=256
 27 |     ADD_ARGS="--optimizer=AdamW --enable_fp32"
 28 |     # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+
 29 |     # Switched to using model.to(torch.bfloat16)
 30 | else
 31 |     echo "Enable Full BF16 (model.to(torch.bfloat16)) and FP32 optimizer parameters"
 32 |     BATCH_SIZE=16
 33 |     GRAD_ACCUM_USTEPS=128
 34 |     ADD_ARGS=""
 35 |     # XLA_DOWNCAST_BF16 is deprecated in torch-xla 2.4+
 36 |     # Switched to using model.to(torch.bfloat16)
 37 | fi
 38 | 
 39 | if [ -e /opt/aws/neuron/bin/neuron-ls ]; then
 40 |     NUM_DEVICES=`/opt/aws/neuron/bin/neuron-ls -j | jq '. | length'`
 41 |     NC_PER_DEVICE=`/opt/aws/neuron/bin/neuron-ls -j | jq '.[0].nc_count'`
 42 |     echo "Found $NC_PER_DEVICE NeuronCores per device"
 43 | 
 44 |     if [ -z "$NUM_DEVICES" ] || [ "$NUM_DEVICES" == "0" ] || [ -z "$NC_PER_DEVICE" ] || [ "$NC_PER_DEVICE" == "null" ]; then
 45 |         NUM_NEURONCORES=32
 46 |         echo "Unable to extract device count and nc_count from neuron-ls json output; using default $NUM_NEURONCORES NeuronCores"
 47 |     else
 48 |         let NUM_NEURONCORES=$NUM_DEVICES*$NC_PER_DEVICE
 49 |         echo "Found $NUM_NEURONCORES NeuronCores"
 50 |     fi
 51 | else
 52 |     NUM_NEURONCORES=32
 53 |     echo "neuron-ls not installed (aws-neuronx-tools); using default $NUM_NEURONCORES NeuronCores"
 54 | fi
 55 | 
 56 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
 57 | OUTPUT_DIR=output
 58 | OPT=LAMB
 59 | LOG_FILE=log_ph1_bf16
 60 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
 61 |    LOG_FILE=${LOG_FILE}_compile
 62 | fi
 63 | 
 64 | if [ ! -z "$SLURM_NTASKS" ]; then
 65 |     export FI_EFA_USE_DEVICE_RDMA=1
 66 |     export FI_PROVIDER=efa
 67 |     export FI_EFA_FORK_SAFE=1
 68 |     export BUCKET_CAP_MB=512
 69 |     export XLA_TRANSFER_SEED_ASYNC=1
 70 |     WORLD_SIZE_JOB=$SLURM_NTASKS
 71 |     RANK_NODE=$SLURM_NODEID
 72 |     MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
 73 |     MASTER_PORT=2022
 74 |     GRAD_ACCUM_USTEPS=$(($GRAD_ACCUM_USTEPS/$WORLD_SIZE_JOB))
 75 |     DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 76 |     echo $DISTRIBUTED_ARGS
 77 |     OUTPUT_DIR=output_$SLURM_JOB_ID
 78 |     LOG_FILE=${LOG_FILE}_${RANK_NODE}_${WORLD_SIZE_JOB}
 79 |     if [ -z "$NEURON_COMPILE_CACHE_URL" ]; then
 80 |         CACHE_DIR=$HOME/neuron_cache/bert/`hostname`
 81 |         export NEURON_CC_FLAGS="--cache_dir=$CACHE_DIR"
 82 |     fi
 83 |     export HF_HOME=/tmp/hf_cache/
 84 |     mkdir -p $HF_HOME
 85 |     if [ -e $HOME/.cache/huggingface ]; then
 86 |         rsync -av $HOME/.cache/huggingface/ $HF_HOME
 87 |     fi
 88 |     # HF ver > 4.22: Move cache ahead of time to prevent multiple workers moving at the same time
 89 |     python -c "import transformers.utils as utils; utils.move_cache()"
 90 | fi
 91 | 
 92 | HOST=`hostname`
 93 | echo "Hostname: $HOST (instance ID: $INSTANCEID)"
 94 | 
 95 | steps_this_run=$MAX_STEPS
 96 | if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
 97 |     steps_this_run=5
 98 | fi
 99 | 
100 | update_test_variables=../../load_test_variables.sh
101 | if [ -e $update_test_variables ]; then
102 |     . ./$update_test_variables $@ || echo "Unable to find test env."
103 | fi
104 | mkdir -p $OUTPUT_DIR
105 | if [ -z "$json" ]; then json="$OUTPUT_DIR/results.json" && rm -f $json; fi
106 | 
107 | sudo sysctl -w net.ipv4.ip_local_reserved_ports=48620 || exit 1
108 | torchrun $DISTRIBUTED_ARGS dp_bert_large_hf_pretrain_hdf5.py $ADD_ARGS --optimizer $OPT --lr 6e-3 --output_dir $OUTPUT_DIR --max_steps $MAX_STEPS --steps_this_run $steps_this_run --metrics_file $json --batch_size=$BATCH_SIZE --grad_accum_usteps=$GRAD_ACCUM_USTEPS |& tee $OUTPUT_DIR/$LOG_FILE
109 | 
110 | ret_val=${PIPESTATUS[0]}
111 | echo $ret_val
112 | if [ $ret_val -eq 0 ]; then
113 |     success=1
114 | else
115 |     success=0
116 | fi
117 | 
118 | if [ -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
119 |     dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh
120 |     if [ -e $dump_to_s3_update_json_scr ]; then
121 |         $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON."
122 |     else
123 |         echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON."
124 |     fi
125 | fi
126 | 
127 | # copy final checkpoint for ph2
128 | if [ -e $OUTPUT_DIR/ckpt_$MAX_STEPS.pt ]; then cp -f $OUTPUT_DIR/ckpt_$MAX_STEPS.pt ../; fi
129 | 
130 | exit $ret_val
131 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/hf_language_modeling/gpt2/run_clm.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
 2 | index 3cf36ec9d..960b3a169 100755
 3 | --- a/examples/pytorch/language-modeling/run_clm.py
 4 | +++ b/examples/pytorch/language-modeling/run_clm.py
 5 | @@ -53,6 +53,22 @@ from transformers.trainer_utils import get_last_checkpoint
 6 |  from transformers.utils import check_min_version, send_example_telemetry
 7 |  from transformers.utils.versions import require_version
 8 |  
 9 | +from importlib.metadata import version
10 | +
11 | +if version("torch") >= "2.0":
12 | +    import copy
13 | +    import torch_xla.core.xla_model as xm
14 | +    def mesh_reduce(tag, data, reduce_fn):
15 | +        xm.rendezvous(tag)
16 | +        xdatain = copy.deepcopy(data)
17 | +        xdatain = xdatain.to("xla")
18 | +        xdata = xm.all_gather(xdatain, pin_layout=False)
19 | +        cpu_xdata = xdata.detach().to("cpu")
20 | +        cpu_xdata_split = torch.split(cpu_xdata, xdatain.shape[0])
21 | +        xldata = [x for x in cpu_xdata_split]
22 | +        return reduce_fn(xldata)
23 | +    xm.mesh_reduce = mesh_reduce
24 | +
25 |  
26 |  # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
27 |  check_min_version("4.27.0")
28 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/hf_sentiment_analysis/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | #*.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | .idea/
127 | .vscode/
128 | deploy/
129 | test/
130 | **/.DS_Store
131 | cdk.out/
132 | 
133 | */**/models
134 | *.pem


--------------------------------------------------------------------------------
/torch-neuronx/training/hf_sentiment_analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Hugging Face BERT Sentinment Analysis - AWS Trainium
 2 | 
 3 | ## Introduction
 4 | 
 5 | In this example, we will go through the steps required for easily adapt your PyTorch code for training a Machine Learning
 6 | (ML) model by using [Hugging Face](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face) and BERT as 
 7 | model type on an Amazon EC2 instance by using AWS Trainium chip.
 8 | 
 9 | In this repository, we are sharing some code examples for:
10 | 1. Train BERT ML model by using PyTorch and Hugging Face
11 |    1. Code: [single Neuron Core](code/01-trainium-single-core/train.py)
12 |    2. Notebook: [notebook single Neuron Core](./01-hf-single-neuron.ipynb)
13 | 2. Distributed training of BERT ML model by using PyTorch and Hugging Face
14 |    1. Code: [distributed training on Neuron Cores](code/02-trainium-distributed-training/train.py)
15 |    2. Notebook: [notebook distributed training on Neuron Cores](./02-hf-distributed-training.ipynb)
16 | 
17 | ## Infrastructure Setup for AWS Trainium
18 | 
19 | ### Prerequisites
20 | 
21 | * Instance Image: [Deep Learning AMI Neuron PyTorch 1.11](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-11-amazon-linux-2/)
22 | * Instance Type: trn1.32xlarge
23 | * Git installed on the EC2 instance
24 | 
25 | ```
26 | git --version
27 | ```
28 | 
29 | ### Activate pre-built PyTorch environment
30 | 
31 | ```
32 | source /opt/aws_neuron_venv_pytorch/bin/activate
33 | ```
34 | 
35 | ### Check AWS Neuron SDK installation
36 | 
37 | ```
38 | neuron-ls
39 | 
40 | neuron-top
41 | ```
42 | 
43 | ## ML Training on single Neuron Core
44 | 
45 | Activate [pre-built PyTorch environment](#activate-pre-built-pytorch-environment) 
46 | 
47 | Test the code execution by using the provided [notebook](./01-hf-single-neuron.ipynb)
48 | 
49 | ### CL execution example
50 | 
51 | ```
52 | cd examples/01-trainium-single-core
53 | 
54 | python3 train.py
55 | ```
56 | 
57 | ## Distributed Training on all available Neuron Cores
58 | 
59 | Activate [pre-built PyTorch environment](#activate-pre-built-pytorch-environment) 
60 | 
61 | Test the code execution by using the provided [notebook](./02-hf-distributed-training.ipynb)
62 | 
63 | ### CL execution example
64 | 
65 | ```
66 | cd examples/02-trainium-distributed-training
67 | 
68 | export TOKENIZERS_PARALLELISM=false
69 | 
70 | torchrun --nproc_per_node=32 train.py
71 | ```
72 | 
73 | # Errors
74 | 
75 | 1. Flush Neuron Cores
76 | 
77 | ```
78 | sudo rmmod neuron; sudo modprobe neuron
79 | ```


--------------------------------------------------------------------------------
/torch-neuronx/training/hf_sentiment_analysis/code/01-trainium-single-core/train.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from datasets import Dataset, DatasetDict
  3 | import logging
  4 | import os
  5 | import pandas as pd
  6 | from time import gmtime, strftime
  7 | from tqdm.auto import tqdm
  8 | import torch
  9 | import torch_xla.core.xla_model as xm
 10 | import torch_xla.runtime as xr
 11 | from torch.optim import AdamW
 12 | from torch.utils.data import DataLoader
 13 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
 14 | 
 15 | logging.basicConfig(level=logging.INFO)
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | model_name = "bert-base-cased"
 19 | ## define xla as device for using AWS Trainium Neuron Cores
 20 | device = "xla"
 21 | 
 22 | batch_size = 8
 23 | num_epochs = 6
 24 | 
 25 | logger.info("Device: {}".format(device))
 26 | 
 27 | ## tokenize_and_encode
 28 | # params:
 29 | #   data: DatasetDict
 30 | # This method returns a dictionary of input_ids, token_type_ids, attention_mask
 31 | def tokenize_and_encode(data):
 32 |     results = tokenizer(data["text"], padding="max_length", truncation=True)
 33 |     return results
 34 | 
 35 | if __name__ == '__main__':
 36 |     path = os.path.abspath("data")
 37 |     csv_path = path + "/train.csv"
 38 | 
 39 |     train = pd.read_csv(
 40 |         csv_path,
 41 |         sep=',',
 42 |         quotechar='"',
 43 |         quoting=csv.QUOTE_ALL,
 44 |         escapechar='\\',
 45 |         encoding='utf-8'
 46 |     )
 47 | 
 48 |     train_dataset = Dataset.from_dict(train)
 49 | 
 50 |     hg_dataset = DatasetDict({"train": train_dataset})
 51 | 
 52 |     ## Loading Hugging Face AutoTokenizer for the defined model
 53 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 54 | 
 55 |     ds_encoded = hg_dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"])
 56 | 
 57 |     ds_encoded.set_format("torch")
 58 | 
 59 |     ## Creating a DataLoader object for iterating over it during the training epochs
 60 |     train_dl = DataLoader(ds_encoded["train"], shuffle=True, batch_size=batch_size)
 61 | 
 62 |     ## Loading Hugging Face pre-trained model for sequence classification for the defined model
 63 |     model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
 64 |     model.to(device)
 65 | 
 66 |     current_timestamp = strftime("%Y-%m-%d-%H-%M", gmtime())
 67 | 
 68 |     optimizer = AdamW(model.parameters(), lr=1.45e-4)
 69 | 
 70 |     num_training_steps = num_epochs * len(train_dl)
 71 |     progress_bar = tqdm(range(num_training_steps))
 72 |     lr_scheduler = get_scheduler(
 73 |         name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
 74 |     )
 75 | 
 76 |     logger.info("Start training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime())))
 77 | 
 78 |     ## Start model training and defining the training loop
 79 |     model.train()
 80 |     for epoch in range(num_epochs):
 81 |         for batch in train_dl:
 82 |             batch = {k: v.to(device) for k, v in batch.items()}
 83 |             outputs = model(**batch)
 84 |             loss = outputs.loss
 85 |             loss.backward()
 86 |             optimizer.step()
 87 |             lr_scheduler.step()
 88 |             ## xm.mark_step is executing the current graph, updating the model params, and notifiy end of step to Neuron Core
 89 |             xm.mark_step()
 90 |             optimizer.zero_grad()
 91 |             progress_bar.update(1)
 92 | 
 93 |         logger.info("Epoch {}, rank {}, Loss {:0.4f}".format(epoch, xr.global_ordinal(), loss.detach().to("cpu")))
 94 | 
 95 |     logger.info("End training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime())))
 96 | 
 97 |     ## Using XLA for saving model after training for being sure only one copy of the model is saved
 98 |     os.makedirs("./../../models/checkpoints/{}".format(current_timestamp), exist_ok=True)
 99 |     checkpoint = {"state_dict": model.state_dict()}
100 |     xm.save(checkpoint, "./../../models/checkpoints/{}/checkpoint.pt".format(current_timestamp))
101 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/hf_sentiment_analysis/code/02-trainium-distributed-training/train.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from datasets import Dataset, DatasetDict
  3 | import logging
  4 | import os
  5 | import pandas as pd
  6 | from time import gmtime, strftime
  7 | from tqdm.auto import tqdm
  8 | import torch
  9 | import torch_xla.core.xla_model as xm
 10 | import torch_xla.distributed.parallel_loader as pl
 11 | import torch_xla.distributed.xla_backend
 12 | import torch_xla.runtime as xr
 13 | from torch.optim import AdamW
 14 | from torch.utils.data import DataLoader
 15 | from torch.utils.data.distributed import DistributedSampler
 16 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
 17 | 
 18 | logging.basicConfig(level=logging.INFO)
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | torch.manual_seed(0)
 22 | 
 23 | model_name = "bert-base-cased"
 24 | ## define xla as device for using AWS Trainium Neuron Cores
 25 | device = "xla"
 26 | 
 27 | torch.distributed.init_process_group(device)
 28 | 
 29 | # Get the global number of workes.
 30 | world_size = xr.world_size()
 31 | logger.info("Workers: {}".format(world_size))
 32 | 
 33 | batch_size = 8
 34 | num_epochs = 6
 35 | 
 36 | logger.info("Device: {}".format(device))
 37 | 
 38 | ## tokenize_and_encode
 39 | # params:
 40 | #   data: DatasetDict
 41 | # This method returns a dictionary of input_ids, token_type_ids, attention_mask
 42 | def tokenize_and_encode(data):
 43 |     results = tokenizer(data["text"], padding="max_length", truncation=True)
 44 |     return results
 45 | 
 46 | if __name__ == '__main__':
 47 |     path = os.path.abspath("data")
 48 |     csv_path = path + "/train.csv"
 49 | 
 50 |     train = pd.read_csv(
 51 |         csv_path,
 52 |         sep=',',
 53 |         quotechar='"',
 54 |         quoting=csv.QUOTE_ALL,
 55 |         escapechar='\\',
 56 |         encoding='utf-8'
 57 |     )
 58 | 
 59 |     train_dataset = Dataset.from_dict(train)
 60 | 
 61 |     hg_dataset = DatasetDict({"train": train_dataset})
 62 | 
 63 |     ## Loading Hugging Face AutoTokenizer for the defined model
 64 |     tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
 65 | 
 66 |     ds_encoded = hg_dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"])
 67 | 
 68 |     ds_encoded.set_format("torch")
 69 | 
 70 |     ## Create a subsed of data sampler, for parallelizing the training across multiple cores
 71 |     if world_size > 1:
 72 |         train_sampler = DistributedSampler(
 73 |             ds_encoded["train"],
 74 |             num_replicas=world_size,
 75 |             rank=xr.global_ordinal(),
 76 |             shuffle=True,
 77 |         )
 78 | 
 79 |     ## Creating a DataLoader object for iterating over it during the training epochs
 80 |     train_dl = DataLoader(
 81 |         ds_encoded["train"],
 82 |         batch_size=batch_size,
 83 |         sampler=train_sampler,
 84 |         shuffle=False if train_sampler else True)
 85 | 
 86 |     ## Loading a subset of the data in the different Neuron Cores provided as input
 87 |     train_device_loader = pl.MpDeviceLoader(train_dl, device)
 88 | 
 89 |     ## Loading Hugging Face pre-trained model for sequence classification for the defined model
 90 |     model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, force_download=True).to(device)
 91 | 
 92 |     current_timestamp = strftime("%Y-%m-%d-%H-%M", gmtime())
 93 | 
 94 |     optimizer = AdamW(model.parameters(), lr=1.45e-4 * world_size)
 95 | 
 96 |     num_training_steps = num_epochs * len(train_dl)
 97 |     progress_bar = tqdm(range(num_training_steps))
 98 | 
 99 |     logger.info("Start training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime())))
100 | 
101 |     ## Start model training and defining the training loop
102 |     model.train()
103 |     for epoch in range(num_epochs):
104 |         for batch in train_device_loader:
105 |             batch = {k: v.to(device) for k, v in batch.items()}
106 |             outputs = model(**batch)
107 |             optimizer.zero_grad()
108 |             loss = outputs.loss
109 |             loss.backward()
110 |             ## xm.optimizer_step is performing the sum of all the gradients updates done in the different Cores
111 |             xm.optimizer_step(optimizer)
112 |             progress_bar.update(1)
113 | 
114 |         logger.info("Epoch {}, rank {}, Loss {:0.4f}".format(epoch, xr.global_ordinal(), loss.detach().to("cpu")))
115 | 
116 |     logger.info("End training: {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime())))
117 | 
118 |     ## Using XLA for saving model after training for being sure only one copy of the model is saved
119 |     os.makedirs("./../../models/checkpoints/{}".format(current_timestamp), exist_ok=True)
120 |     checkpoint = {"state_dict": model.state_dict()}
121 |     xm.save(checkpoint, "./../../models/checkpoints/{}/checkpoint.pt".format(current_timestamp))
122 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/hf_text_classification/README.md:
--------------------------------------------------------------------------------
 1 | # Hugging Face Text Classification
 2 | 
 3 | This folder contains various examples of Hugging Face models that can be trained with AWS Trainium for a Text Classification task. Each Jupyter notebook contains a specific example of training a model using the Hugging Face Trainer API and uses a slightly modified script called [run_glue.py](run_glue.py) to fine tune the pretrained model. 
 4 |   
 5 | The following models are currently supported and tested with AWS Trainium:
 6 | - [BERT base cased](BertBaseCased.ipynb)
 7 | - [BERT base uncased](BertBaseUncased.ipynb)
 8 | - [BERT large cased](BertLargeCased.ipynb)
 9 | - [BERT large uncased](BertLargeUncased.ipynb)
10 | - [RoBERTa base](RobertaBase.ipynb)
11 | - [RoBERTa large](RobertaLarge.ipynb)
12 | - [XLM RoBERTa base](XlmRobertaBase.ipynb)
13 | - [AlBERT base](AlbertBase.ipynb)
14 | - [Distilbert base uncased](DistilbertBaseUncased.ipynb)
15 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/llama2/get_dataset.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from transformers import AutoTokenizer
 3 | from itertools import chain
 4 | import os
 5 | 
 6 | dataset_name = "wikicorpus"
 7 | dataset_config_name = "raw_en"
 8 | save_path = "~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k"
 9 | tokenizer_path = os.getcwd()
10 | 
11 | save_path = os.path.expanduser(save_path)
12 | tokenizer_path = os.path.expanduser(tokenizer_path)
13 | if not os.path.exists(save_path):
14 |     os.makedirs(save_path)
15 | 
16 | block_size = 4096
17 | 
18 | raw_datasets = load_dataset(dataset_name, dataset_config_name)
19 | 
20 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
21 | 
22 | column_names = raw_datasets["train"].column_names
23 | text_column_name = "text" if "text" in column_names else column_names[0]
24 | 
25 | def tokenize_function(examples):
26 |     return tokenizer(examples[text_column_name])
27 | 
28 | tokenized_datasets = raw_datasets.map(
29 |     tokenize_function,
30 |     batched=True,
31 |     remove_columns=column_names,
32 |     load_from_cache_file=True,
33 |     desc="Running tokenizer on dataset",
34 | )
35 | 
36 | if block_size > tokenizer.model_max_length:
37 |     print("block_size > tokenizer.model_max_length")
38 | block_size = min(block_size, tokenizer.model_max_length)
39 | 
40 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
41 | def group_texts(examples):
42 |     # Concatenate all texts.
43 |     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
44 |     total_length = len(concatenated_examples[list(examples.keys())[0]])
45 |     # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
46 |     # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
47 |     total_length = (total_length // block_size) * block_size
48 |     # Split by chunks of max_len.
49 |     result = {
50 |         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
51 |         for k, t in concatenated_examples.items()
52 |     }
53 |     result["labels"] = result["input_ids"].copy()
54 |     return result
55 | 
56 | lm_datasets = tokenized_datasets.map(
57 |     group_texts,
58 |     batched=True,
59 |     load_from_cache_file=True,
60 |     desc=f"Grouping texts in chunks of {block_size}",
61 | )
62 | 
63 | train_dataset = lm_datasets["train"]
64 | print(len(train_dataset))
65 | 
66 | train_dataset.save_to_disk(save_path)
67 | 
68 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/llama2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.31.0
2 | regex
3 | tensorboard
4 | datasets
5 | sentencepiece
6 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | 
13 | # Load MNIST test dataset
14 | test_dataset = mnist.MNIST(root='./MNIST_DATA_test', \
15 |                            train=False, download=True, transform=ToTensor())
16 | 
17 | def main():
18 |     # Fix the random number generator seeds for reproducibility
19 |     torch.manual_seed(0)
20 | 
21 |     # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance)
22 |     device = xm.xla_device()
23 |     # Move model to device 
24 |     model = MLP().to(device)
25 |     
26 |     # Load check point
27 |     checkpoint = torch.load('checkpoints/checkpoint.pt', map_location='cpu')
28 |     model.load_state_dict(checkpoint['state_dict'])
29 |     
30 |     # Prepare data loader
31 |     test_loader = DataLoader(test_dataset, batch_size=32)
32 |     
33 |     # Run the evaluation loop 
34 |     print('----------Evaluating---------------')
35 |     match_count = 0
36 |     model.eval()
37 |     start = time.time()
38 |     for idx, (test_x, test_label) in enumerate(test_loader):
39 |         test_x = test_x.view(test_x.size(0), -1)
40 |         test_x = test_x.to(device)
41 |         test_pred = model(test_x)
42 |         pred_label = torch.argmax(test_pred, dim=1)
43 |         match_count += sum(pred_label == test_label.to(device))
44 |         xm.mark_step() # XLA: collect ops and run them in XLA runtime
45 |         if idx < 2: # skip warmup iterations
46 |             start = time.time()
47 |     
48 |     # Compute statistics
49 |     interval = idx - 2 # skip warmup iterations
50 |     throughput = interval / (time.time() - start)
51 |     print("Test throughput (iter/sec): {}".format(throughput))
52 |     
53 |     accuracy = match_count / (idx * 32)
54 |     print("Accuracy: {}".format(accuracy))
55 |     assert(accuracy > 0.92)
56 |     print('----------Done Evaluating---------------')
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/eval_using_trace.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # Load MNIST test dataset
11 | test_dataset = mnist.MNIST(root='./MNIST_DATA_test', \
12 |                            train=False, download=True, transform=ToTensor())
13 | 
14 | def main():
15 |     # Fix the random number generator seeds for reproducibility
16 |     torch.manual_seed(0)
17 | 
18 |     # Use cpu device for trace API
19 |     device = "cpu"
20 |     # Move model to device 
21 |     model = MLP().to(device)
22 |     
23 |     # Load check point
24 |     checkpoint = torch.load('checkpoints/checkpoint.pt', map_location='cpu')
25 |     model.load_state_dict(checkpoint['state_dict'])
26 | 
27 |     # Prepare data loader
28 |     test_loader = DataLoader(test_dataset, batch_size=32, drop_last=True)
29 |     
30 |     # Run the evaluation loop 
31 |     print('----------Evaluating---------------')
32 |     match_count = 0
33 |     model.eval()
34 |     start = time.time()
35 |     for idx, (test_x, test_label) in enumerate(test_loader):
36 |         test_x = test_x.view(test_x.size(0), -1)
37 |         test_x = test_x.to(device)
38 |         if idx == 0:
39 |             import torch_neuronx
40 |             model = torch_neuronx.trace(model, test_x)
41 |         test_pred = model(test_x)
42 |         pred_label = torch.argmax(test_pred, dim=1)
43 |         match_count += sum(pred_label == test_label.to(device))
44 |         if idx < 2: # skip warmup iterations
45 |             start = time.time()
46 |     
47 |     # Compute statistics
48 |     interval = idx - 2 # skip warmup iterations
49 |     throughput = interval / (time.time() - start)
50 |     print("Test throughput (iter/sec): {}".format(throughput))
51 |     
52 |     accuracy = match_count / (idx * 32)
53 |     print("Accuracy: {}".format(accuracy))
54 |     assert(accuracy > 0.92)
55 |     print('----------Done Evaluating---------------')
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | # Declare 3-layer MLP for MNIST dataset
 5 | class MLP(nn.Module):
 6 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]):
 7 |       super(MLP, self).__init__()
 8 |       self.fc1 = nn.Linear(input_size, layers[0])
 9 |       self.fc2 = nn.Linear(layers[0], layers[1])
10 |       self.fc3 = nn.Linear(layers[1], output_size)
11 | 
12 |   def forward(self, x):
13 |       x = F.relu(self.fc1(x))
14 |       x = F.relu(self.fc2(x))
15 |       x = self.fc3(x)
16 |       return F.log_softmax(x, dim=1)
17 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | 
13 | # Global constants
14 | EPOCHS = 4
15 | WARMUP_STEPS = 2
16 | BATCH_SIZE = 32
17 | 
18 | # Load MNIST train dataset
19 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train',
20 |                             train=True, download=True, transform=ToTensor())
21 | 
22 | def main():
23 |     # Prepare data loader
24 |     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
25 | 
26 |     # Fix the random number generator seeds for reproducibility
27 |     torch.manual_seed(0)
28 | 
29 |     # XLA: Specify XLA device (defaults to a NeuronCore on Trn1 instance)
30 |     device = 'xla'
31 | 
32 |     # Move model to device and declare optimizer and loss function
33 |     model = MLP().to(device)
34 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
35 |     loss_fn = torch.nn.NLLLoss()
36 | 
37 |     # Run the training loop
38 |     print('----------Training ---------------')
39 |     model.train()
40 |     for epoch in range(EPOCHS):
41 |         start = time.time()
42 |         for idx, (train_x, train_label) in enumerate(train_loader):
43 |             optimizer.zero_grad()
44 |             train_x = train_x.view(train_x.size(0), -1)
45 |             train_x = train_x.to(device)
46 |             train_label = train_label.to(device)
47 |             output = model(train_x)
48 |             loss = loss_fn(output, train_label)
49 |             loss.backward()
50 |             optimizer.step()
51 |             xm.mark_step() # XLA: collect ops and run them in XLA runtime
52 |             if idx < WARMUP_STEPS: # skip warmup iterations
53 |                 start = time.time()
54 | 
55 |     # Compute statistics for the last epoch
56 |     interval = idx - WARMUP_STEPS # skip warmup iterations
57 |     throughput = interval / (time.time() - start)
58 |     print("Train throughput (iter/sec): {}".format(throughput))
59 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
60 | 
61 |     # Save checkpoint for evaluation
62 |     os.makedirs("checkpoints", exist_ok=True)
63 |     checkpoint = {'state_dict': model.state_dict()}
64 |     # XLA: use xm.save instead of torch.save to ensure states are moved back to cpu
65 |     # This can prevent "XRT memory handle not found" at end of test.py execution
66 |     xm.save(checkpoint,'checkpoints/checkpoint.pt')
67 | 
68 |     print('----------End Training ---------------')
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 
73 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/train_cpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # Global constants
11 | EPOCHS = 4
12 | WARMUP_STEPS = 2
13 | BATCH_SIZE = 32
14 | 
15 | # Load MNIST train dataset
16 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train',
17 |                             train=True, download=True, transform=ToTensor())
18 | 
19 | def main():
20 |     # Prepare data loader
21 |     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
22 | 
23 |     # Fix the random number generator seeds for reproducibility
24 |     torch.manual_seed(0)
25 | 
26 |     # Move model to device and declare optimizer and loss function
27 |     device = 'cpu'
28 |     model = MLP().to(device)
29 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
30 |     loss_fn = torch.nn.NLLLoss()
31 | 
32 |     # Run the training loop
33 |     print('----------Training ---------------')
34 |     model.train()
35 |     for epoch in range(EPOCHS):
36 |         start = time.time()
37 |         for idx, (train_x, train_label) in enumerate(train_loader):
38 |             optimizer.zero_grad()
39 |             train_x = train_x.view(train_x.size(0), -1)
40 |             train_x = train_x.to(device)
41 |             train_label = train_label.to(device)
42 |             output = model(train_x)
43 |             loss = loss_fn(output, train_label)
44 |             loss.backward()
45 |             optimizer.step()
46 |             if idx < WARMUP_STEPS: # skip warmup iterations
47 |                 start = time.time()
48 | 
49 |     # Compute statistics for the last epoch
50 |     interval = idx - WARMUP_STEPS # skip warmup iterations
51 |     throughput = interval / (time.time() - start)
52 |     print("Train throughput (iter/sec): {}".format(throughput))
53 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
54 | 
55 |     # Save checkpoint for evaluation
56 |     os.makedirs("checkpoints", exist_ok=True)
57 |     checkpoint = {'state_dict': model.state_dict()}
58 |     torch.save(checkpoint,'checkpoints/checkpoint.pt')
59 |     print('----------End Training ---------------')
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/train_torchrun.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | import torch_xla.runtime as xr
13 | 
14 | # XLA imports for parallel loader and multi-processing
15 | import torch_xla.distributed.parallel_loader as pl
16 | from torch.utils.data.distributed import DistributedSampler
17 | 
18 | # Initialize XLA process group for torchrun
19 | import torch_xla.distributed.xla_backend
20 | torch.distributed.init_process_group('xla')
21 | 
22 | # Global constants
23 | EPOCHS = 4
24 | WARMUP_STEPS = 2
25 | BATCH_SIZE = 32
26 | 
27 | # Load MNIST train dataset
28 | if not xm.is_master_ordinal(): xm.rendezvous('dataset_download')
29 | train_dataset = mnist.MNIST(root='/tmp/MNIST_DATA_train',
30 |                             train=True, download=True, transform=ToTensor())
31 | if xm.is_master_ordinal(): xm.rendezvous('dataset_download')
32 | 
33 | def main():
34 |     # XLA MP: get world size
35 |     world_size = xr.world_size()
36 |     # multi-processing: ensure each worker has same initial weights
37 |     torch.manual_seed(0)
38 | 
39 |     # Move model to device and declare optimizer and loss function
40 |     device = 'xla'
41 |     model = MLP().to(device)
42 |     # For multiprocessing, scale up learning rate
43 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size)
44 |     loss_fn = torch.nn.NLLLoss()
45 | 
46 |     # Prepare data loader
47 |     train_sampler = None
48 |     if world_size > 1:
49 |         train_sampler = DistributedSampler(train_dataset,
50 |                                            num_replicas=world_size,
51 |                                            rank=xr.global_ordinal(),
52 |                                            shuffle=True)
53 |     train_loader = DataLoader(train_dataset,
54 |                               batch_size=BATCH_SIZE,
55 |                               sampler=train_sampler,
56 |                               shuffle=False if train_sampler else True)
57 |     # XLA MP: use MpDeviceLoader from torch_xla.distributed
58 |     train_device_loader = pl.MpDeviceLoader(train_loader, device)
59 | 
60 |     # Run the training loop
61 |     print('----------Training ---------------')
62 |     model.train()
63 |     for epoch in range(EPOCHS):
64 |         start = time.time()
65 |         for idx, (train_x, train_label) in enumerate(train_device_loader):
66 |             optimizer.zero_grad()
67 |             train_x = train_x.view(train_x.size(0), -1)
68 |             output = model(train_x)
69 |             loss = loss_fn(output, train_label)
70 |             loss.backward()
71 |             xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step
72 |             if idx < WARMUP_STEPS: # skip warmup iterations
73 |                 start = time.time()
74 | 
75 |     # Compute statistics for the last epoch
76 |     interval = idx - WARMUP_STEPS # skip warmup iterations
77 |     throughput = interval / (time.time() - start)
78 |     print("Train throughput (iter/sec): {}".format(throughput))
79 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
80 | 
81 |     # Save checkpoint for evaluation (xm.save ensures only one process save)
82 |     os.makedirs("checkpoints", exist_ok=True)
83 |     checkpoint = {'state_dict': model.state_dict()}
84 |     xm.save(checkpoint,'checkpoints/checkpoint.pt')
85 | 
86 |     print('----------End Training ---------------')
87 | 
88 | if __name__ == '__main__':
89 |     main()
90 | 
91 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/mnist_mlp/train_xmp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from model import MLP
 5 | 
 6 | from torchvision.datasets import mnist
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import ToTensor
 9 | 
10 | # XLA imports
11 | import torch_xla.core.xla_model as xm
12 | # XLA imports for parallel loader and multi-processing
13 | import torch_xla.distributed.parallel_loader as pl
14 | import torch_xla.distributed.xla_multiprocessing as xmp
15 | import torch_xla.runtime as xr
16 | from torch.utils.data.distributed import DistributedSampler
17 | 
18 | # Global constants
19 | EPOCHS = 4
20 | WARMUP_STEPS = 2
21 | BATCH_SIZE = 32
22 | 
23 | # Load MNIST train dataset
24 | train_dataset = mnist.MNIST(root='./MNIST_DATA_train',
25 |                             train=True, download=True, transform=ToTensor())
26 | 
27 | def main(index):
28 |     # XLA MP: get world size
29 |     world_size = xr.world_size()
30 |     # multi-processing: ensure each worker has same initial weights
31 |     torch.manual_seed(0)
32 |     # Move model to device and declare optimizer and loss function
33 |     device = 'xla'
34 |     model = MLP().to(device)
35 |     # For multiprocessing, scale up learning rate
36 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size)
37 |     loss_fn = torch.nn.NLLLoss()
38 | 
39 |     # Prepare data loader
40 |     train_sampler = None
41 |     if world_size > 1:
42 |         train_sampler = DistributedSampler(train_dataset,
43 |                                            num_replicas=world_size,
44 |                                            rank=xr.global_ordinal(),
45 |                                            shuffle=True)
46 |     train_loader = DataLoader(train_dataset,
47 |                               batch_size=BATCH_SIZE,
48 |                               sampler=train_sampler,
49 |                               shuffle=False if train_sampler else True)
50 |     # XLA MP: use MpDeviceLoader from torch_xla.distributed
51 |     train_device_loader = pl.MpDeviceLoader(train_loader, device)
52 | 
53 |     # Run the training loop
54 |     print('----------Training ---------------')
55 |     model.train()
56 |     for epoch in range(EPOCHS):
57 |         start = time.time()
58 |         for idx, (train_x, train_label) in enumerate(train_device_loader):
59 |             optimizer.zero_grad()
60 |             train_x = train_x.view(train_x.size(0), -1)
61 |             output = model(train_x)
62 |             loss = loss_fn(output, train_label)
63 |             loss.backward()
64 |             xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step
65 |             if idx < WARMUP_STEPS: # skip warmup iterations
66 |                 start = time.time()
67 | 
68 |     # Compute statistics for the last epoch
69 |     interval = idx - WARMUP_STEPS # skip warmup iterations
70 |     throughput = interval / (time.time() - start)
71 |     print("Train throughput (iter/sec): {}".format(throughput))
72 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
73 | 
74 |     # Save checkpoint for evaluation (xm.save ensures only one process save)
75 |     os.makedirs("checkpoints", exist_ok=True)
76 |     checkpoint = {'state_dict': model.state_dict()}
77 |     xm.save(checkpoint,'checkpoints/checkpoint.pt')
78 | 
79 |     print('----------End Training ---------------')
80 | 
81 | if __name__ == '__main__':
82 |     xmp.spawn(main)
83 | 
84 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/stable_diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | torchvision
2 | diffusers==0.19.3 # Intentionally pin to 0.19.3. More recent versions have problems on Neuron.
3 | transformers==4.31.0
4 | datasets==2.14.2
5 | fsspec==2023.9.2


--------------------------------------------------------------------------------
/torch-neuronx/training/tp_dp_bert_hf_pretrain/requirements.txt:
--------------------------------------------------------------------------------
 1 | graphviz
 2 | tensorboard==2.6
 3 | transformers==4.26.0
 4 | evaluate
 5 | pillow
 6 | pytest
 7 | accelerate
 8 | datasets >= 1.8.0
 9 | sentencepiece != 0.1.92
10 | h5py
11 | requests
12 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from transformers import AutoTokenizer
 3 | from itertools import chain
 4 | import os
 5 | 
 6 | dataset_name = "wikicorpus"
 7 | dataset_config_name = "raw_en"
 8 | save_path = "~/examples_datasets/wikicorpus_gpt_neox_tokenized_2k"
 9 | 
10 | save_path = os.path.expanduser(save_path)
11 | if not os.path.exists(save_path):
12 |     os.makedirs(save_path)
13 | 
14 | block_size = 2048
15 | 
16 | raw_datasets = load_dataset(dataset_name, dataset_config_name)
17 | 
18 | model_name = "EleutherAI/gpt-neox-20b"
19 | tokenizer = AutoTokenizer.from_pretrained(model_name)
20 | 
21 | column_names = raw_datasets["train"].column_names
22 | text_column_name = "text" if "text" in column_names else column_names[0]
23 | 
24 | def tokenize_function(examples):
25 |     return tokenizer(examples[text_column_name])
26 | 
27 | tokenized_datasets = raw_datasets.map(
28 |     tokenize_function,
29 |     batched=True,
30 |     remove_columns=column_names,
31 |     load_from_cache_file=True,
32 |     desc="Running tokenizer on dataset",
33 | )
34 | 
35 | if block_size > tokenizer.model_max_length:
36 |     print("block_size > tokenizer.model_max_length")
37 | block_size = min(block_size, tokenizer.model_max_length)
38 | 
39 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
40 | def group_texts(examples):
41 |     # Concatenate all texts.
42 |     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
43 |     total_length = len(concatenated_examples[list(examples.keys())[0]])
44 |     # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
45 |     # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
46 |     total_length = (total_length // block_size) * block_size
47 |     # Split by chunks of max_len.
48 |     result = {
49 |         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
50 |         for k, t in concatenated_examples.items()
51 |     }
52 |     result["labels"] = result["input_ids"].copy()
53 |     return result
54 | 
55 | lm_datasets = tokenized_datasets.map(
56 |     group_texts,
57 |     batched=True,
58 |     load_from_cache_file=True,
59 |     desc=f"Grouping texts in chunks of {block_size}",
60 | )
61 | 
62 | train_dataset = lm_datasets["train"]
63 | print(len(train_dataset))
64 | 
65 | train_dataset.save_to_disk(save_path)
66 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.26.0
2 | regex
3 | tensorboard
4 | datasets
5 | sentencepiece
6 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/unet_image_segmentation/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | # milesial, U-Net: Semantic segmentation with PyTorch, GitHub repository
  6 | # https://github.com/milesial/Pytorch-UNet
  7 | 
  8 | class DoubleConv(nn.Module):
  9 |     def __init__(self, in_channels, out_channels, mid_channels=None):
 10 |         super().__init__()
 11 |         if not mid_channels:
 12 |             mid_channels = out_channels
 13 |         self.double_conv = nn.Sequential(
 14 |             nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
 15 |             nn.BatchNorm2d(mid_channels),
 16 |             nn.ReLU(inplace=True),
 17 |             nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
 18 |             nn.BatchNorm2d(out_channels),
 19 |             nn.ReLU(inplace=True)
 20 |         )
 21 | 
 22 |     def forward(self, x):
 23 |         return self.double_conv(x)
 24 | 
 25 | 
 26 | class Down(nn.Module):
 27 |     def __init__(self, in_channels, out_channels):
 28 |         super().__init__()
 29 |         self.maxpool_conv = nn.Sequential(
 30 |             nn.MaxPool2d(2),
 31 |             DoubleConv(in_channels, out_channels)
 32 |         )
 33 | 
 34 |     def forward(self, x):
 35 |         return self.maxpool_conv(x)
 36 | 
 37 | 
 38 | class Up(nn.Module):
 39 |     def __init__(self, in_channels, out_channels, bilinear=True):
 40 |         super().__init__()
 41 | 
 42 |         if bilinear:
 43 |             self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
 44 |             self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
 45 |         else:
 46 |             self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
 47 |             self.conv = DoubleConv(in_channels, out_channels)
 48 | 
 49 |     def forward(self, x1, x2):
 50 |         x1 = self.up(x1)
 51 |         # input is CHW
 52 |         diffY = x2.size()[2] - x1.size()[2]
 53 |         diffX = x2.size()[3] - x1.size()[3]
 54 | 
 55 |         x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
 56 |                         diffY // 2, diffY - diffY // 2])
 57 |         
 58 |         x = torch.cat([x2, x1], dim=1)
 59 |         return self.conv(x)
 60 | 
 61 | 
 62 | class OutConv(nn.Module):
 63 |     def __init__(self, in_channels, out_channels):
 64 |         super(OutConv, self).__init__()
 65 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
 66 | 
 67 |     def forward(self, x):
 68 |         return self.conv(x)
 69 |     
 70 | class UNet(nn.Module):
 71 |     def __init__(self, n_channels, n_classes, bilinear=False):
 72 |         super(UNet, self).__init__()
 73 |         self.n_channels = n_channels
 74 |         self.n_classes = n_classes
 75 |         self.bilinear = bilinear
 76 | 
 77 |         self.inc = (DoubleConv(n_channels, 64))
 78 |         self.down1 = (Down(64, 128))
 79 |         self.down2 = (Down(128, 256))
 80 |         self.down3 = (Down(256, 512))
 81 |         factor = 2 if bilinear else 1
 82 |         self.down4 = (Down(512, 1024 // factor))
 83 |         self.up1 = (Up(1024, 512 // factor, bilinear))
 84 |         self.up2 = (Up(512, 256 // factor, bilinear))
 85 |         self.up3 = (Up(256, 128 // factor, bilinear))
 86 |         self.up4 = (Up(128, 64, bilinear))
 87 |         self.outc = (OutConv(64, n_classes))
 88 | 
 89 |     def forward(self, x):
 90 |         x1 = self.inc(x)
 91 |         x2 = self.down1(x1)
 92 |         x3 = self.down2(x2)
 93 |         x4 = self.down3(x3)
 94 |         x5 = self.down4(x4)
 95 |         x = self.up1(x5, x4)
 96 |         x = self.up2(x, x3)
 97 |         x = self.up3(x, x2)
 98 |         x = self.up4(x, x1)
 99 |         logits = self.outc(x)
100 |         return logits
101 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/zero1_gpt2/config_1p5B_gpt2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function": "gelu_new",
 3 |   "architectures": [
 4 |     "GPT2LMHeadModel"
 5 |   ],
 6 |   "attn_pdrop": 0.1,
 7 |   "bos_token_id": 50256,
 8 |   "embd_pdrop": 0.1,
 9 |   "eos_token_id": 50256,
10 |   "initializer_range": 0.02,
11 |   "layer_norm_epsilon": 1e-05,
12 |   "model_type": "gpt2",
13 |   "n_ctx": 1024,
14 |   "n_embd": 1600,
15 |   "n_head": 25,
16 |   "n_layer": 48,
17 |   "n_positions": 1024,
18 |   "output_past": true,
19 |   "resid_pdrop": 0.1,
20 |   "summary_activation": null,
21 |   "summary_first_dropout": 0.1,
22 |   "summary_proj_to_labels": true,
23 |   "summary_type": "cls_index",
24 |   "summary_use_proj": true,
25 |   "task_specific_params": {
26 |     "text-generation": {
27 |       "do_sample": true,
28 |       "max_length": 50
29 |     }
30 |   },
31 |   "vocab_size": 50257
32 | }
33 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/zero1_gpt2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.52.3
2 | accelerate==1.7.0
3 | datasets
4 | tensorboard==2.12.2
5 | huggingface-hub==0.32.4
6 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/zero1_gpt2/run_clm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -o pipefail
  3 | 
  4 | sudo rmmod neuron; sudo modprobe neuron
  5 | sudo sysctl -w net.ipv4.ip_local_reserved_ports=44000,48620
  6 | sudo sysctl -w kernel.threads-max=10000000
  7 | ulimit -c unlimited
  8 | 
  9 | NUM_NEURONCORES=32
 10 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
 11 | 
 12 | LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"
 13 | MALLOC_ARENA_MAX=64
 14 | echo "MALLOC_ARENA_MAX" $MALLOC_ARENA_MAX
 15 | echo "LD_PRELOAD" $LD_PRELOAD
 16 | 
 17 | if [ ! -z "$SLURM_NTASKS" ]; then
 18 |     # if running inside slurm, handle here
 19 |     MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
 20 |     MASTER_PORT=2022
 21 |     WORLD_SIZE_JOB=$SLURM_NTASKS
 22 |     RANK_NODE=$SLURM_NODEID
 23 |     JOB_ID_TAG=job-"$SLURM_JOB_ID"
 24 |     DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 25 |     echo $DISTRIBUTED_ARGS
 26 |     export NEURON_RT_ROOT_COMM_ID=$MASTER_ADDR:46820
 27 |     export FI_EFA_FORK_SAFE=1
 28 |     export FI_EFA_USE_DEVICE_RDMA=1
 29 |     export FI_PROVIDER=efa
 30 |     echo "WORLD_SIZE_JOB=$WORLD_SIZE_JOB,  RANK_NODE=$RANK_NODE,  MASTER_ADDR_JOB=$MASTER_ADDR_JOB, NODE_LIST=$NODE_LIST"
 31 |     export TRANSFORMERS_CACHE=$HOME/hf_cache/`hostname`/hub
 32 |     export HF_DATASETS_CACHE=$HOME/hf_cache/`hostname`/datasets
 33 | fi
 34 | 
 35 | #Print Slurm Config
 36 | date;hostname;
 37 | 
 38 | export TRAINING_PRECISION=$1 #options FP32, BF16, MIXED
 39 | export NEURON_RT_STOCHASTIC_ROUNDING_EN=1
 40 | 
 41 | if [[ "BF16" == $TRAINING_PRECISION ]]; then
 42 |     echo "USING BF16 ONLY"
 43 |     export XLA_USE_BF16=1
 44 |     export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy llm-training --model-type transformer"
 45 | elif [[ "MIXED" == $TRAINING_PRECISION ]]; then
 46 |     echo "USING MIXED PRECISION BF16 and FP32"
 47 |     export NEURON_CC_FLAGS="--retry_failed_compilation --enable-mixed-precision-accumulation --distribution-strategy llm-training --model-type transformer"
 48 | else
 49 |     echo "USING FP32 as default"
 50 |     export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy llm-training --model-type transformer"
 51 | fi
 52 | 
 53 | NEURON_CC_FLAGS+=" --cache_dir=$HOME/neuron_cache/gpt_1p5B/`hostname`"
 54 | 
 55 | export DISABLE_NUMERIC_CC_TOKEN=1
 56 | export NEURON_RT_HIERARCHICAL_CC=1
 57 | 
 58 | export NEURON_RT_EXEC_TIMEOUT=600
 59 | export TF_NUM_INTEROP_THREADS=8192
 60 | 
 61 | export NEURON_ENABLE_NOSEED_DROPOUT=1
 62 | 
 63 | GRAD_ACCUM_STEP=1
 64 | BATCH_SIZE=1
 65 | MODEL_CONFIG="config_1p5B_gpt2.json"
 66 | MODEL_SIZE=$(echo $CONFIG | grep -m 1 -Eo '[0-9MBp]+' | head -n1 | tr -d '\n')
 67 | DATASET_CONFIG=$2
 68 | 
 69 | if [ $GRAD_ACCUM_STEP -gt 1 ]; then
 70 |     echo "need to uncomment accelerator.py code to run"
 71 |     ./uncomment_gradaccum.sh
 72 | fi
 73 | 
 74 | MAX_STEPS=3000
 75 | LOG_FILE_NAME="run_log_hf_gpt2_param_"$MODEL_SIZE"_nodes"$WORLD_SIZE_JOB"_grad_accum"$GRAD_ACCUM_STEP"_bs"$BATCH_SIZE_$(date +"%m-%d-%Y")_$(date +"%H:%M:%S")
 76 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then
 77 |     MAX_STEPS=10
 78 |     LOG_FILE_NAME="compile_log_hf_gpt2_param_"$MODEL_SIZE"_grad_accum"$GRAD_ACCUM_STEP"_bs"$BATCH_SIZE_$(date +"%m-%d-%Y")_$(date +"%H:%M:%S")
 79 | fi
 80 | 
 81 | torchrun $DISTRIBUTED_ARGS run_clm_no_trainer.py \
 82 |     --model_name_or_path gpt2 \
 83 |     --dataset_name wikitext \
 84 |     --dataset_config_name $DATASET_CONFIG  \
 85 |     --config_name $MODEL_CONFIG \
 86 |     --per_device_train_batch_size $BATCH_SIZE \
 87 |     --gradient_accumulation_steps $GRAD_ACCUM_STEP \
 88 |     --max_train_steps $MAX_STEPS \
 89 |     --weight_decay 0.01 \
 90 |     --learning_rate 0.00015 \
 91 |     --lr_scheduler_type cosine \
 92 |     --use_zero1 \
 93 |     --gradient_checkpointing \
 94 |     --seed 1234 \
 95 |     --num_warmup_steps 75 \
 96 |     --use_grad_clipping \
 97 |     --validation_split_percentage 0 \
 98 |     --output_dir gpt_1p5B \
 99 |     |& tee $LOG_FILE_NAME
100 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/zero1_gpt2/run_clm.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --nodes=4
3 | #SBATCH --exclusive
4 | #SBATCH --output=slurm-%x-%j.out
5 | 
6 | srun ./run_clm.sh MIXED wikitext-103-raw-v1
7 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/zero1_gpt2/run_clm_compile.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --nodes=4
3 | #SBATCH --exclusive
4 | #SBATCH --output=slurm-%x-%j.out
5 | 
6 | srun neuron_parallel_compile ./run_clm.sh MIXED wikitext-103-raw-v1
7 | 


--------------------------------------------------------------------------------
/torch-neuronx/training/zero1_gpt2/uncomment_gradaccum.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | script_output=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
 3 | script_output+="/accelerate/accelerator.py"
 4 | 
 5 | experitment_grad_accum() {
 6 | 	echo 'uncommenting the assersiont to run grad_accum steps > 1'
 7 | 	# look for "Gradient accumulation on TPU is not supported. Pass in `gradient_accumulation_steps=1`"
 8 | 	ln=$(grep -wn "NotImplementedError" $script_output | cut -d: -f1)
 9 | 	let start=$ln-2
10 | 	let end=$ln+3
11 | 	let tagln=$start-1
12 | 	sed -i "${tagln}a        \\ #ExperimentalHackOn" $script_output
13 | 	while [[ start -le $end ]]
14 | 	do
15 |     		sed -i "$start s/./#&/" $script_output
16 |     		((start = start + 1))
17 | 	done
18 | }
19 | 
20 | if grep -r 'ExperimentalHackOn' $script_output; then
21 | 	echo Already edited the accelerator code
22 | else
23 | 	echo Editing accelerator code
24 | 	experitment_grad_accum
25 | fi
26 | 


--------------------------------------------------------------------------------
/torch-neuronx/transformers-neuronx/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Neuron (transformers-neuronx) Samples for AWS Inf2 & Trn1
 2 | 
 3 | This directory contains sample Jupyter Notebooks demonstrating tensor parallel inference for various PyTorch large language models (LLMs) on [AWS Inferentia](https://aws.amazon.com/ec2/instance-types/inf2/) (Inf2) instances) and [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/) (Trn1) instances.
 4 | 
 5 | For additional information on these training scripts, please refer to the tutorials found in the <mark>official Inferentia and Trainium documentation</mark>.
 6 | 
 7 | ## Inference
 8 | 
 9 | The following samples are available for LLM tensor parallel inference:
10 | 
11 | | Name                                                        | Instance type |
12 | |-------------------------------------------------------------| --------------- |
13 | | [facebook/opt-13b](inference/facebook-opt-13b-sampling.ipynb) | Inf2 & Trn1 |
14 | | [facebook/opt-30b](inference/facebook-opt-30b-sampling.ipynb) | Inf2 & Trn1 |
15 | | [facebook/opt-66b](inference/facebook-opt-66b-sampling.ipynb) | Inf2 |
16 | | [meta-llama/Llama-2-13b](inference/meta-llama-2-13b-sampling.ipynb) | Inf2 & Trn1 |
17 | 


--------------------------------------------------------------------------------
/torch-neuronx/transformers-neuronx/inference/gpt-j-dp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | from transformers import AutoTokenizer
 5 | from transformers_neuronx.gptj.model import GPTJForSampling
 6 | from multiprocessing import Process
 7 | 
 8 | def load_model_infer():
 9 |     # load model to NeuronCores with 8-way tensor parallel and DP
10 |     load_compile_time = time.time()
11 |     neuron_model = GPTJForSampling.from_pretrained('./gptj-6b-split', n_positions=1024, batch_size=64, tp_degree=8, amp='f16')
12 |     neuron_model.to_neuron()
13 |     load_compile_elapsed = time.time() - load_compile_time
14 |     print(f'Model load & compile time in a single process  {load_compile_elapsed} seconds')
15 | 
16 |     # construct a tokenizer and encode prompt text
17 |     tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6B')
18 | 
19 |     batch_prompts = [
20 |         "I am specialized at sentence generation language models,", 
21 |     ]
22 |     batch_prompts = batch_prompts * 64
23 | 
24 |     input_ids = torch.as_tensor([tokenizer.encode(text) for text in batch_prompts])
25 | 
26 | 
27 |     with torch.inference_mode():
28 |         # warmup
29 |         generated_sequences = neuron_model.sample(input_ids, sequence_length=1024)
30 |         
31 |         start = time.time()
32 |         for i in range(2):
33 |             generated_sequences = neuron_model.sample(input_ids, sequence_length=1024)
34 |         elapsed = (time.time() - start) / 2
35 | 
36 |         generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences]
37 |     print(f'Averaged Latency for one inference {elapsed} seconds')
38 | 
39 | if __name__ == '__main__':
40 |     os.environ['NEURON_RT_NUM_CORES']='8'
41 |     total_start = time.time()
42 |     p1 = Process(target=load_model_infer)
43 |     p2 = Process(target=load_model_infer)
44 |     p3 = Process(target=load_model_infer)
45 |     p1.start()
46 |     p2.start()
47 |     p3.start()
48 |     p1.join()
49 |     p2.join()
50 |     p3.join()
51 |     total_elapsed = time.time() - total_start
52 |     print(f'total processes time including compilation finished in {total_elapsed} seconds')
53 |     print(f'TPS {(30/total_elapsed)*64} ')
54 |     p1.terminate()
55 |     p2.terminate()
56 |     p3.terminate()


--------------------------------------------------------------------------------