├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_template.md
    │   ├── doc_template.md
    │   └── feature_template.md
    ├── PULL_REQUEST_TEMPLATE
    │   └── pull_request_template.md
    └── workflows
    │   ├── doc-build.yml
    │   ├── main_cpp.yml
    │   ├── main_distributed.yaml
    │   └── main_python.yml
├── .gitignore
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── cpp
    ├── .clang-format
    ├── autograd
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── autograd.cpp
    ├── custom-dataset
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── custom-dataset.cpp
    │   └── info.txt
    ├── dcgan
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── dcgan.cpp
    │   └── display_samples.py
    ├── distributed
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── dist-mnist.cpp
    ├── mnist
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── mnist.cpp
    ├── regression
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── regression.cpp
    ├── tools
    │   ├── InstallingOpenCV.md
    │   └── download_mnist.py
    └── transfer-learning
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── classify.cpp
    │   ├── convert.py
    │   ├── main.cpp
    │   └── main.h
├── dcgan
    ├── .gitignore
    ├── README.md
    ├── main.py
    └── requirements.txt
├── distributed
    ├── FSDP
    │   ├── .gitignore
    │   ├── README.md
    │   ├── T5_training.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   ├── fsdp.py
    │   │   └── training.py
    │   ├── download_dataset.sh
    │   ├── model_checkpointing
    │   │   ├── __init__.py
    │   │   └── checkpoint_handler.py
    │   ├── policies
    │   │   ├── __init__.py
    │   │   ├── activation_checkpointing_functions.py
    │   │   ├── mixed_precision.py
    │   │   └── wrapping.py
    │   ├── requirements.txt
    │   ├── summarization_dataset.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── environment.py
    │   │   └── train_utils.py
    ├── ddp-tutorial-series
    │   ├── README.md
    │   ├── datautils.py
    │   ├── multigpu.py
    │   ├── multigpu_torchrun.py
    │   ├── multinode.py
    │   ├── requirements.txt
    │   ├── single_gpu.py
    │   └── slurm
    │   │   ├── config.yaml.template
    │   │   ├── sbatch_run.sh
    │   │   └── setup_pcluster_slurm.md
    ├── ddp
    │   ├── README.md
    │   ├── example.py
    │   ├── main.py
    │   └── requirements.txt
    ├── minGPT-ddp
    │   ├── README.md
    │   ├── mingpt
    │   │   ├── char_dataset.py
    │   │   ├── data
    │   │   │   └── input.txt
    │   │   ├── gpt2_train_cfg.yaml
    │   │   ├── main.py
    │   │   ├── model.py
    │   │   ├── slurm
    │   │   │   ├── config.yaml.template
    │   │   │   ├── sbatch_run.sh
    │   │   │   └── setup_pcluster_slurm.md
    │   │   └── trainer.py
    │   └── requirements.txt
    ├── rpc
    │   ├── batch
    │   │   ├── README.md
    │   │   ├── parameter_server.py
    │   │   ├── reinforce.py
    │   │   └── requirements.txt
    │   ├── ddp_rpc
    │   │   ├── README.md
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── parameter_server
    │   │   ├── README.md
    │   │   └── rpc_parameter_server.py
    │   ├── pipeline
    │   │   ├── README.md
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── rl
    │   │   ├── README.md
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   └── rnn
    │   │   ├── README.md
    │   │   ├── main.py
    │   │   ├── requirements.txt
    │   │   └── rnn.py
    └── tensor_parallelism
    │   ├── README.md
    │   ├── fsdp_tp_example.py
    │   ├── llama2_model.py
    │   ├── log_utils.py
    │   ├── requirements.txt
    │   ├── run_example.sh
    │   ├── sequence_parallel_example.py
    │   └── tensor_parallel_example.py
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── conf.py
    │   └── index.rst
├── fast_neural_style
    ├── README.md
    ├── download_saved_models.py
    ├── images
    │   ├── content-images
    │   │   └── amber.jpg
    │   ├── output-images
    │   │   ├── amber-candy.jpg
    │   │   ├── amber-mosaic.jpg
    │   │   ├── amber-rain-princess.jpg
    │   │   └── amber-udnie.jpg
    │   └── style-images
    │   │   ├── candy.jpg
    │   │   ├── mosaic.jpg
    │   │   ├── rain-princess-cropped.jpg
    │   │   ├── rain-princess.jpg
    │   │   └── udnie.jpg
    └── neural_style
    │   ├── __init__.py
    │   ├── neural_style.py
    │   ├── transformer_net.py
    │   ├── utils.py
    │   └── vgg.py
├── fx
    ├── README.md
    ├── custom_tracer.py
    ├── inline_function.py
    ├── invert.py
    ├── module_tracer.py
    ├── native_interpreter
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── interpreter.cpp
    │   └── use_interpreter.py
    ├── primitive_library.py
    ├── profiling_tracer.py
    ├── proxy_based_graph_creation.py
    ├── replace_op.py
    ├── subgraph_rewriter_basic_use.py
    └── wrap_output_dynamically.py
├── gat
    ├── README.md
    ├── main.py
    └── requirements.txt
├── gcn
    ├── README.md
    ├── main.py
    └── requirements.txt
├── imagenet
    ├── README.md
    ├── extract_ILSVRC.sh
    ├── main.py
    └── requirements.txt
├── language_translation
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── src
    │   ├── data.py
    │   └── model.py
├── legacy
    └── snli
    │   ├── README.md
    │   ├── model.py
    │   ├── requirements.txt
    │   ├── train.py
    │   └── util.py
├── mnist
    ├── README.md
    ├── main.py
    └── requirements.txt
├── mnist_forward_forward
    ├── README.md
    ├── main.py
    └── requirements.txt
├── mnist_hogwild
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── train.py
├── mnist_rnn
    ├── README.md
    ├── main.py
    └── requirements.txt
├── regression
    ├── README.md
    └── main.py
├── reinforcement_learning
    ├── README.md
    ├── actor_critic.py
    ├── reinforce.py
    └── requirements.txt
├── run_cpp_examples.sh
├── run_distributed_examples.sh
├── run_python_examples.sh
├── runtime.txt
├── siamese_network
    ├── README.md
    ├── main.py
    └── requirements.txt
├── super_resolution
    ├── README.md
    ├── data.py
    ├── dataset.py
    ├── main.py
    ├── model.py
    └── super_resolve.py
├── time_sequence_prediction
    ├── README.md
    ├── generate_sine_wave.py
    ├── requirements.txt
    └── train.py
├── utils.sh
├── vae
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── results
    │   └── .gitignore
└── word_language_model
    ├── README.md
    ├── data.py
    ├── data
        └── wikitext-2
        │   ├── README
        │   ├── test.txt
        │   ├── train.txt
        │   └── valid.txt
    ├── generate.py
    ├── main.py
    ├── model.py
    └── requirements.txt


/.github/ISSUE_TEMPLATE/bug_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41B Bug report"
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | Your issue may already be reported!
 8 | Please search on the [issue tracker](https://github.com/pytorch/examples/issues) before creating one.
 9 | 
10 | ## Context
11 | <!--- How has this issue affected you? What are you trying to accomplish? -->
12 | <!--- Providing context helps us come up with a solution that is most useful in the real world -->
13 | * Pytorch version:
14 | * Operating System and version:
15 | 
16 | ## Your Environment
17 | <!--- Include as many relevant details about the environment you experienced the bug in -->
18 | * Installed using source? [yes/no]:
19 | * Are you planning to deploy it using docker container? [yes/no]:
20 | * Is it a CPU or GPU environment?:
21 | * Which example are you using:
22 | * Link to code or data to repro [if any]:
23 | 
24 | ## Expected Behavior
25 | <!--- If you're describing a bug, tell us what should happen -->
26 | 
27 | ## Current Behavior
28 | <!--- If describing a bug, tell us what happens instead of the expected behavior -->
29 | 
30 | ## Possible Solution
31 | <!--- Not obligatory, but suggest a fix/reason for the bug -->
32 | 
33 | ## Steps to Reproduce
34 | <!--- Provide a link to a live example, or an unambiguous set of steps to -->
35 | <!--- reproduce this bug. Include code to reproduce, if relevant -->
36 | 1.
37 | 2.
38 | ...
39 | 
40 | ## Failure Logs [if any]
41 | <!--- Provide any relevant log snippets or files here. -->
42 | 
43 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/doc_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F4DA Documentation"
 3 | about: Report a documentation related issue
 4 | 
 5 | ---
 6 | 
 7 | ## 📚 Documentation
 8 | 
 9 | <!-- A clear and concise description of what content in any of the README.md files is an issues
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F680 Feature request"
 3 | about: Suggest a new example or an improvement to the repo
 4 | 
 5 | ---
 6 | 
 7 | <!--
 8 | Thank you for suggesting an idea to improve pytorch/examples
 9 | 
10 | Please fill in as much of the template below as you're able.
11 | -->
12 | 
13 | ## Is your feature request related to a problem? Please describe.
14 | <!-- Please describe the problem you are trying to solve. -->
15 | 
16 | ## Describe the solution
17 | <!-- Please describe the desired behavior. -->
18 | 
19 | ## Describe alternatives solution
20 | <!-- Please describe alternative solutions or features you have considered. -->
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41B Pull Request"
 3 | about: Fix a bug or create new example
 4 | 
 5 | ---
 6 | 
 7 | ## Description
 8 | 
 9 | Please include a summary of the newly proposed example or issue being fixed. Please also include relevant motivation, context.
10 | 
11 | If this is a new example, how is your example different enough from the remaining examples in the repo. 
12 | 
13 | If this is a bug fix please link the issue you are fixing. Fixes #(issue)
14 | 
15 | ## Type of change
16 | 
17 | Please delete options that are not relevant.
18 | 
19 | - [ ] Bug fix (non-breaking change which fixes an issue)
20 | - [ ] New Example (new example contribution)
21 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
22 | - [ ] This change requires a documentation update
23 | 
24 | ## Feature/Issue validation/testing
25 | 
26 | Please describe the tests [UT/IT] that you ran to verify your changes and relevant result summary. If this is a bug fix please run `run_python_examples.sh` before and after your change locally to make sure it works and add the logs here.
27 | 
28 | - [ ] Logs before change
29 | - [ ] Logs after change
30 | 
31 | - Logs
32 | 
33 | If this is a new example please add a corresponding test in `run_python_examples.sh`
34 | 
35 | - [ ] Test Added
36 | 
37 | ## Checklist:
38 | 
39 | - [ ] Have you added tests that prove your fix is effective or that this example works?
40 | - [ ] Has code been commented, particularly in hard-to-understand areas?
41 | - [ ] Have you made corresponding changes to the documentation?
42 | 


--------------------------------------------------------------------------------
/.github/workflows/doc-build.yml:
--------------------------------------------------------------------------------
 1 | name: Doc Build
 2 | 
 3 | on:  
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build_docs_job:
11 |     runs-on: ubuntu-latest
12 |     # Grant write permission here so that the doc can be pushed to gh-pages branch
13 |     permissions:
14 |       contents: write
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.9]
18 |     steps:
19 |     - name: Checkout
20 |       uses: actions/checkout@v2
21 |     - name: Dependencies
22 |       run: |
23 |         echo `python3 --version`
24 |         sudo apt-get install -y python-setuptools
25 |         sudo apt-get install -y python3-sphinx
26 |         python3 -m pip install --upgrade pip
27 |         python3 -m pip install setuptools
28 |       id: build
29 |     - name: Build the docset
30 |       run: | 
31 |         cd docs
32 |         pip install -r requirements.txt
33 |         make html 
34 |     - name: Get output time
35 |       run: echo "The time was ${{ steps.build.outputs.time }}"      
36 |     - name: Deploy
37 |       uses: JamesIves/github-pages-deploy-action@releases/v3
38 |       with:
39 |           ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
40 |           BRANCH: gh-pages # The branch the action should deploy to.
41 |           FOLDER: ./docs/build/html # The folder the action should deploy. 
42 | 


--------------------------------------------------------------------------------
/.github/workflows/main_cpp.yml:
--------------------------------------------------------------------------------
 1 | name: Run CPP Examples
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   schedule:
 9 |     # Every day at 3:00am
10 |     - cron: '0 3 * * *'
11 | 
12 | 
13 | jobs:
14 |   test:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python 3.11
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: 3.11
24 |         
25 |     - name: Install Cmake, Make, g++, MKL
26 |       run: |
27 |         sudo apt update && sudo apt upgrade
28 |         sudo apt install cmake g++ make
29 |         sudo apt-get -y install intel-mkl
30 |     - name: Install OpenCV
31 |       run: |
32 |         sudo apt -y install libtbb-dev
33 |         sudo apt install libopencv-dev
34 |     - name: Install argparse
35 |       run: |
36 |         git clone https://github.com/p-ranav/argparse
37 |         cd argparse
38 |         mkdir build
39 |         cd build
40 |         cmake -DARGPARSE_BUILD_SAMPLES=off -DARGPARSE_BUILD_TESTS=off ..
41 |         sudo make install
42 |     # Alternatively, you can install OpenCV from source
43 |     # - name: Install OpenCV from source
44 |     #   run: |
45 |         # wget -O opencv.zip https://github.com/opencv/opencv/archive/4.x.zip
46 |         # unzip opencv.zip
47 |         # mkdir -p build && cd build
48 |         # cmake  ../opencv-4.x
49 |         # cmake --build .
50 |         # sudo make install    
51 |     
52 |     - name: Run Cpp Tests
53 |       run: |
54 |         chmod +x ./run_cpp_examples.sh
55 |         ./run_cpp_examples.sh "get_libtorch,run_all,clean"
56 | 


--------------------------------------------------------------------------------
/.github/workflows/main_distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: Run Distributed Examples
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   schedule:
 9 |     # Every day at 3:00am
10 |     - cron: '0 3 * * *'
11 | 
12 | 
13 | jobs:
14 |   test:
15 | 
16 |     runs-on: 4-core-ubuntu-gpu-t4
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python 3.8
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: 3.8
24 |     - name: Install PyTorch
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html
28 |     - name: Run Tests
29 |       run: |
30 |         ./run_distributed_examples.sh "run_all,clean"
31 |     - name: Open issue on failure
32 |       if: ${{ failure() && github.event_name  == 'schedule' }}
33 |       uses: rishabhgupta/git-action-issue@v2
34 |       with:
35 |         token: ${{ secrets.GITHUB_TOKEN }}
36 |         title: Daily CI failed
37 |         body:  Commit ${{ github.sha }} daily scheduled [CI run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) failed, please check why
38 |         assignees: ''
39 | 


--------------------------------------------------------------------------------
/.github/workflows/main_python.yml:
--------------------------------------------------------------------------------
 1 | name: Run Python Examples
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   schedule:
 9 |     # Every day at 3:00am
10 |     - cron: '0 3 * * *'
11 | 
12 | 
13 | jobs:
14 |   test:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up Python 3.10
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: '3.10'
24 |     - name: Install PyTorch
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         # Install CPU-based pytorch
28 |         pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
29 |         # Maybe use the CUDA 10.2 version instead?
30 |         # pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
31 |     - name: Run Tests
32 |       run: |
33 |         ./run_python_examples.sh "install_deps,run_all,clean"
34 |     - name: Open issue on failure
35 |       if: ${{ failure() && github.event_name  == 'schedule' }}
36 |       uses: rishabhgupta/git-action-issue@v2
37 |       with:
38 |         token: ${{ secrets.GITHUB_TOKEN }}
39 |         title: Daily CI failed
40 |         body:  Commit ${{ github.sha }} daily scheduled [CI run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) failed, please check why
41 |         assignees: ''
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dcgan/data
 2 | data
 3 | *.pyc
 4 | OpenNMT/data
 5 | cpp/mnist/build
 6 | cpp/dcgan/build
 7 | dcgan/*.png
 8 | dcgan/*.pth
 9 | snli/.data
10 | snli/.vector_cache
11 | snli/results
12 | word_language_model/model.pt
13 | fast_neural_style/saved_models
14 | fast_neural_style/saved_models.zip
15 | gcn/cora/
16 | gat/cora/
17 | docs/build
18 | docs/venv
19 | 
20 | # vi backups
21 | *~
22 | .*.swp
23 | 
24 | # development
25 | .vscode
26 | **/.DS_Store
27 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # This is a comment.
 2 | # Each line is a file pattern followed by one or more owners.
 3 | 
 4 | # Github Actions, tests and CI
 5 | ./github/ @msaroufim
 6 | run_python_examples.sh @msaroufim
 7 | 
 8 | # Distributed examples 
 9 | # Can also add the distributed oncall
10 | ./distributed/ @mrshenli @pritamdamania87 @rohan-varma @H-Huang
11 | ./mnist_hogwild/ @mrshenli @pritamdamania87 @rohan-varma @H-Huang
12 | 
13 | # FX examples 
14 | ./fx/ @jamesr66a @Chillee
15 | 
16 | # Domain Examples 
17 | ./reinforcement_learning/ @msaroufim 
18 | ./word_language_model/ @msaroufim
19 | 
20 | # Need an owner 
21 | ./regression/
22 | ./mnist/
23 | ./imagenet/
24 | ./super_resolution/ 
25 | ./time_sequence_prediction/
26 | ./vae/
27 | 
28 | # Legacy examples
29 | ./cpp/
30 | ./legacy/snli/ 
31 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | - Using welcoming and inclusive language
18 | - Being respectful of differing viewpoints and experiences
19 | - Gracefully accepting constructive criticism
20 | - Focusing on what is best for the community
21 | - Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | - The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | - Trolling, insulting/derogatory comments, and personal or political attacks
28 | - Public or private harassment
29 | - Publishing other's private information, such as physical or electronic
30 |   address, without explicit permission
31 | - Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at <opensource-conduct@fb.com>. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to examples
 2 | 
 3 | We want to make contributing to this project as easy and transparent as
 4 | possible.
 5 | 
 6 | ## Pull Requests
 7 | 
 8 | We actively welcome your pull requests.
 9 | 
10 | If you're new, we encourage you to take a look at issues tagged with [good first issue](https://github.com/pytorch/examples/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
11 | 
12 | ### For new examples
13 | 
14 | 0. Create a GitHub issue proposing a new example and make sure it's substantially different from an existing one.
15 | 1. Fork the repo and create your branch from `main`.
16 | 2. If you've added code that should be tested, add tests to `run_python_examples.sh`.
17 | 3. Create a `README.md`.
18 | 4. Add a card with a brief description of your example and link to the repo to
19 |    the `docs/source/index.rst` file and build the docs by running:
20 | 
21 |    ```
22 |    cd docs
23 |    virtualenv venv
24 |    source venv/bin/activate
25 |    pip install -r requirements.txt
26 |    make html
27 |    ```
28 | 
29 |    When done working with `virtualenv`, run `deactivate`.
30 | 
31 | 5. Verify that there are no issues in your doc build. You can check the preview locally
32 |    by installing [sphinx-serve](https://pypi.org/project/sphinx-serve/)
33 |    then running `sphinx-serve -b build`.
34 | 6. Ensure your test passes locally.
35 | 7. If you haven't already, complete the Contributor License Agreement ("CLA").
36 | 8. Address any feedback in code review promptly.
37 | 
38 | ## For bug fixes
39 | 
40 | 1. Fork the repo and create your branch from `main`.
41 | 2. Make sure you have a GPU-enabled machine, either locally or in the cloud. `g4dn.4xlarge` is a good starting point on AWS.
42 | 3. Make your code change.
43 | 4. First, install all dependencies with `./run_python_examples.sh "install_deps"`.
44 | 5. Then, make sure that `./run_python_examples.sh` passes locally by running the script end to end.
45 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
46 | 7. Address any feedback in code review promptly.
47 | 
48 | ## Contributor License Agreement ("CLA")
49 | 
50 | To accept your pull request, we need you to submit a CLA. You only need
51 | to do this once to work on any of Facebook's open source projects.
52 | 
53 | Complete your CLA here: <https://code.facebook.com/cla>
54 | 
55 | ## Issues
56 | 
57 | We use GitHub issues to track public bugs. Please ensure your description is
58 | clear and has sufficient instructions to be able to reproduce the issue.
59 | 
60 | ## License
61 | 
62 | By contributing to examples, you agree that your contributions will be licensed
63 | under the LICENSE file in the root directory of this source tree.
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Pytorch contributors
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Examples
 2 | 
 3 | ![Run Examples](https://github.com/pytorch/examples/workflows/Run%20Examples/badge.svg)
 4 | 
 5 | https://pytorch.org/examples/
 6 | 
 7 | `pytorch/examples` is a repository showcasing examples of using [PyTorch](https://github.com/pytorch/pytorch). The goal is to have curated, short, few/no dependencies _high quality_ examples that are substantially different from each other that can be emulated in your existing work.
 8 | 
 9 | - For tutorials: https://github.com/pytorch/tutorials
10 | - For changes to pytorch.org: https://github.com/pytorch/pytorch.github.io
11 | - For a general model hub: https://pytorch.org/hub/ or https://huggingface.co/models
12 | - For recipes on how to run PyTorch in production: https://github.com/facebookresearch/recipes
13 | - For general Q&A and support: https://discuss.pytorch.org/
14 | 
15 | ## Available models
16 | 
17 | - [Image classification (MNIST) using Convnets](./mnist/README.md)
18 | - [Word-level Language Modeling using RNN and Transformer](./word_language_model/README.md)
19 | - [Training Imagenet Classifiers with Popular Networks](./imagenet/README.md)
20 | - [Generative Adversarial Networks (DCGAN)](./dcgan/README.md)
21 | - [Variational Auto-Encoders](./vae/README.md)
22 | - [Superresolution using an efficient sub-pixel convolutional neural network](./super_resolution/README.md)
23 | - [Hogwild training of shared ConvNets across multiple processes on MNIST](mnist_hogwild)
24 | - [Training a CartPole to balance in OpenAI Gym with actor-critic](./reinforcement_learning/README.md)
25 | - [Natural Language Inference (SNLI) with GloVe vectors, LSTMs, and torchtext](snli)
26 | - [Time sequence prediction - use an LSTM to learn Sine waves](./time_sequence_prediction/README.md)
27 | - [Implement the Neural Style Transfer algorithm on images](./fast_neural_style/README.md)
28 | - [Reinforcement Learning with Actor Critic and REINFORCE algorithms on OpenAI gym](./reinforcement_learning/README.md)
29 | - [PyTorch Module Transformations using fx](./fx/README.md)
30 | - Distributed PyTorch examples with [Distributed Data Parallel](./distributed/ddp/README.md) and [RPC](./distributed/rpc)
31 | - [Several examples illustrating the C++ Frontend](cpp)
32 | - [Image Classification Using Forward-Forward](./mnist_forward_forward/README.md)
33 | - [Language Translation using Transformers](./language_translation/README.md)
34 | 
35 | 
36 | 
37 | Additionally, a list of good examples hosted in their own repositories:
38 | 
39 | - [Neural Machine Translation using sequence-to-sequence RNN with attention (OpenNMT)](https://github.com/OpenNMT/OpenNMT-py)
40 | 
41 | ## Contributing
42 | 
43 | If you'd like to contribute your own example or fix a bug please make sure to take a look at [CONTRIBUTING.md](CONTRIBUTING.md).
44 | 


--------------------------------------------------------------------------------
/cpp/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | AccessModifierOffset: -1
 3 | AlignAfterOpenBracket: AlwaysBreak
 4 | AlignConsecutiveAssignments: false
 5 | AlignConsecutiveDeclarations: false
 6 | AlignEscapedNewlinesLeft: true
 7 | AlignOperands:   false
 8 | AlignTrailingComments: false
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: Empty
13 | AllowShortIfStatementsOnASingleLine: false
14 | AllowShortLoopsOnASingleLine: false
15 | AlwaysBreakAfterReturnType: None
16 | AlwaysBreakBeforeMultilineStrings: true
17 | AlwaysBreakTemplateDeclarations: true
18 | BinPackArguments: false
19 | BinPackParameters: false
20 | BraceWrapping:
21 |   AfterClass:      false
22 |   AfterControlStatement: false
23 |   AfterEnum:       false
24 |   AfterFunction:   false
25 |   AfterNamespace:  false
26 |   AfterObjCDeclaration: false
27 |   AfterStruct:     false
28 |   AfterUnion:      false
29 |   BeforeCatch:     false
30 |   BeforeElse:      false
31 |   IndentBraces:    false
32 | BreakBeforeBinaryOperators: None
33 | BreakBeforeBraces: Attach
34 | BreakBeforeTernaryOperators: true
35 | BreakConstructorInitializersBeforeComma: false
36 | BreakAfterJavaFieldAnnotations: false
37 | BreakStringLiterals: false
38 | ColumnLimit:     80
39 | CommentPragmas:  '^ IWYU pragma:'
40 | CompactNamespaces: false
41 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
42 | ConstructorInitializerIndentWidth: 4
43 | ContinuationIndentWidth: 4
44 | Cpp11BracedListStyle: true
45 | DerivePointerAlignment: false
46 | DisableFormat:   false
47 | ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
48 | IncludeCategories:
49 |   - Regex:           '^<.*\.h(pp)?>'
50 |     Priority:        1
51 |   - Regex:           '^<.*'
52 |     Priority:        2
53 |   - Regex:           '.*'
54 |     Priority:        3
55 | IndentCaseLabels: true
56 | IndentWidth:     2
57 | IndentWrappedFunctionNames: false
58 | KeepEmptyLinesAtTheStartOfBlocks: false
59 | MacroBlockBegin: ''
60 | MacroBlockEnd:   ''
61 | MaxEmptyLinesToKeep: 1
62 | NamespaceIndentation: None
63 | ObjCBlockIndentWidth: 2
64 | ObjCSpaceAfterProperty: false
65 | ObjCSpaceBeforeProtocolList: false
66 | PenaltyBreakBeforeFirstCallParameter: 1
67 | PenaltyBreakComment: 300
68 | PenaltyBreakFirstLessLess: 120
69 | PenaltyBreakString: 1000
70 | PenaltyExcessCharacter: 1000000
71 | PenaltyReturnTypeOnItsOwnLine: 2000000
72 | PointerAlignment: Left
73 | ReflowComments:  true
74 | SortIncludes:    true
75 | SpaceAfterCStyleCast: false
76 | SpaceBeforeAssignmentOperators: true
77 | SpaceBeforeParens: ControlStatements
78 | SpaceInEmptyParentheses: false
79 | SpacesBeforeTrailingComments: 1
80 | SpacesInAngles:  false
81 | SpacesInContainerLiterals: true
82 | SpacesInCStyleCastParentheses: false
83 | SpacesInParentheses: false
84 | SpacesInSquareBrackets: false
85 | Standard:        Cpp11
86 | TabWidth:        8
87 | UseTab:          Never
88 | ...
89 | 


--------------------------------------------------------------------------------
/cpp/autograd/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(autograd)
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | 
 6 | find_package(Torch REQUIRED)
 7 | 
 8 | add_executable(${PROJECT_NAME} "autograd.cpp")
 9 | target_link_libraries(${PROJECT_NAME} "${TORCH_LIBRARIES}")
10 | 
11 | # The following code block is suggested to be used on Windows.
12 | # According to https://github.com/pytorch/pytorch/issues/25457,
13 | # the DLLs need to be copied to avoid memory errors.
14 | if (MSVC)
15 |   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
16 |   add_custom_command(TARGET ${PROJECT_NAME}
17 |                      POST_BUILD
18 |                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
19 |                      ${TORCH_DLLS}
20 |                      $<TARGET_FILE_DIR:${PROJECT_NAME}>)
21 | endif (MSVC)
22 | 


--------------------------------------------------------------------------------
/cpp/autograd/README.md:
--------------------------------------------------------------------------------
 1 | # C++ autograd example
 2 | 
 3 | `autograd.cpp` contains several examples of doing autograd in PyTorch C++ frontend.
 4 | 
 5 | To build the code, run the following commands from your terminal:
 6 | 
 7 | ```shell
 8 | $ cd autograd
 9 | $ mkdir build
10 | $ cd build
11 | $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
12 | $ make
13 | ```
14 | 
15 | where `/path/to/libtorch` should be the path to the unzipped _LibTorch_
16 | distribution, which you can get from the [PyTorch
17 | homepage](https://pytorch.org/get-started/locally/).
18 | 
19 | Execute the compiled binary to run:
20 | 
21 | ```shell
22 | $ ./autograd
23 | ====== Running: "Basic autograd operations" ======
24 |  1  1
25 |  1  1
26 | [ CPUFloatType{2,2} ]
27 |  3  3
28 |  3  3
29 | [ CPUFloatType{2,2} ]
30 | AddBackward1
31 |  27  27
32 |  27  27
33 | [ CPUFloatType{2,2} ]
34 | MulBackward1
35 | 27
36 | [ CPUFloatType{} ]
37 | MeanBackward0
38 | false
39 | true
40 | SumBackward0
41 |  4.5000  4.5000
42 |  4.5000  4.5000
43 | [ CPUFloatType{2,2} ]
44 |   813.6625
45 |  1015.0142
46 |  -664.8849
47 | [ CPUFloatType{3} ]
48 | MulBackward1
49 |   204.8000
50 |  2048.0000
51 |     0.2048
52 | [ CPUFloatType{3} ]
53 | true
54 | true
55 | false
56 | true
57 | false
58 | true
59 | 
60 | ====== Running "Computing higher-order gradients in C++" ======
61 |  0.0025  0.0946  0.1474  0.1387
62 |  0.0238 -0.0018  0.0259  0.0094
63 |  0.0513 -0.0549 -0.0604  0.0210
64 | [ CPUFloatType{3,4} ]
65 | 
66 | ====== Running "Using custom autograd function in C++" ======
67 | -3.5513  3.7160  3.6477
68 | -3.5513  3.7160  3.6477
69 | [ CPUFloatType{2,3} ]
70 |  0.3095  1.4035 -0.0349
71 |  0.3095  1.4035 -0.0349
72 |  0.3095  1.4035 -0.0349
73 |  0.3095  1.4035 -0.0349
74 | [ CPUFloatType{4,3} ]
75 |  5.5000
76 |  5.5000
77 | [ CPUFloatType{2} ]
78 | ```
79 | 


--------------------------------------------------------------------------------
/cpp/custom-dataset/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(custom-dataset)
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | 
 6 | find_package(Torch REQUIRED)
 7 | find_package(OpenCV REQUIRED COMPONENTS core imgproc imgcodecs)
 8 | 
 9 | message(STATUS "OpenCV include dirs: ${OpenCV_INCLUDE_DIRS}")
10 | message(STATUS "OpenCV libraries: ${OpenCV_LIBS}")
11 | 
12 | 
13 | include_directories(${OpenCV_INCLUDE_DIRS})
14 | add_executable(${PROJECT_NAME} "custom-dataset.cpp")
15 | target_link_libraries(${PROJECT_NAME} "${OpenCV_LIBS}")
16 | target_link_libraries(${PROJECT_NAME} "${TORCH_LIBRARIES}")
17 | 
18 | configure_file("info.txt" "info.txt" COPYONLY)
19 | 
20 | # The following code block is suggested to be used on Windows.
21 | # According to https://github.com/pytorch/pytorch/issues/25457,
22 | # the DLLs need to be copied to avoid memory errors.
23 | if (MSVC)
24 |   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
25 |   add_custom_command(TARGET ${PROJECT_NAME}
26 |                      POST_BUILD
27 |                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
28 |                      ${TORCH_DLLS}
29 |                      $<TARGET_FILE_DIR:${PROJECT_NAME}>)
30 | endif (MSVC)
31 | 


--------------------------------------------------------------------------------
/cpp/custom-dataset/README.md:
--------------------------------------------------------------------------------
 1 | # Custom Dataset Example with the PyTorch C++ Frontend
 2 | 
 3 | This folder contains an example of loading a custom image dataset with OpenCV and training a model to label images, using the PyTorch C++ frontend.
 4 | 
 5 | The dataset used here is [Caltech 101](https://data.caltech.edu/records/mzrjq-6wc02) dataset.
 6 | 
 7 | The entire training code is contained in custom-data.cpp.
 8 | 
 9 | You can find instructions on how to install OpenCV [here](../tools/InstallingOpenCV.md).
10 | 
11 | To build the code, run the following commands from your terminal:
12 | 
13 | ```shell
14 | $ cd custom-dataset
15 | $ mkdir build
16 | $ cd build
17 | $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
18 | $ make
19 | ```
20 | 
21 | where /path/to/libtorch should be the path to the unzipped LibTorch distribution, which you can get from the [PyTorch homepage](https://pytorch.org/get-started/locally/).
22 | 
23 | If you see an error like `undefined reference to cv::imread(std::string const&, int)` when running the `make` command, you should build LibTorch from source using the instructions [here](https://github.com/pytorch/pytorch#from-source), and then set `CMAKE_PREFIX_PATH` to that PyTorch source directory. An alternative solution is to use `libtorch-cxx11-abi-shared-with-deps` instead of `libtorch-shared-with-deps` as the latter is not compatible with openCV (reported [here](https://discuss.pytorch.org/t/library-conflict-between-libtorch-and-opencv/64489)).
24 | 
25 | The build directory should look like this:
26 | 
27 | ```
28 | .
29 | ├── custom-dataset
30 | ├── dataset
31 | │   ├── accordion
32 | │   │   ├── image_0001.jpg
33 | │   │   ├── ...
34 | │   ├── airplanes
35 | │   │   ├── ...
36 | │   ├── ...
37 | ├── info.txt
38 | └── Makefile
39 | └── ...
40 | ```
41 | 
42 | `info.txt` file gets copied from source directory during build.
43 | 
44 | Execute the compiled binary to train the model:
45 | 
46 | ```shell
47 | ./custom-dataset
48 | Running on: CUDA
49 | Train Epoch: 1 16/7281	Loss: 0.314655	Acc: 0
50 | Train Epoch: 1 176/7281	Loss: 0.532111	Acc: 0.0681818
51 | Train Epoch: 1 336/7281	Loss: 0.538482	Acc: 0.0714286
52 | Train Epoch: 1 496/7281	Loss: 0.535302	Acc: 0.0705645
53 | Train Epoch: 1 656/7281	Loss: 0.536113	Acc: 0.0716463
54 | Train Epoch: 1 816/7281	Loss: 0.537626	Acc: 0.0784314
55 | Train Epoch: 1 976/7281	Loss: 0.537055	Acc: 0.079918
56 | ...
57 | 
58 | ```


--------------------------------------------------------------------------------
/cpp/dcgan/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(dcgan)
 3 | 
 4 | find_package(Torch REQUIRED)
 5 | 
 6 | option(DOWNLOAD_MNIST "Download the MNIST dataset from the internet" ON)
 7 | if (DOWNLOAD_MNIST)
 8 |   message(STATUS "Downloading MNIST dataset")
 9 |   execute_process(
10 |     COMMAND python ${CMAKE_CURRENT_LIST_DIR}/../tools/download_mnist.py
11 |       -d ${CMAKE_BINARY_DIR}/data
12 |     ERROR_VARIABLE DOWNLOAD_ERROR)
13 |   if (DOWNLOAD_ERROR)
14 |     message(FATAL_ERROR "Error downloading MNIST dataset: ${DOWNLOAD_ERROR}")
15 |   endif()
16 | endif()
17 | 
18 | add_executable(dcgan dcgan.cpp)
19 | target_link_libraries(dcgan "${TORCH_LIBRARIES}")
20 | set_property(TARGET dcgan PROPERTY CXX_STANDARD 17)
21 | 
22 | if (MSVC)
23 |   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
24 |   add_custom_command(TARGET dcgan
25 |                      POST_BUILD
26 |                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
27 |                      ${TORCH_DLLS}
28 |                      $<TARGET_FILE_DIR:dcgan>)
29 | endif (MSVC)
30 | 


--------------------------------------------------------------------------------
/cpp/dcgan/README.md:
--------------------------------------------------------------------------------
 1 | # DCGAN Example with the PyTorch C++ Frontend
 2 | 
 3 | This folder contains an example of training a DCGAN to generate MNIST digits
 4 | with the PyTorch C++ frontend.
 5 | 
 6 | The entire training code is contained in `dcgan.cpp`.
 7 | 
 8 | You can find the commands to install argparse [here](https://github.com/pytorch/examples/blob/main/.github/workflows/main_cpp.yml#L34).
 9 | 
10 | To build the code, run the following commands from your terminal:
11 | 
12 | ```shell
13 | $ cd dcgan
14 | $ mkdir build
15 | $ cd build
16 | $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
17 | $ make
18 | ```
19 | 
20 | where `/path/to/libtorch` should be the path to the unzipped _LibTorch_
21 | distribution, which you can get from the [PyTorch
22 | homepage](https://pytorch.org/get-started/locally/).
23 | 
24 | Execute the compiled binary to train the model:
25 | 
26 | ```shell
27 | $ ./dcgan
28 | [ 1/30][200/938] D_loss: 0.4953 | G_loss: 4.0195
29 | -> checkpoint 1
30 | [ 1/30][400/938] D_loss: 0.3610 | G_loss: 4.8148
31 | -> checkpoint 2
32 | [ 1/30][600/938] D_loss: 0.4072 | G_loss: 4.36760
33 | -> checkpoint 3
34 | [ 1/30][800/938] D_loss: 0.4444 | G_loss: 4.0250
35 | -> checkpoint 4
36 | [ 2/30][200/938] D_loss: 0.3761 | G_loss: 3.8790
37 | -> checkpoint 5
38 | [ 2/30][400/938] D_loss: 0.3977 | G_loss: 3.3315
39 | -> checkpoint 6
40 | [ 2/30][600/938] D_loss: 0.3815 | G_loss: 3.5696
41 | -> checkpoint 7
42 | [ 2/30][800/938] D_loss: 0.4039 | G_loss: 3.2759
43 | -> checkpoint 8
44 | [ 3/30][200/938] D_loss: 0.4236 | G_loss: 4.5132
45 | -> checkpoint 9
46 | [ 3/30][400/938] D_loss: 0.3645 | G_loss: 3.9759
47 | -> checkpoint 10
48 | ...
49 | ```
50 | 
51 | We can also specify the `--epochs` to change the number of epochs to train as follows:
52 | 
53 | ```shell
54 | $ ./dcgan --epochs 10
55 | ```
56 | Without specifying the `--epochs` flag, the default number of epochs to train is 30.
57 | 
58 | 
59 | The training script periodically generates image samples. Use the
60 | `display_samples.py` script situated in this folder to generate a plot image.
61 | For example:
62 | 
63 | ```shell
64 | $ python display_samples.py -i dcgan-sample-10.pt
65 | Saved out.png
66 | ```
67 | 


--------------------------------------------------------------------------------
/cpp/dcgan/display_samples.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | 
 4 | import argparse
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import torch
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("-i", "--sample-file", required=True)
12 | parser.add_argument("-o", "--out-file", default="out.png")
13 | parser.add_argument("-d", "--dimension", type=int, default=3)
14 | options = parser.parse_args()
15 | 
16 | module = torch.jit.load(options.sample_file)
17 | images = list(module.parameters())[0]
18 | 
19 | for index in range(options.dimension * options.dimension):
20 |     image = images[index].detach().cpu().reshape(28, 28).mul(255).to(torch.uint8)
21 |     array = image.numpy()
22 |     axis = plt.subplot(options.dimension, options.dimension, 1 + index)
23 |     plt.imshow(array, cmap="gray")
24 |     axis.get_xaxis().set_visible(False)
25 |     axis.get_yaxis().set_visible(False)
26 | 
27 | plt.savefig(options.out_file)
28 | print("Saved ", options.out_file)
29 | 


--------------------------------------------------------------------------------
/cpp/distributed/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(dist-mnist)
 3 | 
 4 | find_package(Torch REQUIRED)
 5 | 
 6 | find_package(MPI REQUIRED)
 7 | 
 8 | include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
 9 | 
10 | add_executable(dist-mnist dist-mnist.cpp)
11 | target_link_libraries(dist-mnist ${TORCH_LIBRARIES})
12 | target_link_libraries(dist-mnist ${MPI_LIBRARIES})
13 | target_link_libraries(dist-mnist ${CMAKE_PREFIX_PATH}/lib/libc10d.a)
14 | 
15 | if(MPI_COMPILE_FLAGS)
16 |   set_target_properties(dist-mnist PROPERTIES
17 |     COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
18 | endif()
19 | 
20 | if(MPI_LINK_FLAGS)
21 |   set_target_properties(dist-mnist PROPERTIES
22 |     LINK_FLAGS "${MPI_LINK_FLAGS}")
23 | endif()
24 | 


--------------------------------------------------------------------------------
/cpp/distributed/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed Training on MNIST using PyTorch C++ Frontend (Libtorch)
 2 | 
 3 | This folder contains an example of data-parallel training of a convolutional neural network on the MNIST dataset. For parallelization, Message Passing Interface (MPI) is used.
 4 | 
 5 | The entire code is contained in dist-mnist.cpp
 6 | 
 7 | You can find instructions on how to install MPI [here] (https://www.open-mpi.org/faq/?category=building). This code was tested on Open MPI but it should run on other MPI distributions as well such as MPICH, MVAPICH, etc.
 8 | 
 9 | To build the code, run the following commands from the terminal:
10 | 
11 | ```shell
12 | $ cd distributed
13 | $ mkdir build
14 | $ cd build
15 | $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
16 | $ make
17 | ```
18 | 
19 | where /path/to/libtorch should be the path to the unzipped LibTorch distribution. Note that the LibTorch from the [PyTorch homepage] ((https://pytorch.org/get-started/locally/) does not include MPI headers and cannot be used for this example. You have to compile LibTorch manually - a set of guidelines is provided [here] (https://gist.github.com/lasagnaphil/3e0099816837318e8e8bcab7edcfd5d9), however this may vary for different systems.
20 | 
21 | To run the code,
22 | 
23 | ```shell
24 | mpirun -np {NUM-PROCS} ./dist-mnist
25 | ```
26 | 


--------------------------------------------------------------------------------
/cpp/mnist/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(mnist)
 3 | set(CMAKE_CXX_STANDARD 17)
 4 | 
 5 | find_package(Torch REQUIRED)
 6 | 
 7 | option(DOWNLOAD_MNIST "Download the MNIST dataset from the internet" ON)
 8 | if (DOWNLOAD_MNIST)
 9 |   message(STATUS "Downloading MNIST dataset")
10 |   execute_process(
11 |     COMMAND python ${CMAKE_CURRENT_LIST_DIR}/../tools/download_mnist.py
12 |       -d ${CMAKE_BINARY_DIR}/data
13 |     ERROR_VARIABLE DOWNLOAD_ERROR)
14 |   if (DOWNLOAD_ERROR)
15 |     message(FATAL_ERROR "Error downloading MNIST dataset: ${DOWNLOAD_ERROR}")
16 |   endif()
17 | endif()
18 | 
19 | add_executable(mnist mnist.cpp)
20 | target_compile_features(mnist PUBLIC cxx_range_for)
21 | target_link_libraries(mnist ${TORCH_LIBRARIES})
22 | 
23 | if (MSVC)
24 |   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
25 |   add_custom_command(TARGET mnist
26 |                      POST_BUILD
27 |                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
28 |                      ${TORCH_DLLS}
29 |                      $<TARGET_FILE_DIR:mnist>)
30 | endif (MSVC)
31 | 


--------------------------------------------------------------------------------
/cpp/mnist/README.md:
--------------------------------------------------------------------------------
 1 | # MNIST Example with the PyTorch C++ Frontend
 2 | 
 3 | This folder contains an example of training a computer vision model to recognize
 4 | digits in images from the MNIST dataset, using the PyTorch C++ frontend.
 5 | 
 6 | The entire training code is contained in `mnist.cpp`.
 7 | 
 8 | To build the code, run the following commands from your terminal:
 9 | 
10 | ```shell
11 | $ cd mnist
12 | $ mkdir build
13 | $ cd build
14 | $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
15 | $ make
16 | ```
17 | 
18 | where `/path/to/libtorch` should be the path to the unzipped _LibTorch_
19 | distribution, which you can get from the [PyTorch
20 | homepage](https://pytorch.org/get-started/locally/).
21 | 
22 | Execute the compiled binary to train the model:
23 | 
24 | ```shell
25 | $ ./mnist
26 | Train Epoch: 1 [59584/60000] Loss: 0.4232
27 | Test set: Average loss: 0.1989 | Accuracy: 0.940
28 | Train Epoch: 2 [59584/60000] Loss: 0.1926
29 | Test set: Average loss: 0.1338 | Accuracy: 0.959
30 | Train Epoch: 3 [59584/60000] Loss: 0.1390
31 | Test set: Average loss: 0.0997 | Accuracy: 0.969
32 | Train Epoch: 4 [59584/60000] Loss: 0.1239
33 | Test set: Average loss: 0.0875 | Accuracy: 0.972
34 | ...
35 | ```
36 | 


--------------------------------------------------------------------------------
/cpp/regression/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(regression)
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | 
 6 | find_package(Torch REQUIRED)
 7 | 
 8 | add_executable(${PROJECT_NAME} "regression.cpp")
 9 | target_link_libraries(${PROJECT_NAME} "${TORCH_LIBRARIES}")
10 | 
11 | # The following code block is suggested to be used on Windows.
12 | # According to https://github.com/pytorch/pytorch/issues/25457,
13 | # the DLLs need to be copied to avoid memory errors.
14 | if (MSVC)
15 |   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
16 |   add_custom_command(TARGET ${PROJECT_NAME}
17 |                      POST_BUILD
18 |                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
19 |                      ${TORCH_DLLS}
20 |                      $<TARGET_FILE_DIR:${PROJECT_NAME}>)
21 | endif (MSVC)
22 | 


--------------------------------------------------------------------------------
/cpp/regression/README.md:
--------------------------------------------------------------------------------
 1 | # Linear regression example
 2 | 
 3 | Trains a single fully-connected layer to fit a 4th degree polynomial.
 4 | 
 5 | To build the code, run the following commands from your terminal:
 6 | 
 7 | ```shell
 8 | $ cd regression
 9 | $ mkdir build
10 | $ cd build
11 | $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
12 | $ make
13 | ```
14 | 
15 | where `/path/to/libtorch` should be the path to the unzipped _LibTorch_
16 | distribution, which you can get from the [PyTorch
17 | homepage](https://pytorch.org/get-started/locally/).
18 | 
19 | Execute the compiled binary to run:
20 | 
21 | ```shell
22 | $ ./regression
23 | Loss: 0.000301158 after 584 batches
24 | ==> Learned function:	y = 11.6441 x^4 -3.10164 x^3 2.19786 x^2 -3.83606 x^1 + 4.37066
25 | ==> Actual function:	y = 11.669 x^4 -3.16023 x^3 2.19182 x^2 -3.81505 x^1 + 4.38219
26 | ...
27 | ```
28 | 


--------------------------------------------------------------------------------
/cpp/regression/regression.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/torch.h>
 2 | #include <iostream>
 3 | #include <sstream>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #define POLY_DEGREE 4
 8 | 
 9 | // Builds features i.e. a matrix with columns [x, x^2, x^3, x^4].
10 | torch::Tensor make_features(torch::Tensor x) {
11 |   x = x.unsqueeze(1);
12 |   std::vector<torch::Tensor> xs;
13 |   for (int64_t i = 0; i < POLY_DEGREE; ++i)
14 |     xs.push_back(x.pow(i + 1));
15 |   return torch::cat(xs, 1);
16 | }
17 | 
18 | // Approximated function.
19 | torch::Tensor f(
20 |     torch::Tensor x,
21 |     torch::Tensor W_target,
22 |     torch::Tensor b_target) {
23 |   return x.mm(W_target) + b_target.item();
24 | }
25 | 
26 | // Creates a string description of a polynomial.
27 | std::string poly_desc(torch::Tensor W, torch::Tensor b) {
28 |   auto size = W.size(0);
29 |   std::ostringstream stream;
30 | 
31 |   stream << "y = ";
32 |   for (int64_t i = 0; i < size; ++i)
33 |     stream << W[i].item<float>() << " x^" << size - i << " ";
34 |   stream << "+ " << b[0].item<float>();
35 |   return stream.str();
36 | }
37 | 
38 | // Builds a batch i.e. (x, f(x)) pair.
39 | std::pair<torch::Tensor, torch::Tensor> get_batch(
40 |     torch::Tensor W_target,
41 |     torch::Tensor b_target,
42 |     int64_t batch_size = 32) {
43 |   auto random = torch::randn({batch_size});
44 |   auto x = make_features(random);
45 |   auto y = f(x, W_target, b_target);
46 |   return std::make_pair(x, y);
47 | }
48 | 
49 | int main() {
50 |   auto W_target = torch::randn({POLY_DEGREE, 1}) * 5;
51 |   auto b_target = torch::randn({1}) * 5;
52 | 
53 |   // Define the model and optimizer
54 |   auto fc = torch::nn::Linear(W_target.size(0), 1);
55 |   torch::optim::SGD optim(fc->parameters(), .1);
56 | 
57 |   float loss = 0;
58 |   int64_t batch_idx = 0;
59 | 
60 |   while (++batch_idx) {
61 |     // Get data
62 |     torch::Tensor batch_x, batch_y;
63 |     std::tie(batch_x, batch_y) = get_batch(W_target, b_target);
64 | 
65 |     // Reset gradients
66 |     optim.zero_grad();
67 | 
68 |     // Forward pass
69 |     auto output = torch::smooth_l1_loss(fc(batch_x), batch_y);
70 |     loss = output.item<float>();
71 | 
72 |     // Backward pass
73 |     output.backward();
74 | 
75 |     // Apply gradients
76 |     optim.step();
77 | 
78 |     // Stop criterion
79 |     if (loss < 1e-3f)
80 |       break;
81 |   }
82 | 
83 |   std::cout << "Loss: " << loss << " after " << batch_idx << " batches"
84 |             << std::endl;
85 |   std::cout << "==> Learned function:\t"
86 |             << poly_desc(fc->weight.view({-1}), fc->bias) << std::endl;
87 |   std::cout << "==> Actual function:\t"
88 |             << poly_desc(W_target.view({-1}), b_target) << std::endl;
89 | 
90 |   return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/cpp/tools/InstallingOpenCV.md:
--------------------------------------------------------------------------------
 1 | # Installing OpenCV 
 2 | 
 3 | ## Linux with Package Manager
 4 | 
 5 | ### Arch Linux
 6 | 
 7 | ```shell
 8 | pacman -Syu base-devel opencv
 9 | ```
10 | 
11 | ### Fedora
12 | 
13 | ```shell
14 | sudo dnf install opencv opencv-dev
15 | ```
16 | 
17 | ## Linux From Source
18 | 
19 | Required Packages:
20 | 
21 | ```shell
22 | sudo apt-get install build-essential cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev
23 | ```
24 | 
25 | Optional Packages:
26 | 
27 | ```shell
28 | sudo apt-get install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev
29 | ```
30 | 
31 | Building from Source:
32 | 
33 | ```shell
34 | git clone https://github.com/opencv/opencv.git
35 | git clone https://github.com/opencv/opencv_contrib.git
36 | 
37 | cd opencv && mkdir build && cd build
38 | cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
39 | make -j8 # runs 8 jobs in parallel
40 | sudo make install
41 | ```
42 | 
43 | ## Windows
44 | 
45 | You can download the pre-built libraries from [OpenCV releases](https://github.com/opencv/opencv/releases) and install them easily.
46 | 


--------------------------------------------------------------------------------
/cpp/tools/download_mnist.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | 
 4 | import argparse
 5 | import gzip
 6 | import os
 7 | import sys
 8 | import urllib
 9 | 
10 | try:
11 |     from urllib.error import URLError
12 |     from urllib.request import urlretrieve
13 | except ImportError:
14 |     from urllib2 import URLError
15 |     from urllib import urlretrieve
16 | 
17 | RESOURCES = [
18 |     'train-images-idx3-ubyte.gz',
19 |     'train-labels-idx1-ubyte.gz',
20 |     't10k-images-idx3-ubyte.gz',
21 |     't10k-labels-idx1-ubyte.gz',
22 | ]
23 | 
24 | 
25 | def report_download_progress(chunk_number, chunk_size, file_size):
26 |     if file_size != -1:
27 |         percent = min(1, (chunk_number * chunk_size) / file_size)
28 |         bar = '#' * int(64 * percent)
29 |         sys.stdout.write('\r0% |{:<64}| {}%'.format(bar, int(percent * 100)))
30 | 
31 | 
32 | def download(destination_path, url, quiet):
33 |     if os.path.exists(destination_path):
34 |         if not quiet:
35 |             print('{} already exists, skipping ...'.format(destination_path))
36 |     else:
37 |         print('Downloading {} ...'.format(url))
38 |         try:
39 |             hook = None if quiet else report_download_progress
40 |             urlretrieve(url, destination_path, reporthook=hook)
41 |         except URLError:
42 |             raise RuntimeError('Error downloading resource!')
43 |         finally:
44 |             if not quiet:
45 |                 # Just a newline.
46 |                 print()
47 | 
48 | 
49 | def unzip(zipped_path, quiet):
50 |     unzipped_path = os.path.splitext(zipped_path)[0]
51 |     if os.path.exists(unzipped_path):
52 |         if not quiet:
53 |             print('{} already exists, skipping ... '.format(unzipped_path))
54 |         return
55 |     with gzip.open(zipped_path, 'rb') as zipped_file:
56 |         with open(unzipped_path, 'wb') as unzipped_file:
57 |             unzipped_file.write(zipped_file.read())
58 |             if not quiet:
59 |                 print('Unzipped {} ...'.format(zipped_path))
60 | 
61 | 
62 | def main():
63 |     parser = argparse.ArgumentParser(
64 |         description='Download the MNIST dataset from the internet')
65 |     parser.add_argument(
66 |         '-d', '--destination', default='.', help='Destination directory')
67 |     parser.add_argument(
68 |         '-q',
69 |         '--quiet',
70 |         action='store_true',
71 |         help="Don't report about progress")
72 |     options = parser.parse_args()
73 | 
74 |     if not os.path.exists(options.destination):
75 |         os.makedirs(options.destination)
76 | 
77 |     try:
78 |         for resource in RESOURCES:
79 |             path = os.path.join(options.destination, resource)
80 |             # url = 'http://yann.lecun.com/exdb/mnist/{}'.format(resource)
81 |             url = 'https://ossci-datasets.s3.amazonaws.com/mnist/{}'.format(resource)
82 |             download(path, url, options.quiet)
83 |             unzip(path, options.quiet)
84 |     except KeyboardInterrupt:
85 |         print('Interrupted')
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     main()
90 | 


--------------------------------------------------------------------------------
/cpp/transfer-learning/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(example)
 3 | 
 4 | find_package(Torch REQUIRED)
 5 | find_package(OpenCV 4.1.0 REQUIRED)
 6 | 
 7 | include_directories(${OpenCV_INCLUDE_DIRS})
 8 | 
 9 | add_executable(example main.cpp main.h)
10 | add_executable(classify classify.cpp)
11 | 
12 | target_link_libraries(example ${OpenCV_LIBS})
13 | target_link_libraries(example "${TORCH_LIBRARIES}")
14 | target_link_libraries(classify ${OpenCV_LIBS})
15 | target_link_libraries(classify "${TORCH_LIBRARIES}")
16 | 
17 | set_property(TARGET classify PROPERTY CXX_STANDARD 17)
18 | set_property(TARGET example PROPERTY CXX_STANDARD 17)
19 | 


--------------------------------------------------------------------------------
/cpp/transfer-learning/README.md:
--------------------------------------------------------------------------------
 1 | # Transfer Learning on Dogs vs Cats Dataset using Libtorch and OpenCV
 2 | 
 3 | Transfer Learning on Dogs vs Cats dataset using PyTorch C++ API.
 4 | 
 5 | ## Usage
 6 | 
 7 | For **training**:
 8 | 
 9 | 1. Remove final layer of `ResNet18` pre-trained model and convert to `torch.jit` module: `python3 convert.py`.
10 | 2. Create build directory: `mkdir build && cd build`
11 | 3. `cmake -DCMAKE_PREFIX_PATH=/absolute/path/to/libtorch ..`
12 | 4. `make`
13 | 5. Run training code: `./example <path_to_scripting_model>`
14 | 
15 | For **prediction**:
16 | 
17 | 1. `cd build`
18 | 2. `./classify <path_image> <path_to_resnet18_model_without_fc_layer> <model_linear_trained>` : `./classify <path_image> ../resnet18_without_last_layer.pt model_linear.pt`
19 | 
20 | Detailed blog on applying Transfer Learning using Libtorch: https://krshrimali.github.io/Applying-Transfer-Learning-Dogs-Cats/.
21 | 


--------------------------------------------------------------------------------
/cpp/transfer-learning/classify.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  classify.cpp
 3 | //  transfer-learning
 4 | //
 5 | //  Created by Kushashwa Ravi Shrimali on 15/08/19.
 6 | //
 7 | 
 8 | #include <iostream>
 9 | #include <torch/torch.h>
10 | #include <opencv2/opencv.hpp>
11 | #include <torch/script.h>
12 | #include <dirent.h>
13 | 
14 | // Utility function to load image from given folder
15 | // File type accepted: .jpg
16 | std::vector<std::string> load_images(std::string folder_name) {
17 |     std::vector<std::string> list_images;
18 |     std::string base_name = folder_name;
19 |     DIR* dir;
20 |     struct dirent *ent;
21 |     if((dir = opendir(base_name.c_str())) != NULL) {
22 |         while((ent = readdir(dir)) != NULL) {
23 |             std::string filename = ent->d_name;
24 |             if(filename.length() > 4 && filename.substr(filename.length() - 3) == "jpg") {
25 |                 std::string newf = base_name + filename;
26 |                 list_images.push_back(newf);
27 |             }
28 |         }
29 |     }
30 |     return list_images;
31 | }
32 | 
33 | void print_probabilities(std::string loc, std::string model_path, std::string model_path_linear) {
34 |     // Load image with OpenCV.
35 |     cv::Mat img = cv::imread(loc);
36 |     cv::resize(img, img, cv::Size(224, 224), cv::INTER_CUBIC);
37 |     // Convert the image and label to a tensor.
38 |     torch::Tensor img_tensor = torch::from_blob(img.data, {1, img.rows, img.cols, 3}, torch::kByte);
39 |     img_tensor = img_tensor.permute({0, 3, 1, 2}); // convert to CxHxW
40 |     img_tensor = img_tensor.to(torch::kF32);
41 |     
42 |     // Load the model.
43 |     torch::jit::script::Module model;
44 |     model = torch::jit::load(model_path);
45 |     
46 |     torch::nn::Linear model_linear(512, 2);
47 |     torch::load(model_linear, model_path_linear);
48 |     
49 |     // Predict the probabilities for the classes.
50 |     std::vector<torch::jit::IValue> input;
51 |     input.push_back(img_tensor);
52 |     torch::Tensor prob = model.forward(input).toTensor();
53 |     prob = prob.view({prob.size(0), -1});
54 |     prob = model_linear(prob);
55 |     
56 |     std::cout << "Printing for location: " << loc << std::endl;
57 |     std::cout << "Cat prob: " << *(prob.data<float>())*100. << std::endl;
58 |     std::cout << "Dog prob: " << *(prob.data<float>()+1)*100. << std::endl;
59 | }
60 | 
61 | int main(int arc, char** argv)
62 | {
63 |     // argv[1] should is the test image
64 |     std::string location = argv[1];
65 |     
66 |     // argv[2] contains pre-trained model without last layer
67 |     // argv[3] contains trained last FC layer
68 |     std::string model_path = argv[2];
69 |     std::string model_path_linear = argv[3];
70 |     
71 |     // Load the model.
72 |     // You can also use: auto model = torch::jit::load(model_path);
73 |     torch::jit::script::Module model = torch::jit::load(model_path);
74 |     
75 |     // Print probabilities for dog and cat classes
76 |     print_probabilities(location, model_path, model_path_linear);
77 |     return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/cpp/transfer-learning/convert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This python script converts the network into Script Module
 3 | """
 4 | import torch
 5 | from torchvision import models
 6 | 
 7 | # Download and load the pre-trained model
 8 | model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
 9 | 
10 | # Set upgrading the gradients to False
11 | for param in model.parameters():
12 | 	param.requires_grad = False
13 | 
14 | # Save the model except the final FC Layer
15 | resnet18 = torch.nn.Sequential(*list(model.children())[:-1])
16 | 
17 | example_input = torch.rand(1, 3, 224, 224)
18 | script_module = torch.jit.trace(resnet18, example_input)
19 | script_module.save('resnet18_without_last_layer.pt')
20 | 


--------------------------------------------------------------------------------
/cpp/transfer-learning/main.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  main.h
 3 | //  transfer-learning
 4 | //
 5 | //  Created by Kushashwa Ravi Shrimali on 15/08/19.
 6 | //
 7 | 
 8 | #ifndef main_h
 9 | #define main_h
10 | 
11 | #include <iostream>
12 | #include <opencv2/opencv.hpp>
13 | #include <torch/torch.h>
14 | #include <dirent.h>
15 | #include <torch/script.h>
16 | 
17 | // Function to return image read at location given as type torch::Tensor
18 | // Resizes image to (224, 224, 3)
19 | torch::Tensor read_data(std::string location);
20 | 
21 | // Function to return label from int (0, 1 for binary and 0, 1, ..., n-1 for n-class classification) as type torch::Tensor
22 | torch::Tensor read_label(int label);
23 | 
24 | // Function returns vector of tensors (images) read from the list of images in a folder
25 | std::vector<torch::Tensor> process_images(std::vector<std::string> list_images);
26 | 
27 | // Function returns vector of tensors (labels) read from the list of labels
28 | std::vector<torch::Tensor> process_labels(std::vector<int> list_labels);
29 | 
30 | // Function to load data from given folder(s) name(s) (folders_name)
31 | // Returns pair of vectors of string (image locations) and int (respective labels)
32 | std::pair<std::vector<std::string>, std::vector<int>> load_data_from_folder(std::vector<std::string> folders_name);
33 | 
34 | // Function to train the network on train data
35 | template<typename Dataloader>
36 | void train(torch::jit::script::Module net, torch::nn::Linear lin, Dataloader& data_loader, torch::optim::Optimizer& optimizer, size_t dataset_size);
37 | 
38 | // Function to test the network on test data
39 | template<typename Dataloader>
40 | void test(torch::jit::script::Module network, torch::nn::Linear lin, Dataloader& loader, size_t data_size);
41 | 
42 | // Custom Dataset class
43 | class CustomDataset : public torch::data::Dataset<CustomDataset> {
44 | private:
45 |     /* data */
46 |     // Should be 2 tensors
47 |     std::vector<torch::Tensor> states, labels;
48 |     size_t ds_size;
49 | public:
50 |     CustomDataset(std::vector<std::string> list_images, std::vector<int> list_labels) {
51 |         states = process_images(list_images);
52 |         labels = process_labels(list_labels);
53 |         ds_size = states.size();
54 |     };
55 |     
56 |     torch::data::Example<> get(size_t index) override {
57 |         /* This should return {torch::Tensor, torch::Tensor} */
58 |         torch::Tensor sample_img = states.at(index);
59 |         torch::Tensor sample_label = labels.at(index);
60 |         return {sample_img.clone(), sample_label.clone()};
61 |     };
62 |     
63 |     torch::optional<size_t> size() const override {
64 |         return ds_size;
65 |     };
66 | };
67 | 
68 | #endif /* main_h */
69 | 


--------------------------------------------------------------------------------
/dcgan/.gitignore:
--------------------------------------------------------------------------------
1 | lsun
2 | 


--------------------------------------------------------------------------------
/dcgan/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Convolution Generative Adversarial Networks
 2 | 
 3 | This example implements the paper [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](http://arxiv.org/abs/1511.06434)
 4 | 
 5 | The implementation is very close to the Torch implementation [dcgan.torch](https://github.com/soumith/dcgan.torch)
 6 | 
 7 | After every 100 training iterations, the files `real_samples.png` and `fake_samples.png` are written to disk
 8 | with the samples from the generative model.
 9 | 
10 | After every epoch, models are saved to: `netG_epoch_%d.pth` and `netD_epoch_%d.pth`
11 | 
12 | ## Downloading the dataset
13 | 
14 | You can download the LSUN dataset by cloning [this repo](https://github.com/fyu/lsun) and running
15 | 
16 | ```
17 | python download.py -c bedroom
18 | ```
19 | 
20 | ## Usage
21 | 
22 | ```
23 | usage: main.py [-h] --dataset DATASET --dataroot DATAROOT [--workers WORKERS]
24 |                [--batchSize BATCHSIZE] [--imageSize IMAGESIZE] [--nz NZ]
25 |                [--ngf NGF] [--ndf NDF] [--niter NITER] [--lr LR]
26 |                [--beta1 BETA1] [--cuda] [--ngpu NGPU] [--netG NETG]
27 |                [--netD NETD] [--mps]
28 | 
29 | optional arguments:
30 |   -h, --help            show this help message and exit
31 |   --dataset DATASET     cifar10 | lsun | mnist |imagenet | folder | lfw | fake
32 |   --dataroot DATAROOT   path to dataset
33 |   --workers WORKERS     number of data loading workers
34 |   --batchSize BATCHSIZE input batch size
35 |   --imageSize IMAGESIZE the height / width of the input image to network
36 |   --nz NZ               size of the latent z vector
37 |   --ngf NGF             number of filters in the generator
38 |   --ndf NDF             number of filters in the discriminator
39 |   --niter NITER         number of epochs to train for
40 |   --lr LR               learning rate, default=0.0002
41 |   --beta1 BETA1         beta1 for adam. default=0.5
42 |   --cuda                enables cuda
43 |   --mps                 enables macOS GPU
44 |   --ngpu NGPU           number of GPUs to use
45 |   --netG NETG           path to netG (to continue training)
46 |   --netD NETD           path to netD (to continue training)
47 |   --outf OUTF           folder to output images and model checkpoints
48 |   --manualSeed SEED     manual seed
49 |   --classes CLASSES     comma separated list of classes for the lsun data set
50 | ```
51 | 


--------------------------------------------------------------------------------
/dcgan/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | lmdb
4 | 


--------------------------------------------------------------------------------
/distributed/FSDP/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.pt
3 | *.csv


--------------------------------------------------------------------------------
/distributed/FSDP/README.md:
--------------------------------------------------------------------------------
 1 | ## FSDP T5
 2 | 
 3 | To run the T5 example with FSDP for text summarization:
 4 | 
 5 | ## Get the wikihow dataset
 6 | ```bash
 7 | 
 8 | sh download_dataset.sh
 9 | 
10 | ```
11 | 
12 | ## Install the requirements:
13 | ~~~
14 | pip install -r requirements.txt
15 | ~~~
16 | ## Ensure you are running a recent version of PyTorch:
17 | see https://pytorch.org to install at least 1.12 and ideally a current nightly build. 
18 | 
19 | Start the training with Torchrun (adjust nproc_per_node to your GPU count):
20 | 
21 | ```
22 | torchrun --nnodes 1 --nproc_per_node 4  T5_training.py
23 | 
24 | ```
25 | 


--------------------------------------------------------------------------------
/distributed/FSDP/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .fsdp import fsdp_config
2 | from .training import train_config
3 | 


--------------------------------------------------------------------------------
/distributed/FSDP/configs/fsdp.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import ClassVar
 3 | from torch.distributed.fsdp import ShardingStrategy
 4 | from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 5 | 
 6 | @dataclass
 7 | class fsdp_config:
 8 |     mixed_precision: bool=True
 9 |     use_fp16: bool=False
10 |     seed: int=42
11 |     fsdp_activation_checkpointing: bool=False
12 |     limit_all_gathers: bool=True
13 |     sharding_strategy: ShardingStrategy = ShardingStrategy.FULL_SHARD #HYBRID_SHARD, SHARD_GRAD_OP
14 |     checkpoint_type: StateDictType = StateDictType.FULL_STATE_DICT # alternatively can use SHARDED_STATE_DICT to avoid OOMs
15 |     save_optimizer: bool=False
16 |     
17 |     
18 |     
19 |     


--------------------------------------------------------------------------------
/distributed/FSDP/configs/training.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import ClassVar
 3 | 
 4 | 
 5 | @dataclass
 6 | class train_config:
 7 |     model_name: str="t5-base"
 8 |     run_validation: bool=True
 9 |     batch_size_training: int=4
10 |     num_workers_dataloader: int=2
11 |     lr: float=0.002
12 |     weight_decay: float=0.0
13 |     gamma: float= 0.85
14 |     use_fp16: bool=False
15 |     mixed_precision: bool=True
16 |     save_model: bool=False
17 |     
18 |     
19 |     


--------------------------------------------------------------------------------
/distributed/FSDP/download_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Create the "data" folder if it doesn't exist
4 | mkdir -p data
5 | 
6 | # Download the files into the "data" folder
7 | wget -P data https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowAll.csv
8 | wget -P data https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowSep.csv
9 | 


--------------------------------------------------------------------------------
/distributed/FSDP/model_checkpointing/__init__.py:
--------------------------------------------------------------------------------
 1 | from .checkpoint_handler import (
 2 |     load_model_checkpoint,
 3 |     save_model_checkpoint,
 4 |     save_distributed_model_checkpoint,
 5 |     load_distributed_model_checkpoint,
 6 |     load_optimizer_checkpoint,
 7 |     save_optimizer_checkpoint,
 8 |     save_model_and_optimizer_sharded,
 9 |     load_model_sharded,
10 | )
11 | 


--------------------------------------------------------------------------------
/distributed/FSDP/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from .mixed_precision import *
2 | from .wrapping import *
3 | from .activation_checkpointing_functions import apply_fsdp_checkpointing
4 | 


--------------------------------------------------------------------------------
/distributed/FSDP/policies/activation_checkpointing_functions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import torch.distributed as dist
 4 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
 5 |     checkpoint_wrapper,
 6 |     CheckpointImpl,
 7 |     apply_activation_checkpointing,
 8 | )
 9 | 
10 | from transformers.models.t5.modeling_t5 import T5Block
11 | 
12 | from functools import partial
13 | 
14 | non_reentrant_wrapper = partial(
15 |     checkpoint_wrapper,
16 |     offload_to_cpu=False,
17 |     checkpoint_impl=CheckpointImpl.NO_REENTRANT,
18 | )
19 | 
20 | check_fn = lambda submodule: isinstance(submodule, T5Block)
21 | 
22 | 
23 | def apply_fsdp_checkpointing(model):
24 |     """apply activation checkpointing to model
25 |     returns None as model is updated directly
26 |     """
27 |     print(f"--> applying fdsp activation checkpointing...")
28 | 
29 |     apply_activation_checkpointing(
30 |         model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
31 |     )
32 | 


--------------------------------------------------------------------------------
/distributed/FSDP/policies/mixed_precision.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.distributed.fsdp import (
 4 |     # FullyShardedDataParallel as FSDP,
 5 |     # CPUOffload,
 6 |     MixedPrecision,
 7 |     # BackwardPrefetch,
 8 |     # ShardingStrategy,
 9 | )
10 | 
11 | # requires grad scaler in main loop
12 | fpSixteen = MixedPrecision(
13 |     param_dtype=torch.float16,
14 |     # Gradient communication precision.
15 |     reduce_dtype=torch.float16,
16 |     # Buffer precision.
17 |     buffer_dtype=torch.float16,
18 | )
19 | 
20 | bfSixteen = MixedPrecision(
21 |     param_dtype=torch.bfloat16,
22 |     # Gradient communication precision.
23 |     reduce_dtype=torch.bfloat16,
24 |     # Buffer precision.
25 |     buffer_dtype=torch.bfloat16,
26 | )
27 | 
28 | bfSixteen_working = MixedPrecision(
29 |     param_dtype=torch.float32,
30 |     reduce_dtype=torch.bfloat16,
31 |     buffer_dtype=torch.bfloat16,
32 | )
33 | 
34 | fp32_policy = MixedPrecision(
35 |     param_dtype=torch.float32,
36 |     reduce_dtype=torch.float32,
37 |     buffer_dtype=torch.float32,
38 | )
39 | 


--------------------------------------------------------------------------------
/distributed/FSDP/policies/wrapping.py:
--------------------------------------------------------------------------------
 1 | # holds various wrapping policies for fsdp
 2 | 
 3 | 
 4 | import torch.distributed as dist
 5 | import torch.nn as nn
 6 | import torch
 7 | 
 8 | from transformers.models.t5.modeling_t5 import T5Block
 9 | 
10 | from torch.distributed.fsdp.fully_sharded_data_parallel import (
11 |     FullyShardedDataParallel as FSDP,
12 |     CPUOffload,
13 |     BackwardPrefetch,
14 |     MixedPrecision,
15 | )
16 | from torch.distributed.fsdp.wrap import (
17 |     transformer_auto_wrap_policy,
18 |     size_based_auto_wrap_policy,
19 |     enable_wrap,
20 |     wrap,
21 | )
22 | 
23 | import functools
24 | from typing import Type
25 | 
26 | 
27 | def get_size_policy(min_params=1e8):
28 |     num_wrap_policy = functools.partial(
29 |         size_based_auto_wrap_policy, min_num_params=min_params
30 |     )
31 |     return num_wrap_policy
32 | 
33 | 
34 | def get_t5_wrapper():
35 |     """we register our main layer class and use the fsdp transformer wrapping policy
36 |     ensures embedding layers are in the root fsdp unit for shared access and that fsdp units map to transformer layers
37 |     """
38 |     # ====   use new transformer wrapper
39 | 
40 |     t5_auto_wrap_policy = functools.partial(
41 |         transformer_auto_wrap_policy,
42 |         transformer_layer_cls={
43 |             T5Block,
44 |         },
45 |     )
46 | 
47 |     return t5_auto_wrap_policy
48 | 


--------------------------------------------------------------------------------
/distributed/FSDP/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | tqdm
4 | protobuf
5 | SentencePiece
6 | nlp
7 | 


--------------------------------------------------------------------------------
/distributed/FSDP/summarization_dataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import json
 5 | import time
 6 | import logging
 7 | import random
 8 | import re
 9 | from itertools import chain
10 | from string import punctuation
11 | 
12 | import pandas as pd
13 | import numpy as np
14 | import torch
15 | from torch.utils.data import Dataset, DataLoader
16 | 
17 | from nlp import load_dataset
18 | 
19 | from transformers import (
20 |     AdamW,
21 |     T5ForConditionalGeneration,
22 |     T5Tokenizer,
23 |     get_linear_schedule_with_warmup
24 | )
25 | 
26 | class wikihow(Dataset):
27 |     def __init__(self, tokenizer, type_path, num_samples, input_length, output_length, print_text=False):
28 |         self.dataset =  load_dataset('wikihow', 'all', data_dir='data/', split=type_path)
29 |         if num_samples:
30 |             self.dataset = self.dataset.select(list(range(0, num_samples)))
31 |         self.input_length = input_length
32 |         self.tokenizer = tokenizer
33 |         self.output_length = output_length
34 |         self.print_text = print_text
35 | 
36 |     def __len__(self):
37 |         return self.dataset.shape[0]
38 | 
39 |     def clean_text(self, text):
40 |         text = text.replace('Example of text:', '')
41 |         text = text.replace('Example of Summary:', '')
42 |         text = text.replace('\n','')
43 |         text = text.replace('``', '')
44 |         text = text.replace('"', '')
45 | 
46 |         return text
47 | 
48 | 
49 |     def convert_to_features(self, example_batch):
50 |         # Tokenize contexts and questions (as pairs of inputs)
51 | 
52 |         if self.print_text:
53 |             print("Input Text: ", self.clean_text(example_batch['text']))
54 | #         input_ = self.clean_text(example_batch['text']) + " </s>"
55 | #         target_ = self.clean_text(example_batch['headline']) + " </s>"
56 | 
57 |         input_ = self.clean_text(example_batch['text'])
58 |         target_ = self.clean_text(example_batch['headline'])
59 | 
60 |         source = self.tokenizer.batch_encode_plus([input_], max_length=self.input_length,
61 |                                                      padding='max_length', truncation=True, return_tensors="pt")
62 | 
63 |         targets = self.tokenizer.batch_encode_plus([target_], max_length=self.output_length,
64 |                                                      padding='max_length', truncation=True, return_tensors="pt")
65 | 
66 | 
67 |         return source, targets
68 | 
69 |     def __getitem__(self, index):
70 |         source, targets = self.convert_to_features(self.dataset[index])
71 | 
72 |         source_ids = source["input_ids"].squeeze()
73 |         target_ids = targets["input_ids"].squeeze()
74 | 
75 |         src_mask    = source["attention_mask"].squeeze()
76 |         target_mask = targets["attention_mask"].squeeze()
77 | 
78 |         return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
79 | 
80 | def get_dataset(tokenizer, type_path, num_samples, args):
81 |       return wikihow(tokenizer=tokenizer, type_path=type_path, num_samples=num_samples,  input_length=max_input_length,
82 |                         output_length=max_output_length)
83 | 


--------------------------------------------------------------------------------
/distributed/FSDP/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .environment import bfloat_support
2 | from .train_utils import setup, cleanup, get_date_of_run, format_metrics_to_gb, train, validation,setup_model
3 |                           
4 |                           


--------------------------------------------------------------------------------
/distributed/FSDP/utils/environment.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Meta Platforms, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the Apache-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # This is a simple check to confirm that your current server has full bfloat support -
 8 | #  both GPU native support, and Network communication support.
 9 | 
10 | # Be warned that if you run on V100 without a check like this, you will be running without native Bfloat16
11 | # support and will find significant performance degradation (but it will not complain via an error).
12 | # Hence the reason for a checker!
13 | 
14 | from pkg_resources import packaging
15 | import torch
16 | import torch.cuda.nccl as nccl
17 | import torch.distributed as dist
18 | 
19 | # global flag that confirms ampere architecture, cuda version and
20 | # nccl version to verify bfloat16 native support is ready
21 | 
22 | def bfloat_support():
23 |     return (
24 |         torch.version.cuda
25 |         and torch.cuda.is_bf16_supported()
26 |         and packaging.version.parse(torch.version.cuda).release >= (11, 0)
27 |         and dist.is_nccl_available()
28 |         and nccl.version() >= (2, 10)
29 |     )
30 | 


--------------------------------------------------------------------------------
/distributed/FSDP/utils/train_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.distributed as dist
  4 | from datetime import datetime
  5 | import tqdm
  6 | from transformers import AutoTokenizer, GPT2TokenizerFast
  7 | from transformers import T5Tokenizer, T5ForConditionalGeneration
  8 | 
  9 | g_gigabyte = 1024**3
 10 | 
 11 | def setup():
 12 |     # initialize the process group
 13 |     dist.init_process_group("nccl")
 14 | 
 15 | 
 16 | def cleanup():
 17 |     dist.destroy_process_group()
 18 | 
 19 | def get_date_of_run():
 20 |     """create date and time for file save uniqueness
 21 |     example: 2022-05-07-08:31:12_PM'
 22 |     """
 23 |     date_of_run = datetime.now().strftime("%Y-%m-%d-%I:%M:%S_%p")
 24 |     print(f"--> current date and time of run = {date_of_run}")
 25 |     return date_of_run
 26 | 
 27 | 
 28 | 
 29 | def format_metrics_to_gb(item):
 30 |     """quick function to format numbers to gigabyte and round to 4 digit precision"""
 31 |     metric_num = item / g_gigabyte
 32 |     metric_num = round(metric_num, ndigits=4)
 33 |     return metric_num
 34 | 
 35 | def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
 36 |     model.train()
 37 |     local_rank = int(os.environ['LOCAL_RANK'])
 38 |     fsdp_loss = torch.zeros(2).to(local_rank)
 39 | 
 40 |     if sampler:
 41 |         sampler.set_epoch(epoch)
 42 |     if rank==0:
 43 |         inner_pbar = tqdm.tqdm(
 44 |             range(len(train_loader)), colour="blue", desc="r0 Training Epoch"
 45 |         )
 46 |     for batch in train_loader:
 47 |         for key in batch.keys():
 48 |             batch[key] = batch[key].to(local_rank)
 49 |         optimizer.zero_grad()
 50 |         output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )
 51 |         loss = output["loss"]
 52 |         loss.backward()
 53 |         optimizer.step()
 54 |         fsdp_loss[0] += loss.item()
 55 |         fsdp_loss[1] += len(batch)
 56 |         if rank==0:
 57 |             inner_pbar.update(1)
 58 | 
 59 |     dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM)
 60 |     train_accuracy = fsdp_loss[0] / fsdp_loss[1]
 61 | 
 62 | 
 63 |     if rank == 0:
 64 |         inner_pbar.close()
 65 |         print(
 66 |                 f"Train Epoch: \t{epoch}, Loss: \t{train_accuracy:.4f}"
 67 |             )
 68 |     return train_accuracy
 69 | 
 70 | 
 71 | def validation(model, rank, world_size, val_loader):
 72 |     model.eval()
 73 |     correct = 0
 74 |     local_rank = int(os.environ['LOCAL_RANK'])
 75 |     fsdp_loss = torch.zeros(2).to(local_rank)
 76 |     if rank == 0:
 77 |         inner_pbar = tqdm.tqdm(
 78 |             range(len(val_loader)), colour="green", desc="Validation Epoch"
 79 |         )
 80 |     with torch.no_grad():
 81 |         for batch in val_loader:
 82 |             for key in batch.keys():
 83 |                 batch[key] = batch[key].to(local_rank)
 84 |             output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"])
 85 |             fsdp_loss[0] += output["loss"].item()  # sum up batch loss
 86 |             fsdp_loss[1] += len(batch)
 87 | 
 88 |             if rank==0:
 89 |                 inner_pbar.update(1)
 90 | 
 91 |     dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM)
 92 |     val_loss = fsdp_loss[0] / fsdp_loss[1]
 93 |     if rank == 0:
 94 |         inner_pbar.close()
 95 |         print(f"Validation Loss: {val_loss:.4f}")
 96 |     return val_loss
 97 | 
 98 | 
 99 | def setup_model(model_name):
100 |         model = T5ForConditionalGeneration.from_pretrained(model_name)
101 |         tokenizer =  T5Tokenizer.from_pretrained(model_name, legacy=False)
102 |         return model, tokenizer
103 | 


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/README.md:
--------------------------------------------------------------------------------
 1 | # distributed-pytorch
 2 | 
 3 | Code for the DDP tutorial series at https://pytorch.org/tutorials/beginner/ddp_series_intro.html
 4 | 
 5 | Each code file extends upon the previous one. The series starts with a non-distributed script that runs on a single GPU and incrementally updates to end with multinode training on a Slurm cluster.
 6 | 
 7 | ## Files
 8 | * [single_gpu.py](single_gpu.py): Non-distributed training script
 9 | 
10 | * [multigpu.py](multigpu.py): DDP on a single node
11 | 
12 | * [multigpu_torchrun.py](multigpu_torchrun.py): DDP on a single node using Torchrun
13 | 
14 | * [multinode.py](multinode.py): DDP on multiple nodes using Torchrun (and optionally Slurm)
15 |     * [slurm/setup_pcluster_slurm.md](slurm/setup_pcluster_slurm.md): instructions to set up an AWS cluster
16 |     * [slurm/config.yaml.template](slurm/config.yaml.template): configuration to set up an AWS cluster
17 |     * [slurm/sbatch_run.sh](slurm/sbatch_run.sh): slurm script to launch the training job
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/datautils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | 
 4 | class MyTrainDataset(Dataset):
 5 |     def __init__(self, size):
 6 |         self.size = size
 7 |         self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
 8 | 
 9 |     def __len__(self):
10 |         return self.size
11 |     
12 |     def __getitem__(self, index):
13 |         return self.data[index]


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/multigpu.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch.utils.data import Dataset, DataLoader
  4 | from datautils import MyTrainDataset
  5 | 
  6 | import torch.multiprocessing as mp
  7 | from torch.utils.data.distributed import DistributedSampler
  8 | from torch.nn.parallel import DistributedDataParallel as DDP
  9 | from torch.distributed import init_process_group, destroy_process_group
 10 | import os
 11 | 
 12 | 
 13 | def ddp_setup(rank, world_size):
 14 |     """
 15 |     Args:
 16 |         rank: Unique identifier of each process
 17 |         world_size: Total number of processes
 18 |     """
 19 |     os.environ["MASTER_ADDR"] = "localhost"
 20 |     os.environ["MASTER_PORT"] = "12355"
 21 |     torch.cuda.set_device(rank)
 22 |     init_process_group(backend="nccl", rank=rank, world_size=world_size)
 23 | 
 24 | class Trainer:
 25 |     def __init__(
 26 |         self,
 27 |         model: torch.nn.Module,
 28 |         train_data: DataLoader,
 29 |         optimizer: torch.optim.Optimizer,
 30 |         gpu_id: int,
 31 |         save_every: int,
 32 |     ) -> None:
 33 |         self.gpu_id = gpu_id
 34 |         self.model = model.to(gpu_id)
 35 |         self.train_data = train_data
 36 |         self.optimizer = optimizer
 37 |         self.save_every = save_every
 38 |         self.model = DDP(model, device_ids=[gpu_id])
 39 | 
 40 |     def _run_batch(self, source, targets):
 41 |         self.optimizer.zero_grad()
 42 |         output = self.model(source)
 43 |         loss = F.cross_entropy(output, targets)
 44 |         loss.backward()
 45 |         self.optimizer.step()
 46 | 
 47 |     def _run_epoch(self, epoch):
 48 |         b_sz = len(next(iter(self.train_data))[0])
 49 |         print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
 50 |         self.train_data.sampler.set_epoch(epoch)
 51 |         for source, targets in self.train_data:
 52 |             source = source.to(self.gpu_id)
 53 |             targets = targets.to(self.gpu_id)
 54 |             self._run_batch(source, targets)
 55 | 
 56 |     def _save_checkpoint(self, epoch):
 57 |         ckp = self.model.module.state_dict()
 58 |         PATH = "checkpoint.pt"
 59 |         torch.save(ckp, PATH)
 60 |         print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")
 61 | 
 62 |     def train(self, max_epochs: int):
 63 |         for epoch in range(max_epochs):
 64 |             self._run_epoch(epoch)
 65 |             if self.gpu_id == 0 and epoch % self.save_every == 0:
 66 |                 self._save_checkpoint(epoch)
 67 | 
 68 | 
 69 | def load_train_objs():
 70 |     train_set = MyTrainDataset(2048)  # load your dataset
 71 |     model = torch.nn.Linear(20, 1)  # load your model
 72 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
 73 |     return train_set, model, optimizer
 74 | 
 75 | 
 76 | def prepare_dataloader(dataset: Dataset, batch_size: int):
 77 |     return DataLoader(
 78 |         dataset,
 79 |         batch_size=batch_size,
 80 |         pin_memory=True,
 81 |         shuffle=False,
 82 |         sampler=DistributedSampler(dataset)
 83 |     )
 84 | 
 85 | 
 86 | def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
 87 |     ddp_setup(rank, world_size)
 88 |     dataset, model, optimizer = load_train_objs()
 89 |     train_data = prepare_dataloader(dataset, batch_size)
 90 |     trainer = Trainer(model, train_data, optimizer, rank, save_every)
 91 |     trainer.train(total_epochs)
 92 |     destroy_process_group()
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     import argparse
 97 |     parser = argparse.ArgumentParser(description='simple distributed training job')
 98 |     parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
 99 |     parser.add_argument('save_every', type=int, help='How often to save a snapshot')
100 |     parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
101 |     args = parser.parse_args()
102 | 
103 |     world_size = torch.cuda.device_count()
104 |     mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size)
105 | 


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.11.0


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/single_gpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.utils.data import Dataset, DataLoader
 4 | from datautils import MyTrainDataset
 5 | 
 6 | 
 7 | class Trainer:
 8 |     def __init__(
 9 |         self,
10 |         model: torch.nn.Module,
11 |         train_data: DataLoader,
12 |         optimizer: torch.optim.Optimizer,
13 |         gpu_id: int,
14 |         save_every: int, 
15 |     ) -> None:
16 |         self.gpu_id = gpu_id
17 |         self.model = model.to(gpu_id)
18 |         self.train_data = train_data
19 |         self.optimizer = optimizer
20 |         self.save_every = save_every
21 | 
22 |     def _run_batch(self, source, targets):
23 |         self.optimizer.zero_grad()
24 |         output = self.model(source)
25 |         loss = F.cross_entropy(output, targets)
26 |         loss.backward()
27 |         self.optimizer.step()
28 | 
29 |     def _run_epoch(self, epoch):
30 |         b_sz = len(next(iter(self.train_data))[0])
31 |         print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
32 |         for source, targets in self.train_data:
33 |             source = source.to(self.gpu_id)
34 |             targets = targets.to(self.gpu_id)
35 |             self._run_batch(source, targets)
36 | 
37 |     def _save_checkpoint(self, epoch):
38 |         ckp = self.model.state_dict()
39 |         PATH = "checkpoint.pt"
40 |         torch.save(ckp, PATH)
41 |         print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")
42 | 
43 |     def train(self, max_epochs: int):
44 |         for epoch in range(max_epochs):
45 |             self._run_epoch(epoch)
46 |             if epoch % self.save_every == 0:
47 |                 self._save_checkpoint(epoch)
48 | 
49 | 
50 | def load_train_objs():
51 |     train_set = MyTrainDataset(2048)  # load your dataset
52 |     model = torch.nn.Linear(20, 1)  # load your model
53 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
54 |     return train_set, model, optimizer
55 | 
56 | 
57 | def prepare_dataloader(dataset: Dataset, batch_size: int):
58 |     return DataLoader(
59 |         dataset,
60 |         batch_size=batch_size,
61 |         pin_memory=True,
62 |         shuffle=True
63 |     )
64 | 
65 | 
66 | def main(device, total_epochs, save_every, batch_size):
67 |     dataset, model, optimizer = load_train_objs()
68 |     train_data = prepare_dataloader(dataset, batch_size)
69 |     trainer = Trainer(model, train_data, optimizer, device, save_every)
70 |     trainer.train(total_epochs)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     import argparse
75 |     parser = argparse.ArgumentParser(description='simple distributed training job')
76 |     parser.add_argument('total_epochs', type=int, help='Total epochs to train the model')
77 |     parser.add_argument('save_every', type=int, help='How often to save a snapshot')
78 |     parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
79 |     args = parser.parse_args()
80 |     
81 |     device = 0  # shorthand for cuda:0
82 |     main(device, args.total_epochs, args.save_every, args.batch_size)
83 | 


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/slurm/config.yaml.template:
--------------------------------------------------------------------------------
 1 | Region: us-east-1
 2 | 
 3 | Image:
 4 |   Os: ubuntu1804
 5 | 
 6 | SharedStorage:
 7 |   - MountDir: /shared
 8 |     Name: shared-fs
 9 |     StorageType: FsxLustre
10 |     FsxLustreSettings:
11 |       StorageCapacity: 1200
12 |       DeploymentType: SCRATCH_1
13 |       StorageType: SSD
14 | 
15 | HeadNode:
16 |   InstanceType: c5.xlarge
17 |   Networking:
18 |     SubnetId: subnet-xxxxxxx
19 |   Ssh:
20 |     KeyName: your-keyname-file
21 | 
22 | Scheduling:
23 |   Scheduler: slurm
24 |   SlurmQueues:
25 |   - Name: train
26 |     ComputeResources:
27 |     - Name: p32xlarge
28 |       InstanceType: p3.2xlarge
29 |       MinCount: 0
30 |       MaxCount: 5
31 |     Networking:
32 |       SubnetIds:
33 |       - subnet-xxxxxxx
34 | 


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/slurm/sbatch_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=multinode-example
 4 | #SBATCH --nodes=4
 5 | #SBATCH --ntasks=4
 6 | #SBATCH --gpus-per-task=1
 7 | #SBATCH --cpus-per-task=4
 8 | 
 9 | nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
10 | nodes_array=($nodes)
11 | head_node=${nodes_array[0]}
12 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
13 | 
14 | echo Node IP: $head_node_ip
15 | export LOGLEVEL=INFO
16 | 
17 | srun torchrun \
18 | --nnodes 4 \
19 | --nproc_per_node 1 \
20 | --rdzv_id $RANDOM \
21 | --rdzv_backend c10d \
22 | --rdzv_endpoint $head_node_ip:29500 \
23 | /shared/examples/multinode_torchrun.py 50 10


--------------------------------------------------------------------------------
/distributed/ddp-tutorial-series/slurm/setup_pcluster_slurm.md:
--------------------------------------------------------------------------------
 1 | # Setup AWS cluster with pcluster
 2 | 
 3 | ## 1. Sign in to an AWS instance
 4 | 
 5 | ## 2. Install pcluster
 6 | ```
 7 | pip3 install awscli -U --user
 8 | pip3 install "aws-parallelcluster" --upgrade --user
 9 | ```
10 | 
11 | ## 3. Create a cluster config file
12 | ```
13 | pcluster configure --config config.yaml
14 | ```
15 | See config.yaml.template for an example
16 | 
17 | 
18 | ## 4. Create the cluster
19 | ```
20 | pcluster create-cluster --cluster-name dist-ml --cluster-configuration config.yaml
21 | ```
22 | 
23 | ### 4a. Track progress
24 | ```
25 | pcluster list-clusters
26 | ```
27 | 
28 | ## 5. Login to cluster headnode
29 | ```
30 | pcluster ssh --cluster-name dist-ml -i your-keyname-file
31 | ```
32 | 
33 | ## 6. Install dependencies
34 | ```
35 | sudo apt-get update
36 | sudo apt-get install -y python3-venv
37 | python3 -m venv /shared/venv/
38 | source /shared/venv/bin/activate
39 | pip install wheel
40 | echo 'source /shared/venv/bin/activate' >> ~/.bashrc
41 | ```
42 | 
43 | ## 7. Download training code and install requirements
44 | ```
45 | cd /shared
46 | git clone --depth 1 https://github.com/pytorch/examples;
47 | cd /shared/examples
48 | git filter-branch --prune-empty --subdirectory-filter distributed/ddp-tutorial-series
49 | python3 -m pip install setuptools==59.5.0
50 | pip install -r requirements.txt
51 | ```


--------------------------------------------------------------------------------
/distributed/ddp/example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | import tempfile
 5 | from urllib.parse import urlparse
 6 | 
 7 | import torch
 8 | import torch.distributed as dist
 9 | import torch.nn as nn
10 | import torch.optim as optim
11 | 
12 | from torch.nn.parallel import DistributedDataParallel as DDP
13 | 
14 | class ToyModel(nn.Module):
15 |     def __init__(self):
16 |         super(ToyModel, self).__init__()
17 |         self.net1 = nn.Linear(10, 10)
18 |         self.relu = nn.ReLU()
19 |         self.net2 = nn.Linear(10, 5)
20 | 
21 |     def forward(self, x):
22 |         return self.net2(self.relu(self.net1(x)))
23 | 
24 | 
25 | def demo_basic(local_world_size, local_rank):
26 | 
27 |     # setup devices for this process. For local_world_size = 2, num_gpus = 8,
28 |     # rank 0 uses GPUs [0, 1, 2, 3] and
29 |     # rank 1 uses GPUs [4, 5, 6, 7].
30 |     n = torch.cuda.device_count() // local_world_size
31 |     device_ids = list(range(local_rank * n, (local_rank + 1) * n))
32 | 
33 |     print(
34 |         f"[{os.getpid()}] rank = {dist.get_rank()}, "
35 |         + f"world_size = {dist.get_world_size()}, n = {n}, device_ids = {device_ids} \n", end=''
36 |     )
37 | 
38 |     model = ToyModel().cuda(device_ids[0])
39 |     ddp_model = DDP(model, device_ids)
40 | 
41 |     loss_fn = nn.MSELoss()
42 |     optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
43 | 
44 |     optimizer.zero_grad()
45 |     outputs = ddp_model(torch.randn(20, 10))
46 |     labels = torch.randn(20, 5).to(device_ids[0])
47 |     loss_fn(outputs, labels).backward()
48 |     optimizer.step()
49 | 
50 | 
51 | def spmd_main(local_world_size, local_rank):
52 |     # These are the parameters used to initialize the process group
53 |     env_dict = {
54 |         key: os.environ[key]
55 |         for key in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE")
56 |     }
57 |     
58 |     if sys.platform == "win32":
59 |         # Distributed package only covers collective communications with Gloo
60 |         # backend and FileStore on Windows platform. Set init_method parameter
61 |         # in init_process_group to a local file.
62 |         if "INIT_METHOD" in os.environ.keys():
63 |             print(f"init_method is {os.environ['INIT_METHOD']}")
64 |             url_obj = urlparse(os.environ["INIT_METHOD"])
65 |             if url_obj.scheme.lower() != "file":
66 |                 raise ValueError("Windows only supports FileStore")
67 |             else:
68 |                 init_method = os.environ["INIT_METHOD"]
69 |         else:
70 |             # It is a example application, For convience, we create a file in temp dir.
71 |             temp_dir = tempfile.gettempdir()
72 |             init_method = f"file:///{os.path.join(temp_dir, 'ddp_example')}"
73 |         dist.init_process_group(backend="gloo", init_method=init_method, rank=int(env_dict["RANK"]), world_size=int(env_dict["WORLD_SIZE"]))
74 |     else:
75 |         print(f"[{os.getpid()}] Initializing process group with: {env_dict}")  
76 |         dist.init_process_group(backend="nccl")
77 | 
78 |     print(
79 |         f"[{os.getpid()}]: world_size = {dist.get_world_size()}, "
80 |         + f"rank = {dist.get_rank()}, backend={dist.get_backend()} \n", end=''
81 |     )
82 | 
83 |     demo_basic(local_world_size, local_rank)
84 | 
85 |     # Tear down the process group
86 |     dist.destroy_process_group()
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     parser = argparse.ArgumentParser()
91 |     # This is passed in via launch.py
92 |     parser.add_argument("--local_rank", type=int, default=0)
93 |     # This needs to be explicitly passed in
94 |     parser.add_argument("--local_world_size", type=int, default=1)
95 |     args = parser.parse_args()
96 |     # The main entry point is called directly without using subprocess
97 |     spmd_main(args.local_world_size, args.local_rank)
98 | 


--------------------------------------------------------------------------------
/distributed/ddp/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/README.md:
--------------------------------------------------------------------------------
 1 | # minGPT-DDP
 2 | 
 3 | Code accompanying the tutorial at https://pytorch.org/tutorials/intermediate/ddp_series_minGPT.html for training a GPT-like model with Distributed Data Parallel (DDP) in PyTorch. 
 4 | 
 5 | Files marked with an asterisk (*) are adapted from the minGPT repo (https://github.com/karpathy/minGPT). 
 6 | 
 7 | - [trainer.py](mingpt/trainer.py) includes the Trainer class that runs the distributed training iterations on the model with the provided dataset.
 8 | - [model.py *](mingpt/model.py) defines the model architecture.
 9 | - [char_dataset.py *](mingpt/char_dataset.py) contains the `Dataset`class for a character-level dataset.
10 | - [gpt2_train_cfg.yaml](mingpt/gpt2_train_cfg.yaml) contains the configurations for data, model, optimizer and training run.
11 | - [main.py](mingpt/main.py) is the entry point to the trainig job. It sets up the DDP process group, reads all the configurations and runs the training job.
12 | - [slurm/](mingpt/slurm) contains files for setting up an AWS cluster and the slurm script to run multinode training.


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/mingpt/char_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | import fsspec
 4 | from dataclasses import dataclass
 5 | 
 6 | """
 7 | Adapted from https://github.com/karpathy/minGPT/blob/master/projects/chargpt/chargpt.py
 8 | """
 9 | 
10 | @dataclass
11 | class DataConfig:
12 |     path: str = None
13 |     block_size: int = None
14 |     train_split: float = None
15 |     truncate: float = 1.0
16 | 
17 | class CharDataset(Dataset):
18 | 
19 |     def __init__(self, data_cfg: DataConfig): #data_path: str, block_size):
20 |         data = fsspec.open(data_cfg.path).open().read().decode('utf-8')
21 |         data = data[ : int(len(data) * data_cfg.truncate)]
22 | 
23 |         chars = sorted(list(set(data)))
24 |         data_size, vocab_size = len(data), len(chars)
25 |         print('Data has %d characters, %d unique.' % (data_size, vocab_size))
26 | 
27 |         self.stoi = {ch: i for i, ch in enumerate(chars)}
28 |         self.itos = {i: ch for i, ch in enumerate(chars)}
29 |         self.block_size = data_cfg.block_size
30 |         self.vocab_size = vocab_size
31 |         self.data = data
32 | 
33 |     def __len__(self):
34 |         return len(self.data) - self.block_size
35 | 
36 |     def __getitem__(self, idx):
37 |         # grab a chunk of (block_size + 1) characters from the data
38 |         chunk = self.data[idx:idx + self.block_size + 1]
39 |         # encode every character to an integer
40 |         dix = [self.stoi[s] for s in chunk]
41 |         x = torch.tensor(dix[:-1], dtype=torch.long)
42 |         y = torch.tensor(dix[1:], dtype=torch.long)
43 |         return x, y
44 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/mingpt/gpt2_train_cfg.yaml:
--------------------------------------------------------------------------------
 1 | data_config:
 2 |   path: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
 3 |   block_size: 128
 4 |   train_split: 0.9
 5 |   truncate: 0.05
 6 | gpt_config:
 7 |   n_layer: 8
 8 |   n_head: 8
 9 |   n_embd: 512
10 | trainer_config:
11 |   max_epochs: 10
12 |   batch_size: 216
13 |   data_loader_workers: 4
14 |   grad_norm_clip: 1.0
15 |   snapshot_path: gpt_snapshot.pt
16 |   save_every: 3
17 |   use_amp: True
18 | optimizer_config:
19 |   weight_decay: 0.1
20 |   learning_rate: 0.0003
21 | 
22 | hydra:
23 |   run:
24 |     dir: ./
25 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/mingpt/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.data import random_split
 4 | from torch.distributed import init_process_group, destroy_process_group
 5 | from model import GPT, GPTConfig, OptimizerConfig, create_optimizer
 6 | from trainer import Trainer, TrainerConfig
 7 | from char_dataset import CharDataset, DataConfig
 8 | from omegaconf import DictConfig
 9 | import hydra
10 | 
11 | 
12 | def ddp_setup():
13 |     init_process_group(backend="nccl")
14 |     torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
15 | 
16 | def get_train_objs(gpt_cfg: GPTConfig, opt_cfg: OptimizerConfig, data_cfg: DataConfig):
17 |     dataset = CharDataset(data_cfg)
18 |     train_len = int(len(dataset) * data_cfg.train_split)
19 |     train_set, test_set = random_split(dataset, [train_len, len(dataset) - train_len])
20 | 
21 |     gpt_cfg.vocab_size = dataset.vocab_size
22 |     gpt_cfg.block_size = dataset.block_size
23 |     model = GPT(gpt_cfg)
24 |     optimizer = create_optimizer(model, opt_cfg)
25 |     
26 |     return model, optimizer, train_set, test_set
27 | 
28 | @hydra.main(version_base=None, config_path=".", config_name="gpt2_train_cfg")
29 | def main(cfg: DictConfig):
30 |     ddp_setup()
31 | 
32 |     gpt_cfg = GPTConfig(**cfg['gpt_config'])
33 |     opt_cfg = OptimizerConfig(**cfg['optimizer_config'])
34 |     data_cfg = DataConfig(**cfg['data_config'])
35 |     trainer_cfg = TrainerConfig(**cfg['trainer_config'])
36 | 
37 |     model, optimizer, train_data, test_data = get_train_objs(gpt_cfg, opt_cfg, data_cfg)
38 |     trainer = Trainer(trainer_cfg, model, optimizer, train_data, test_data)
39 |     trainer.train()
40 | 
41 |     destroy_process_group()
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/mingpt/slurm/config.yaml.template:
--------------------------------------------------------------------------------
 1 | Region: us-east-1
 2 | 
 3 | Image:
 4 |   Os: ubuntu1804
 5 | 
 6 | SharedStorage:
 7 |   - MountDir: /shared
 8 |     Name: shared-fs
 9 |     StorageType: FsxLustre
10 |     FsxLustreSettings:
11 |       StorageCapacity: 1200
12 |       DeploymentType: SCRATCH_1
13 |       StorageType: SSD
14 | 
15 | HeadNode:
16 |   InstanceType: c5.xlarge
17 |   Networking:
18 |     SubnetId: subnet-xxxxxxx
19 |   Ssh:
20 |     KeyName: your-keyname-file
21 | 
22 | Scheduling:
23 |   Scheduler: slurm
24 |   SlurmQueues:
25 |   - Name: train
26 |     ComputeResources:
27 |     - Name: p32xlarge
28 |       InstanceType: p3.2xlarge
29 |       MinCount: 0
30 |       MaxCount: 5
31 |     Networking:
32 |       SubnetIds:
33 |       - subnet-xxxxxxx
34 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/mingpt/slurm/sbatch_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=multinode-example
 4 | #SBATCH --nodes=2
 5 | #SBATCH --ntasks=2
 6 | #SBATCH --gpus-per-task=1
 7 | #SBATCH --cpus-per-task=4
 8 | 
 9 | nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
10 | nodes_array=($nodes)
11 | head_node=${nodes_array[0]}
12 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
13 | 
14 | echo Node IP: $head_node_ip
15 | export LOGLEVEL=INFO
16 | 
17 | srun torchrun \
18 | --nnodes 2 \
19 | --nproc_per_node 1 \
20 | --rdzv_id $RANDOM \
21 | --rdzv_backend c10d \
22 | --rdzv_endpoint $head_node_ip:29500 \
23 | /shared/examples/mingpt/main.py
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/mingpt/slurm/setup_pcluster_slurm.md:
--------------------------------------------------------------------------------
 1 | # Setup AWS cluster with pcluster
 2 | Refer https://www.hpcworkshops.com/04-pcluster-cli.html
 3 | 
 4 | ## 1. Sign in to an AWS instance
 5 | 
 6 | ## 2. Install pcluster
 7 | ```
 8 | pip3 install awscli -U --user
 9 | pip3 install "aws-parallelcluster" --upgrade --user
10 | ```
11 | 
12 | ## 3. Create a cluster config file
13 | ```
14 | pcluster configure --config config.yaml
15 | ```
16 | See config.yaml.template for an example. Ensure you have a valid EC2 key-pair file
17 | 
18 | 
19 | ## 4. Create the cluster
20 | ```
21 | pcluster create-cluster --cluster-name dist-ml --cluster-configuration config.yaml
22 | ```
23 | 
24 | ### 4a. Track progress
25 | ```
26 | pcluster list-clusters
27 | ```
28 | 
29 | ## 5. Login to cluster headnode
30 | ```
31 | pcluster ssh --cluster-name dist-ml -i your-keypair-file
32 | ```
33 | 
34 | ## 6. Install dependencies
35 | ```
36 | sudo apt-get update
37 | sudo apt-get install -y python3-venv
38 | python3 -m venv /shared/venv/
39 | source /shared/venv/bin/activate
40 | pip install wheel
41 | echo 'source /shared/venv/bin/activate' >> ~/.bashrc
42 | ```
43 | 
44 | ## 7. Download training code and install requirements
45 | ```
46 | cd /shared
47 | git clone --depth 1 https://github.com/pytorch/examples;
48 | cd /shared/examples
49 | git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp
50 | python3 -m pip install setuptools==59.5.0
51 | pip install -r requirements.txt
52 | ```
53 | 


--------------------------------------------------------------------------------
/distributed/minGPT-ddp/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.11.0
2 | fsspec
3 | boto3
4 | hydra-core
5 | requests
6 | aiohttp
7 | 


--------------------------------------------------------------------------------
/distributed/rpc/batch/README.md:
--------------------------------------------------------------------------------
 1 | # Examples For Asynchronous RPC User Functions
 2 | 
 3 | This folder contains two examples for [`@rpc.functions.async_execution`](https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution):
 4 | 
 5 | 1. Synchronized Batch Update Parameter Server: uses `@rpc.functions.async_execution`
 6 |    for parameter update and retrieving. This serves as a simple starter example
 7 |    for batch RPC.
 8 |    ```
 9 |    pip install -r requirements.txt
10 |    python parameter_server.py
11 |    ```
12 | 2. Multi-Observer with Batch-Processing Agent: uses `@rpc.functions.async_execution`
13 |    to run multiple observed states through the policy to get actions.
14 |    ```
15 |    pip install -r requirements.txt
16 |    python reinforce.py
17 |    ```
18 | 


--------------------------------------------------------------------------------
/distributed/rpc/batch/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.9.0
2 | torchvision==0.7.0
3 | numpy
4 | gym
5 | 


--------------------------------------------------------------------------------
/distributed/rpc/ddp_rpc/README.md:
--------------------------------------------------------------------------------
 1 | Distributed DataParallel + Distributed RPC Framework Example
 2 | 
 3 | The example shows how to combine Distributed DataParallel with the Distributed 
 4 | RPC Framework. There are two trainer nodes, 1 master node and 1 parameter 
 5 | server in the example.
 6 | 
 7 | The master node creates an embedding table on the parameter server and drives 
 8 | the training loop on the trainers. The model consists of a dense part 
 9 | (nn.Linear) replicated on the trainers via Distributed DataParallel and a 
10 | sparse part (nn.EmbeddingBag) which resides on the parameter server. Each 
11 | trainer performs an embedding lookup on the parameter server (using the 
12 | Distributed RPC Framework)  and then executes its local nn.Linear module. 
13 | During the backward pass, the gradients for the dense part are aggregated via 
14 | allreduce by DDP and the distributed backward pass updates the parameters for 
15 | the embedding table on the parameter server.
16 | 
17 | 
18 | ```
19 | pip install -r requirements.txt
20 | python main.py
21 | ```
22 | 


--------------------------------------------------------------------------------
/distributed/rpc/ddp_rpc/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.6.0
2 | 


--------------------------------------------------------------------------------
/distributed/rpc/parameter_server/README.md:
--------------------------------------------------------------------------------
 1 | ### RPC-based distributed training
 2 | 
 3 | This is a basic example of RPC-based training that uses several trainers remotely train a model hosted on a server.
 4 | 
 5 | To run the example locally, run the following command worker for the server and each worker you wish to spawn, in separate terminal windows:
 6 | `python rpc_parameter_server.py --world_size=WORLD_SIZE --rank=RANK`. For example, for a master node with world size of 2, the command would be `python rpc_parameter_server.py --world_size=2 --rank=0`. The trainer can then be launched with the command `python rpc_parameter_server.py --world_size=2 --rank=1` in a separate window, and this will begin training with one server and a single trainer.
 7 | 
 8 | Note that for demonstration purposes, this example supports only between 0-2 GPUs, although the pattern can be extended to make use of additional GPUs. To configure the number of GPUs, pass in `--num_gpus=N` to your training command.
 9 | 
10 | You can pass in the command line arguments `--master_addr=<address>` and `master_port=PORT` to indicate the address:port that the master worker is listening on. All workers will contact the master for rendezvous during worker discovery. By default, `master_addr` will be `localhost` and `master_port` will be 29500.
11 | 


--------------------------------------------------------------------------------
/distributed/rpc/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | Distributed Pipeline Parallel Example
 2 | 
 3 | This example shows how to distribute a ResNet50 model on two RPC workers and
 4 | then implement distributed pipeline parallelism using RPC. With pipeline
 5 | parallelism, every input batch is divided into micro-batches and thse
 6 | micro-batches are feed into the model in a pipelined fashion to increase the
 7 | amortized device utilization. Note that this example only parallelizes the
 8 | forward pass which can be viewed as the distributed counterpart of the
 9 | [single machine pipeline parallel](https://pytorch.org/tutorials/intermediate/model_parallel_tutorial.html#speed-up-by-pipelining-inputs)
10 | example.
11 | 
12 | ```
13 | pip install -r requirements.txt
14 | python main.py
15 | ```
16 | 


--------------------------------------------------------------------------------
/distributed/rpc/pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.9.0
2 | torchvision==0.7.0


--------------------------------------------------------------------------------
/distributed/rpc/rl/README.md:
--------------------------------------------------------------------------------
 1 | Distributed Multi-Observer Single-Agent Reinforcement Learning Example
 2 | 
 3 | This example demonstrates `torch.distributed.rpc` API using an CartPole
 4 | reinforcement learning example. Please note that the goal is to present the RPC
 5 | API instead of building the best CartPole solver.
 6 | 
 7 | ```
 8 | pip install -r requirements.txt
 9 | python main.py
10 | ```
11 | 


--------------------------------------------------------------------------------
/distributed/rpc/rl/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | numpy
3 | gymnasium
4 | 


--------------------------------------------------------------------------------
/distributed/rpc/rnn/README.md:
--------------------------------------------------------------------------------
 1 | Distributed RNN Model Parallel Example
 2 | 
 3 | This example shows how to build an RNN model using RPC where different
 4 | components of the RNN model can be placed on different workers.
 5 | 
 6 | ```
 7 | pip install -r requirements.txt
 8 | python main.py
 9 | ```
10 | 


--------------------------------------------------------------------------------
/distributed/rpc/rnn/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import torch.distributed.autograd as dist_autograd
 5 | import torch.distributed.rpc as rpc
 6 | import torch.multiprocessing as mp
 7 | import torch.optim as optim
 8 | from torch.distributed.optim import DistributedOptimizer
 9 | 
10 | import rnn
11 | 
12 | 
13 | def _run_trainer():
14 |     r"""
15 |     The trainer creates a distributed RNNModel and a DistributedOptimizer. Then,
16 |     it performs training using random input data.
17 |     """
18 |     batch = 5
19 |     ntoken = 7
20 |     ninp = 2
21 | 
22 |     nhid = 3
23 |     nindices = 6
24 |     nlayers = 4
25 |     hidden = (
26 |         torch.randn(nlayers, nindices, nhid),
27 |         torch.randn(nlayers, nindices, nhid)
28 |     )
29 | 
30 |     model = rnn.RNNModel('ps', ntoken, ninp, nhid, nlayers)
31 | 
32 |     # setup distributed optimizer
33 |     opt = DistributedOptimizer(
34 |         optim.SGD,
35 |         model.parameter_rrefs(),
36 |         lr=0.05,
37 |     )
38 | 
39 |     criterion = torch.nn.CrossEntropyLoss()
40 | 
41 |     def get_next_batch():
42 |         for _ in range(5):
43 |             data = torch.LongTensor(batch, nindices) % ntoken
44 |             target = torch.LongTensor(batch, ntoken) % nindices
45 |             yield data, target
46 | 
47 |     # train for 10 iterations
48 |     for epoch in range(10):
49 |         # create distributed autograd context
50 |         for data, target in get_next_batch():
51 |             with dist_autograd.context() as context_id:
52 |                 hidden[0].detach_()
53 |                 hidden[1].detach_()
54 |                 output, hidden = model(data, hidden)
55 |                 loss = criterion(output, target)
56 |                 # run distributed backward pass
57 |                 dist_autograd.backward(context_id, [loss])
58 |                 # run distributed optimizer
59 |                 opt.step(context_id)
60 |                 # not necessary to zero grads as each iteration creates a different
61 |                 # distributed autograd context which hosts different grads
62 |         print("Training epoch {}".format(epoch))
63 | 
64 | 
65 | def run_worker(rank, world_size):
66 |     r"""
67 |     A wrapper function that initializes RPC, calls the function, and shuts down
68 |     RPC.
69 |     """
70 |     os.environ['MASTER_ADDR'] = 'localhost'
71 |     os.environ['MASTER_PORT'] = '29500'
72 |     if rank == 1:
73 |         rpc.init_rpc("trainer", rank=rank, world_size=world_size)
74 |         _run_trainer()
75 |     else:
76 |         rpc.init_rpc("ps", rank=rank, world_size=world_size)
77 |         # parameter server does nothing
78 |         pass
79 | 
80 |     # block until all rpcs finish
81 |     rpc.shutdown()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     world_size = 2
86 |     mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True)
87 | 


--------------------------------------------------------------------------------
/distributed/rpc/rnn/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | 


--------------------------------------------------------------------------------
/distributed/rpc/rnn/rnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.distributed.rpc as rpc
  4 | from torch.distributed.rpc import RRef
  5 | 
  6 | 
  7 | def _call_method(method, rref, *args, **kwargs):
  8 |     r"""
  9 |     a helper function to call a method on the given RRef
 10 |     """
 11 |     return method(rref.local_value(), *args, **kwargs)
 12 | 
 13 | 
 14 | def _remote_method(method, rref, *args, **kwargs):
 15 |     r"""
 16 |     a helper function to run method on the owner of rref and fetch back the
 17 |     result using RPC
 18 |     """
 19 |     return rpc.rpc_sync(
 20 |         rref.owner(),
 21 |         _call_method,
 22 |         args=[method, rref] + list(args),
 23 |         kwargs=kwargs
 24 |     )
 25 | 
 26 | 
 27 | def _parameter_rrefs(module):
 28 |     r"""
 29 |     Create one RRef for each parameter in the given local module, and return a
 30 |     list of RRefs.
 31 |     """
 32 |     param_rrefs = []
 33 |     for param in module.parameters():
 34 |         param_rrefs.append(RRef(param))
 35 |     return param_rrefs
 36 | 
 37 | 
 38 | class EmbeddingTable(nn.Module):
 39 |     r"""
 40 |     Encoding layers of the RNNModel
 41 |     """
 42 |     def __init__(self, ntoken, ninp, dropout):
 43 |         super(EmbeddingTable, self).__init__()
 44 |         self.drop = nn.Dropout(dropout)
 45 |         self.encoder = nn.Embedding(ntoken, ninp)
 46 |         if torch.cuda.is_available():
 47 |             self.encoder = self.encoder.cuda()
 48 |         nn.init.uniform_(self.encoder.weight, -0.1, 0.1)
 49 | 
 50 |     def forward(self, input):
 51 |         if torch.cuda.is_available():
 52 |             input = input.cuda()
 53 |         return self.drop(self.encoder(input)).cpu()
 54 | 
 55 | 
 56 | class Decoder(nn.Module):
 57 |     r"""
 58 |     Decoding layers of the RNNModel
 59 |     """
 60 |     def __init__(self, ntoken, nhid, dropout):
 61 |         super(Decoder, self).__init__()
 62 |         self.drop = nn.Dropout(dropout)
 63 |         self.decoder = nn.Linear(nhid, ntoken)
 64 |         nn.init.zeros_(self.decoder.bias)
 65 |         nn.init.uniform_(self.decoder.weight, -0.1, 0.1)
 66 | 
 67 |     def forward(self, output):
 68 |         return self.decoder(self.drop(output))
 69 | 
 70 | 
 71 | class RNNModel(nn.Module):
 72 |     r"""
 73 |     A distributed RNN model which puts embedding table and decoder parameters on
 74 |     a remote parameter server, and locally holds parameters for the LSTM module.
 75 |     The structure of the RNN model is borrowed from the word language model
 76 |     example. See https://github.com/pytorch/examples/blob/main/word_language_model/model.py
 77 |     """
 78 |     def __init__(self, ps, ntoken, ninp, nhid, nlayers, dropout=0.5):
 79 |         super(RNNModel, self).__init__()
 80 | 
 81 |         # setup embedding table remotely
 82 |         self.emb_table_rref = rpc.remote(ps, EmbeddingTable, args=(ntoken, ninp, dropout))
 83 |         # setup LSTM locally
 84 |         self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
 85 |         # setup decoder remotely
 86 |         self.decoder_rref = rpc.remote(ps, Decoder, args=(ntoken, nhid, dropout))
 87 | 
 88 |     def forward(self, input, hidden):
 89 |         # pass input to the remote embedding table and fetch emb tensor back
 90 |         emb = _remote_method(EmbeddingTable.forward, self.emb_table_rref, input)
 91 |         output, hidden = self.rnn(emb, hidden)
 92 |         # pass output to the remote decoder and get the decoded output back
 93 |         decoded = _remote_method(Decoder.forward, self.decoder_rref, output)
 94 |         return decoded, hidden
 95 | 
 96 |     def parameter_rrefs(self):
 97 |         remote_params = []
 98 |         # get RRefs of embedding table
 99 |         remote_params.extend(_remote_method(_parameter_rrefs, self.emb_table_rref))
100 |         # create RRefs for local parameters
101 |         remote_params.extend(_parameter_rrefs(self.rnn))
102 |         # get RRefs of decoder
103 |         remote_params.extend(_remote_method(_parameter_rrefs, self.decoder_rref))
104 |         return remote_params
105 | 


--------------------------------------------------------------------------------
/distributed/tensor_parallelism/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch native Tensor Parallel for distributed training
 2 | 
 3 | This example demonstrates SPMD Megatron-LM style Tensor Parallel by using
 4 | PyTorch native Tensor Parallel APIs, which include:
 5 | 
 6 | 1. Simple module-level Tensor Parallelism on a dummy MLP model.
 7 | 2. Simple module-level Tensor Parallelism with Sequence Parallel inputs/outputs on a dummy MLP model.
 8 | 3. A E2E demo of Fully Sharded Data Parallel + Tensor Parallel (with Sequence Parallel) on a example Llama2 model.
 9 | 
10 | More details about the PyTorch native Tensor Parallel APIs, please see PyTorch docs:
11 | https://pytorch.org/docs/stable/distributed.tensor.parallel.html
12 | 
13 | ```
14 | pip install -r requirements.txt
15 | python example.py
16 | ```
17 | 


--------------------------------------------------------------------------------
/distributed/tensor_parallelism/log_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | 
 4 | logging.basicConfig(
 5 |     format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO
 6 | )
 7 | 
 8 | def get_logger():
 9 |     return logging.getLogger(__name__)
10 | 
11 | 
12 | def rank_log(_rank, logger, msg):
13 |     """helper function to log only on global rank 0"""
14 |     if _rank == 0:
15 |         logger.info(f" {msg}")
16 | 
17 | 
18 | def verify_min_gpu_count(min_gpus: int = 2) -> bool:
19 |     """ verification that we have at least 2 gpus to run dist examples """
20 |     has_cuda = torch.cuda.is_available()
21 |     gpu_count = torch.cuda.device_count()
22 |     return has_cuda and gpu_count >= min_gpus
23 | 


--------------------------------------------------------------------------------
/distributed/tensor_parallelism/requirements.txt:
--------------------------------------------------------------------------------
1 | # Python dependencies required for running the example
2 | 
3 | --pre
4 | --extra-index-url https://download.pytorch.org/whl/nightly/cu118
5 | --extra-index-url https://download.pytorch.org/whl/nightly/cu121
6 | torch >= 2.3.0.dev0; sys_platform == "linux"
7 | 


--------------------------------------------------------------------------------
/distributed/tensor_parallelism/run_example.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # To run samples:
 3 | # bash run_example.sh {file_to_run.py} {num_gpus}
 4 | # where file_to_run = example to launch.  Default = 'fsdp_tp_example.py'
 5 | # num_gpus = num local gpus to use (must be at least 2). Default = 4
 6 | 
 7 | # samples to run include:
 8 | # sequence_parallel_example.py
 9 | # tensor_parallel_example.py
10 | # fsdp_tp_example.py
11 | 
12 | echo "Launching ${1:-fsdp_tp_example.py} with ${2:-4} gpus"
13 | torchrun --nnodes=1 --nproc_per_node=${2:-4} --rdzv_id=101 --rdzv_endpoint="localhost:5972" ${1:-fsdp_tp_example.py}
14 | 


--------------------------------------------------------------------------------
/distributed/tensor_parallelism/sequence_parallel_example.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from torch.distributed._tensor import Shard
  7 | 
  8 | from torch.distributed.tensor.parallel import (
  9 |     parallelize_module,
 10 |     ColwiseParallel,
 11 |     RowwiseParallel,
 12 | )
 13 | 
 14 | from log_utils import rank_log, get_logger, verify_min_gpu_count
 15 | 
 16 | 
 17 | # ---- GPU check ------------
 18 | _min_gpu_count = 2
 19 | 
 20 | if not verify_min_gpu_count(min_gpus=_min_gpu_count):
 21 |     print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.")
 22 |     sys.exit()
 23 | # ---------------------------
 24 | 
 25 | 
 26 | from torch.distributed._tensor.device_mesh import init_device_mesh
 27 | 
 28 | 
 29 | 
 30 | """
 31 | This is the script to test Sequence Parallel(SP) on a toy model in a
 32 | Megetron-LM SPMD style. We show an E2E working flow from forward,
 33 | backward and optimization.
 34 | 
 35 | We use the example of two `nn.Linear` layers with an element-wise `nn.RELU`
 36 | in between to show an example of sequence parallel, which was proposed in paper:
 37 | 
 38 | https://arxiv.org/pdf/2205.05198.pdf.
 39 | 
 40 | Like tensor parallel, we parallelize the first linear layer by column
 41 | and also parallelize the second linear layer by row. But the input in each rank
 42 | now is different so that we need one all-gather for input and one reduce-scatter
 43 | in the end of the second linear layer.
 44 | """
 45 | 
 46 | 
 47 | class ToyModel(nn.Module):
 48 |     """MLP based model"""
 49 | 
 50 |     def __init__(self):
 51 |         super().__init__()
 52 |         self.in_proj = nn.Linear(10, 32)
 53 |         self.relu = nn.ReLU()
 54 |         self.out_proj = nn.Linear(32, 5)
 55 | 
 56 |     def forward(self, x):
 57 |         return self.out_proj(self.relu(self.in_proj(x)))
 58 | 
 59 | 
 60 | """
 61 | Main body of the demo of a basic version of sequence parallel by using
 62 | PyTorch native APIs.
 63 | """
 64 | logger = get_logger()
 65 | 
 66 | # create a device mesh based on the given world_size.
 67 | device_mesh = init_device_mesh(
 68 |     device_type="cuda", mesh_shape=(int(os.environ["WORLD_SIZE"]),)
 69 | )
 70 | 
 71 | _rank = device_mesh.get_rank()
 72 | 
 73 | print(f"Starting PyTorch Sequence Parallel example on rank {_rank}.")
 74 | 
 75 | rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}")
 76 | 
 77 | # create model and move it to GPU.  Init_device_mesh has already assigned gpu ids...
 78 | model = ToyModel().to("cuda")
 79 | 
 80 | # Custom parallelization plan for the model
 81 | sp_model = parallelize_module(
 82 |     module=model,
 83 |     device_mesh=device_mesh,
 84 |     parallelize_plan={
 85 |         "in_proj": ColwiseParallel(input_layouts=Shard(0)),
 86 |         "out_proj": RowwiseParallel(output_layouts=Shard(0)),
 87 |     },
 88 | )
 89 | 
 90 | 
 91 | # Create a optimizer for the parallelized module.
 92 | lr = 0.25
 93 | optimizer = torch.optim.AdamW(sp_model.parameters(), lr=lr, foreach=True)
 94 | 
 95 | 
 96 | # Perform a num of iterations of forward/backward
 97 | # and optimizations for the sharded module.
 98 | num_iters = 10
 99 | rank_log(_rank, logger, "Sequence Parallel training starting...")
100 | 
101 | for i in range(num_iters):
102 |     # For SP, input can be different across all ranks.
103 |     inp = torch.rand(20, 10, device="cuda")
104 |     output = sp_model(inp)
105 |     output.sum().backward()
106 |     optimizer.step()
107 |     rank_log(_rank, logger, f"Sequence Parallel iter {i} completed")
108 | 
109 | rank_log(_rank, logger, "Sequence Parallel training completed!")
110 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | # torch
3 | # PyTorch Theme
4 | -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
5 | sphinx-panels
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the BSD-style license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | # Configuration file for the Sphinx documentation builder.
 9 | #
10 | # This file only contains a selection of the most common options. For a full
11 | # list see the documentation:
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
13 | 
14 | # -- Path setup --------------------------------------------------------------
15 | 
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 | 
23 | import pytorch_sphinx_theme
24 | 
25 | current_dir = os.path.dirname(__file__)
26 | target_dir = os.path.abspath(os.path.join(current_dir, "../.."))
27 | sys.path.insert(0, target_dir)
28 | print(target_dir)
29 | 
30 | # -- Project information -----------------------------------------------------
31 | 
32 | project = "PyTorchExamples"
33 | copyright = "2022, Meta"
34 | author = "Meta"
35 | 
36 | # The full version, including alpha/beta/rc tags
37 | release = "1.11"
38 | 
39 | # -- General configuration ---------------------------------------------------
40 | 
41 | # Add any Sphinx extension module names here, as strings. They can be
42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
43 | # ones.
44 | extensions = ["sphinx.ext.napoleon", "sphinx.ext.autodoc", 'sphinx_panels']
45 | panels_add_bootstrap_css = False
46 | 
47 | # Add any paths that contain templates here, relative to this directory.
48 | templates_path = ["_templates"]
49 | 
50 | # List of patterns, relative to source directory, that match files and
51 | # directories to ignore when looking for source files.
52 | # This pattern also affects html_static_path and html_extra_path.
53 | exclude_patterns = []
54 | 
55 | # -- Options for HTML output -------------------------------------------------
56 | 
57 | # The theme to use for HTML and HTML Help pages.  See the documentation for
58 | # a list of builtin themes.
59 | #
60 | # html_theme = 'alabaster'
61 | html_theme = "pytorch_sphinx_theme"
62 | html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | 
68 | html_static_path = ["_static"]
69 | panels_add_fontawesome_latex = True
70 | 
71 | html_theme_options = {
72 |     'pytorch_project': 'examples',
73 |     'collapse_navigation': False,
74 |     'display_version': True,
75 |     'logo_only': False,
76 |     'analytics_id': 'UA-117752657-2',
77 | }
78 | 


--------------------------------------------------------------------------------
/fast_neural_style/README.md:
--------------------------------------------------------------------------------
 1 | # fast-neural-style :city_sunrise: :rocket:
 2 | 
 3 | This repository contains a pytorch implementation of an algorithm for artistic style transfer. The algorithm can be used to mix the content of an image with the style of another image. For example, here is a photograph of a door arch rendered in the style of a stained glass painting.
 4 | 
 5 | The model uses the method described in [Perceptual Losses for Real-Time Style Transfer and Super-Resolution](https://arxiv.org/abs/1603.08155) along with [Instance Normalization](https://arxiv.org/pdf/1607.08022.pdf). The saved-models for examples shown in the README can be downloaded from [here](https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=0).
 6 | 
 7 | <p align="center">
 8 |     <img src="images/style-images/mosaic.jpg" height="200px">
 9 |     <img src="images/content-images/amber.jpg" height="200px">
10 |     <img src="images/output-images/amber-mosaic.jpg" height="440px">
11 | </p>
12 | 
13 | ## Requirements
14 | 
15 | The program is written in Python, and uses [pytorch](http://pytorch.org/), [scipy](https://www.scipy.org). A GPU is not necessary, but can provide a significant speed up especially for training a new model. Regular sized images can be styled on a laptop or desktop using saved models.
16 | 
17 | ## Usage
18 | 
19 | Stylize image
20 | 
21 | ```
22 | python neural_style/neural_style.py eval --content-image </path/to/content/image> --model </path/to/saved/model> --output-image </path/to/output/image> --cuda 0
23 | ```
24 | 
25 | - `--content-image`: path to content image you want to stylize.
26 | - `--model`: saved model to be used for stylizing the image (eg: `mosaic.pth`)
27 | - `--output-image`: path for saving the output image.
28 | - `--content-scale`: factor for scaling down the content image if memory is an issue (eg: value of 2 will halve the height and width of content-image)
29 | - `--cuda`: set it to 1 for running on GPU, 0 for CPU.
30 | - `--mps`: set it to 1 for running on macOS GPU
31 | 
32 | Train model
33 | 
34 | ```bash
35 | python neural_style/neural_style.py train --dataset </path/to/train-dataset> --style-image </path/to/style/image> --save-model-dir </path/to/save-model/folder> --epochs 2 --cuda 1
36 | ```
37 | 
38 | There are several command line arguments, the important ones are listed below
39 | 
40 | - `--dataset`: path to training dataset, the path should point to a folder containing another folder with all the training images. I used COCO 2014 Training images dataset [80K/13GB] [(download)](https://cocodataset.org/#download).
41 | - `--style-image`: path to style-image.
42 | - `--save-model-dir`: path to folder where trained model will be saved.
43 | - `--cuda`: set it to 1 for running on GPU, 0 for CPU.
44 | - `--mps`: set it to 1 for running on macOS GPU
45 | 
46 | Refer to `neural_style/neural_style.py` for other command line arguments. For training new models you might have to tune the values of `--content-weight` and `--style-weight`. The mosaic style model shown above was trained with `--content-weight 1e5` and `--style-weight 1e10`. The remaining 3 models were also trained with similar order of weight parameters with slight variation in the `--style-weight` (`5e10` or `1e11`).
47 | 
48 | ## Models
49 | 
50 | Models for the examples shown below can be downloaded from [here](https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=0) or by running the script `download_saved_models.py`.
51 | 
52 | <div align='center'>
53 |   <img src='images/content-images/amber.jpg' height="174px">		
54 | </div>
55 | 
56 | <div align='center'>
57 |   <img src='images/style-images/mosaic.jpg' height="174px">
58 |   <img src='images/output-images/amber-mosaic.jpg' height="174px">
59 |   <img src='images/output-images/amber-candy.jpg' height="174px">
60 |   <img src='images/style-images/candy.jpg' height="174px">
61 |   <br>
62 |   <img src='images/style-images/rain-princess-cropped.jpg' height="174px">
63 |   <img src='images/output-images/amber-rain-princess.jpg' height="174px">
64 |   <img src='images/output-images/amber-udnie.jpg' height="174px">
65 |   <img src='images/style-images/udnie.jpg' height="174px">
66 | </div>
67 | 


--------------------------------------------------------------------------------
/fast_neural_style/download_saved_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | 
 4 | # PyTorch 1.1 moves _download_url_to_file
 5 | #   from torch.utils.model_zoo to torch.hub
 6 | # PyTorch 1.0 exists another _download_url_to_file
 7 | #   2 argument
 8 | # TODO: If you remove support PyTorch 1.0 or older,
 9 | #       You should remove torch.utils.model_zoo
10 | #       Ref. PyTorch #18758
11 | #         https://github.com/pytorch/pytorch/pull/18758/commits
12 | try:
13 |     from torch.utils.model_zoo import _download_url_to_file
14 | except ImportError:
15 |     try:
16 |         from torch.hub import download_url_to_file as _download_url_to_file
17 |     except ImportError:
18 |         from torch.hub import _download_url_to_file
19 | 
20 | 
21 | def unzip(source_filename, dest_dir):
22 |     with zipfile.ZipFile(source_filename) as zf:
23 |         zf.extractall(path=dest_dir)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     _download_url_to_file('https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=1', 'saved_models.zip', None, True)
28 |     unzip('saved_models.zip', '.')
29 | 


--------------------------------------------------------------------------------
/fast_neural_style/images/content-images/amber.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/content-images/amber.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/output-images/amber-candy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/output-images/amber-candy.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/output-images/amber-mosaic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/output-images/amber-mosaic.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/output-images/amber-rain-princess.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/output-images/amber-rain-princess.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/output-images/amber-udnie.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/output-images/amber-udnie.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/style-images/candy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/style-images/candy.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/style-images/mosaic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/style-images/mosaic.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/style-images/rain-princess-cropped.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/style-images/rain-princess-cropped.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/style-images/rain-princess.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/style-images/rain-princess.jpg


--------------------------------------------------------------------------------
/fast_neural_style/images/style-images/udnie.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/images/style-images/udnie.jpg


--------------------------------------------------------------------------------
/fast_neural_style/neural_style/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geohot/examples/07267b7138142b2979589a69a4d64470e29022d5/fast_neural_style/neural_style/__init__.py


--------------------------------------------------------------------------------
/fast_neural_style/neural_style/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | 
 4 | 
 5 | def load_image(filename, size=None, scale=None):
 6 |     img = Image.open(filename).convert('RGB')
 7 |     if size is not None:
 8 |         img = img.resize((size, size), Image.ANTIALIAS)
 9 |     elif scale is not None:
10 |         img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS)
11 |     return img
12 | 
13 | 
14 | def save_image(filename, data):
15 |     img = data.clone().clamp(0, 255).numpy()
16 |     img = img.transpose(1, 2, 0).astype("uint8")
17 |     img = Image.fromarray(img)
18 |     img.save(filename)
19 | 
20 | 
21 | def gram_matrix(y):
22 |     (b, ch, h, w) = y.size()
23 |     features = y.view(b, ch, w * h)
24 |     features_t = features.transpose(1, 2)
25 |     gram = features.bmm(features_t) / (ch * h * w)
26 |     return gram
27 | 
28 | 
29 | def normalize_batch(batch):
30 |     # normalize using imagenet mean and std
31 |     mean = batch.new_tensor([0.485, 0.456, 0.406]).view(-1, 1, 1)
32 |     std = batch.new_tensor([0.229, 0.224, 0.225]).view(-1, 1, 1)
33 |     batch = batch.div_(255.0)
34 |     return (batch - mean) / std
35 | 


--------------------------------------------------------------------------------
/fast_neural_style/neural_style/vgg.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | import torch
 4 | from torchvision import models
 5 | 
 6 | 
 7 | class Vgg16(torch.nn.Module):
 8 |     def __init__(self, requires_grad=False):
 9 |         super(Vgg16, self).__init__()
10 |         vgg_pretrained_features = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features
11 |         self.slice1 = torch.nn.Sequential()
12 |         self.slice2 = torch.nn.Sequential()
13 |         self.slice3 = torch.nn.Sequential()
14 |         self.slice4 = torch.nn.Sequential()
15 |         for x in range(4):
16 |             self.slice1.add_module(str(x), vgg_pretrained_features[x])
17 |         for x in range(4, 9):
18 |             self.slice2.add_module(str(x), vgg_pretrained_features[x])
19 |         for x in range(9, 16):
20 |             self.slice3.add_module(str(x), vgg_pretrained_features[x])
21 |         for x in range(16, 23):
22 |             self.slice4.add_module(str(x), vgg_pretrained_features[x])
23 |         if not requires_grad:
24 |             for param in self.parameters():
25 |                 param.requires_grad = False
26 | 
27 |     def forward(self, X):
28 |         h = self.slice1(X)
29 |         h_relu1_2 = h
30 |         h = self.slice2(h)
31 |         h_relu2_2 = h
32 |         h = self.slice3(h)
33 |         h_relu3_3 = h
34 |         h = self.slice4(h)
35 |         h_relu4_3 = h
36 |         vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3'])
37 |         out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3)
38 |         return out
39 | 


--------------------------------------------------------------------------------
/fx/README.md:
--------------------------------------------------------------------------------
1 | # FX Examples
2 | 
3 | This folder contains several examples of program transformations implemented using `torch.fx`. More information about FX can be found in the [documentation](https://pytorch.org/docs/master/fx.html).
4 | 
5 | Note that all examples should be runnable as standalone Python files. In the case of an exception, the example will appear in a subfolder with a `README.md` file explaining how to run the example.
6 | 
7 | As FX is currently in a Beta release, the API or these examples are subject to change.
8 | 


--------------------------------------------------------------------------------
/fx/inline_function.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.fx import Proxy, symbolic_trace
 3 | from torch.fx.node import map_arg
 4 | 
 5 | 
 6 | '''
 7 | How to Inline a Function Into an Existing Graph
 8 | 
 9 | One reason you might want to inline a function is to get around FX's
10 | default tracing behavior. For example, unless you've defined a custom
11 | Tracer, the out-of-the-box implementation of ``symbolic_trace`` causes
12 | references to ``torch.nn`` module instances to appear as
13 | ``call_module`` calls rather than being traced through. Let's say this
14 | behavior is almost what you need; the only problem is that there's a
15 | single module call that you want to replace with an inlined trace of the
16 | function. Creating a custom Tracer would be too much. Instead, you can
17 | accomplish this using Proxies.
18 | 
19 | The following code demonstrates how to trace a module and inline it
20 | into an existing Graph using Proxy. We'll trace our Graph, then iterate
21 | through its Nodes until we find the right place to swap out the
22 | ``call_module`` Node with an inlined trace. At that point, we'll create
23 | Proxies from the Node's args and kwargs. Finally, we'll call the
24 | function we want to replace with those Proxies--which will, in essence,
25 | "trace" that function. Finally, we'll insert the result of that call
26 | into our Graph. (This last step will automatically inline the function.)
27 | '''
28 | 
29 | 
30 | # Sample module
31 | class M(torch.nn.Module):
32 |     def __init__(self):
33 |         super().__init__()
34 |         self.relu = torch.nn.ReLU()
35 | 
36 |     def forward(self, x):
37 |         return self.relu(x) + 1.0
38 | 
39 | # Symbolically trace an instance of `M`. After tracing, `self.relu` is
40 | # represented as a `call_module` Node. The full operation in the
41 | # generated `forward` function's code will appear as `self.relu(x)`
42 | m = symbolic_trace(M())
43 | 
44 | # Insert nodes from the ReLU graph in place of the original call to
45 | # `self.relu`
46 | # create a graph-appending tracer pointing to the original graph
47 | tracer = torch.fx.proxy.GraphAppendingTracer(m.graph)
48 | for node in m.graph.nodes:
49 |     # Find `call_module` Node in `m` that corresponds to `self.relu`.
50 |     # This is the Node we want to swap out for an inlined version of the
51 |     # same call
52 |     if (node.op, node.target) == ("call_module", "relu"):
53 |         with m.graph.inserting_before(node):
54 |             # Create a Proxy from each Node in the current Node's
55 |             # args/kwargs
56 |             proxy_args = map_arg(node.args, lambda n: Proxy(n, tracer))
57 |             proxy_kwargs = map_arg(node.kwargs, lambda n: Proxy(n, tracer))
58 |             # Call `m.relu` with the newly-created Proxy arguments.
59 |             # `m.relu` is the generic version of the function; by
60 |             # calling it with Proxies created from Nodes in `m`, we're
61 |             # emitting Nodes that reference exiting values in the IR.
62 |             # The result of this call is another Proxy, which we can
63 |             # hook into our existing Graph to complete the function
64 |             # inlining.
65 |             proxy_output = m.relu(*proxy_args, **proxy_kwargs)
66 |             # Replace the relu `call_module` node with the inlined
67 |             # version of the function
68 |             node.replace_all_uses_with(proxy_output.node)
69 |             # Make sure that the old relu Node is erased
70 |             m.graph.erase_node(node)
71 | 


--------------------------------------------------------------------------------
/fx/invert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.fx as fx
 3 | 
 4 | # An inverse mapping is one that takes a function f(x) and returns a function g
 5 | # such that f(g(x)) == x. For example,since log(exp(x)) == x, exp and log are
 6 | # inverses.
 7 | 
 8 | invert_mapping = {}
 9 | def add_inverse(a, b):
10 |     invert_mapping[a] = b
11 |     invert_mapping[b] = a
12 | inverses = [
13 |     (torch.sin, torch.arcsin),
14 |     (torch.cos, torch.arccos),
15 |     (torch.tan, torch.arctan),
16 |     (torch.exp, torch.log),
17 | ]
18 | for a, b in inverses:
19 |     add_inverse(a, b)
20 | 
21 | # The general strategy is that we walk the graph backwards, transforming each
22 | # node into its inverse. To do so, we swap the outputs and inputs of the
23 | # functions, and then we look up its inverse in `invert_mapping`. Note that
24 | # this transform assumes that all operations take in only one input and return
25 | # one output.
26 | def invert(model: torch.nn.Module) -> torch.nn.Module:
27 |     fx_model = fx.symbolic_trace(model)
28 |     new_graph = fx.Graph()  # As we're building up a new graph
29 |     env = {}
30 |     for node in reversed(fx_model.graph.nodes):
31 |         if node.op == 'call_function':
32 |             # This creates a node in the new graph with the inverse function,
33 |             # and passes `env[node.name]` (i.e. the previous output node) as
34 |             # input.
35 |             new_node = new_graph.call_function(invert_mapping[node.target], (env[node.name],))
36 |             env[node.args[0].name] = new_node
37 |         elif node.op == 'output':
38 |             # We turn the output into an input placeholder
39 |             new_node = new_graph.placeholder(node.name)
40 |             env[node.args[0].name] = new_node
41 |         elif node.op == 'placeholder':
42 |             # We turn the input placeholder into an output
43 |             new_graph.output(env[node.name])
44 |         else:
45 |             raise RuntimeError("Not implemented")
46 | 
47 |     new_graph.lint()
48 |     return fx.GraphModule(fx_model, new_graph)
49 | 
50 | 
51 | def f(x):
52 |     return torch.exp(torch.tan(x))
53 | 
54 | res = invert(f)
55 | print(res.code)
56 | """
57 | def forward(self, output):
58 |     log_1 = torch.log(output);  output = None
59 |     arctan_1 = torch.arctan(log_1);  log_1 = None
60 |     return arctan_1
61 | """
62 | print(f(res((torch.arange(5) + 1))))  # [1., 2., 3., 4, 5.]
63 | 


--------------------------------------------------------------------------------
/fx/native_interpreter/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 2 | project(interpreter)
 3 | 
 4 | find_package(Torch REQUIRED)
 5 | 
 6 | # Define our library target
 7 | add_library(interpreter SHARED interpreter.cpp)
 8 | set(CMAKE_CXX_STANDARD 17)
 9 | # Link against LibTorch
10 | target_link_libraries(interpreter "${TORCH_LIBRARIES}")
11 | 


--------------------------------------------------------------------------------
/fx/native_interpreter/README.md:
--------------------------------------------------------------------------------
 1 | # Converting PyTorch Code to a Native Runtime With FX and TorchScript Custom Classes
 2 | 
 3 | In this example, we are going to build a pipeline that does the following things:
 4 | 
 5 | 1. Converts (or “lowers”) code in a PyTorch module into another representation (we will define the representation within the example)
 6 | 2. Registers an interpreter for that code representation that can be used in TorchScript or Python
 7 | 3. Wrap the converted code into a format that can still be used in TorchScript compilation.
 8 | 
 9 | We are going to build up a trivial interpreter for this example, but you can imagine extending the same process to work with more sophisticated backends, ones which may do code optimization or offloading to an accelerator.
10 | 
11 | We will be using [TorchScript custom classes](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) to expose this Interpreter to Python and TorchScript. You may want to review that tutorial and documentation before reading this example project.
12 | 
13 | ### Defining the Interpreter
14 | 
15 | We define the interpreter in `interpreter.cpp`. This interpreter is very limited: it only supports two element-wise operations (`add` and `mul`) and it only supports `Tensor` values. When this interpreter runs code, it iterates through the list of instructions and simply calls the appropriate PyTorch operator from C++.
16 | 
17 | To build the interpreter into a shared-object file to be loaded in for use, use the following commands from this example’s root:
18 | 
19 | 
20 | ```
21 | $ mkdir build
22 | $ cd build
23 | $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
24 | $ make -j
25 | ```
26 | 
27 | After the build finishes, you should see `build/libinterpreter.so` (or with a different extension depending on your OS). We will use this dynamic library next when we load it up into a process to be used in execution.
28 | 
29 | ### Defining the Transformation
30 | 
31 | We define the code that transforms a `PyTorch` module to the format the interpreter understands in `use_interpreter.py`. Note that that file loads in the shared object we built in the previous step via a `torch.classes.load_library` call. `use_interpreter.py` contains driver code and the end that can be directly run to test the lowering transformation.
32 | 
33 | ### Questions, Comments, Feedback
34 | 
35 | Please direct questions and discussion to the [PyTorch forums](https://discuss.pytorch.org/). To report any issues with PyTorch (including FX and custom classes), please use the [issue tracker](https://github.com/pytorch/pytorch/issues).
36 | 


--------------------------------------------------------------------------------
/fx/proxy_based_graph_creation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.fx import Proxy, Graph, GraphModule
 3 | 
 4 | 
 5 | '''
 6 | How to Create a Graph Using Proxy Objects Instead of Tracing
 7 | 
 8 | It's possible to directly create a Proxy object around a raw Node. This
 9 | can be used to create a Graph independently of symbolic tracing.
10 | 
11 | The following code demonstrates how to use Proxy with a raw Node to
12 | append operations to a fresh Graph. We'll create two parameters (``x``
13 | and ``y``), perform some operations on those parameters, then add
14 | everything we created to the new Graph. We'll then wrap that Graph in
15 | a GraphModule. Doing so creates a runnable instance of ``nn.Module``
16 | where previously-created operations are represented in the Module's
17 | ``forward`` function.
18 | 
19 | By the end of the tutorial, we'll have added the following method to an
20 | empty ``nn.Module`` class.
21 | 
22 | .. code-block:: python
23 | 
24 |     def forward(self, x, y):
25 |         cat_1 = torch.cat([x, y]);  x = y = None
26 |         tanh_1 = torch.tanh(cat_1);  cat_1 = None
27 |         neg_1 = torch.neg(tanh_1);  tanh_1 = None
28 |         return neg_1
29 | 
30 | '''
31 | 
32 | 
33 | # Create a graph independently of symbolic tracing
34 | graph = Graph()
35 | tracer = torch.fx.proxy.GraphAppendingTracer(graph)
36 | 
37 | # Create raw Nodes
38 | raw1 = graph.placeholder('x')
39 | raw2 = graph.placeholder('y')
40 | 
41 | # Initialize Proxies using the raw Nodes and graph's default tracer
42 | y = Proxy(raw1, tracer)
43 | z = Proxy(raw2, tracer)
44 | # y = Proxy(raw1)
45 | # z = Proxy(raw2)
46 | 
47 | # Create other operations using the Proxies `y` and `z`
48 | a = torch.cat([y, z])
49 | b = torch.tanh(a)
50 | c = torch.neg(b)
51 | # By using the graph's own appending tracer to create Proxies,
52 | # notice we can now use n-ary operators on operations without
53 | # multiple tracers being created at run-time (line 52) which leads
54 | # to errors # To try this out for yourself, replace lines 42, 43
55 | # with 44, 45
56 | z = torch.add(b, c)
57 | 
58 | # Create a new output Node and add it to the Graph. By doing this, the
59 | # Graph will contain all the Nodes we just created (since they're all
60 | # linked to the output Node)
61 | graph.output(c.node)
62 | 
63 | # Wrap our created Graph in a GraphModule to get a final, runnable
64 | # `nn.Module` instance
65 | mod = GraphModule(torch.nn.Module(), graph)
66 | 


--------------------------------------------------------------------------------
/fx/replace_op.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.fx import symbolic_trace
 3 | import operator
 4 | 
 5 | """
 6 | How to Replace One Op With Another
 7 | 
 8 | 1. Iterate through all Nodes in your GraphModule's Graph.
 9 | 2. Determine if the current Node should be replaced. (Suggested: match
10 | on the Node's ``target`` attribute).
11 | 3. Create a replacement Node and add it to the Graph.
12 | 4. Use the FX built-in ``replace_all_uses_with`` to replace all uses of
13 | the current Node with the replacement.
14 | 5. Delete the old Node from the graph.
15 | 6. Call ``recompile`` on the GraphModule. This updates the generated
16 | Python code to reflect the new Graph state.
17 | 
18 | Currently, FX does not provide any way to guarantee that replaced
19 | operators are syntactically valid. It's up to the user to confirm that
20 | any new operators will work with the existing operands.
21 | 
22 | The following code demonstrates an example of replacing any instance of
23 | addition with a bitwise AND.
24 | 
25 | To examine how the Graph evolves during op replacement, add the
26 | statement `print(traced.graph)` after the line you want to inspect.
27 | Alternatively, call `traced.graph.print_tabular()` to see the IR in a
28 | tabular format.
29 | """
30 | 
31 | # Sample module
32 | class M(torch.nn.Module):
33 |     def forward(self, x, y):
34 |         return x + y, torch.add(x, y), x.add(y)
35 | 
36 | # Symbolically trace an instance of the module
37 | traced = symbolic_trace(M())
38 | 
39 | # As demonstrated in the above example, there are several different ways
40 | # to denote addition. The possible cases are:
41 | #     1. `x + y` - A `call_function` Node with target `operator.add`.
42 | #         We can match for equality on that `operator.add` directly.
43 | #     2. `torch.add(x, y)` - A `call_function` Node with target
44 | #         `torch.add`. Similarly, we can match this function directly.
45 | #     3. `x.add(y)` - The Tensor method call, whose target we can match
46 | #         as a string.
47 | 
48 | patterns = set([operator.add, torch.add, "add"])
49 | 
50 | # Go through all the nodes in the Graph
51 | for n in traced.graph.nodes:
52 |     # If the target matches one of the patterns
53 |     if any(n.target == pattern for pattern in patterns):
54 |         # Set the insert point, add the new node, and replace all uses
55 |         # of `n` with the new node
56 |         with traced.graph.inserting_after(n):
57 |             new_node = traced.graph.call_function(torch.bitwise_and, n.args, n.kwargs)
58 |             n.replace_all_uses_with(new_node)
59 |         # Remove the old node from the graph
60 |         traced.graph.erase_node(n)
61 | 
62 | # Don't forget to recompile!
63 | traced.recompile()
64 | 


--------------------------------------------------------------------------------
/fx/subgraph_rewriter_basic_use.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.fx import symbolic_trace, replace_pattern
 3 | 
 4 | 
 5 | '''
 6 | How to Use the FX Subgraph Rewriter
 7 | 
 8 | For easy subgraph rewriting, FX exposes the utility function:
 9 | 
10 |     replace_pattern(gm : GraphModule,
11 |                     pattern : Callable,
12 |                     replacement : Callable)
13 |                     -> None
14 | 
15 | `replace_pattern` matches all possible non-overlapping sets of operators
16 | and their data dependencies (`pattern`) in the Graph of a GraphModule
17 | (`gm`), then replaces each of these matched subgraphs with another
18 | subgraph (`replacement).
19 | 
20 | The docstring for `replace_pattern` (located in `subgraph_rewriter.py`)
21 | gives an in-depth explanation as to how `pattern` and `replacement`
22 | should be specified, what happens during pattern matching, and other
23 | important technical details. This tutorial, therefore, is only meant to
24 | give an overview as to the FX Subgraph Rewriter's basic functionality.
25 | Let's go rewrite a Graph!
26 | '''
27 | 
28 | # Sample module
29 | class M(torch.nn.Module):
30 |     def __init__(self):
31 |         super().__init__()
32 | 
33 |     def forward(self, x, w1, w2):
34 |         val1 = torch.neg(w1)
35 |         m1 = torch.cat([val1, w2]).sum()
36 |         val2 = torch.neg(w1)
37 |         m2 = torch.cat([val2, w2]).sum()
38 |         return x + torch.max(m1) + torch.max(m2)
39 | 
40 | # Symbolically trace an instance of `M`
41 | traced = symbolic_trace(M())
42 | 
43 | # Define the pattern. The FX Subgraph Rewriter will match all
44 | # non-overlapping instances of the pattern in the larger graph.
45 | # Note that Pattern-matching is done based on data dependencies,
46 | # not Node names. Even though we're operating on Nodes named `a1` and
47 | # `a2` instead of `w1` and `w2`, the pattern is still a valid match
48 | # for the two instances of `torch.cat([w1, w2]).sum()` above. Only
49 | # operations that contribute to the single output value of the pattern
50 | # are considered
51 | def pattern(a1, a2):
52 |     val1 = torch.neg(a1)
53 |     return torch.cat([val1, a2]).sum()
54 | 
55 | # Define the replacement (same rules as the pattern)
56 | def replacement(w1, w2):
57 |     return torch.stack([w1, w2])
58 | 
59 | # Replace `pattern` with `replacement` in `traced`
60 | replace_pattern(traced, pattern, replacement)
61 | 
62 | # After calling `replace_pattern`, the generated code is:
63 | '''
64 | def forward(self, x, w1, w2):
65 |     stack = torch.stack([w1, w2])
66 |     max_1 = torch.max(stack);  stack = None
67 |     add = x + max_1;  x = max_1 = None
68 |     stack_1 = torch.stack([w1, w2]);  w1 = w2 = None
69 |     max_2 = torch.max(stack_1);  stack_1 = None
70 |     add_1 = add + max_2;  add = max_2 = None
71 |     return add_1
72 | '''
73 | 


--------------------------------------------------------------------------------
/fx/wrap_output_dynamically.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from enum import Enum, auto
 3 | 
 4 | import torch
 5 | from torch.fx import GraphModule, Node, Proxy, symbolic_trace
 6 | 
 7 | '''
 8 | Wrap Graph Output Dynamically
 9 | 
10 | The following code demonstrates how change an existing Graph based on
11 | parameters specified at runtime. We'll let the user specify an
12 | activation function from a predefined Enum list, then we'll symbolically
13 | trace it. Next, we'll create a Proxy from the last operation in the
14 | Graph. We'll call our traced activation function with this Proxy and
15 | insert the ``output`` Node from that call into our Graph. (This final
16 | step will automatically inline the entire traced function.)
17 | '''
18 | 
19 | 
20 | # Sample module
21 | class M(torch.nn.Module):
22 |     def __init__(self):
23 |         super().__init__()
24 | 
25 |     def forward(self, x, y):
26 |         y = torch.cat([x, y])
27 |         return y
28 | 
29 | # Symbolically trace an instance of `M`
30 | traced = symbolic_trace(M())
31 | 
32 | # Selected activation functions
33 | class ActivationFunction(Enum):
34 |     RELU = auto()
35 |     LEAKY_RELU = auto()
36 |     PRELU = auto()
37 | 
38 | # Map activation function names to their implementation
39 | activation_functions = {
40 |     ActivationFunction.RELU: torch.nn.ReLU(),
41 |     ActivationFunction.LEAKY_RELU: torch.nn.LeakyReLU(),
42 |     ActivationFunction.PRELU: torch.nn.PReLU(),
43 | }
44 | 
45 | def wrap_in_activation_function(m: GraphModule, fn: ActivationFunction) -> GraphModule:
46 |     # Get output node
47 |     output_node: Optional[Node] = None
48 |     for n in reversed(m.graph.nodes):
49 |         if n.op == "output":
50 |             output_node = n
51 |             break
52 |     assert output_node
53 | 
54 |     # Get the actual output (the "input" of the output node). This is
55 |     # the Node we want to wrap in a user-specified activation function
56 |     assert len(output_node.all_input_nodes) == 1
57 |     wrap_node = output_node.all_input_nodes[0]
58 | 
59 |     # Wrap the actual output in a Proxy
60 |     wrap_proxy = Proxy(wrap_node)
61 | 
62 |     # Get the implementation of the specified activation function and
63 |     # symbolically trace it
64 |     fn_impl = activation_functions[fn]
65 |     fn_impl_traced = symbolic_trace(fn_impl)
66 | 
67 |     # Call the specified activation function using the Proxy wrapper for
68 |     # `output_op`. The result of this call is another Proxy, which we
69 |     # can hook into our existing Graph.
70 |     with traced.graph.inserting_after(wrap_node):
71 |         fn_impl_output_node = fn_impl_traced(wrap_proxy)
72 |         new_args = (fn_impl_output_node.node,)
73 |         output_node.args = new_args
74 | 
75 |     m.recompile()
76 | 
77 | 
78 | # Example call
79 | x, y = torch.randn(5, 3), torch.randn(5, 3)
80 | orig_output = traced(x, y)
81 | 
82 | wrap_in_activation_function(traced, ActivationFunction.LEAKY_RELU)
83 | new_output = traced(x, y)
84 | 
85 | torch.testing.assert_close(new_output, torch.nn.LeakyReLU()(orig_output))
86 | 


--------------------------------------------------------------------------------
/gat/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | requests
3 | numpy<2
4 | 


--------------------------------------------------------------------------------
/gcn/README.md:
--------------------------------------------------------------------------------
 1 | # Graph Convolutional Network
 2 | 
 3 | This repository contains an implementation of Graph Convolutional Networks (GCN) based on the paper "Semi-Supervised Classification with Graph Convolutional Networks" by Thomas N. Kipf and Max Welling.
 4 | 
 5 | ## Overview
 6 | This project implements the GCN model proposed in the paper for semi-supervised node classification on graph-structured data. GCN leverages graph convolutions to aggregate information from neighboring nodes and learn node representations for downstream tasks. The implementation provides a flexible and efficient GCN model for graph-based machine learning tasks.
 7 | 
 8 | # Requirements
 9 | - Python 3.7 or higher
10 | - PyTorch 2.0 or higher
11 | - Requests 2.31 or higher
12 | - NumPy 1.24 or higher
13 | 
14 | 
15 | # Installation
16 | ```bash
17 | pip install -r requirements.txt
18 | python main.py
19 | ```
20 | 
21 | # Dataset
22 | The implementation includes support for the Cora dataset, a standard benchmark dataset for graph-based machine learning tasks. The Cora dataset consists of scientific publications, where nodes represent papers and edges represent citation relationships. Each paper is associated with a binary label indicating one of seven classes. The dataset is downloaded, preprocessed and ready to use.
23 | 
24 | ## Model Architecture
25 | The GCN model architecture follows the details provided in the paper. It consists of multiple graph convolutional layers with ReLU activation, followed by a final softmax layer for classification. The implementation supports customizable hyperparameters such as the number of hidden units, the number of layers, and dropout rate.
26 | 
27 | ## Usage
28 | To train and evaluate the GCN model on the Cora dataset, use the following command:
29 | ```bash
30 | python train.py --epochs 200 --lr 0.01 --l2 5e-4 --dropout-p 0.5 --hidden-dim 16 --val-every 20 --include-bias False --no-cuda False
31 | ```
32 | 
33 | # Results
34 | The model achieves a classification accuracy of 82.5% on the test set of the Cora dataset after 200 epochs of training. This result is comparable to the performance reported in the original paper. However, the results can vary due to the randomness of the train/val/test split.
35 | 
36 | References
37 | Thomas N. Kipf and Max Welling. "Semi-Supervised Classification with Graph Convolutional Networks." Link to the paper
38 | 
39 | Original paper repository: [https://github.com/tkipf/gcn](https://github.com/tkipf/gcn)


--------------------------------------------------------------------------------
/gcn/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | requests
4 | numpy<2
5 | 


--------------------------------------------------------------------------------
/imagenet/extract_ILSVRC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # script to extract ImageNet dataset
 4 | # ILSVRC2012_img_train.tar (about 138 GB)
 5 | # ILSVRC2012_img_val.tar (about 6.3 GB)
 6 | # make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
 7 | #
 8 | #  Adapted from:
 9 | #  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
10 | #  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
11 | # 
12 | #  imagenet/train/
13 | #  ├── n01440764
14 | #  │   ├── n01440764_10026.JPEG
15 | #  │   ├── n01440764_10027.JPEG
16 | #  │   ├── ......
17 | #  ├── ......
18 | #  imagenet/val/
19 | #  ├── n01440764
20 | #  │   ├── ILSVRC2012_val_00000293.JPEG
21 | #  │   ├── ILSVRC2012_val_00002138.JPEG
22 | #  │   ├── ......
23 | #  ├── ......
24 | #
25 | #
26 | # Make imagnet directory
27 | #
28 | mkdir imagenet
29 | #
30 | # Extract the training data:
31 | #
32 | # Create train directory; move .tar file; change directory
33 | mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
34 | # Extract training set; remove compressed file
35 | tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
36 | #
37 | # At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
38 | #
39 | # For each .tar file: 
40 | #   1. create directory with same name as .tar file
41 | #   2. extract and copy contents of .tar file into directory
42 | #   3. remove .tar file
43 | find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
44 | #
45 | # This results in a training directory like so:
46 | #
47 | #  imagenet/train/
48 | #  ├── n01440764
49 | #  │   ├── n01440764_10026.JPEG
50 | #  │   ├── n01440764_10027.JPEG
51 | #  │   ├── ......
52 | #  ├── ......
53 | #
54 | # Change back to original directory
55 | cd ../..
56 | #
57 | # Extract the validation data and move images to subfolders:
58 | #
59 | # Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
60 | mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
61 | # get script from soumith and run; this script creates all class directories and moves images into corresponding directories
62 | wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
63 | #
64 | # This results in a validation directory like so:
65 | #
66 | #  imagenet/val/
67 | #  ├── n01440764
68 | #  │   ├── ILSVRC2012_val_00000293.JPEG
69 | #  │   ├── ILSVRC2012_val_00002138.JPEG
70 | #  │   ├── ......
71 | #  ├── ......
72 | #
73 | #
74 | # Check total files after extract
75 | #
76 | #  $ find train/ -name "*.JPEG" | wc -l
77 | #  1281167
78 | #  $ find val/ -name "*.JPEG" | wc -l
79 | #  50000
80 | #
81 | 


--------------------------------------------------------------------------------
/imagenet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | 


--------------------------------------------------------------------------------
/language_translation/README.md:
--------------------------------------------------------------------------------
 1 | # Language Translation
 2 | 
 3 | This example shows how one might use transformers for language translation. In particular, this implementation is loosely based on the [Attention is All You Need paper](https://arxiv.org/abs/1706.03762).
 4 | 
 5 | ## Requirements
 6 | 
 7 | We will need a tokenizer for our languages. Torchtext does include a tokenizer for English, but unfortunately, we will need more languages then that. We can get these tokenizers via ```spacy```
 8 | 
 9 | ```bash
10 | python3 -m spacy download <language>
11 | python3 -m spacy download en
12 | python3 -m spacy download de
13 | ```
14 | 
15 | Spacy supports many languages. For a full accounting of supported languages, please look [here](https://spacy.io/usage/models). This example will default from German to English.
16 | 
17 | Torchtext is also required:
18 | ```bash
19 | pip install torchtext
20 | ```
21 | 
22 | Just running these commands will get you started:
23 | ```bash
24 | pip install -r requirements.txt
25 | python3 -m spacy download <language-you-want>
26 | ```
27 | 
28 | ## Usage
29 | 
30 | This example contains a lot of flags that you can set to change the behavior / training of the module. You can see all of them by running:
31 | 
32 | ```bash
33 | python3 main.py -h
34 | ```
35 | 
36 | But in general, all of the settings have "sensible" defaults; however, the default translation is to translate from German to English. To *train* the model, you only need to run the following command, but there is also an example for how to use any language you want:
37 | 
38 | ```bash
39 | python3 main.py
40 | python3 main.py --src en --tgt fr # For english to french translation
41 | ```
42 | 
43 | For model inference, you can use this command:
44 | 
45 | ```bash
46 | python3 main.py --inference --model_path <path-to-model>
47 | ```
48 | 
49 | After some loading time, this will open an interactive interface where you can type in whatever sentence you are interested in translating.
50 | 


--------------------------------------------------------------------------------
/language_translation/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchtext
3 | torchdata==0.9.0
4 | spacy
5 | portalocker
6 | 


--------------------------------------------------------------------------------
/language_translation/src/model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | from torch import nn
 6 | 
 7 | class PositionalEncoding(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         emb_size,
11 |         dropout,
12 |         maxlen=5000
13 |     ):
14 |         super(PositionalEncoding, self).__init__()
15 |         den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
16 |         pos = torch.arange(0, maxlen).reshape(maxlen, 1)
17 |         pos_embedding = torch.zeros((maxlen, emb_size))
18 |         pos_embedding[:, 0::2] = torch.sin(pos * den)
19 |         pos_embedding[:, 1::2] = torch.cos(pos * den)
20 |         pos_embedding = pos_embedding.unsqueeze(-2)
21 | 
22 |         self.dropout = nn.Dropout(dropout)
23 |         self.register_buffer('pos_embedding', pos_embedding)
24 | 
25 |     def forward(self, token_embedding):
26 |         return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
27 | 
28 | class Translator(nn.Module):
29 |     def __init__(
30 |             self,
31 |             num_encoder_layers,
32 |             num_decoder_layers,
33 |             embed_size,
34 |             num_heads,
35 |             src_vocab_size,
36 |             tgt_vocab_size,
37 |             dim_feedforward,
38 |             dropout
39 |         ):
40 |         super(Translator, self).__init__()
41 | 
42 |         # Output of embedding must be equal (embed_size)
43 |         self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
44 |         self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
45 | 
46 |         self.pos_enc = PositionalEncoding(embed_size, dropout)
47 | 
48 |         self.transformer = nn.Transformer(
49 |             d_model=embed_size,
50 |             nhead=num_heads,
51 |             num_encoder_layers=num_encoder_layers,
52 |             num_decoder_layers=num_decoder_layers,
53 |             dim_feedforward=dim_feedforward,
54 |             dropout=dropout
55 |         )
56 | 
57 |         self.ff = nn.Linear(embed_size, tgt_vocab_size)
58 | 
59 |         self._init_weights()
60 | 
61 |     def _init_weights(self):
62 |         for p in self.parameters():
63 |             if p.dim() > 1:
64 |                 nn.init.xavier_uniform_(p)
65 | 
66 |     def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
67 | 
68 |         src_emb = self.pos_enc(self.src_embedding(src))
69 |         tgt_emb = self.pos_enc(self.tgt_embedding(trg))
70 | 
71 |         outs = self.transformer(
72 |             src_emb,
73 |             tgt_emb,
74 |             src_mask,
75 |             tgt_mask,
76 |             None,
77 |             src_padding_mask,
78 |             tgt_padding_mask,
79 |             memory_key_padding_mask
80 |         )
81 | 
82 |         return self.ff(outs)
83 | 
84 |     def encode(self, src, src_mask):
85 | 
86 |         embed = self.src_embedding(src)
87 | 
88 |         pos_enc = self.pos_enc(embed)
89 | 
90 |         return self.transformer.encoder(pos_enc, src_mask)
91 | 
92 |     def decode(self, tgt, memory, tgt_mask):
93 |         
94 |         embed = self.tgt_embedding(tgt)
95 | 
96 |         pos_enc = self.pos_enc(embed)
97 | 
98 |         return self.transformer.decoder(pos_enc, memory, tgt_mask)
99 | 


--------------------------------------------------------------------------------
/legacy/snli/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch-based NLI Training with SNLI
 2 | 
 3 | ## 📝 Overview
 4 | 
 5 | This repository contains Python scripts to train a Natural Language Inference (NLI) model, specifically the `SNLIClassifier`, using the Stanford Natural Language Inference (SNLI) corpus. The trained model predicts textual entailment, identifying if a statement is entailed, contradicted, or neither by another statement.
 6 | 
 7 | ## ⚙️ Dependencies
 8 | 
 9 | Install the necessary Python libraries with:
10 | 
11 | ```bash
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | The `requirements.txt` file includes:
16 | 
17 | ```
18 | torch
19 | torchtext
20 | spacy
21 | ```
22 | 
23 | ## 💻 Usage
24 | 
25 | Start the training process with:
26 | 
27 | ```bash
28 | python train.py --lower --word-vectors [PATH_TO_WORD_VECTORS] --vector-cache [PATH_TO_VECTOR_CACHE] --epochs [NUMBER_OF_EPOCHS] --batch-size [BATCH_SIZE] --save-path [PATH_TO_SAVE_MODEL] --gpu [GPU_NUMBER]
29 | ```
30 | 
31 | ## 🏋️‍♀️ Training
32 | 
33 | The script trains the model on mini-batches of data across a specified number of epochs. It saves the best-performing model on the validation set as a `.pt` file in the specified directory.
34 | 
35 | ## 📚 Scripts
36 | 
37 | - `model.py`: Defines the `SNLIClassifier` model and auxiliary classes.
38 | - `util.py`: Contains utility functions for directory creation and command-line argument parsing.
39 | 
40 | ## 📣 Note
41 | 
42 | Ensure the `model.py` and `util.py` scripts are available in your working directory.


--------------------------------------------------------------------------------
/legacy/snli/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Bottle(nn.Module):
 6 | 
 7 |     def forward(self, input):
 8 |         if len(input.size()) <= 2:
 9 |             return super(Bottle, self).forward(input)
10 |         size = input.size()[:2]
11 |         out = super(Bottle, self).forward(input.view(size[0]*size[1], -1))
12 |         return out.view(size[0], size[1], -1)
13 | 
14 | 
15 | class Linear(Bottle, nn.Linear):
16 |     pass
17 | 
18 | 
19 | class Encoder(nn.Module):
20 | 
21 |     def __init__(self, config):
22 |         super(Encoder, self).__init__()
23 |         self.config = config
24 |         input_size = config.d_proj if config.projection else config.d_embed
25 |         dropout = 0 if config.n_layers == 1 else config.dp_ratio
26 |         self.rnn = nn.LSTM(input_size=input_size, hidden_size=config.d_hidden,
27 |                         num_layers=config.n_layers, dropout=dropout,
28 |                         bidirectional=config.birnn)
29 | 
30 |     def forward(self, inputs):
31 |         batch_size = inputs.size()[1]
32 |         state_shape = self.config.n_cells, batch_size, self.config.d_hidden
33 |         h0 = c0 = inputs.new_zeros(state_shape)
34 |         outputs, (ht, ct) = self.rnn(inputs, (h0, c0))
35 |         return ht[-1] if not self.config.birnn else ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1)
36 | 
37 | 
38 | class SNLIClassifier(nn.Module):
39 | 
40 |     def __init__(self, config):
41 |         super(SNLIClassifier, self).__init__()
42 |         self.config = config
43 |         self.embed = nn.Embedding(config.n_embed, config.d_embed)
44 |         self.projection = Linear(config.d_embed, config.d_proj)
45 |         self.encoder = Encoder(config)
46 |         self.dropout = nn.Dropout(p=config.dp_ratio)
47 |         self.relu = nn.ReLU()
48 |         seq_in_size = 2*config.d_hidden
49 |         if self.config.birnn:
50 |             seq_in_size *= 2
51 |         lin_config = [seq_in_size]*2
52 |         self.out = nn.Sequential(
53 |             Linear(*lin_config),
54 |             self.relu,
55 |             self.dropout,
56 |             Linear(*lin_config),
57 |             self.relu,
58 |             self.dropout,
59 |             Linear(*lin_config),
60 |             self.relu,
61 |             self.dropout,
62 |             Linear(seq_in_size, config.d_out))
63 | 
64 |     def forward(self, batch):
65 |         prem_embed = self.embed(batch.premise)
66 |         hypo_embed = self.embed(batch.hypothesis)
67 |         if self.config.fix_emb:
68 |             prem_embed = prem_embed.detach()
69 |             hypo_embed = hypo_embed.detach()
70 |         if self.config.projection:
71 |             prem_embed = self.relu(self.projection(prem_embed))
72 |             hypo_embed = self.relu(self.projection(hypo_embed))
73 |         premise = self.encoder(prem_embed)
74 |         hypothesis = self.encoder(hypo_embed)
75 |         scores = self.out(torch.cat([premise, hypothesis], 1))
76 |         return scores
77 | 


--------------------------------------------------------------------------------
/legacy/snli/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchtext
3 | spacy
4 | 


--------------------------------------------------------------------------------
/legacy/snli/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from argparse import ArgumentParser
 3 | 
 4 | def makedirs(name):
 5 |     """helper function for python 2 and 3 to call os.makedirs()
 6 |        avoiding an error if the directory to be created already exists"""
 7 | 
 8 |     import os, errno
 9 | 
10 |     try:
11 |         os.makedirs(name)
12 |     except OSError as ex:
13 |         if ex.errno == errno.EEXIST and os.path.isdir(name):
14 |             # ignore existing directory
15 |             pass
16 |         else:
17 |             # a different error happened
18 |             raise
19 | 
20 | 
21 | def get_args():
22 |     parser = ArgumentParser(description='PyTorch/torchtext SNLI example')
23 |     parser.add_argument('--epochs', type=int, default=50,
24 |                         help='the number of total epochs to run.')
25 |     parser.add_argument('--batch_size', type=int, default=128,
26 |                         help='batch size. (default: 128)')
27 |     parser.add_argument('--d_embed', type=int, default=100,
28 |                         help='the size of each embedding vector.')
29 |     parser.add_argument('--d_proj', type=int, default=300,
30 |                         help='the size of each projection layer.')
31 |     parser.add_argument('--d_hidden', type=int, default=300,
32 |                         help='the number of features in the hidden state.')
33 |     parser.add_argument('--n_layers', type=int, default=1,
34 |                         help='the number of recurrent layers. (default: 50)')
35 |     parser.add_argument('--log_every', type=int, default=50,
36 |                         help='iteration period to output log.')
37 |     parser.add_argument('--lr',type=float, default=.001,
38 |                         help='initial learning rate.')
39 |     parser.add_argument('--dev_every', type=int, default=1000,
40 |                         help='log period of validation results.')
41 |     parser.add_argument('--save_every', type=int, default=1000,
42 |                         help='model checkpoint period.')
43 |     parser.add_argument('--dp_ratio', type=int, default=0.2,
44 |                         help='probability of an element to be zeroed.')
45 |     parser.add_argument('--no-bidirectional', action='store_false', dest='birnn',
46 |                         help='disable bidirectional LSTM.')
47 |     parser.add_argument('--preserve-case', action='store_false', dest='lower',
48 |                         help='case-sensitivity.')
49 |     parser.add_argument('--no-projection', action='store_false', dest='projection',
50 |                         help='disable projection layer.')
51 |     parser.add_argument('--train_embed', action='store_false', dest='fix_emb',
52 |                         help='enable embedding word training.')
53 |     parser.add_argument('--gpu', type=int, default=0,
54 |                         help='gpu id to use. (default: 0)')
55 |     parser.add_argument('--save_path', type=str, default='results',
56 |                         help='save path of results.')
57 |     parser.add_argument('--vector_cache', type=str, default=os.path.join(os.getcwd(), '.vector_cache/input_vectors.pt'),
58 |                         help='name of vector cache directory, which saved input word-vectors.')
59 |     parser.add_argument('--word_vectors', type=str, default='glove.6B.100d',
60 |                         help='one of or a list containing instantiations of the GloVe, CharNGram, or Vectors classes.'
61 |                         'Alternatively, one of or a list of available pretrained vectors: '
62 |                         'charngram.100d fasttext.en.300d fasttext.simple.300d'
63 |                         'glove.42B.300d glove.840B.300d glove.twitter.27B.25d'
64 |                         'glove.twitter.27B.50d glove.twitter.27B.100d glove.twitter.27B.200d'
65 |                         'glove.6B.50d glove.6B.100d glove.6B.200d glove.6B.300d')
66 |     parser.add_argument('--resume_snapshot', type=str, default='',
67 |                         help='model snapshot to resume.')
68 |     parser.add_argument('--dry-run', action='store_true',
69 |                         help='run only a few iterations')
70 |     args = parser.parse_args()
71 |     return args
72 | 


--------------------------------------------------------------------------------
/mnist/README.md:
--------------------------------------------------------------------------------
1 | # Basic MNIST Example
2 | 
3 | ```bash
4 | pip install -r requirements.txt
5 | python main.py
6 | # CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
7 | ```
8 | 


--------------------------------------------------------------------------------
/mnist/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | 


--------------------------------------------------------------------------------
/mnist_forward_forward/README.md:
--------------------------------------------------------------------------------
 1 | # Basic Forward-Forward Example
 2 | 
 3 | This example implements the paper [The Forward-Forward Algorithm: Some Preliminary Investigations](https://arxiv.org/abs/2212.13345) by Geoffrey Hinton.
 4 | 
 5 | the aim of this paper is to introduce a new learning procedure for neural networks. the forward and backward passes of backpropagation by two forward passes.
 6 | 
 7 | ```bash
 8 | pip install -r requirements.txt
 9 | python main.py
10 | ```
11 | 
12 | The main.py script accepts the following arguments:
13 | 
14 | ```bash
15 | optional arguments:
16 |   -h, --help            show this help message and exit
17 |   --epochs EPOCHS       number of epochs to train (default: 1000)
18 |   --lr LR               learning rate (default: 0.03)
19 |   --no_cuda             disables CUDA training
20 |   --no_mps              disables MPS training
21 |   --seed SEED           random seed (default: 1)
22 |   --save_model          For saving the current Model
23 |   --train_size TRAIN_SIZE
24 |                         size of training set
25 |   --threshold THRESHOLD
26 |                         threshold for training
27 |   --test_size TEST_SIZE
28 |                         size of test set
29 |   --save-model          For Saving the current Model
30 |   --log-interval LOG_INTERVAL
31 |                         logging training status interval
32 | ```
33 | 


--------------------------------------------------------------------------------
/mnist_forward_forward/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | 


--------------------------------------------------------------------------------
/mnist_hogwild/README.md:
--------------------------------------------------------------------------------
 1 | # MNIST Hogwild Example
 2 | 
 3 | ```bash
 4 | pip install -r requirements.txt
 5 | python main.py
 6 | ```
 7 | 
 8 | The main.py script accepts the following arguments:
 9 | 
10 | ```bash
11 | optional arguments:
12 |   -h, --help            show this help message and exit
13 |   --batch_size          input batch_size for training (default:64)
14 |   --testing_batch_size  input batch size for testing (default: 1000)
15 |   --epochs EPOCHS       number of epochs to train (default: 1000)
16 |   --lr LR               learning rate (default: 0.03)
17 |   --momentum            SGD momentum (default: 0.5)
18 |   --seed SEED           random seed (default: 1)
19 |   --mps                 enables macos GPU training
20 |   --save_model          For saving the current Model
21 |   --log_interval        how many batches to wait before logging training status
22 |   --num_process         how many training processes to use (default: 2)
23 |   --cuda                enables CUDA training
24 |   --dry-run             quickly check a single pass
25 |   --save-model          For Saving the current Model
26 | ```
27 | 


--------------------------------------------------------------------------------
/mnist_hogwild/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | 


--------------------------------------------------------------------------------
/mnist_hogwild/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.optim as optim
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def train(rank, args, model, device, dataset, dataloader_kwargs):
 8 |     torch.manual_seed(args.seed + rank)
 9 | 
10 |     train_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs)
11 | 
12 |     optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
13 |     for epoch in range(1, args.epochs + 1):
14 |         train_epoch(epoch, args, model, device, train_loader, optimizer)
15 | 
16 | 
17 | def test(args, model, device, dataset, dataloader_kwargs):
18 |     torch.manual_seed(args.seed)
19 | 
20 |     test_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs)
21 | 
22 |     test_epoch(model, device, test_loader)
23 | 
24 | 
25 | def train_epoch(epoch, args, model, device, data_loader, optimizer):
26 |     model.train()
27 |     pid = os.getpid()
28 |     for batch_idx, (data, target) in enumerate(data_loader):
29 |         optimizer.zero_grad()
30 |         output = model(data.to(device))
31 |         loss = F.nll_loss(output, target.to(device))
32 |         loss.backward()
33 |         optimizer.step()
34 |         if batch_idx % args.log_interval == 0:
35 |             print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
36 |                 pid, epoch, batch_idx * len(data), len(data_loader.dataset),
37 |                 100. * batch_idx / len(data_loader), loss.item()))
38 |             if args.dry_run:
39 |                 break
40 | 
41 | 
42 | def test_epoch(model, device, data_loader):
43 |     model.eval()
44 |     test_loss = 0
45 |     correct = 0
46 |     with torch.no_grad():
47 |         for data, target in data_loader:
48 |             output = model(data.to(device))
49 |             test_loss += F.nll_loss(output, target.to(device), reduction='sum').item() # sum up batch loss
50 |             pred = output.max(1)[1] # get the index of the max log-probability
51 |             correct += pred.eq(target.to(device)).sum().item()
52 | 
53 |     test_loss /= len(data_loader.dataset)
54 |     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
55 |         test_loss, correct, len(data_loader.dataset),
56 |         100. * correct / len(data_loader.dataset)))
57 | 


--------------------------------------------------------------------------------
/mnist_rnn/README.md:
--------------------------------------------------------------------------------
 1 | # Example of MNIST using RNN
 2 | 
 3 | ## Motivation
 4 | Create pytorch example similar to Official Tensorflow Keras RNN example using MNIST [here](https://www.tensorflow.org/guide/keras/rnn) 
 5 | 
 6 | ```bash
 7 | pip install -r requirements.txt
 8 | python main.py
 9 | # CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
10 | ```
11 | 


--------------------------------------------------------------------------------
/mnist_rnn/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | 


--------------------------------------------------------------------------------
/regression/README.md:
--------------------------------------------------------------------------------
1 | # Linear regression example
2 | 
3 | Trains a single fully-connected layer to fit a 4th degree polynomial.
4 | 


--------------------------------------------------------------------------------
/regression/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | from itertools import count
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | POLY_DEGREE = 4
 9 | W_target = torch.randn(POLY_DEGREE, 1) * 5
10 | b_target = torch.randn(1) * 5
11 | 
12 | 
13 | def make_features(x):
14 |     """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4]."""
15 |     x = x.unsqueeze(1)
16 |     return torch.cat([x ** i for i in range(1, POLY_DEGREE+1)], 1)
17 | 
18 | 
19 | def f(x):
20 |     """Approximated function."""
21 |     return x.mm(W_target) + b_target.item()
22 | 
23 | 
24 | def poly_desc(W, b):
25 |     """Creates a string description of a polynomial."""
26 |     result = 'y = '
27 |     for i, w in enumerate(W):
28 |         result += '{:+.2f} x^{} '.format(w, i + 1)
29 |     result += '{:+.2f}'.format(b[0])
30 |     return result
31 | 
32 | 
33 | def get_batch(batch_size=32):
34 |     """Builds a batch i.e. (x, f(x)) pair."""
35 |     random = torch.randn(batch_size)
36 |     x = make_features(random)
37 |     y = f(x)
38 |     return x, y
39 | 
40 | 
41 | # Define model
42 | fc = torch.nn.Linear(W_target.size(0), 1)
43 | 
44 | for batch_idx in count(1):
45 |     # Get data
46 |     batch_x, batch_y = get_batch()
47 | 
48 |     # Reset gradients
49 |     fc.zero_grad()
50 | 
51 |     # Forward pass
52 |     output = F.smooth_l1_loss(fc(batch_x), batch_y)
53 |     loss = output.item()
54 | 
55 |     # Backward pass
56 |     output.backward()
57 | 
58 |     # Apply gradients
59 |     for param in fc.parameters():
60 |         param.data.add_(-0.1 * param.grad)
61 | 
62 |     # Stop criterion
63 |     if loss < 1e-3:
64 |         break
65 | 
66 | print('Loss: {:.6f} after {} batches'.format(loss, batch_idx))
67 | print('==> Learned function:\t' + poly_desc(fc.weight.view(-1), fc.bias))
68 | print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target))
69 | 


--------------------------------------------------------------------------------
/reinforcement_learning/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement learning training example
 2 | 
 3 | ```bash
 4 | pip install -r requirements.txt
 5 | # For REINFORCE:
 6 | python reinforce.py
 7 | # For actor critic:
 8 | python actor_critic.py
 9 | ```
10 | 


--------------------------------------------------------------------------------
/reinforcement_learning/reinforce.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import numpy as np
  4 | from itertools import count
  5 | from collections import deque
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | from torch.distributions import Categorical
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
 14 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
 15 |                     help='discount factor (default: 0.99)')
 16 | parser.add_argument('--seed', type=int, default=543, metavar='N',
 17 |                     help='random seed (default: 543)')
 18 | parser.add_argument('--render', action='store_true',
 19 |                     help='render the environment')
 20 | parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 21 |                     help='interval between training status logs (default: 10)')
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | env = gym.make('CartPole-v1')
 26 | env.reset(seed=args.seed)
 27 | torch.manual_seed(args.seed)
 28 | 
 29 | 
 30 | class Policy(nn.Module):
 31 |     def __init__(self):
 32 |         super(Policy, self).__init__()
 33 |         self.affine1 = nn.Linear(4, 128)
 34 |         self.dropout = nn.Dropout(p=0.6)
 35 |         self.affine2 = nn.Linear(128, 2)
 36 | 
 37 |         self.saved_log_probs = []
 38 |         self.rewards = []
 39 | 
 40 |     def forward(self, x):
 41 |         x = self.affine1(x)
 42 |         x = self.dropout(x)
 43 |         x = F.relu(x)
 44 |         action_scores = self.affine2(x)
 45 |         return F.softmax(action_scores, dim=1)
 46 | 
 47 | 
 48 | policy = Policy()
 49 | optimizer = optim.Adam(policy.parameters(), lr=1e-2)
 50 | eps = np.finfo(np.float32).eps.item()
 51 | 
 52 | 
 53 | def select_action(state):
 54 |     state = torch.from_numpy(state).float().unsqueeze(0)
 55 |     probs = policy(state)
 56 |     m = Categorical(probs)
 57 |     action = m.sample()
 58 |     policy.saved_log_probs.append(m.log_prob(action))
 59 |     return action.item()
 60 | 
 61 | 
 62 | def finish_episode():
 63 |     R = 0
 64 |     policy_loss = []
 65 |     returns = deque()
 66 |     for r in policy.rewards[::-1]:
 67 |         R = r + args.gamma * R
 68 |         returns.appendleft(R)
 69 |     returns = torch.tensor(returns)
 70 |     returns = (returns - returns.mean()) / (returns.std() + eps)
 71 |     for log_prob, R in zip(policy.saved_log_probs, returns):
 72 |         policy_loss.append(-log_prob * R)
 73 |     optimizer.zero_grad()
 74 |     policy_loss = torch.cat(policy_loss).sum()
 75 |     policy_loss.backward()
 76 |     optimizer.step()
 77 |     del policy.rewards[:]
 78 |     del policy.saved_log_probs[:]
 79 | 
 80 | 
 81 | def main():
 82 |     running_reward = 10
 83 |     for i_episode in count(1):
 84 |         state, _ = env.reset()
 85 |         ep_reward = 0
 86 |         for t in range(1, 10000):  # Don't infinite loop while learning
 87 |             action = select_action(state)
 88 |             state, reward, done, _, _ = env.step(action)
 89 |             if args.render:
 90 |                 env.render()
 91 |             policy.rewards.append(reward)
 92 |             ep_reward += reward
 93 |             if done:
 94 |                 break
 95 | 
 96 |         running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
 97 |         finish_episode()
 98 |         if i_episode % args.log_interval == 0:
 99 |             print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
100 |                   i_episode, ep_reward, running_reward))
101 |         if running_reward > env.spec.reward_threshold:
102 |             print("Solved! Running reward is now {} and "
103 |                   "the last episode runs to {} time steps!".format(running_reward, t))
104 |             break
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/reinforcement_learning/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | numpy
3 | gym
4 | pygame
5 | 


--------------------------------------------------------------------------------
/run_distributed_examples.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # This script runs through the code in each of the python examples.
 4 | # The purpose is just as an integration test, not to actually train models in any meaningful way.
 5 | # For that reason, most of these set epochs = 1 and --dry-run.
 6 | #
 7 | # Optionally specify a comma separated list of examples to run.
 8 | # can be run as:
 9 | # ./run_python_examples.sh "install_deps,run_all,clean"
10 | # to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
11 | # Expects pytorch, torchvision to be installed.
12 | 
13 | BASE_DIR="$(pwd)/$(dirname $0)"
14 | source $BASE_DIR/utils.sh
15 | 
16 | USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())")
17 | case $USE_CUDA in
18 |   "True")
19 |     echo "using cuda"
20 |     CUDA=1
21 |     CUDA_FLAG="--cuda"
22 |     ;;
23 |   "False")
24 |     echo "not using cuda"
25 |     CUDA=0
26 |     CUDA_FLAG=""
27 |     ;;
28 |   "")
29 |     exit 1;
30 |     ;;
31 | esac
32 | 
33 | function distributed() {
34 |     start
35 |     bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
36 |     bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
37 |     bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
38 |     python ddp/main.py || error "ddp example failed"
39 | }
40 | 
41 | function clean() {
42 |   cd $BASE_DIR
43 |   echo "running clean to remove cruft"
44 | }
45 | 
46 | function run_all() {
47 |   distributed
48 | }
49 | 
50 | # by default, run all examples
51 | if [ "" == "$EXAMPLES" ]; then
52 |   run_all
53 | else
54 |   for i in $(echo $EXAMPLES | sed "s/,/ /g")
55 |   do
56 |     echo "Starting $i"
57 |     $i
58 |     echo "Finished $i, status $?"
59 |   done
60 | fi
61 | 
62 | if [ "" == "$ERRORS" ]; then
63 |   echo "Completed successfully with status $?"
64 | else
65 |   echo "Some distributed examples failed:"
66 |   printf "$ERRORS\n"
67 |   #Exit with error (0-255) in case of failure in one of the tests.
68 |   exit 1
69 | 
70 | fi
71 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | 3.8
2 | 


--------------------------------------------------------------------------------
/siamese_network/README.md:
--------------------------------------------------------------------------------
1 | # Siamese Network Example
2 | 
3 | ```bash
4 | pip install -r requirements.txt
5 | python main.py
6 | # CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
7 | ```
8 | 


--------------------------------------------------------------------------------
/siamese_network/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | 


--------------------------------------------------------------------------------
/super_resolution/README.md:
--------------------------------------------------------------------------------
 1 | # Superresolution using an efficient sub-pixel convolutional neural network
 2 | 
 3 | This example illustrates how to use the efficient sub-pixel convolution layer described in ["Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network" - Shi et al.](https://arxiv.org/abs/1609.05158) for increasing spatial resolution within your network for tasks such as superresolution.
 4 | 
 5 | ```
 6 | usage: main.py [-h] --upscale_factor UPSCALE_FACTOR [--batchSize BATCHSIZE]
 7 |                [--testBatchSize TESTBATCHSIZE] [--nEpochs NEPOCHS] [--lr LR]
 8 |                [--cuda] [--threads THREADS] [--seed SEED]
 9 | 
10 | PyTorch Super Res Example
11 | 
12 | optional arguments:
13 |   -h, --help            show this help message and exit
14 |   --upscale_factor      super resolution upscale factor
15 |   --batchSize           training batch size
16 |   --testBatchSize       testing batch size
17 |   --nEpochs             number of epochs to train for
18 |   --lr                  Learning Rate. Default=0.01
19 |   --cuda                use cuda
20 |   --mps                 enable GPU on macOS
21 |   --threads             number of threads for data loader to use Default=4
22 |   --seed                random seed to use. Default=123
23 | ```
24 | 
25 | This example trains a super-resolution network on the [BSD300 dataset](https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/), using crops from the 200 training images, and evaluating on crops of the 100 test images. A snapshot of the model after every epoch with filename model*epoch*<epoch_number>.pth
26 | 
27 | ## Example Usage:
28 | 
29 | ### Train
30 | 
31 | `python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 30 --lr 0.001`
32 | 
33 | ### Super Resolve
34 | 
35 | `python super_resolve.py --input_image dataset/BSDS300/images/test/16077.jpg --model model_epoch_500.pth --output_filename out.png`
36 | 


--------------------------------------------------------------------------------
/super_resolution/data.py:
--------------------------------------------------------------------------------
 1 | from os.path import exists, join, basename
 2 | from os import makedirs, remove
 3 | from six.moves import urllib
 4 | import tarfile
 5 | from torchvision.transforms import Compose, CenterCrop, ToTensor, Resize
 6 | 
 7 | from dataset import DatasetFromFolder
 8 | 
 9 | 
10 | def download_bsd300(dest="dataset"):
11 |     output_image_dir = join(dest, "BSDS300/images")
12 | 
13 |     if not exists(output_image_dir):
14 |         makedirs(dest)
15 |         url = "http://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/BSDS300-images.tgz"
16 |         print("downloading url ", url)
17 | 
18 |         data = urllib.request.urlopen(url)
19 | 
20 |         file_path = join(dest, basename(url))
21 |         with open(file_path, 'wb') as f:
22 |             f.write(data.read())
23 | 
24 |         print("Extracting data")
25 |         with tarfile.open(file_path) as tar:
26 |             for item in tar:
27 |                 tar.extract(item, dest)
28 | 
29 |         remove(file_path)
30 | 
31 |     return output_image_dir
32 | 
33 | 
34 | def calculate_valid_crop_size(crop_size, upscale_factor):
35 |     return crop_size - (crop_size % upscale_factor)
36 | 
37 | 
38 | def input_transform(crop_size, upscale_factor):
39 |     return Compose([
40 |         CenterCrop(crop_size),
41 |         Resize(crop_size // upscale_factor),
42 |         ToTensor(),
43 |     ])
44 | 
45 | 
46 | def target_transform(crop_size):
47 |     return Compose([
48 |         CenterCrop(crop_size),
49 |         ToTensor(),
50 |     ])
51 | 
52 | 
53 | def get_training_set(upscale_factor):
54 |     root_dir = download_bsd300()
55 |     train_dir = join(root_dir, "train")
56 |     crop_size = calculate_valid_crop_size(256, upscale_factor)
57 | 
58 |     return DatasetFromFolder(train_dir,
59 |                              input_transform=input_transform(crop_size, upscale_factor),
60 |                              target_transform=target_transform(crop_size))
61 | 
62 | 
63 | def get_test_set(upscale_factor):
64 |     root_dir = download_bsd300()
65 |     test_dir = join(root_dir, "test")
66 |     crop_size = calculate_valid_crop_size(256, upscale_factor)
67 | 
68 |     return DatasetFromFolder(test_dir,
69 |                              input_transform=input_transform(crop_size, upscale_factor),
70 |                              target_transform=target_transform(crop_size))
71 | 


--------------------------------------------------------------------------------
/super_resolution/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch.utils.data as data
 2 | 
 3 | from os import listdir
 4 | from os.path import join
 5 | from PIL import Image
 6 | 
 7 | 
 8 | def is_image_file(filename):
 9 |     return any(filename.endswith(extension) for extension in [".png", ".jpg", ".jpeg"])
10 | 
11 | 
12 | def load_img(filepath):
13 |     img = Image.open(filepath).convert('YCbCr')
14 |     y, _, _ = img.split()
15 |     return y
16 | 
17 | 
18 | class DatasetFromFolder(data.Dataset):
19 |     def __init__(self, image_dir, input_transform=None, target_transform=None):
20 |         super(DatasetFromFolder, self).__init__()
21 |         self.image_filenames = [join(image_dir, x) for x in listdir(image_dir) if is_image_file(x)]
22 | 
23 |         self.input_transform = input_transform
24 |         self.target_transform = target_transform
25 | 
26 |     def __getitem__(self, index):
27 |         input = load_img(self.image_filenames[index])
28 |         target = input.copy()
29 |         if self.input_transform:
30 |             input = self.input_transform(input)
31 |         if self.target_transform:
32 |             target = self.target_transform(target)
33 | 
34 |         return input, target
35 | 
36 |     def __len__(self):
37 |         return len(self.image_filenames)
38 | 


--------------------------------------------------------------------------------
/super_resolution/main.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | from math import log10
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.optim as optim
 8 | from torch.utils.data import DataLoader
 9 | from model import Net
10 | from data import get_training_set, get_test_set
11 | 
12 | # Training settings
13 | parser = argparse.ArgumentParser(description='PyTorch Super Res Example')
14 | parser.add_argument('--upscale_factor', type=int, required=True, help="super resolution upscale factor")
15 | parser.add_argument('--batchSize', type=int, default=64, help='training batch size')
16 | parser.add_argument('--testBatchSize', type=int, default=10, help='testing batch size')
17 | parser.add_argument('--nEpochs', type=int, default=2, help='number of epochs to train for')
18 | parser.add_argument('--lr', type=float, default=0.01, help='Learning Rate. Default=0.01')
19 | parser.add_argument('--cuda', action='store_true', help='use cuda?')
20 | parser.add_argument('--mps', action='store_true', default=False, help='enables macOS GPU training')
21 | parser.add_argument('--threads', type=int, default=4, help='number of threads for data loader to use')
22 | parser.add_argument('--seed', type=int, default=123, help='random seed to use. Default=123')
23 | opt = parser.parse_args()
24 | 
25 | print(opt)
26 | 
27 | if opt.cuda and not torch.cuda.is_available():
28 |     raise Exception("No GPU found, please run without --cuda")
29 | if not opt.mps and torch.backends.mps.is_available():
30 |     raise Exception("Found mps device, please run with --mps to enable macOS GPU")
31 | 
32 | torch.manual_seed(opt.seed)
33 | use_mps = opt.mps and torch.backends.mps.is_available()
34 | 
35 | if opt.cuda:
36 |     device = torch.device("cuda")
37 | elif use_mps:
38 |     device = torch.device("mps")
39 | else:
40 |     device = torch.device("cpu")
41 | 
42 | print('===> Loading datasets')
43 | train_set = get_training_set(opt.upscale_factor)
44 | test_set = get_test_set(opt.upscale_factor)
45 | training_data_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batchSize, shuffle=True)
46 | testing_data_loader = DataLoader(dataset=test_set, num_workers=opt.threads, batch_size=opt.testBatchSize, shuffle=False)
47 | 
48 | print('===> Building model')
49 | model = Net(upscale_factor=opt.upscale_factor).to(device)
50 | criterion = nn.MSELoss()
51 | 
52 | optimizer = optim.Adam(model.parameters(), lr=opt.lr)
53 | 
54 | 
55 | def train(epoch):
56 |     epoch_loss = 0
57 |     for iteration, batch in enumerate(training_data_loader, 1):
58 |         input, target = batch[0].to(device), batch[1].to(device)
59 | 
60 |         optimizer.zero_grad()
61 |         loss = criterion(model(input), target)
62 |         epoch_loss += loss.item()
63 |         loss.backward()
64 |         optimizer.step()
65 | 
66 |         print("===> Epoch[{}]({}/{}): Loss: {:.4f}".format(epoch, iteration, len(training_data_loader), loss.item()))
67 | 
68 |     print("===> Epoch {} Complete: Avg. Loss: {:.4f}".format(epoch, epoch_loss / len(training_data_loader)))
69 | 
70 | 
71 | def test():
72 |     avg_psnr = 0
73 |     with torch.no_grad():
74 |         for batch in testing_data_loader:
75 |             input, target = batch[0].to(device), batch[1].to(device)
76 | 
77 |             prediction = model(input)
78 |             mse = criterion(prediction, target)
79 |             psnr = 10 * log10(1 / mse.item())
80 |             avg_psnr += psnr
81 |     print("===> Avg. PSNR: {:.4f} dB".format(avg_psnr / len(testing_data_loader)))
82 | 
83 | 
84 | def checkpoint(epoch):
85 |     model_out_path = "model_epoch_{}.pth".format(epoch)
86 |     torch.save(model, model_out_path)
87 |     print("Checkpoint saved to {}".format(model_out_path))
88 | 
89 | for epoch in range(1, opt.nEpochs + 1):
90 |     train(epoch)
91 |     test()
92 |     checkpoint(epoch)
93 | 


--------------------------------------------------------------------------------
/super_resolution/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.init as init
 4 | 
 5 | 
 6 | class Net(nn.Module):
 7 |     def __init__(self, upscale_factor):
 8 |         super(Net, self).__init__()
 9 | 
10 |         self.relu = nn.ReLU()
11 |         self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
12 |         self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
13 |         self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
14 |         self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
15 |         self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
16 | 
17 |         self._initialize_weights()
18 | 
19 |     def forward(self, x):
20 |         x = self.relu(self.conv1(x))
21 |         x = self.relu(self.conv2(x))
22 |         x = self.relu(self.conv3(x))
23 |         x = self.pixel_shuffle(self.conv4(x))
24 |         return x
25 | 
26 |     def _initialize_weights(self):
27 |         init.orthogonal_(self.conv1.weight, init.calculate_gain('relu'))
28 |         init.orthogonal_(self.conv2.weight, init.calculate_gain('relu'))
29 |         init.orthogonal_(self.conv3.weight, init.calculate_gain('relu'))
30 |         init.orthogonal_(self.conv4.weight)
31 | 


--------------------------------------------------------------------------------
/super_resolution/super_resolve.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import torch
 4 | from PIL import Image
 5 | from torchvision.transforms import ToTensor
 6 | 
 7 | import numpy as np
 8 | 
 9 | # Training settings
10 | parser = argparse.ArgumentParser(description='PyTorch Super Res Example')
11 | parser.add_argument('--input_image', type=str, required=True, help='input image to use')
12 | parser.add_argument('--model', type=str, required=True, help='model file to use')
13 | parser.add_argument('--output_filename', type=str, help='where to save the output image')
14 | parser.add_argument('--cuda', action='store_true', help='use cuda')
15 | opt = parser.parse_args()
16 | 
17 | print(opt)
18 | img = Image.open(opt.input_image).convert('YCbCr')
19 | y, cb, cr = img.split()
20 | 
21 | model = torch.load(opt.model)
22 | img_to_tensor = ToTensor()
23 | input = img_to_tensor(y).view(1, -1, y.size[1], y.size[0])
24 | 
25 | if opt.cuda:
26 |     model = model.cuda()
27 |     input = input.cuda()
28 | 
29 | out = model(input)
30 | out = out.cpu()
31 | out_img_y = out[0].detach().numpy()
32 | out_img_y *= 255.0
33 | out_img_y = out_img_y.clip(0, 255)
34 | out_img_y = Image.fromarray(np.uint8(out_img_y[0]), mode='L')
35 | 
36 | out_img_cb = cb.resize(out_img_y.size, Image.BICUBIC)
37 | out_img_cr = cr.resize(out_img_y.size, Image.BICUBIC)
38 | out_img = Image.merge('YCbCr', [out_img_y, out_img_cb, out_img_cr]).convert('RGB')
39 | 
40 | out_img.save(opt.output_filename)
41 | print('output image saved to ', opt.output_filename)
42 | 


--------------------------------------------------------------------------------
/time_sequence_prediction/README.md:
--------------------------------------------------------------------------------
 1 | # Time Sequence Prediction
 2 | 
 3 | This is a toy example for beginners to start with. It helps learn both PyTorch and time sequence prediction. Two LSTMCell units are used in this example to learn some sine wave signals starting at different phases. After learning the sine waves, the network tries to predict the signal values in the future. The results are shown in the picture below.
 4 | 
 5 | ## Usage
 6 | 
 7 | ```
 8 | python generate_sine_wave.py
 9 | python train.py
10 | ```
11 | 
12 | ## Result
13 | 
14 | The initial signal and the predicted results are shown in the image. We first give some initial signals (full line). The network will subsequently give some predicted results (dash line). It can be concluded that the network can generate new sine waves.
15 | ![image](https://cloud.githubusercontent.com/assets/1419566/24184438/e24f5280-0f08-11e7-8f8b-4d972b527a81.png)
16 | 


--------------------------------------------------------------------------------
/time_sequence_prediction/generate_sine_wave.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | np.random.seed(2)
 5 | 
 6 | T = 20
 7 | L = 1000
 8 | N = 100
 9 | 
10 | x = np.empty((N, L), 'int64')
11 | x[:] = np.array(range(L)) + np.random.randint(-4 * T, 4 * T, N).reshape(N, 1)
12 | data = np.sin(x / 1.0 / T).astype('float64')
13 | torch.save(data, open('traindata.pt', 'wb'))
14 | 


--------------------------------------------------------------------------------
/time_sequence_prediction/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | matplotlib
3 | 


--------------------------------------------------------------------------------
/time_sequence_prediction/train.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.optim as optim
 6 | import numpy as np
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | import matplotlib.pyplot as plt
10 | 
11 | class Sequence(nn.Module):
12 |     def __init__(self):
13 |         super(Sequence, self).__init__()
14 |         self.lstm1 = nn.LSTMCell(1, 51)
15 |         self.lstm2 = nn.LSTMCell(51, 51)
16 |         self.linear = nn.Linear(51, 1)
17 | 
18 |     def forward(self, input, future = 0):
19 |         outputs = []
20 |         h_t = torch.zeros(input.size(0), 51, dtype=torch.double)
21 |         c_t = torch.zeros(input.size(0), 51, dtype=torch.double)
22 |         h_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
23 |         c_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
24 | 
25 |         for input_t in input.split(1, dim=1):
26 |             h_t, c_t = self.lstm1(input_t, (h_t, c_t))
27 |             h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
28 |             output = self.linear(h_t2)
29 |             outputs += [output]
30 |         for i in range(future):# if we should predict the future
31 |             h_t, c_t = self.lstm1(output, (h_t, c_t))
32 |             h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
33 |             output = self.linear(h_t2)
34 |             outputs += [output]
35 |         outputs = torch.cat(outputs, dim=1)
36 |         return outputs
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--steps', type=int, default=15, help='steps to run')
42 |     opt = parser.parse_args()
43 |     # set random seed to 0
44 |     np.random.seed(0)
45 |     torch.manual_seed(0)
46 |     # load data and make training set
47 |     data = torch.load('traindata.pt')
48 |     input = torch.from_numpy(data[3:, :-1])
49 |     target = torch.from_numpy(data[3:, 1:])
50 |     test_input = torch.from_numpy(data[:3, :-1])
51 |     test_target = torch.from_numpy(data[:3, 1:])
52 |     # build the model
53 |     seq = Sequence()
54 |     seq.double()
55 |     criterion = nn.MSELoss()
56 |     # use LBFGS as optimizer since we can load the whole data to train
57 |     optimizer = optim.LBFGS(seq.parameters(), lr=0.8)
58 |     #begin to train
59 |     for i in range(opt.steps):
60 |         print('STEP: ', i)
61 |         def closure():
62 |             optimizer.zero_grad()
63 |             out = seq(input)
64 |             loss = criterion(out, target)
65 |             print('loss:', loss.item())
66 |             loss.backward()
67 |             return loss
68 |         optimizer.step(closure)
69 |         # begin to predict, no need to track gradient here
70 |         with torch.no_grad():
71 |             future = 1000
72 |             pred = seq(test_input, future=future)
73 |             loss = criterion(pred[:, :-future], test_target)
74 |             print('test loss:', loss.item())
75 |             y = pred.detach().numpy()
76 |         # draw the result
77 |         plt.figure(figsize=(30,10))
78 |         plt.title('Predict future values for time sequences\n(Dashlines are predicted values)', fontsize=30)
79 |         plt.xlabel('x', fontsize=20)
80 |         plt.ylabel('y', fontsize=20)
81 |         plt.xticks(fontsize=20)
82 |         plt.yticks(fontsize=20)
83 |         def draw(yi, color):
84 |             plt.plot(np.arange(input.size(1)), yi[:input.size(1)], color, linewidth = 2.0)
85 |             plt.plot(np.arange(input.size(1), input.size(1) + future), yi[input.size(1):], color + ':', linewidth = 2.0)
86 |         draw(y[0], 'r')
87 |         draw(y[1], 'g')
88 |         draw(y[2], 'b')
89 |         plt.savefig('predict%d.pdf'%i)
90 |         plt.close()
91 | 


--------------------------------------------------------------------------------
/utils.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script contains utility functions and initialize exmaple scripts.
 3 | # Eg: run_python_examples.sh, run_distributed_examples.sh
 4 | 
 5 | BASE_DIR="$(pwd)/$(dirname $0)"
 6 | EXAMPLES=$(echo $1 | sed -e 's/ //g')
 7 | 
 8 | # Redirect 'python' calls to 'python3'
 9 | python() {
10 |     command python3 "$@"
11 | }
12 | 
13 | ERRORS=${ERRORS-""}
14 | 
15 | function error() {
16 |   ERR=$1
17 |   if [ "" == "$ERRORS" ]; then
18 |     ERRORS="$ERR"
19 |   else
20 |     ERRORS="$ERRORS\n$ERR"
21 |   fi
22 | }
23 | 
24 | function install_deps() {
25 |   echo "installing requirements"
26 |   cat $BASE_DIR/*/requirements.txt | \
27 |     sort -u | \
28 |     # testing the installed version of torch, so don't pip install it.
29 |     grep -vE '^torch$' | \
30 |     pip install -r /dev/stdin || \
31 |     { error "failed to install dependencies"; exit 1; }
32 | }
33 | 
34 | function start() {
35 |   EXAMPLE=${FUNCNAME[1]}
36 |   cd $BASE_DIR/$EXAMPLE
37 |   echo "Running example: $EXAMPLE"
38 | }
39 | 


--------------------------------------------------------------------------------
/vae/README.md:
--------------------------------------------------------------------------------
 1 | # Basic VAE Example
 2 | 
 3 | This is an improved implementation of the paper [Auto-Encoding Variational Bayes](http://arxiv.org/abs/1312.6114) by Kingma and Welling.
 4 | It uses ReLUs and the adam optimizer, instead of sigmoids and adagrad. These changes make the network converge much faster.
 5 | 
 6 | ```bash
 7 | pip install -r requirements.txt
 8 | python main.py
 9 | ```
10 | 
11 | The main.py script accepts the following arguments:
12 | 
13 | ```bash
14 | optional arguments:
15 |   --batch-size		input batch size for training (default: 128)
16 |   --epochs		number of epochs to train (default: 10)
17 |   --no-cuda		enables CUDA training
18 |   --mps         enables GPU on macOS
19 |   --seed		random seed (default: 1)
20 |   --log-interval	how many batches to wait before logging training status
21 | ```


--------------------------------------------------------------------------------
/vae/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision==0.20.0
3 | tqdm
4 | six
5 | 


--------------------------------------------------------------------------------
/vae/results/.gitignore:
--------------------------------------------------------------------------------
1 | *.png
2 | 


--------------------------------------------------------------------------------
/word_language_model/README.md:
--------------------------------------------------------------------------------
 1 | # Word-level Language Modeling using RNN and Transformer
 2 | 
 3 | This example trains a multi-layer RNN (Elman, GRU, or LSTM) or Transformer on a language modeling task. By default, the training script uses the Wikitext-2 dataset, provided.
 4 | The trained model can then be used by the generate script to generate new text.
 5 | 
 6 | ```bash
 7 | python main.py --cuda --epochs 6           # Train a LSTM on Wikitext-2 with CUDA.
 8 | python main.py --cuda --epochs 6 --tied    # Train a tied LSTM on Wikitext-2 with CUDA.
 9 | python main.py --cuda --tied               # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs.
10 | python main.py --cuda --epochs 6 --model Transformer --lr 5
11 |                                            # Train a Transformer model on Wikitext-2 with CUDA.
12 | 
13 | python generate.py                         # Generate samples from the default model checkpoint.
14 | ```
15 | 
16 | The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`) or Transformer module (`nn.TransformerEncoder` and `nn.TransformerEncoderLayer`) which will automatically use the cuDNN backend if run on CUDA with cuDNN installed.
17 | 
18 | During training, if a keyboard interrupt (Ctrl-C) is received, training is stopped and the current model is evaluated against the test dataset.
19 | 
20 | The `main.py` script accepts the following arguments:
21 | 
22 | ```bash
23 | optional arguments:
24 |   -h, --help            show this help message and exit
25 |   --data DATA           location of the data corpus
26 |   --model MODEL         type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)
27 |   --emsize EMSIZE       size of word embeddings
28 |   --nhid NHID           number of hidden units per layer
29 |   --nlayers NLAYERS     number of layers
30 |   --lr LR               initial learning rate
31 |   --clip CLIP           gradient clipping
32 |   --epochs EPOCHS       upper epoch limit
33 |   --batch_size N        batch size
34 |   --bptt BPTT           sequence length
35 |   --dropout DROPOUT     dropout applied to layers (0 = no dropout)
36 |   --tied                tie the word embedding and softmax weights
37 |   --seed SEED           random seed
38 |   --cuda                use CUDA
39 |   --mps                 enable GPU on macOS
40 |   --log-interval N      report interval
41 |   --save SAVE           path to save the final model
42 |   --onnx-export ONNX_EXPORT
43 |                         path to export the final model in onnx format
44 |   --nhead NHEAD         the number of heads in the encoder/decoder of the transformer model
45 |   --dry-run             verify the code and the model
46 | ```
47 | 
48 | With these arguments, a variety of models can be tested.
49 | As an example, the following arguments produce slower but better models:
50 | 
51 | ```bash
52 | python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40
53 | python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied
54 | python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40
55 | python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied
56 | ```
57 | 


--------------------------------------------------------------------------------
/word_language_model/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from io import open
 3 | import torch
 4 | 
 5 | class Dictionary(object):
 6 |     def __init__(self):
 7 |         self.word2idx = {}
 8 |         self.idx2word = []
 9 | 
10 |     def add_word(self, word):
11 |         if word not in self.word2idx:
12 |             self.idx2word.append(word)
13 |             self.word2idx[word] = len(self.idx2word) - 1
14 |         return self.word2idx[word]
15 | 
16 |     def __len__(self):
17 |         return len(self.idx2word)
18 | 
19 | 
20 | class Corpus(object):
21 |     def __init__(self, path):
22 |         self.dictionary = Dictionary()
23 |         self.train = self.tokenize(os.path.join(path, 'train.txt'))
24 |         self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
25 |         self.test = self.tokenize(os.path.join(path, 'test.txt'))
26 | 
27 |     def tokenize(self, path):
28 |         """Tokenizes a text file."""
29 |         assert os.path.exists(path)
30 |         # Add words to the dictionary
31 |         with open(path, 'r', encoding="utf8") as f:
32 |             for line in f:
33 |                 words = line.split() + ['<eos>']
34 |                 for word in words:
35 |                     self.dictionary.add_word(word)
36 | 
37 |         # Tokenize file content
38 |         with open(path, 'r', encoding="utf8") as f:
39 |             idss = []
40 |             for line in f:
41 |                 words = line.split() + ['<eos>']
42 |                 ids = []
43 |                 for word in words:
44 |                     ids.append(self.dictionary.word2idx[word])
45 |                 idss.append(torch.tensor(ids).type(torch.int64))
46 |             ids = torch.cat(idss)
47 | 
48 |         return ids
49 | 


--------------------------------------------------------------------------------
/word_language_model/data/wikitext-2/README:
--------------------------------------------------------------------------------
1 | This is raw data from the wikitext-2 dataset.
2 | 
3 | See https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
4 | 


--------------------------------------------------------------------------------
/word_language_model/generate.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Language Modeling on Wikitext-2
 3 | #
 4 | # This file generates new sentences sampled from the language model.
 5 | #
 6 | ###############################################################################
 7 | import argparse
 8 | import torch
 9 | 
10 | import data
11 | 
12 | parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')
13 | # Model parameters.
14 | parser.add_argument('--data', type=str, default='./data/wikitext-2',
15 |                     help='location of the data corpus')
16 | parser.add_argument('--checkpoint', type=str, default='./model.pt',
17 |                     help='model checkpoint to use')
18 | parser.add_argument('--outf', type=str, default='generated.txt',
19 |                     help='output file for generated text')
20 | parser.add_argument('--words', type=int, default='1000',
21 |                     help='number of words to generate')
22 | parser.add_argument('--seed', type=int, default=1111,
23 |                     help='random seed')
24 | parser.add_argument('--cuda', action='store_true',
25 |                     help='use CUDA')
26 | parser.add_argument('--mps', action='store_true', default=False,
27 |                         help='enables macOS GPU training')
28 | parser.add_argument('--temperature', type=float, default=1.0,
29 |                     help='temperature - higher will increase diversity')
30 | parser.add_argument('--log-interval', type=int, default=100,
31 |                     help='reporting interval')
32 | args = parser.parse_args()
33 | 
34 | # Set the random seed manually for reproducibility.
35 | torch.manual_seed(args.seed)
36 | if torch.cuda.is_available():
37 |     if not args.cuda:
38 |         print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
39 | if torch.backends.mps.is_available():
40 |     if not args.mps:
41 |         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
42 |         
43 | use_mps = args.mps and torch.backends.mps.is_available()
44 | if args.cuda:
45 |     device = torch.device("cuda")
46 | elif use_mps:
47 |     device = torch.device("mps")
48 | else:
49 |     device = torch.device("cpu")
50 | 
51 | if args.temperature < 1e-3:
52 |     parser.error("--temperature has to be greater or equal 1e-3.")
53 | 
54 | with open(args.checkpoint, 'rb') as f:
55 |     model = torch.load(f, map_location=device)
56 | model.eval()
57 | 
58 | corpus = data.Corpus(args.data)
59 | ntokens = len(corpus.dictionary)
60 | 
61 | is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'
62 | if not is_transformer_model:
63 |     hidden = model.init_hidden(1)
64 | input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
65 | 
66 | with open(args.outf, 'w') as outf:
67 |     with torch.no_grad():  # no tracking history
68 |         for i in range(args.words):
69 |             if is_transformer_model:
70 |                 output = model(input, False)
71 |                 word_weights = output[-1].squeeze().div(args.temperature).exp().cpu()
72 |                 word_idx = torch.multinomial(word_weights, 1)[0]
73 |                 word_tensor = torch.Tensor([[word_idx]]).long().to(device)
74 |                 input = torch.cat([input, word_tensor], 0)
75 |             else:
76 |                 output, hidden = model(input, hidden)
77 |                 word_weights = output.squeeze().div(args.temperature).exp().cpu()
78 |                 word_idx = torch.multinomial(word_weights, 1)[0]
79 |                 input.fill_(word_idx)
80 | 
81 |             word = corpus.dictionary.idx2word[word_idx]
82 | 
83 |             outf.write(word + ('\n' if i % 20 == 19 else ' '))
84 | 
85 |             if i % args.log_interval == 0:
86 |                 print('| Generated {}/{} words'.format(i, args.words))
87 | 


--------------------------------------------------------------------------------
/word_language_model/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | 


--------------------------------------------------------------------------------