├── .github └── workflows │ ├── markdown-ci.yml │ ├── pr.yml │ └── pub.yml ├── .gitignore ├── CNAME ├── README.md ├── cn ├── docs │ ├── assets │ │ ├── index-mdx.css │ │ ├── install-selector.css │ │ ├── install-selector.js │ │ └── product-layer.png │ ├── basics │ │ ├── 01_quickstart.md │ │ ├── 02_tensor.md │ │ ├── 03_dataset_dataloader.md │ │ ├── 04_build_network.md │ │ ├── 05_autograd.md │ │ ├── 06_optimization.md │ │ ├── 07_model_load_save.md │ │ ├── 08_nn_graph.md │ │ └── imgs │ │ │ ├── compute_graph.png │ │ │ ├── dataloader_item.png │ │ │ ├── dynamic_graph.gif │ │ │ ├── fashionMNIST.png │ │ │ ├── neural-network-layers.png │ │ │ ├── qq_group.png │ │ │ └── static_graph.png │ ├── code │ │ └── parallelism │ │ │ └── ddp_train.py │ ├── cookies │ │ ├── activation_checkpointing.md │ │ ├── amp.md │ │ ├── global_tensor.md │ │ ├── global_tensor_distributed.md │ │ ├── imgs │ │ │ ├── Three_Stages_of_ZeRO-DP_Optimizations.jpg │ │ │ ├── cat.jpg │ │ │ ├── hybrid-parallel.png │ │ │ ├── oneflow-serving-demo.png │ │ │ └── triton-oneflow-backend.png │ │ ├── lr_scale.md │ │ ├── one_embedding.md │ │ ├── oneflow2onnnx.md │ │ ├── oneflow_torch.md │ │ ├── save_load.md │ │ ├── serving.md │ │ ├── torch2flow.md │ │ ├── transfer_learning.md │ │ └── zero.md │ ├── index.md │ ├── javascripts │ │ └── config.js │ └── parallelism │ │ ├── 01_introduction.md │ │ ├── 02_sbp.md │ │ ├── 03_consistent_tensor.md │ │ ├── 04_2d-sbp.md │ │ ├── 04_launch.md │ │ ├── 05_ddp.md │ │ ├── 06_pipeline.md │ │ └── imgs │ │ ├── 2d-sbp.png │ │ ├── boxing_s2b.png │ │ ├── consistent-view.png │ │ ├── gpt3-overview.png │ │ ├── matmul_data_paralelism.png │ │ ├── matmul_logical.png │ │ ├── matmul_model_paralelism.png │ │ ├── multi-matmul.png │ │ ├── realy.png │ │ ├── sbp-example.png │ │ └── sbp_translation.png ├── mkdocs.yml └── overrides │ ├── partials │ ├── integrations │ │ └── disqus.html │ └── source.html │ └── templates │ └── home.html ├── en ├── docs │ ├── assets │ │ ├── index-mdx.css │ │ ├── install-selector.css │ │ ├── install-selector.js │ │ └── product-layer.png │ ├── basics │ │ ├── 01_quickstart.md │ │ ├── 02_tensor.md │ │ ├── 03_dataset_dataloader.md │ │ ├── 04_build_network.md │ │ ├── 05_autograd.md │ │ ├── 06_optimization.md │ │ ├── 07_model_load_save.md │ │ ├── 08_nn_graph.md │ │ └── imgs │ │ │ ├── compute_graph.png │ │ │ ├── dataloader_item.png │ │ │ ├── dynamic_graph.gif │ │ │ ├── fashionMNIST.png │ │ │ ├── neural-network-layers.png │ │ │ ├── poly_fit.png │ │ │ ├── qq_group.png │ │ │ └── static_graph.png │ ├── cookies │ │ ├── activation_checkpointing.md │ │ ├── amp.md │ │ ├── global_tensor.md │ │ ├── global_tensor_distributed.md │ │ ├── imgs │ │ │ ├── Three_Stages_of_ZeRO-DP_Optimizations.jpg │ │ │ ├── cat.jpg │ │ │ ├── hybrid-parallel.png │ │ │ ├── oneflow-serving-demo.png │ │ │ └── triton-oneflow-backend.png │ │ ├── one_embedding.md │ │ ├── oneflow2onnnx.md │ │ ├── oneflow_torch.md │ │ ├── save_load.md │ │ ├── serving.md │ │ ├── torch2flow.md │ │ ├── transfer_learning.md │ │ └── zero.md │ ├── index.md │ ├── javascripts │ │ └── config.js │ └── parallelism │ │ ├── 01_introduction.md │ │ ├── 02_sbp.md │ │ ├── 03_consistent_tensor.md │ │ ├── 04_2d-sbp.md │ │ ├── 04_launch.md │ │ ├── 05_ddp.md │ │ ├── 06_pipeline.md │ │ └── imgs │ │ ├── 2d-sbp.png │ │ ├── boxing_s2b.png │ │ ├── consistent-view.png │ │ ├── gpt3-overview.png │ │ ├── matmul_data_paralelism.png │ │ ├── matmul_logical.png │ │ ├── matmul_model_paralelism.png │ │ ├── multi-matmul.png │ │ ├── realy.png │ │ ├── sbp-example.png │ │ └── sbp_translation.png ├── mkdocs.yml └── overrides │ ├── partials │ ├── integrations │ │ └── disqus.html │ └── source.html │ └── templates │ └── home.html ├── requirements.txt └── scripts ├── ci-requirements.txt ├── markdown_ci ├── README.md ├── __init__.py ├── configs │ ├── basics_01_quick_start.yml │ ├── basics_02_tensor.yml │ ├── basics_03_dataset.yml │ ├── basics_04_build_network.yml │ ├── basics_05_autograd.yml │ ├── basics_06_optimization.yml │ ├── basics_07_save_load.yml │ ├── basics_08_nn_graph.yml │ ├── guide_activation_checkpointing.yml │ ├── guide_amp.yml │ ├── guide_lr_scale.yml │ ├── guide_one_embedding.yml │ ├── guide_oneflow2onnx.yml │ ├── guide_torch2flow.yml │ ├── guide_transfer_learning.yml │ └── guide_zero.yml ├── extract_code_block.py ├── run_by_yamls.py └── run_markdown_codes.py ├── run-markdown-ci.sh └── run-mike.sh /.github/workflows/markdown-ci.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: RunMarkdownFile 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the master branch 8 | pull_request: 9 | branches: [master] 10 | types: [opened, labeled, unlabeled, synchronize] 11 | 12 | # Allows you to run this workflow manually from the Actions tab 13 | workflow_dispatch: 14 | 15 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 16 | jobs: 17 | # This workflow contains a single job called "run_markdown_file" 18 | run_markdown_file: 19 | if: (contains(github.event.pull_request.labels.*.name, 'ci')) 20 | # The type of runner that the job will run on 21 | runs-on: [self-hosted, linux, provision] 22 | env: 23 | TEST_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test" 24 | TEST_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.9.0:e7a497b41d8b7f1bce055b1f23d027f93b1557ae 25 | # Steps represent a sequence of tasks that will be executed as part of the job 26 | steps: 27 | - name: Fix permissions 28 | run: | 29 | set -x 30 | docker run --rm -v $PWD:/p -w /p busybox rm -rf * 31 | docker run --rm -v $PWD:/p -w /p busybox rm -rf .git 32 | - name: Remove container 33 | timeout-minutes: 45 34 | run: | 35 | docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true 36 | - name: Cancel Previous Runs 37 | continue-on-error: true 38 | uses: styfle/cancel-workflow-action@0.9.1 39 | with: 40 | access_token: ${{ github.token }} 41 | all_but_latest: true 42 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 43 | - uses: actions/checkout@v2 44 | with: 45 | submodules: false 46 | # Runs a set of commands using the runners shell 47 | - name: Start container 48 | timeout-minutes: 45 49 | run: | 50 | docker pull ${{ env.TEST_IMG_TAG }} 51 | docker run -d --rm --privileged --network host --shm-size=8g \ 52 | --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ 53 | --runtime=nvidia \ 54 | -v $HOME/test-container-cache/dot-local:/root/.local \ 55 | -v $HOME/test-container-cache/dot-cache:/root/.cache \ 56 | -v $HOME/test-container-cache/dot-oneflow:/root/.oneflow \ 57 | -v $HOME/test-container-cache/dot-cache/data:$PWD/scripts/markdown_ci/data \ 58 | -v $PWD:$PWD \ 59 | -w $PWD \ 60 | --name ${TEST_CONTAINER_NAME} \ 61 | ${{ env.TEST_IMG_TAG }} \ 62 | sleep 3600 63 | - name: Install run markdown files code requirements 64 | timeout-minutes: 45 65 | run: | 66 | docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 67 | docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip install -f https://staging.oneflow.info/branch/master/cu117 --pre oneflow 68 | docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip install -r requirements.txt --user 69 | docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip install -r scripts/ci-requirements.txt --user 70 | - name: MarkdwonTest 71 | timeout-minutes: 45 72 | run: | 73 | docker exec -w $PWD ${{ env.TEST_CONTAINER_NAME }} sh scripts/run-markdown-ci.sh 74 | - name: Remove container 75 | timeout-minutes: 45 76 | if: ${{ always() }} 77 | run: | 78 | docker rm -f ${TEST_CONTAINER_NAME} || true 79 | docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) . || true 80 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: Check PR 2 | 3 | on: 4 | pull_request: 5 | types: [opened, labeled, unlabeled, synchronize] 6 | 7 | jobs: 8 | check_labels: 9 | runs-on: ubuntu-latest 10 | name: Labels 11 | steps: 12 | - name: Check type labels 'cn, en, fix, enhancement' 13 | if: (contains(github.event.pull_request.labels.*.name, 'cn') || contains(github.event.pull_request.labels.*.name, 'enhancement') || contains(github.event.pull_request.labels.*.name, 'en') || contains(github.event.pull_request.labels.*.name, 'fix')) == false 14 | run: | 15 | exit 1 16 | -------------------------------------------------------------------------------- /.github/workflows/pub.yml: -------------------------------------------------------------------------------- 1 | name: Doc Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | with: 14 | fetch-depth: 0 15 | - run: pip3 install --user -r requirements.txt 16 | - run: git config --global user.email "ci-bot@oneflow.org" 17 | - run: git config --global user.name "oneflow-ci-bot" 18 | - run: sh ./scripts/run-mike.sh 19 | - run: git checkout docs_output 20 | - run: git checkout master -- CNAME 21 | - run: mv CNAME ./_site/ 22 | 23 | - name: Deploy 24 | uses: peaceiris/actions-gh-pages@v3 25 | with: 26 | github_token: ${{ secrets.GITHUB_TOKEN }} 27 | publish_dir: ./_site 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cn/site/ 2 | en/site/ 3 | site/ 4 | 5 | .idea 6 | .DS_Store 7 | 8 | log/ 9 | core.* 10 | *.pyc 11 | *.ipynb 12 | /.vscode 13 | /.idea 14 | /manylinux* 15 | wheelhouse/ 16 | wheelhouse* 17 | .clangd 18 | .cache 19 | /tmp 20 | 21 | meta 22 | out 23 | pickled_data 24 | *model/ 25 | *.onnx 26 | *.pth 27 | data/ 28 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | docs.oneflow.org -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Oneflow-Documentation 2 | 3 | The source code of website https://docs.oneflow.org 4 | 5 | Build the documentation locally: 6 | 7 | ```shell 8 | python3 -m pip install -r requirements.txt 9 | ``` 10 | 11 | And then, change the directory to `cn/` or `en/`, run commands: 12 | 13 | ```shell 14 | mkdocs build 15 | ``` 16 | 17 | The output HTML files will be generated at `site/` directory. 18 | 19 | ## Deployment 20 | 21 | Run commands: 22 | 23 | ```shell 24 | sh ./scripts/run-mike.sh 25 | ``` 26 | 27 | The multi-version website will be built and deployed at branch `docs_output`. 28 | -------------------------------------------------------------------------------- /cn/docs/assets/index-mdx.css: -------------------------------------------------------------------------------- 1 | @-webkit-keyframes mdx-heart { 2 | 0%,40%,80%,to { 3 | transform: scale(1) 4 | } 5 | 6 | 20%,60% { 7 | transform: scale(1.15) 8 | } 9 | } 10 | 11 | @keyframes mdx-heart { 12 | 0%,40%,80%,to { 13 | transform: scale(1) 14 | } 15 | 16 | 20%,60% { 17 | transform: scale(1.15) 18 | } 19 | } 20 | 21 | .mdx-hero { 22 | color: var(--md-primary-bg-color); 23 | margin: 0 .8rem 24 | } 25 | 26 | .mdx-hero h1 { 27 | color: currentColor; 28 | font-weight: 700; 29 | margin-bottom: 1rem 30 | } 31 | 32 | @media screen and (max-width: 29.9375em) { 33 | .mdx-hero h1 { 34 | font-size:1.4rem 35 | } 36 | } 37 | 38 | .mdx-hero__content { 39 | padding-bottom: 1rem 40 | } 41 | 42 | @media screen and (min-width: 60em) { 43 | .mdx-hero { 44 | align-items:stretch; 45 | display: flex 46 | } 47 | 48 | .mdx-hero__content { 49 | margin-top: 1.5rem; 50 | max-width: 19rem; 51 | /* padding-bottom: 14vw */ 52 | } 53 | 54 | .mdx-hero__image { 55 | order: 1; 56 | transform: translateX(4rem); 57 | width: 40rem; 58 | height: 20rem 59 | } 60 | } 61 | 62 | @media screen and (min-width: 76.25em) { 63 | .mdx-hero__image { 64 | transform:translateX(8rem) 65 | } 66 | } 67 | 68 | .mdx-hero .md-button { 69 | color: var(--md-primary-bg-color); 70 | margin-right: .5rem; 71 | margin-top: .5rem 72 | } 73 | 74 | .mdx-hero .md-button:focus,.mdx-hero .md-button:hover { 75 | background-color: var(--md-accent-fg-color); 76 | border-color: var(--md-accent-fg-color); 77 | color: var(--md-accent-bg-color) 78 | } 79 | 80 | .mdx-hero .md-button--primary { 81 | background-color: var(--md-primary-bg-color); 82 | border-color: var(--md-primary-bg-color); 83 | color: #894da8 84 | } 85 | .tx-container { 86 | padding-top: .0rem; 87 | background: linear-gradient(to bottom, var(--md-primary-fg-color), #9941d4 100%,#4051b5 99%) 88 | } 89 | 90 | .feature-item h2 svg { 91 | height: 30px; 92 | float: left; 93 | margin-right: 10px; 94 | transform: translateY(10%); 95 | } 96 | 97 | .top-hr { 98 | margin-top: 42px; 99 | } 100 | 101 | .feature-item { 102 | font-family: 'Lato', sans-serif; 103 | font-weight: 300; 104 | box-sizing: border-box; 105 | padding: 0 15px; 106 | word-break: break-word 107 | } 108 | 109 | .feature-item h2 { 110 | color: #333; 111 | font-weight: 300; 112 | font-size: 25px; 113 | white-space: nowrap; 114 | overflow: hidden; 115 | text-overflow: ellipsis; 116 | line-height: normal; 117 | margin-top: 20px; 118 | margin-bottom: 10px; 119 | font-family: inherit; 120 | } 121 | 122 | .feature-item p { 123 | font-size: 16px; 124 | line-height: 1.8em; 125 | text-rendering: optimizeLegibility; 126 | -webkit-font-smoothing: antialiased; 127 | color: #111; 128 | margin: 0 0 10px; 129 | display: block; 130 | } 131 | 132 | @media screen and (min-width:76.25em) { 133 | .md-sidebar--primary { 134 | display: none 135 | } 136 | 137 | .top-hr { 138 | width: 100%; 139 | display: flex; 140 | max-width: 61rem; 141 | margin-right: auto; 142 | margin-left: auto; 143 | padding: 0 .2rem; 144 | } 145 | 146 | .bottom-hr { 147 | margin-top: 10px; 148 | width: 100%; 149 | display: flex; 150 | max-width: 61rem; 151 | margin-right: auto; 152 | margin-left: auto; 153 | padding: 0 .2rem; 154 | } 155 | 156 | .feature-item { 157 | flex: 1; 158 | min-width: 0; 159 | } 160 | 161 | .feature-item:hover { 162 | background-color: #526cfe47; 163 | border-radius: 3px; 164 | } 165 | } 166 | 167 | .hr { 168 | border-bottom: 1px solid #eee; 169 | width: 100%; 170 | margin: 20px 0; 171 | } 172 | 173 | .md-footer-meta__inner { 174 | display: flex; 175 | flex-wrap: wrap; 176 | justify-content: space-between; 177 | margin-top: 0rem; 178 | } 179 | 180 | .md-footer-social { 181 | padding-top: 20px; 182 | } 183 | -------------------------------------------------------------------------------- /cn/docs/assets/install-selector.css: -------------------------------------------------------------------------------- 1 | #instruction ul, #instruction li, #instruction p{ 2 | margin: 0; 3 | padding: 0; 4 | } 5 | #instruction{ 6 | padding-top: 20px; 7 | width: 700px; 8 | } 9 | #instruction ul{ 10 | display: flex; 11 | width: 100%; 12 | margin: 10px -3px 0; 13 | } 14 | #instruction li{ 15 | border: 2px solid #FFF; 16 | height: 48px; 17 | line-height: 48px; 18 | cursor: pointer; 19 | text-align: center; 20 | list-style: none; 21 | flex: 1; 22 | margin: 0 3px; 23 | border-radius: 4px; 24 | transition: all .3s; 25 | } 26 | #instruction li.active{ 27 | border: 2px solid #FFFFFF; 28 | background: #526cfe6b; 29 | } 30 | 31 | #instruction .command{ 32 | padding-top: 10px; 33 | } 34 | #instruction .panel-code{ 35 | border-radius: 4px; 36 | background: #FFF; 37 | color: #333; 38 | padding: 20px; 39 | margin-top: 10px; 40 | } 41 | .smlVers{ 42 | overflow: hidden; 43 | } 44 | 45 | .command-copy{ 46 | margin-top: 5px; 47 | margin-bottom: 0px; 48 | width: 0%; 49 | height: 0%; 50 | float: right; 51 | } 52 | 53 | @media screen and (max-width: 29.9375em) { 54 | #instruction{ 55 | padding-top: 20px; 56 | width: 95%; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /cn/docs/assets/install-selector.js: -------------------------------------------------------------------------------- 1 | ; (function () { 2 | window.addEventListener('load', () => { 3 | 4 | function get_commands(latest_version) { 5 | let stable_command_118 = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cu118 oneflow' 6 | let stable_command_121 = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cu121 oneflow' 7 | let stable_command_122 = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cu122 oneflow' 8 | let stable_command_cpu = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cpu oneflow' 9 | let commands = [ 10 | { 11 | versions: 'Stable', 12 | framework: 'CUDA', 13 | smlVers: '11.8', 14 | command: stable_command_118.replace("VERSION", latest_version) 15 | }, 16 | { 17 | versions: 'Stable', 18 | framework: 'CUDA', 19 | smlVers: '12.1', 20 | command: stable_command_121.replace("VERSION", latest_version) 21 | }, 22 | { 23 | versions: 'Stable', 24 | framework: 'CUDA', 25 | smlVers: '12.2', 26 | command: stable_command_122.replace("VERSION", latest_version) 27 | }, 28 | { 29 | versions: 'Stable', 30 | framework: 'CPU', 31 | smlVers: '', 32 | command: stable_command_cpu.replace("VERSION", latest_version) 33 | }, 34 | { 35 | versions: 'Nightly', 36 | framework: 'CUDA', 37 | smlVers: '11.8', 38 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cu118 --pre oneflow' 39 | }, 40 | { 41 | versions: 'Nightly', 42 | framework: 'CUDA', 43 | smlVers: '12.1', 44 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cu121 --pre oneflow' 45 | }, 46 | { 47 | versions: 'Nightly', 48 | framework: 'CUDA', 49 | smlVers: '12.2', 50 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cu122 --pre oneflow' 51 | }, 52 | { 53 | versions: 'Nightly', 54 | framework: 'CPU', 55 | smlVers: '', 56 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cpu --pre oneflow' 57 | }, 58 | ] 59 | return commands 60 | } 61 | 62 | function init_selector(commands) { 63 | let condition = { 64 | versions: 'Stable', 65 | framework: 'CUDA', 66 | smlVers: '11.8', 67 | } 68 | selectCommands(condition) 69 | let items = document.querySelectorAll('#instruction li') 70 | 71 | function selectCommands(conditioning) { 72 | let filter = null 73 | if (conditioning.framework == "CPU") { 74 | filter = commands.filter(e => e.versions == conditioning.versions).filter(e => e.framework == conditioning.framework) 75 | } else { 76 | filter = commands.filter(e => e.versions == conditioning.versions).filter(e => e.framework == conditioning.framework).filter(e => e.smlVers == conditioning.smlVers) 77 | } 78 | if (filter && filter[0]) { 79 | document.querySelector('.panel-code').innerHTML = filter[0].command 80 | } 81 | } 82 | items.forEach(e => { 83 | e.addEventListener('click', function () { 84 | let attach = this.getAttribute('attach') 85 | let tempItems = document.querySelectorAll(`[attach=${attach}]`) 86 | tempItems.forEach(e => { 87 | e.className = '' 88 | }) 89 | this.className = 'active' 90 | condition[attach] = this.innerHTML 91 | if (condition['framework'] == 'CPU') { 92 | document.querySelector('.smlVers').style.height = '0px' 93 | } else { 94 | document.querySelector('.smlVers').style.height = '48px' 95 | } 96 | selectCommands(condition) 97 | }) 98 | }) 99 | } 100 | 101 | let TAGS_API_URL = 'https://api.github.com/repos/Oneflow-Inc/oneflow/tags' 102 | let xmlhttp = new XMLHttpRequest(); 103 | let latest_version_hardcode = "0.8.0" // using latest version in hard-code way if request fails 104 | xmlhttp.onreadystatechange = function () { 105 | if (xmlhttp.readyState == 4) {// 4 = "loaded" 106 | if (xmlhttp.status == 200) {// 200 = "OK" 107 | localStorage.latest_version = eval(xmlhttp.responseText)[0].name.replace("v", "").replace("0.8.1", "0.8.0") // eg: v0.x.0 => 0.x.0 108 | init_selector(get_commands(localStorage.latest_version)) 109 | } 110 | else { 111 | init_selector(get_commands(localStorage.latest_version ? localStorage.latest_version : latest_version_hardcode)) 112 | } 113 | } 114 | } 115 | xmlhttp.open("GET", TAGS_API_URL, true) 116 | xmlhttp.send(null) 117 | }) 118 | })(); 119 | 120 | function copyPipCommand() { 121 | var copyText = document.querySelector('.panel-code').innerHTML 122 | navigator.clipboard.writeText(copyText) 123 | } 124 | -------------------------------------------------------------------------------- /cn/docs/assets/product-layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/assets/product-layer.png -------------------------------------------------------------------------------- /cn/docs/basics/02_tensor.md: -------------------------------------------------------------------------------- 1 | # Tensor 张量 2 | 3 | 神经网络中的数据,都存放在 Tensor 中,Tensor 类似多维数组或者数学上的矩阵。OneFlow 提供了很多用于操作 Tensor 的算子,Tensor 与算子一起构成神经网络。 4 | 5 | Tensor 有别于普通的多维数组的地方是:除了可以运行在 CPU 上外,它还可以运行在 其它 AI 芯片(如 NVIDIA GPU)上,因此可以提高运算速度。此外,OneFlow 还为张量提供了 [自动求导](./05_autograd.md) 的功能。 6 | 7 | ```python 8 | import oneflow as flow 9 | import numpy as np 10 | ``` 11 | 12 | ## 创建 Tensor 13 | 14 | 有多种方法创建 Tensor,包括: 15 | 16 | - 直接从数据创建 17 | - 通过 Numpy 数组创建 18 | - 使用算子创建 19 | 20 | ### 直接从数据创建 21 | 22 | 可以直接从数据创建 Tensor: 23 | 24 | ```python 25 | x1 = flow.tensor([[1, 2], [3, 4]]) 26 | x2 = flow.tensor([[1.0, 2.0], [3.0, 4.0]]) 27 | print(x1) 28 | print(x2) 29 | ``` 30 | 31 | 可以看到创建的 `x1`、`x2` Tensor,它们的类型分别是 `int64` 和 `float32`。 32 | 33 | ```text 34 | tensor([[1, 2], 35 | [3, 4]], dtype=oneflow.int64) 36 | tensor([[1., 2.], 37 | [3., 4.]], dtype=oneflow.float32) 38 | ``` 39 | 40 | ### 通过 Numpy 数组创建 41 | 42 | Tensor 可以通过 Numpy 数组创建,只需要在创建 Tensor 对象时,将 Numpy 数组作为参数传递即可。 43 | 44 | ```python 45 | x3 = flow.tensor(np.ones((2,3))) 46 | x4 = flow.tensor(np.random.rand(2,3)) 47 | print(x3) 48 | print(x4) 49 | ``` 50 | 51 | ```text 52 | tensor([[1., 1., 1.], 53 | [1., 1., 1.]], dtype=oneflow.float64) 54 | tensor([[0.6213, 0.6142, 0.1592], 55 | [0.5539, 0.8453, 0.8576]], dtype=oneflow.float64) 56 | ``` 57 | 58 | ### 通过算子创建 59 | 60 | OneFlow 中还提供了一些算子,可以通过它们创建 Tensor。比如 [ones](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.ones.html)、 [zeros](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.zeros.html)、[eye](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.eye.html),它们分别创建全为1的张量、全为0的张量和单位张量。 61 | 62 | ```python 63 | x5 = flow.ones(2, 3) 64 | x6 = flow.zeros(2, 3) 65 | x7 = flow.eye(3) 66 | print(x5) 67 | print(x6) 68 | print(x7) 69 | ``` 70 | 71 | ```text 72 | tensor([[1., 1., 1.], 73 | [1., 1., 1.]], dtype=oneflow.float32) 74 | tensor([[0., 0., 0.], 75 | [0., 0., 0.]], dtype=oneflow.float32) 76 | tensor([[1., 0., 0.], 77 | [0., 1., 0.], 78 | [0., 0., 1.]], dtype=oneflow.float32) 79 | ``` 80 | 81 | [randn](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.randn.html) 方法可以创建随机化的张量: 82 | 83 | ```python 84 | x8 = flow.randn(2,3) 85 | ``` 86 | 87 | ## `Tensor` 与 `tensor` 的区别 88 | 89 | 细心的用户会发现,OneFlow 中有 [oneflow.Tensor](https://oneflow.readthedocs.io/en/v0.8.1/tensor.html) 和 [oneflow.tensor](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.tensor.html) 两个接口,它们都能用来创建张量。那么它们有什么区别呢? 90 | 91 | 简单而言,大写的 `Tensor` 数据类型默认限定为 `float32`,而小写的 `tensor` 的数据类型可以随着创建时的数据改变。以下代码展示了两者这方面的区别: 92 | 93 | ```python 94 | print(flow.Tensor([1, 2, 3])) 95 | print(flow.tensor([1, 2, 3])) 96 | print(flow.tensor([1.0, 2.0, 3.0])) 97 | ``` 98 | 99 | 数据结果为: 100 | 101 | ```text 102 | tensor([1., 2., 3.], dtype=oneflow.float32) 103 | tensor([1, 2, 3], dtype=oneflow.int64) 104 | tensor([1., 2., 3.], dtype=oneflow.float32) 105 | ``` 106 | 107 | 此外,大写的 `Tensor` 可以在创建时不指定具体数据: 108 | 109 | ```python 110 | x9 = flow.Tensor(2, 3) 111 | print(x9.shape) 112 | ``` 113 | 114 | ```text 115 | flow.Size([2, 3]) 116 | ``` 117 | 118 | 因此,如果在创建张量的同时不想指定数据,那么常常用 `oneflow.Tensor`,否则,应该使用 `oneflow.tensor`。 119 | 120 | ## Tensor 的属性 121 | 122 | Tensor 的 `shape`、`dtype`、`device` 属性分别描述了 Tensor 的形状、数据类型和所在的设备类型。 123 | 124 | ```python 125 | x9 = flow.randn(1,4) 126 | print(x9.shape) 127 | print(x9.dtype) 128 | print(x9.device) 129 | ``` 130 | 131 | 输出结果分别展示了张量的形状、数据类型和所处的设备(第0号 CPU 上,之所以有编号,是因为 OneFlow 很方便自然地支持分布式,可参考 [Global Tensor](../parallelism/03_consistent_tensor.md)) 132 | 133 | ```text 134 | flow.Size([1, 4]) 135 | oneflow.float32 136 | cpu:0 137 | ``` 138 | 139 | 可以通过 [reshape](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.reshape.html) 方法改变 Tensor 的形状,用 [Tensor.to](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.Tensor.to.html) 方法改变 Tensor 的数据类型和所处设备: 140 | 141 | ``` 142 | x10 = x9.reshape(2, 2) 143 | x11 = x10.to(dtype=flow.int32, device=flow.device("cuda")) 144 | print(x10.shape) 145 | print(x11.dtype, x11.device) 146 | ``` 147 | 148 | ```text 149 | flow.Size([2, 2]) 150 | oneflow.int32 cuda:0 151 | ``` 152 | 153 | ## 操作 Tensor 的常见算子 154 | 155 | OneFlow 中提供了大量的算子,对 Tensor 进行操作,它们大多在 [oneflow](https://oneflow.readthedocs.io/en/v0.8.1/oneflow.html)、[oneflow.Tensor](https://oneflow.readthedocs.io/en/v0.8.1/tensor.html)、[oneflow.nn](https://oneflow.readthedocs.io/en/v0.8.1/nn.html)、[oneflow.nn.functional](https://oneflow.readthedocs.io/en/v0.8.1/nn.functional.html)这几个名称空间下。 156 | 157 | OneFlow 中的 Tensor,与 Numpy 数组一样易用。比如,支持与 Numpy 类似的切片操作: 158 | 159 | ```python 160 | tensor = flow.ones(4, 4) 161 | print('First row: ',tensor[0]) 162 | print('First column: ', tensor[:, 0]) 163 | print('Last column:', tensor[..., -1]) 164 | tensor[:,1] = 0 165 | print(tensor) 166 | ``` 167 | 168 | ```text 169 | First row: tensor([1., 1., 1., 1.], dtype=oneflow.float32) 170 | First column: tensor([1., 1., 1., 1.], dtype=oneflow.float32) 171 | Last column: tensor([1., 1., 1., 1.], dtype=oneflow.float32) 172 | tensor([[1., 0., 1., 1.], 173 | [1., 0., 1., 1.], 174 | [1., 0., 1., 1.], 175 | [1., 0., 1., 1.]], dtype=oneflow.float32) 176 | ``` 177 | 178 | 此外,OneFlow 中还有很多其它操作,如算数相关操作的 [add](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.add.html)、[sub](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.sub.html)、[mul](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.mul.html)、[div](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.div.html)等;位置相关操作的 [scatter](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.scatter.html)、[gather](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.gather.html) 等;以及激活函数、卷积等([relu](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.functional.relu.html)、[conv2d](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.functional.conv2d.html)),点击它们的链接可以查看更详细的 API 说明,并找到更多的其它算子。 179 | -------------------------------------------------------------------------------- /cn/docs/basics/03_dataset_dataloader.md: -------------------------------------------------------------------------------- 1 | # Dataset 与 DataLoader 2 | 3 | OneFlow 的 `Dataset` 与 `DataLoader` 的行为与 [PyTorch](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) 的是一致的,都是为了让数据集管理与模型训练解耦。 4 | 5 | `Dataset` 类用于定义如何读取数据。对于常见的计算机视觉数据集(如 FashionMNIST),可以直接使用 [FlowVision](https://github.com/Oneflow-Inc/vision) 库的 `datasets` 模块提供的数据集类,可以帮助我们自动下载并加载一些流行的数据集,这些类都间接继承了 `Dataset` 类。对于其他数据集,可以通过继承 `Dataset` 类来自定义数据集类。 6 | 7 | `DataLoader` 将 `Dataset` 封装为迭代器,方便训练时遍历并操作数据。 8 | 9 | ```python 10 | import matplotlib.pyplot as plt 11 | 12 | import oneflow as flow 13 | import oneflow.nn as nn 14 | from oneflow.utils.data import Dataset 15 | from flowvision import datasets 16 | from flowvision import transforms 17 | ``` 18 | 上面导入的 [flowvision.transforms](https://flowvision.readthedocs.io/en/stable/flowvision.transforms.html) 提供了一些对图像数据进行变换的操作(如 `ToTensor` 可以将 PIL 图像或 NumPy 数组转换为张量),可以在数据集类中直接使用。 19 | 20 | ## 使用 FlowVision 加载数据集 21 | 22 | 以下的例子展示了如何使用 `flowvision.datasets` 加载 FashionMNIST 数据集。 23 | 24 | 我们向 `FashionMNIST` 类传入以下参数: 25 | - `root`:数据集存放的路径 26 | - `train`: `True` 代表下载训练集、`False` 代表下载测试集 27 | - `download=True`: 如果 `root` 路径下数据集不存在,则从网络下载 28 | - `transforms`:指定的数据转换方式 29 | 30 | ```python 31 | training_data = datasets.FashionMNIST( 32 | root="data", 33 | train=True, 34 | download=True, 35 | transform=transforms.ToTensor(), 36 | source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/", 37 | ) 38 | 39 | test_data = datasets.FashionMNIST( 40 | root="data", 41 | train=False, 42 | download=True, 43 | transform=transforms.ToTensor(), 44 | source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/", 45 | ) 46 | ``` 47 | 48 | 第一次运行,会下载数据集,输出: 49 | 50 | ```text 51 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/train-images-idx3-ubyte.gz 52 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz 53 | 26422272/? [00:02<00:00, 8090800.72it/s] 54 | Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw 55 | 56 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/train-labels-idx1-ubyte.gz 57 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz 58 | 29696/? [00:00<00:00, 806948.09it/s] 59 | Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw 60 | 61 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/t10k-images-idx3-ubyte.gz 62 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz 63 | 4422656/? [00:00<00:00, 19237994.98it/s] 64 | Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw 65 | 66 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/t10k-labels-idx1-ubyte.gz 67 | Downloading https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz 68 | 6144/? [00:00<00:00, 152710.85it/s] 69 | Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw 70 | ``` 71 | 72 | ## 遍历数据 73 | 74 | `Dataset` 对象,可以像 `list` 一样,用下标索引,比如 `training_data[index]`。 75 | 以下的例子,随机访问 `training_data` 中的9个图片,并显示。 76 | 77 | ```python 78 | labels_map = { 79 | 0: "T-Shirt", 80 | 1: "Trouser", 81 | 2: "Pullover", 82 | 3: "Dress", 83 | 4: "Coat", 84 | 5: "Sandal", 85 | 6: "Shirt", 86 | 7: "Sneaker", 87 | 8: "Bag", 88 | 9: "Ankle Boot", 89 | } 90 | figure = plt.figure(figsize=(8, 8)) 91 | cols, rows = 3, 3 92 | from random import randint 93 | for i in range(1, cols * rows + 1): 94 | sample_idx = randint(0, len(training_data)) 95 | img, label = training_data[sample_idx] 96 | figure.add_subplot(rows, cols, i) 97 | plt.title(labels_map[label]) 98 | plt.axis("off") 99 | plt.imshow(img.squeeze().numpy(), cmap="gray") 100 | plt.show() 101 | ``` 102 | 103 | ![fashionMNIST](./imgs/fashionMNIST.png) 104 | 105 | ## 自定义 Dataset 106 | 107 | 通过继承 [oneflow.utils.data.Dataset](https://oneflow.readthedocs.io/en/v0.8.1/utils.data.html#oneflow.utils.data.Dataset) 可以实现自定义 `Dataset`,自定义 `Dataset` 同样可以配合下一节介绍的 `Dataloader` 使用,简化数据处理的流程。 108 | 109 | 以下的例子展示了如何实现一个自定义 `Dataset`,它的关键步骤是: 110 | 111 | - 继承 `oneflow.utils.data.Dataset` 112 | - 实现类的 `__len__` 方法,返回结果通常为该数据集中的样本数量 113 | - 实现类的 `__getitem__` 方法,它的返回值对应了用户(或框架)调用 `dataset_obj[idx]` 时得到的结果 114 | 115 | ```python 116 | import numpy as np 117 | class CustomDataset(Dataset): 118 | raw_data_x = np.array([[1, 2], [2, 3], [4, 6], [3, 1]], dtype=np.float32) 119 | raw_label = np.array([[8], [13], [26], [9]], dtype=np.float32) 120 | 121 | def __init__(self, transform=None, target_transform=None): 122 | self.transform = transform 123 | self.target_transform = target_transform 124 | 125 | def __len__(self): 126 | return len(raw_label) 127 | 128 | def __getitem__(self, idx): 129 | x = CustomDataset.raw_data_x[idx] 130 | label = CustomDataset.raw_label[idx] 131 | if self.transform: 132 | x = self.transform(x) 133 | if self.target_transform: 134 | label = self.target_transform(label) 135 | return x, label 136 | 137 | custom_dataset = CustomDataset() 138 | print(custom_dataset[0]) 139 | print(custom_dataset[1]) 140 | ``` 141 | 142 | 输出: 143 | 144 | ```text 145 | (array([1., 2.], dtype=float32), array([8.], dtype=float32)) 146 | (array([2., 3.], dtype=float32), array([13.], dtype=float32)) 147 | ``` 148 | 149 | ## 使用 DataLoader 150 | 151 | 利用 Dataset 可以一次获取一条样本数据。但是在训练中,往往有其它的需求,如:一次读取 batch size 份数据;1轮 epoch 训练后,数据重新打乱(reshuffle)等。 152 | 153 | 这时候,使用 `DataLoader` 即可。 `DataLoader` 可以将 `Dataset` 封装为迭代器,方便训练循环中获取数据。如以下例子: 154 | 155 | - `batch_size=64` : 指定一次迭代返回的数据 batch size 156 | - `shuffle` :是否要随机打乱数据的顺序 157 | 158 | ```python 159 | from oneflow.utils.data import DataLoader 160 | 161 | train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True) 162 | x, label = next(iter(train_dataloader)) 163 | print(f"shape of x:{x.shape}, shape of label: {label.shape}") 164 | ``` 165 | 166 | 输出: 167 | 168 | ```text 169 | shape of x:flow.Size([64, 1, 28, 28]), shape of label: flow.Size([64]) 170 | ``` 171 | 172 | ```python 173 | img = x[0].squeeze().numpy() 174 | label = label[0] 175 | plt.imshow(img, cmap="gray") 176 | plt.show() 177 | print(label) 178 | ``` 179 | 180 | 输出:(随机输出一张图片) 181 | 182 | ![dataloader item](./imgs/dataloader_item.png) 183 | 184 | ```text 185 | tensor(9, dtype=oneflow.int64) 186 | ``` 187 | 188 | 自然我们也可以在训练的循环中,使用 `DataLoader` 迭代器: 189 | 190 | ```python 191 | for x, label in train_dataloader: 192 | print(x.shape, label.shape) 193 | # training... 194 | ``` 195 | -------------------------------------------------------------------------------- /cn/docs/basics/04_build_network.md: -------------------------------------------------------------------------------- 1 | # 搭建神经网络 2 | 3 | ​神经网络的各层,可以使用 [oneflow.nn](https://oneflow.readthedocs.io/en/v0.8.1/nn.html) 名称空间下的 API 搭建,它提供了构建神经网络所需的常见 Module(如 [oneflow.nn.Conv2d](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.Conv2d.html),[oneflow.nn.ReLU](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.ReLU.html) 等等)。 用于搭建网络的所有 Module 类都继承自 [oneflow.nn.Module](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.Module.html),多个简单的 Module 可以组合在一起构成更复杂的 Module,用这种方式,用户可以轻松地搭建和管理复杂的神经网络。 4 | 5 | ```python 6 | import oneflow as flow 7 | import oneflow.nn as nn 8 | ``` 9 | 10 | ## 定义 Module 类 11 | 12 | `oneflow.nn` 下提供了常见的 Module 类,我们可以直接使用它们,或者在它们的基础上,通过自定义 Module 类搭建神经网络。搭建过程包括: 13 | 14 | - 写一个继承自 `oneflow.nn.Module` 的类 15 | - 实现类的 `__init__` 方法,在其中构建神经网络的结构 16 | - 实现类的 `forward` 方法,这个方法针对 Module 的输入进行计算 17 | 18 | ```python 19 | class NeuralNetwork(nn.Module): 20 | def __init__(self): 21 | super(NeuralNetwork, self).__init__() 22 | self.flatten = nn.Flatten() 23 | self.linear_relu_stack = nn.Sequential( 24 | nn.Linear(28*28, 512), 25 | nn.ReLU(), 26 | nn.Linear(512, 512), 27 | nn.ReLU(), 28 | nn.Linear(512, 10), 29 | nn.ReLU() 30 | ) 31 | 32 | def forward(self, x): 33 | x = self.flatten(x) 34 | logits = self.linear_relu_stack(x) 35 | return logits 36 | net = NeuralNetwork() 37 | print(net) 38 | ``` 39 | 40 | 以上代码,会输出刚刚搭建的 `NeuralNetwork` 网络的结构: 41 | 42 | ```text 43 | NeuralNetwork( 44 | (flatten): Flatten(start_dim=1, end_dim=-1) 45 | (linear_relu_stack): Sequential( 46 | (0): Linear(in_features=784, out_features=512, bias=True) 47 | (1): ReLU() 48 | (2): Linear(in_features=512, out_features=512, bias=True) 49 | (3): ReLU() 50 | (4): Linear(in_features=512, out_features=10, bias=True) 51 | (5): ReLU() 52 | ) 53 | ) 54 | ``` 55 | 56 | 接着,调用 `net` (注意:不推荐显式调用 `forward`)即可完成前向传播: 57 | 58 | ```python 59 | X = flow.ones(1, 28, 28) 60 | logits = net(X) 61 | pred_probab = nn.Softmax(dim=1)(logits) 62 | y_pred = pred_probab.argmax(1) 63 | print(f"Predicted class: {y_pred}") 64 | ``` 65 | 66 | 会得到类似以下的输出结果: 67 | 68 | ```text 69 | Predicted class: tensor([1], dtype=oneflow.int32) 70 | ``` 71 | 72 | 以上从数据输入、到网络计算,最终推理输出的流程,如下图所示: 73 | 74 | ![todo](./imgs/neural-network-layers.png) 75 | 76 | ## `flow.nn.functional` 77 | 78 | 除了 `oneflow.nn` 外,[oneflow.nn.functional](https://oneflow.readthedocs.io/en/v0.8.1/nn.functional.html) 名称空间下也提供了不少 API。它与 `oneflow.nn` 在功能上有一定的重叠。比如 [nn.functional.relu](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.functional.relu.html) 与 [nn.ReLU](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.ReLU.html) 都可用于神经网络做 activation 操作。 79 | 80 | 两者的区别主要有: 81 | 82 | - `nn` 下的 API 是类,需要先构造实例化对象,再调用;`nn.functional` 下的 API 是作为函数直接调用 83 | - `nn` 下的类内部自己管理了网络参数;而 `nn.functional` 下的函数,需要我们自己定义参数,每次调用时手动传入 84 | 85 | 实际上,OneFlow 提供的大部分 Module 是通过封装 `nn.functional` 下的方法得到的。`nn.functional` 提供了更加细粒度管理网络的可能。 86 | 87 | 以下的例子,使用 `nn.functional` 中的方法,构建与上文中 `NeuralNetwork` 类等价的 Module `FunctionalNeuralNetwork`,读者可以体会两者的异同: 88 | 89 | ```python 90 | class FunctionalNeuralNetwork(nn.Module): 91 | def __init__(self): 92 | super(FunctionalNeuralNetwork, self).__init__() 93 | 94 | self.weight1 = nn.Parameter(flow.randn(28*28, 512)) 95 | self.bias1 = nn.Parameter(flow.randn(512)) 96 | 97 | self.weight2 = nn.Parameter(flow.randn(512, 512)) 98 | self.bias2 = nn.Parameter(flow.randn(512)) 99 | 100 | self.weight3 = nn.Parameter(flow.randn(512, 10)) 101 | self.bias3 = nn.Parameter(flow.randn(10)) 102 | 103 | def forward(self, x): 104 | x = x.reshape(1, 28*28) 105 | out = flow.matmul(x, self.weight1) 106 | out = out + self.bias1 107 | out = nn.functional.relu(out) 108 | 109 | out = flow.matmul(out, self.weight2) 110 | out = out + self.bias2 111 | out = nn.functional.relu(out) 112 | 113 | out = flow.matmul(out, self.weight3) 114 | out = out + self.bias3 115 | out = nn.functional.relu(out) 116 | 117 | return out 118 | 119 | net = FunctionalNeuralNetwork() 120 | X = flow.ones(1, 28, 28) 121 | logits = net(X) 122 | pred_probab = nn.Softmax(dim=1)(logits) 123 | y_pred = pred_probab.argmax(1) 124 | print(f"Predicted class: {y_pred}") 125 | ``` 126 | 127 | ## Module 容器 128 | 129 | 比较以上 `NeuralNetwork` 与 `FunctionalNeuralNetwork` 实现的异同,可以发现 [nn.Sequential](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.Sequential.html) 对于简化代码起到了重要作用。 130 | 131 | `nn.Sequential` 是一种特殊容器,只要是继承自 `nn.Module` 的类都可以放置放置到其中。 132 | 133 | 它的特殊之处在于:当 Sequential 进行前向传播时,Sequential 会自动地将容器中包含的各层“串联”起来。具体来说,会按照各层加入 Sequential 的顺序,自动地将上一层的输出,作为下一层的输入传递,直到得到整个 Module 的最后一层的输出。 134 | 135 | 以下是不使用 Sequential 构建网络的例子(不推荐): 136 | 137 | ```python 138 | class MyModel(nn.Module): 139 | def __init__(self): 140 | super(MyModel, self).__init__() 141 | self.conv1 = nn.Conv2d(1,20,5) 142 | self.relu1 = nn.ReLU() 143 | self.conv2 = nn.Conv2d(20,64,5) 144 | self.relu2 = nn.ReLU() 145 | 146 | def forward(self, x): 147 | out = self.conv1(x) 148 | out = self.relu1(out) 149 | out = self.conv2(out) 150 | out = self.relu2(out) 151 | return out 152 | ``` 153 | 154 | 如果使用 Sequential,则看起来是这样,会显得更简洁。 155 | 156 | ```python 157 | class MySeqModel(nn.Module): 158 | def __init__(self): 159 | super(MySeqModel, self).__init__() 160 | self.seq = nn.Sequential( 161 | nn.Conv2d(1,20,5), 162 | nn.ReLU(), 163 | nn.Conv2d(20,64,5), 164 | nn.ReLU() 165 | ) 166 | 167 | def forward(self, x): 168 | return self.seq(x) 169 | ``` 170 | 171 | 除了 Sequential 外,还有 `nn.ModuleList` 及 `nn.ModuleDict`,除了会自动注册参数到整个网络外,他们的其它行为类似 Python list、Python dict,只是常用简单的容器,不会自动进行前后层的前向传播,需要自己手工遍历完成各层的计算。 -------------------------------------------------------------------------------- /cn/docs/basics/05_autograd.md: -------------------------------------------------------------------------------- 1 | # Autograd 2 | 3 | 神经网络的训练过程离不开 **反向传播算法**,在反向传播过程中,需要获取 loss 函数对模型参数的梯度,用于更新参数。 4 | 5 | OneFlow 提供了自动求导机制,可自动计算神经网络中参数的梯度。 6 | 7 | 本文将先介绍计算图的基本概念,它有利于理解 OneFlow 自动求导的常见设置及限制,再介绍 OneFlow 中与自动求导有关的常见接口。 8 | 9 | ## 计算图 10 | 11 | 张量与算子,共同组成计算图,如以下代码: 12 | 13 | ```python 14 | import oneflow as flow 15 | 16 | def loss(y_pred, y): 17 | return flow.sum(1/2*(y_pred-y)**2) 18 | 19 | x = flow.ones(1, 5) # 输入 20 | w = flow.randn(5, 3, requires_grad=True) 21 | b = flow.randn(1, 3, requires_grad=True) 22 | z = flow.matmul(x, w) + b 23 | 24 | y = flow.zeros(1, 3) # label 25 | l = loss(z,y) 26 | ``` 27 | 28 | 它对应的计算图如下: 29 | 30 | ![todo](./imgs/compute_graph.png) 31 | 32 | 计算图中,像 `x`、`w`、`b`、`y` 这种只有输出,没有输入的节点称为 **叶子节点**;像 `loss` 这种只有输入没有输出的节点,称为 **根节点**。 33 | 34 | 反向传播过程中,需要求得 `l` 对 `w`、`b` 的梯度,以更新这两个模型参数。因此,我们在创建它们时,设置 `requires_grad` 为 `True`。 35 | 36 | ## 自动求梯度 37 | 38 | ### backward 与梯度 39 | 40 | 在反向传播的过程中,需要得到 `l` 分别对 `w`、`b` 的梯度 $\frac{\partial l}{\partial w}$ 和 $\frac{\partial l}{\partial b}$。我们只需要对 `l` 调用 `backward()` 方法,然后 OneFlow 就会自动计算梯度,并且存放到 `w` 与 `b` 的 `grad` 成员中。 41 | 42 | ```python 43 | l.backward() 44 | print(w.grad) 45 | print(b.grad) 46 | ``` 47 | 48 | ```text 49 | tensor([[0.9397, 2.5428, 2.5377], 50 | [0.9397, 2.5428, 2.5377], 51 | [0.9397, 2.5428, 2.5377], 52 | [0.9397, 2.5428, 2.5377], 53 | [0.9397, 2.5428, 2.5377]], dtype=oneflow.float32) 54 | tensor([[0.9397, 2.5428, 2.5377]], dtype=oneflow.float32) 55 | ``` 56 | 57 | ### 对非叶子节点求梯度 58 | 59 | 默认情况下,只有 `requires_grad=True` 的叶子节点的梯度会被保留。非叶子节点的 `grad` 属性默认在 `backward` 执行过程中,会自动释放,不能查看。 60 | 61 | 如果想保留并查看非叶子节点的梯度,可以调用 `Tensor.retain_grad` 方法: 62 | 63 | ```python 64 | from math import pi 65 | n1 = flow.tensor(pi/2, requires_grad=True) 66 | n2 = flow.sin(n1) 67 | n2.retain_grad() 68 | n3 = flow.pow(n2, 2) 69 | 70 | n3.backward() 71 | print(n1.grad) 72 | print(n2.grad) 73 | ``` 74 | 75 | 以上代码,既求 $\frac{\partial n_3}{\partial n_1}$,也求 $\frac{\partial n_3}{\partial n_2}$ 76 | 77 | 输出: 78 | 79 | ``` 80 | tensor(-8.7423e-08, dtype=oneflow.float32) 81 | tensor(2., dtype=oneflow.float32) 82 | ``` 83 | 84 | ### 对一个计算图多次 `backward()` 85 | 86 | 默认情况下,对于给定的计算图,只能调用 `backward()` 一次。比如,以下代码会报错: 87 | 88 | ```python 89 | n1 = flow.tensor(10., requires_grad=True) 90 | n2 = flow.pow(n1, 2) 91 | n2.backward() 92 | n2.backward() 93 | ``` 94 | 95 | 报错信息: 96 | 97 | > Maybe you try to backward through the node a second time. Specify retain_graph=True when calling .backward() or autograd.grad() the first time. 98 | 99 | 如果想要在同一个计算图上调用多次 `backward()`,需要在调用时设置 `retain_graph=True`。 100 | 101 | ```python 102 | n1 = flow.tensor(10., requires_grad=True) 103 | n2 = flow.pow(n1, 2) 104 | 105 | n2.backward(retain_graph=True) 106 | print(n1.grad) 107 | n2.backward() 108 | print(n1.grad) 109 | ``` 110 | 111 | 输出: 112 | 113 | ```text 114 | tensor(20., dtype=oneflow.float32) 115 | tensor(40., dtype=oneflow.float32) 116 | ``` 117 | 118 | 以上输出可知,OneFlow 会 **累加** 多次 `backward()` 计算得到的梯度。 119 | 如果想清空梯度,可以调用 `zero_` 方法: 120 | 121 | ```python 122 | n1 = flow.tensor(10., requires_grad=True) 123 | n2 = flow.pow(n1, 2) 124 | 125 | n2.backward(retain_graph=True) 126 | print(n1.grad) 127 | n1.grad.zero_() 128 | n2.backward() 129 | print(n1.grad) 130 | ``` 131 | 132 | 输出: 133 | 134 | ```text 135 | tensor(20., dtype=oneflow.float32) 136 | tensor(20., dtype=oneflow.float32) 137 | ``` 138 | 139 | ### 不记录某个 Tensor 的梯度 140 | 141 | 默认情况下,OneFlow 会 tracing `requires_grad` 为 `True` 的 Tensor,自动求梯度。 142 | 不过有些情况可能并不需要 OneFlow 这样做,比如只是想试一试前向推理。那么可以使用 [oneflow.no_grad](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.no_grad.html) 或 [oneflow.Tensor.detach](https://oneflow.readthedocs.io/en/master/generated/oneflow.Tensor.detach.html#oneflow.Tensor.detach) 方法设置。 143 | 144 | ```python 145 | z = flow.matmul(x, w)+b 146 | print(z.requires_grad) 147 | 148 | with flow.no_grad(): 149 | z = flow.matmul(x, w)+b 150 | print(z.requires_grad) 151 | ``` 152 | 153 | 输出: 154 | 155 | ```text 156 | True 157 | False 158 | ``` 159 | 160 | ```python 161 | z_det = z.detach() 162 | print(z_det.requires_grad) 163 | ``` 164 | 165 | 输出: 166 | 167 | ```text 168 | False 169 | ``` 170 | 171 | ### 输出不是标量时如何求梯度 172 | 173 | 通常,调用 `backward()` 方法的是神经网络的 loss,是一个标量。 174 | 175 | 但是,如果不是标量,对 Tensor 调用 `backward()` 时会报错。 176 | 177 | ```python 178 | x = flow.randn(1, 2, requires_grad=True) 179 | y = 3*x + 1 180 | y.backward() 181 | ``` 182 | 183 | 报错信息: 184 | 185 | > Check failed: IsScalarTensor(*outputs.at(i)) Grad can be implicitly created only for scalar outputs 186 | 187 | 而对 `y` 求 `sum` 后可以求梯度: 188 | 189 | ```python 190 | x = flow.randn(1, 2, requires_grad=True) 191 | y = 3*x + 1 192 | y = y.sum() 193 | y.backward() 194 | print(x.grad) 195 | ``` 196 | 197 | 输出: 198 | 199 | ```text 200 | tensor([[3., 3.]], dtype=oneflow.float32) 201 | ``` 202 | 203 | 错误原因及解决方法的分析请参考下文 “扩展阅读” 部分。 204 | 205 | ## 扩展阅读 206 | 207 | `x` 张量中有两个元素,记作 $x_1$ 与 $x_2$,`y` 张量中的两个元素记作 $y_1$ 与 $y_2$,并且两者的关系是: 208 | 209 | $$ 210 | \mathbf{x} = [x_1, x_2] 211 | $$ 212 | 213 | $$ 214 | \mathbf{y} = [y_1, y_2] = [3x_1+1, 3x_2+1] 215 | $$ 216 | 217 | 此时,想直接求 $\frac{\partial \mathbf{y}}{\partial \mathbf{x}}$ 218 | 219 | $$ 220 | \frac{\partial \mathbf{y}}{\partial \mathbf{x}} = 221 | \frac{[3x_1+1, 3x_2+1]}{[x_1, x_2]} 222 | $$ 223 | 224 | 在数学上是没有意义的,因此当然就报错了。 225 | 实际上,当用户调用 `y.backward()` 时,其实想要的结果通常是: 226 | 227 | $$ 228 | [\frac{\partial y_1}{\partial x_1}, \frac{\partial y_2}{\partial x_2}] 229 | $$ 230 | 231 | 当对 `y` 进行 `sum` 运算后: 232 | 233 | $$ 234 | y = y_1 + y_2 = 3x_1 + 3x_2 + 2 235 | $$ 236 | 237 | 此时,调用 `backward()` 时,对 $x_1$ 和 $x_2$ 可求梯度: 238 | 239 | $$ 240 | \frac{\partial y}{\partial x_1} = \frac{\partial 3x_1 + 3x_2 + 2}{\partial x_1} = 3 241 | $$ 242 | 243 | $$ 244 | \frac{\partial y}{\partial x_2} = \frac{\partial 3x_1 + 3x_2 + 2}{\partial x_2} = 3 245 | $$ 246 | 247 | 除了使用 `sum` 之外,还可以使用更通用方法,即 **Vector Jacobian Product(VJP)** 完成非标量的根节点的梯度计算。依然用上文的例子,在反向传播过程中,OneFlow 会根据计算图生成雅可比矩阵: 248 | 249 | $$ 250 | J = \begin{pmatrix} 251 | \frac{\partial y_1}{\partial x_1} & \frac{\partial y_1}{\partial x_2}\\ 252 | \frac{\partial y_2}{\partial x_1} & \frac{\partial y_2}{\partial x_2} 253 | \end{pmatrix}\\ 254 | = \begin{pmatrix} 255 | \frac{\partial y_1}{\partial x_1} & 0 \\ 256 | 0 & \frac{\partial y_2}{\partial x_2} 257 | \end{pmatrix} 258 | $$ 259 | 260 | 只需提供一个与 $\mathbf{y}$ 大小一致的向量 $\mathbf{v}$,即可计算 VJP: 261 | 262 | $$ 263 | \begin{bmatrix} 264 | v_1\\ 265 | v_2 266 | \end{bmatrix} 267 | \times 268 | \begin{pmatrix} 269 | \frac{\partial y_1}{\partial x_1} & 0 \\ 270 | 0 & \frac{\partial y_2}{\partial x_2} 271 | \end{pmatrix}= 272 | \begin{bmatrix} 273 | v_1 \frac{\partial y_1}{\partial x_1}\\ 274 | v_2 \frac{\partial y_2}{\partial x_2} 275 | \end{bmatrix} 276 | $$ 277 | 278 | 若向量 $\mathbf{v}$ 是反向传播中上一层的梯度,VJP 的结果刚好是当前层要求的梯度。 279 | 280 | `backward` 方法是可以接受一个张量做参数的,该参数就是 VJP 中的 $\mathbf{v}$,理解以上道理后,还可以使用以下的方式对张量求梯度: 281 | 282 | ```python 283 | x = flow.randn(1, 2, requires_grad=True) 284 | y = 3*x + 1 285 | y.backward(flow.ones_like(y)) 286 | print(x.grad) 287 | ``` 288 | 289 | 输出: 290 | 291 | ```text 292 | tensor([[3., 3.]], dtype=oneflow.float32) 293 | ``` 294 | 295 | **外部链接** 296 | 297 | - [Automatic Differentiation](http://www.cs.toronto.edu/~rgrosse/courses/csc421_2019/slides/lec06.pdf) 298 | -------------------------------------------------------------------------------- /cn/docs/basics/06_optimization.md: -------------------------------------------------------------------------------- 1 | # 反向传播与 optimizer 2 | 3 | 到目前为止,我们已经掌握如何使用 OneFlow [加载数据](./03_dataset_dataloader.md)、[搭建模型](./04_build_network.md)、[自动计算模型参数的梯度](./05_autograd.md),将它们组合在一起,我们就可以利用反向传播算法训练模型。 4 | 5 | 在 [oneflow.optim](https://oneflow.readthedocs.io/en/v0.8.1/optim.html) 中,有各类 `optimizer`,它们可以简化实现反向传播的代码。 6 | 7 | 本文将先介绍反向传播的基本概念,再介绍如何使用 `oneflow.optim` 类。 8 | 9 | ## numpy 手工实现反向传播 10 | 11 | 为了读者更方便理解反向传播与自动求导的关系,在这里提供了一份仅用 numpy 实现的简单模型的训练过程: 12 | 13 | ```python 14 | import numpy as np 15 | 16 | ITER_COUNT = 500 17 | LR = 0.01 18 | 19 | # 前向传播 20 | def forward(x, w): 21 | return np.matmul(x, w) 22 | 23 | 24 | # 损失函数 25 | def loss(y_pred, y): 26 | return ((y_pred - y) ** 2).sum() 27 | 28 | 29 | # 计算梯度 30 | def gradient(x, y, y_pred): 31 | return np.matmul(x.T, 2 * (y_pred - y)) 32 | 33 | 34 | if __name__ == "__main__": 35 | # 训练目标: Y = 2*X1 + 3*X2 36 | x = np.array([[1, 2], [2, 3], [4, 6], [3, 1]], dtype=np.float32) 37 | y = np.array([[8], [13], [26], [9]], dtype=np.float32) 38 | 39 | w = np.array([[2], [1]], dtype=np.float32) 40 | # 训练循环 41 | for i in range(0, ITER_COUNT): 42 | y_pred = forward(x, w) 43 | l = loss(y_pred, y) 44 | if (i + 1) % 50 == 0: 45 | print(f"{i+1}/{500} loss:{l}") 46 | 47 | grad = gradient(x, y, y_pred) 48 | w -= LR * grad 49 | 50 | print(f"w:{w}") 51 | ``` 52 | 53 | 输出: 54 | 55 | ```text 56 | 50/500 loss:0.0034512376878410578 57 | 100/500 loss:1.965487399502308e-06 58 | 150/500 loss:1.05524122773204e-09 59 | 200/500 loss:3.865352482534945e-12 60 | 250/500 loss:3.865352482534945e-12 61 | 300/500 loss:3.865352482534945e-12 62 | 350/500 loss:3.865352482534945e-12 63 | 400/500 loss:3.865352482534945e-12 64 | 450/500 loss:3.865352482534945e-12 65 | 500/500 loss:3.865352482534945e-12 66 | w:[[2.000001 ] 67 | [2.9999993]] 68 | ``` 69 | 70 | 注意我们选择的 loss 函数表达式为 $\sum (y_{p} - y)^2$,因此 `loss` 对参数 `w`求梯度的代码为: 71 | 72 | ```python 73 | def gradient(x, y, y_pred): 74 | return np.matmul(x.T, 2 * (y_pred - y)) 75 | ``` 76 | 77 | 更新参数采用的是 [SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent): 78 | 79 | ```python 80 | grad = gradient(x, y, y_pred) 81 | w -= LR*grad 82 | ``` 83 | 84 | 总结而言,训练中的一次完整迭代包括以下步骤: 85 | 86 | 1. 模型根据输入、参数,计算得出预测值 (`y_pred`) 87 | 2. 计算 loss,即预测值与标签之间的误差 88 | 3. 求 loss 对参数的梯度 89 | 4. 更新参数 90 | 91 | 其中 1~2 为前向传播过程;3~4为反向传播过程。 92 | 93 | ## 超参 Hyperparameters 94 | 95 | 超参数是有关模型训练设置的参数,可以影响到模型训练的效率和结果。如以上代码中的 `ITER_COUNT`、`LR` 就是超参数。 96 | 97 | ## 使用 `oneflow.optim` 中的优化器类 98 | 99 | 使用 `oneflow.optim` 中的优化器类进行反向传播会更简洁方便,接下来,我们展示如何使用。 100 | 101 | 首先,先准备好数据和模型,使用 Module 的一个方便之处就是,可以把超参放置在 Module 中便于管理。 102 | 103 | ```python 104 | import oneflow as flow 105 | 106 | x = flow.tensor([[1, 2], [2, 3], [4, 6], [3, 1]], dtype=flow.float32) 107 | y = flow.tensor([[8], [13], [26], [9]], dtype=flow.float32) 108 | 109 | 110 | class MyLrModule(flow.nn.Module): 111 | def __init__(self, lr, iter_count): 112 | super().__init__() 113 | self.w = flow.nn.Parameter(flow.tensor([[2], [1]], dtype=flow.float32)) 114 | self.lr = lr 115 | self.iter_count = iter_count 116 | 117 | def forward(self, x): 118 | return flow.matmul(x, self.w) 119 | 120 | 121 | model = MyLrModule(0.01, 500) 122 | ``` 123 | 124 | ### loss 函数 125 | 126 | 然后,选择好 loss 函数,OneFlow 自带了多种 loss 函数,我们在这里选择 [MSELoss](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.MSELoss.html): 127 | 128 | ```python 129 | loss = flow.nn.MSELoss(reduction="sum") 130 | ``` 131 | 132 | ### 构造 optimizer 133 | 134 | 反向传播的逻辑,都被封装在 optimizer 中。我们在此选择 [SGD](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.optim.SGD.html),你可以根据需要选择其它的优化算法,如 [Adam](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.optim.Adam.html)、[AdamW](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.optim.AdamW.html) 等。 135 | 136 | ```python 137 | optimizer = flow.optim.SGD(model.parameters(), model.lr) 138 | ``` 139 | 140 | 构造 `optimizer`时,将模型参数及 learning rate 传递给 `SGD`。之后调用 `optimizer.step()`,在其内部就会自动完成对模型参数求梯度、并按照 SGD 算法更新模型参数。 141 | 142 | ### 训练 143 | 144 | 以上准备完成后,可以开始训练: 145 | 146 | ```python 147 | for i in range(0, model.iter_count): 148 | y_pred = model(x) 149 | l = loss(y_pred, y) 150 | if (i + 1) % 50 == 0: 151 | print(f"{i+1}/{model.iter_count} loss:{l.numpy()}") 152 | 153 | optimizer.zero_grad() 154 | l.backward() 155 | optimizer.step() 156 | 157 | print(f"\nw: {model.w}") 158 | ``` 159 | 160 | 输出: 161 | 162 | ```text 163 | 50/500 loss:0.003451163647696376 164 | 100/500 loss:1.965773662959691e-06 165 | 150/500 loss:1.103217073250562e-09 166 | 200/500 loss:3.865352482534945e-12 167 | 250/500 loss:3.865352482534945e-12 168 | 300/500 loss:3.865352482534945e-12 169 | 350/500 loss:3.865352482534945e-12 170 | 400/500 loss:3.865352482534945e-12 171 | 450/500 loss:3.865352482534945e-12 172 | 500/500 loss:3.865352482534945e-12 173 | 174 | w: tensor([[2.], 175 | [3.]], dtype=oneflow.float32, grad_fn=) 176 | ``` 177 | -------------------------------------------------------------------------------- /cn/docs/basics/07_model_load_save.md: -------------------------------------------------------------------------------- 1 | # 模型的加载与保存 2 | 3 | 对于模型的加载与保存,常用的场景有: 4 | 5 | - 将已经训练一段时间的模型保存,方便下次继续训练 6 | - 将训练好的模型保存,方便后续直接用于预测 7 | 8 | 在本文中,我们将介绍,如何使用 [save](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.save.html) 和 [load](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.load.html) API 保存模型、加载模型。 9 | 10 | 同时也会展示,如何加载预训练模型,完成预测任务。 11 | 12 | ## 模型参数的获取与加载 13 | 14 | OneFlow 预先提供的各种 `Module` 或者用户自定义的 `Module`,都提供了 `state_dict` 方法获取模型所有的参数,它是以 “参数名-参数值” 形式存放的字典。 15 | 16 | ```python 17 | import oneflow as flow 18 | m = flow.nn.Linear(2,3) 19 | print(m.state_dict()) 20 | ``` 21 | 22 | 以上代码,将显式构造好的 Linear Module 对象 m 中的参数打印出来: 23 | 24 | ```text 25 | OrderedDict([('weight', 26 | tensor([[-0.4297, -0.3571], 27 | [ 0.6797, -0.5295], 28 | [ 0.4918, -0.3039]], dtype=oneflow.float32, requires_grad=True)), 29 | ('bias', 30 | tensor([ 0.0977, 0.1219, -0.5372], dtype=oneflow.float32, requires_grad=True))]) 31 | ``` 32 | 33 | 通过调用 `Module` 的 `load_state_dict` 方法,可以加载参数,如以下代码: 34 | 35 | ```python 36 | myparams = {"weight":flow.ones(3,2), "bias":flow.zeros(3)} 37 | m.load_state_dict(myparams) 38 | print(m.state_dict()) 39 | ``` 40 | 41 | 可以看到,我们自己构造的字典中的张量,已经被加载到 m Module 中: 42 | 43 | ```text 44 | OrderedDict([('weight', 45 | tensor([[1., 1.], 46 | [1., 1.], 47 | [1., 1.]], dtype=oneflow.float32, requires_grad=True)), 48 | ('bias', 49 | tensor([0., 0., 0.], dtype=oneflow.float32, requires_grad=True))]) 50 | ``` 51 | 52 | ## 模型保存 53 | 54 | 我们可以使用 [oneflow.save](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.save.html) 方法保存模型。 55 | 56 | ```python 57 | flow.save(m.state_dict(), "./model") 58 | ``` 59 | 60 | 它的第一个参数的 Module 的参数,第二个是保存路径。以上代码,将 `m` Module 对象的参数,保存到了 `model` 文件下。 61 | 62 | ## 模型加载 63 | 64 | 使用 [oneflow.load](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.load.html) 可以将参数从指定的磁盘路径加载参数到内存,得到存有参数的字典。 65 | 66 | ```python 67 | params = flow.load("./model") 68 | ``` 69 | 70 | 然后,再借助上文介绍的 `load_state_dict` 方法,就可以将字典加载到模型中: 71 | 72 | ```python 73 | m2 = flow.nn.Linear(2,3) 74 | m2.load_state_dict(params) 75 | print(m2.state_dict()) 76 | ``` 77 | 78 | 以上代码,新构建了一个 Linear Module 对象 `m2`,并且将从上文保存得到的的参数加载到 `m2` 上。得到输出: 79 | 80 | ```text 81 | OrderedDict([('weight', tensor([[1., 1.], 82 | [1., 1.], 83 | [1., 1.]], dtype=oneflow.float32, requires_grad=True)), ('bias', tensor([0., 0., 0.], dtype=oneflow.float32, requires_grad=True))]) 84 | ``` 85 | 86 | ### 使用预训练模型进行预测 87 | 88 | OneFlow 是可以直接加载 PyTorch 的预训练模型,用于预测的。 89 | 只要模型的作者能够确保搭建的模型的结构、参数名与 PyTorch 模型对齐。 90 | 91 | 相关的例子可以在 [OneFlow Models 仓库的这个 README](https://github.com/Oneflow-Inc/models/blob/main/README_zh-CN.md) 查看。 92 | 93 | 以下命令行,可以体验如何使用预训练好的模型,进行预测: 94 | 95 | ```bash 96 | git clone https://github.com/Oneflow-Inc/models.git 97 | cd models/Vision/classification/image/shufflenetv2/ 98 | bash infer.sh 99 | ``` 100 | -------------------------------------------------------------------------------- /cn/docs/basics/imgs/compute_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/compute_graph.png -------------------------------------------------------------------------------- /cn/docs/basics/imgs/dataloader_item.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/dataloader_item.png -------------------------------------------------------------------------------- /cn/docs/basics/imgs/dynamic_graph.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/dynamic_graph.gif -------------------------------------------------------------------------------- /cn/docs/basics/imgs/fashionMNIST.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/fashionMNIST.png -------------------------------------------------------------------------------- /cn/docs/basics/imgs/neural-network-layers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/neural-network-layers.png -------------------------------------------------------------------------------- /cn/docs/basics/imgs/qq_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/qq_group.png -------------------------------------------------------------------------------- /cn/docs/basics/imgs/static_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/basics/imgs/static_graph.png -------------------------------------------------------------------------------- /cn/docs/code/parallelism/ddp_train.py: -------------------------------------------------------------------------------- 1 | import oneflow as flow 2 | from oneflow.nn.parallel import DistributedDataParallel as ddp 3 | 4 | train_x = [ 5 | flow.tensor([[1, 2], [2, 3]], dtype=flow.float32), 6 | flow.tensor([[4, 6], [3, 1]], dtype=flow.float32), 7 | ] 8 | train_y = [ 9 | flow.tensor([[8], [13]], dtype=flow.float32), 10 | flow.tensor([[26], [9]], dtype=flow.float32), 11 | ] 12 | 13 | 14 | class Model(flow.nn.Module): 15 | def __init__(self): 16 | super().__init__() 17 | self.lr = 0.01 18 | self.iter_count = 500 19 | self.w = flow.nn.Parameter(flow.tensor([[0], [0]], dtype=flow.float32)) 20 | 21 | def forward(self, x): 22 | x = flow.matmul(x, self.w) 23 | return x 24 | 25 | 26 | m = Model().to("cuda") 27 | m = ddp(m) 28 | loss = flow.nn.MSELoss(reduction="sum") 29 | optimizer = flow.optim.SGD(m.parameters(), m.lr) 30 | 31 | for i in range(0, m.iter_count): 32 | rank = flow.env.get_rank() 33 | x = train_x[rank].to("cuda") 34 | y = train_y[rank].to("cuda") 35 | 36 | y_pred = m(x) 37 | l = loss(y_pred, y) 38 | if (i + 1) % 50 == 0: 39 | print(f"{i+1}/{m.iter_count} loss:{l}") 40 | 41 | optimizer.zero_grad() 42 | l.backward() 43 | optimizer.step() 44 | 45 | print(f"\nw:{m.w}") 46 | -------------------------------------------------------------------------------- /cn/docs/cookies/activation_checkpointing.md: -------------------------------------------------------------------------------- 1 | # Activation Checkpointing 2 | 3 | ## Activation Checkpointing 简介 4 | 5 | Activation Checkpointing 是陈天奇团队于 2016 年在论文 [Training Deep Nets with Sublinear Memory Cost](https://arxiv.org/abs/1604.06174) 中提出的一种亚线性内存优化技术,旨在减少训练过程中的中间激活(activation)带来的显存占用。Activation Checkpointing 的基本原理是 **以时间换空间** :经过计算图分析后,前向过程中一些暂时用不到的中间激活特征将被删除以减少显存占用,后向过程中需要时再借助额外的前向计算恢复它们。 6 | 7 | OneFlow 的静态图模块 `nn.Graph` 已经支持 Activation Checkpointing,本文将介绍如何在训练中开启它。 8 | 9 | ## Activation Checkpointing 使用示例 10 | 11 | 首先,我们定义一个简单的模型(由两部分组成)、损失函数及优化器,和以往的用法完全相同。 12 | 13 | ```python 14 | import oneflow as flow 15 | import oneflow.nn as nn 16 | 17 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 18 | print("Using {} device".format(DEVICE)) 19 | 20 | model_part1 = nn.Sequential( 21 | nn.Linear(256, 128), 22 | nn.ReLU(), 23 | nn.Linear(128, 64), 24 | nn.ReLU() 25 | ) 26 | model_part1 = model_part1.to(DEVICE) 27 | model_part1.train() 28 | 29 | model_part2 = nn.Sequential( 30 | nn.Linear(64, 32), 31 | nn.ReLU(), 32 | nn.Linear(32, 10) 33 | ) 34 | model_part2 = model_part2.to(DEVICE) 35 | model_part2.train() 36 | 37 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 38 | optimizer = flow.optim.SGD([{'params': model_part1.parameters()}, 39 | {'params': model_part2.parameters()}], 40 | lr=1e-3) 41 | ``` 42 | 43 | 如果要开启 activation checkpointing,只需在 [nn.Graph](../basics/08_nn_graph.md) 模型中的 Eager 模型成员 (即 nn.Module 对象) 利用 `.to(nn.graph.GraphModule)` 方法转换为 `nn.graph.GraphModule` 对象,并在其上指定 `.activation_checkpointing = True`。此 API 详见:[activation_checkpointing](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.block_config.BlockConfig.activation_checkpointing.html)。对于每个打开 "activation checkpointing" 的 nn.Module,其输入 activation 将会被保留,而其它中间 activation 在反向传播过程中被使用时会被重新计算。 44 | 45 | ```python 46 | class CustomGraph(flow.nn.Graph): 47 | def __init__(self): 48 | super().__init__() 49 | self.model_part1 = model_part1 50 | self.model_part2 = model_part2 51 | # 在连续的两个 nn.Module 上开启 activation checkpointing 52 | self.model_part1.to(nn.graph.GraphModule).activation_checkpointing = True 53 | self.model_part2.to(nn.graph.GraphModule).activation_checkpointing = True 54 | self.loss_fn = loss_fn 55 | self.add_optimizer(optimizer) 56 | 57 | def build(self, x, y): 58 | y_pred = self.model_part2(self.model_part1(x)) 59 | loss = self.loss_fn(y_pred, y) 60 | loss.backward() 61 | return y_pred, loss 62 | ``` 63 | 64 | 然后,像以往那样开始训练等操作即可。 65 | 66 | ```python 67 | graph_model = CustomGraph() 68 | 69 | for _ in range(100): 70 | x = flow.randn(128, 256).to(DEVICE) 71 | y = flow.ones(128, 1, dtype=flow.int64).to(DEVICE) 72 | graph_model(x, y) 73 | # 其他代码... 74 | ``` 75 | 76 | ## 在 BERT 模型上的对比实验 77 | 78 | 为了验证 Activation Checkpointing 的实际效果,我们可以在 [BERT](https://arxiv.org/abs/1810.04805) 模型上进行对比实验。可以直接使用 [libai](https://github.com/Oneflow-Inc/libai) 库提供的 BERT 模型,只需通过在配置文件中将 `train.activation_checkpoint.enabled` 设置为 `True` 就可以开启 Activation Checkpointing。 79 | 80 | 首先,按照 [Prepare the Data and the Vocab](https://libai.readthedocs.io/en/latest/tutorials/get_started/quick_run.html#prepare-the-data-and-the-vocab) 准备好数据。为简单起见,我们使用单卡训练(实验环境使用的 GPU 为 NVIDIA GeForce RTX 3090,显存大小为 24268 MB): 81 | 82 | ```bash 83 | time python tools/train_net.py --config-file configs/bert_large_pretrain.py 84 | ``` 85 | 86 | 在命令最开头加上 `time` 命令来计量训练过程所耗费的时间。 87 | 88 | 实验结果如下: 89 | 90 | | 是否开启 Activation Checkpointing | 平均显存占用 | 训练完成所用时间 | 91 | |:-----------------------------:|:-------:|:---------:| 92 | | 否 | 9141 MB | 25 分 16 秒 | 93 | | 是 | 5978 MB | 33 分 36 秒 | 94 | 95 | 从上表可以看出,Activation Checkpointing 显著减少了训练时的显存占用。同时,训练所用时间由于需要额外的前向计算而有所增加。总体来说,当缺乏显存时,Activation Checkpointing 不失为一种很有效的解决办法。 96 | -------------------------------------------------------------------------------- /cn/docs/cookies/amp.md: -------------------------------------------------------------------------------- 1 | # 自动混合精度训练 2 | 3 | ## AMP 简介 4 | 5 | 当我们在训练深度学习模型时,通常情况下使用的是 32 位单精度浮点数 (FP32),而 **自动混合精度 (Automatic Mixed Precision, AMP)** 是一种允许在训练模型时同时使用 FP32 和 FP16 的技术。这样可以使得训练模型时的内存占用更少、计算更快,但由于 FP16 的数值范围比 FP32 小,因此更容易出现数值溢出的问题,同时可能存在一定误差。但大量实践证明,很多深度学习模型可以用这种技术来训练,并且没有精度损失。 6 | 7 | ## AMP 使用示例 8 | 9 | 首先,我们定义一个简单的模型、损失函数及优化器,和以往的用法完全相同。 10 | 11 | ```python 12 | import oneflow as flow 13 | import oneflow.nn as nn 14 | 15 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 16 | print("Using {} device".format(DEVICE)) 17 | 18 | model = nn.Sequential( 19 | nn.Linear(256, 128), 20 | nn.ReLU(), 21 | nn.Linear(128, 10) 22 | ) 23 | model = model.to(DEVICE) 24 | model.train() 25 | 26 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 27 | optimizer = flow.optim.SGD(model.parameters(), lr=1e-3) 28 | ``` 29 | 30 | 如果要开启 AMP 模式,只需在 [nn.Graph](../basics/08_nn_graph.md) 模型中添加 `self.config.enable_amp(True)`,此 API 详见: [enable_amp](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.graph_config.GraphConfig.enable_amp.html)。 31 | 32 | ```python 33 | class CustomGraph(flow.nn.Graph): 34 | def __init__(self): 35 | super().__init__() 36 | self.model = model 37 | self.loss_fn = loss_fn 38 | self.add_optimizer(optimizer) 39 | self.config.enable_amp(True) # 开启 AMP 模式 40 | 41 | def build(self, x, y): 42 | y_pred = self.model(x) 43 | loss = self.loss_fn(y_pred, y) 44 | loss.backward() 45 | return y_pred 46 | ``` 47 | 48 | 然后,像以往那样开始训练等操作即可。 49 | 50 | ```python 51 | graph_model = CustomGraph() 52 | 53 | for _ in range(100): 54 | x = flow.randn(128, 256).to(DEVICE) 55 | y = flow.ones(128, 1, dtype=flow.int64).to(DEVICE) 56 | 57 | graph_model(x, y) 58 | ``` 59 | 60 | ## Gradient Scaling 61 | 62 | **Gradient Scaling (梯度缩放)** 是一种用于解决 FP16 易导致数值溢出问题的方法,其基本原理是在反向传播的过程中使用一个 scale factor 对损失和梯度进行缩放,以改变其数值的量级,从而尽可能缓解数值溢出问题。 63 | 64 | OneFlow 提供了 `GradScaler` 来在 AMP 模式下使用 Gradient Scaling,只需要在 nn.Graph 模型的 `__init__` 方法中实例化一个`GradScaler` 对象,然后通过 [set_grad_scaler](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.Graph.set_grad_scaler.html) 接口进行指定即可,nn.Graph 将会自动管理 Gradient Scaling 的整个过程。以上文中的 `CustomGraph` 为例,我们需要在其 `__init__` 方法中添加: 65 | 66 | ```python 67 | grad_scaler = flow.amp.GradScaler( 68 | init_scale=2**12, 69 | growth_factor=2.0, 70 | backoff_factor=0.5, 71 | growth_interval=1000, 72 | ) 73 | self.set_grad_scaler(grad_scaler) 74 | ``` 75 | 76 | scale factor 的计算过程以及 GradScaler 的参数的含义如下: 77 | 78 | scale factor 的大小在迭代更新中动态估计(初始值由 `init_scale` 指定),为了尽可能减少数值下溢 (underflow),scale factor 应该更大;但如果太大,FP16 又容易发生数值上溢 (overflow),导致出现 inf 或 NaN。动态估计的过程就是在不出现 inf 或 NaN 的情况下,尽可能增大 scale factor。在每次迭代中,都会检查是否有 inf 或 NaN 的梯度出现: 79 | 80 | 1. 如果有:此次权重更新将被忽略,并且 scale factor 将会减小(乘上 `backoff_factor`) 81 | 82 | 2. 如果没有:权重正常更新,当连续多次迭代中(由 `growth_interval` 指定)没有出现 inf 或 NaN,则 scale factor 将会增大(乘上 `growth_factor`) 83 | -------------------------------------------------------------------------------- /cn/docs/cookies/imgs/Three_Stages_of_ZeRO-DP_Optimizations.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/cookies/imgs/Three_Stages_of_ZeRO-DP_Optimizations.jpg -------------------------------------------------------------------------------- /cn/docs/cookies/imgs/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/cookies/imgs/cat.jpg -------------------------------------------------------------------------------- /cn/docs/cookies/imgs/hybrid-parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/cookies/imgs/hybrid-parallel.png -------------------------------------------------------------------------------- /cn/docs/cookies/imgs/oneflow-serving-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/cookies/imgs/oneflow-serving-demo.png -------------------------------------------------------------------------------- /cn/docs/cookies/imgs/triton-oneflow-backend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/cookies/imgs/triton-oneflow-backend.png -------------------------------------------------------------------------------- /cn/docs/cookies/lr_scale.md: -------------------------------------------------------------------------------- 1 | # 如何分层设置学习率 2 | 3 | 在训练神经网络模型时,有时候需要为不同的网络层指定不同的学习率。例如,当我们在使用预训练的模型时,常常在预训练的主干网络模型上加入一些分支网络,这个时候我们希望在进行训练过程中,主干网络只进行微调,不需要过多改变参数,因此需要设置较小的学习率。而分支网络则需要快速地收敛,所以需要设置较大的学习率。这时设置统一的学习率很难满足要求,故需要对不同的网络层设置不同的学习率提升训练表现。 4 | 5 | 这篇文章以 MobileNet_v2 为例,展示如何在 Eager 和 Graph 模式下在不同层设置不同的学习率。 6 | 7 | ## Eager模式 8 | 9 | ### 基础实现 10 | 11 | 此处使用的例子,基于 [OneFlow 的 Eager 模式](../basics/08_nn_graph.md#oneflow-eager) 修改得来。 12 | 导入库、加载数据集、搭建网络等都不变: 13 | 14 | ```python 15 | import oneflow as flow 16 | import oneflow.nn as nn 17 | import flowvision 18 | import flowvision.transforms as transforms 19 | BATCH_SIZE = 64 20 | EPOCH_NUM = 1 21 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 22 | print("Using {} device".format(DEVICE)) 23 | training_data = flowvision.datasets.CIFAR10( 24 | root="data", 25 | train=True, 26 | transform=transforms.ToTensor(), 27 | download=True, 28 | source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/cifar/cifar-10-python.tar.gz", 29 | ) 30 | 31 | train_dataloader = flow.utils.data.DataLoader( 32 | training_data, BATCH_SIZE, shuffle=True 33 | ) 34 | model = flowvision.models.mobilenet_v2().to(DEVICE) 35 | model.classifer = nn.Sequential(nn.Dropout(0.2), nn.Linear(model.last_channel, 10)) 36 | model.train() 37 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 38 | ``` 39 | 40 | 然后,为了使网络的不同层使用不同的学习率,需要准备一个字典,网络参数对应 `params`,学习率对应 `lr` 。 41 | 42 | ```python 43 | param_groups = [ 44 | {'params':model.features.parameters(), 'lr':1e-3}, 45 | {'params':model.adaptive_avg_pool2d.parameters(), 'lr':1e-4}, 46 | {'params':model.classifier.parameters(), 'lr':1e-5}, 47 | ] 48 | optimizer = flow.optim.SGD(param_groups) 49 | ``` 50 | 51 | `param_groups` 是一个 `list`,每一项是一个字典,将不同的参数分组保存在不同的字典中,字典属性 `params` 指定了参数,`lr` 属性指定了学习率大小。优化器接收 `params_groups` 这个 `list` 后,会遍历这个 `list` 中的每一项。对其中的 `params` 使用指定的学习率 `lr` 进行更新。 52 | 53 | 接下来对模型进行训练 54 | 55 | ```python 56 | for t in range(EPOCH_NUM): 57 | print(f"Epoch {t+1}\n-------------------------------") 58 | size = len(train_dataloader.dataset) 59 | for batch, (x, y) in enumerate(train_dataloader): 60 | x = x.to(DEVICE) 61 | y = y.to(DEVICE) 62 | 63 | # Compute prediction error 64 | pred = model(x) 65 | loss = loss_fn(pred, y) 66 | 67 | # Backpropagation 68 | optimizer.zero_grad() 69 | loss.backward() 70 | optimizer.step() 71 | 72 | current = batch * BATCH_SIZE 73 | if batch % 5 == 0: 74 | print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") 75 | ``` 76 | 77 | ### 自定义分层的学习率衰减策略 78 | 79 | 在 Eager 模式下,不同层设置不同的学习率实现很简单,我们只需要直接指定不同参数 `lr` 。然而,我们经常需要配合学习率衰减策略一起使用,这时,上面的方法不能满足要求。 80 | 81 | 不过,我们依然可以通过动态调整 `param_groups` 中各个字典的 `lr` 属性达到目的。 82 | 83 | 在之前代码的基础上,我们为每个字典新增一个属性 `lr_decay_scale` 作为衰减因子。 84 | 85 | ```python 86 | param_groups = [ 87 | {'params':model.features.parameters(), 'lr':1e-3, 'lr_scale':0.9}, 88 | {'params':model.adaptive_avg_pool2d.parameters(), 'lr':1e-4, 'lr_scale':0.8}, 89 | {'params':model.classifier.parameters(), 'lr':1e-5, 'lr_scale':0.7}, 90 | ] 91 | optimizer = flow.optim.SGD(param_groups) 92 | ``` 93 | 94 | 然后自定义一个学习率调整函数。它读取字典中 `lr_decay_scale` 属性,更新 `lr` 属性。 95 | 96 | ```python 97 | def adjust_learning_rate(optimizer): 98 | for param_group in optimizer.param_groups: 99 | param_group["lr"] *= param_group["lr_scale"] 100 | ``` 101 | 102 | 这样,在训练过程中,调用 `adjust_learning_rate`,就可以分层地、动态调整学习率。 103 | 104 | ```python 105 | for t in range(EPOCH_NUM): 106 | print(f"Epoch {t+1}\n-------------------------------") 107 | size = len(train_dataloader.dataset) 108 | for batch, (x, y) in enumerate(train_dataloader): 109 | x = x.to(DEVICE) 110 | y = y.to(DEVICE) 111 | 112 | # Compute prediction error 113 | pred = model(x) 114 | loss = loss_fn(pred, y) 115 | 116 | # Backpropagation 117 | optimizer.zero_grad() 118 | loss.backward() 119 | optimizer.step() 120 | current = batch * BATCH_SIZE 121 | if batch % 5 == 0: 122 | print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") 123 | 124 | # Adjust the learning rate per 10 batches 125 | if batch % 10 == 0: 126 | adjust_learning_rate(optimizer) 127 | ``` 128 | 129 | ## Graph模式 130 | 131 | 在 Graph 模式下,同样的,我们导入必要的库,设置参数和设备,准备数据集。 132 | 133 | ```python 134 | import oneflow as flow 135 | import oneflow.nn as nn 136 | import flowvision 137 | import flowvision.transforms as transforms 138 | 139 | BATCH_SIZE = 64 140 | EPOCH_NUM = 1 141 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 142 | print("Using {} device".format(DEVICE)) 143 | 144 | training_data = flowvision.datasets.CIFAR10( 145 | root="data", 146 | train=True, 147 | transform=transforms.ToTensor(), 148 | download=True, 149 | source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/cifar/cifar-10-python.tar.gz", 150 | ) 151 | 152 | train_dataloader = flow.utils.data.DataLoader( 153 | training_data, BATCH_SIZE, shuffle=True, drop_last=True 154 | ) 155 | ``` 156 | 157 | 搭建模型并设置损失函数。 158 | 159 | ```python 160 | model = flowvision.models.mobilenet_v2().to(DEVICE) 161 | model.classifer = nn.Sequential(nn.Dropout(0.2), nn.Linear(model.last_channel, 10)) 162 | model.train() 163 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 164 | ``` 165 | 166 | 在设置优化器时,在 Eager 模式下,我们可以直接指定 `params_groups` 中的 `lr` 属性来设置学习率,而在 Graph 模型下,我们需要对不同的参数设置 `lr_scale` 属性来达到修改 `lr` 的目的。其中的 `lr_scale` 是 Graph 模式下内置的标准参数。 167 | 168 | ```python 169 | param_groups = [ 170 | {'params':model.features.parameters(), 'lr_scale':0.9}, 171 | {'params':model.adaptive_avg_pool2d.parameters(), 'lr_scale':0.8}, 172 | {'params':model.classifier.parameters(), 'lr_scale':0.7}, 173 | ] 174 | optimizer = flow.optim.SGD(param_groups, lr=1e-3) 175 | ``` 176 | 177 | 一旦配置了 `lr_scale` 属性,OneFlow 会在静态图编译阶段检测到,并且在运行时使用`lr=lr*lr_scale` 来更新学习率。 178 | 179 | 接下来的使用同 [使用 Graph 做训练](../basics/08_nn_graph.md#graph_2) 中一样,即: 180 | 181 | ```python 182 | class GraphMobileNetV2(flow.nn.Graph): 183 | def __init__(self): 184 | super().__init__() 185 | self.model = model 186 | self.loss_fn = loss_fn 187 | self.add_optimizer(optimizer) 188 | 189 | def build(self, x, y): 190 | y_pred = self.model(x) 191 | loss = self.loss_fn(y_pred, y) 192 | loss.backward() 193 | return loss 194 | ``` 195 | 196 | 训练静态图模型。 197 | 198 | ```python 199 | graph_mobile_net_v2 = GraphMobileNetV2() 200 | 201 | for t in range(EPOCH_NUM): 202 | print(f"Epoch {t+1}\n-------------------------------") 203 | size = len(train_dataloader.dataset) 204 | for batch, (x, y) in enumerate(train_dataloader): 205 | x = x.to(DEVICE) 206 | y = y.to(DEVICE) 207 | loss = graph_mobile_net_v2(x, y) 208 | current = batch * BATCH_SIZE 209 | if batch % 5 == 0: 210 | print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") 211 | ``` 212 | 213 | 至此,我们了解了在 Eager 模式和 Graph 模式下如何设置分层学习率。 214 | -------------------------------------------------------------------------------- /cn/docs/cookies/oneflow_torch.md: -------------------------------------------------------------------------------- 1 | # OneFlow 模拟 PyTorch 2 | 3 | OneFlow 的 API 与 PyTorch 对齐,用户可以很方便地将 PyTorch 模型迁移到 OneFlow。本文介绍三种方法,将使用 PyTorch 运行的代码迁移到 OneFlow。 4 | 5 | ## 1. import oneflow as torch 6 | 7 | 将原有的`import torch`的代码改为 8 | 9 | ```py 10 | import oneflow as torch 11 | ``` 12 | 13 | 就可以使用 OneFlow 训练原有模型;然而,这种方法需要手动修改所有 `import torch` 的文件,如果第三方库使用了 `torch`,还需要更改第三方库的源码。 14 | 15 | ## 2. 使用命令行工具 16 | 17 | OneFlow 提供了一个命令行工具,在 OneFlow 的 Python Package 内模拟了 PyTorch 环境,并将对该模块的引用都转发到实际的 OneFlow 模块中。具体的用法如下 18 | 19 | 开启模拟 PyTorch 20 | 21 | ```shell 22 | eval $(oneflow-mock-torch) 23 | ``` 24 | 25 | 或 26 | 27 | ```shell 28 | eval $(python3 -m oneflow.mock_torch) 29 | ``` 30 | 31 | 为了便于调试,OneFlow 为该方法提供了两个参数: 32 | 33 | 1. lazy 参数,`lazy=True` 时,对不存在的接口会返回一个假对象而不立即报错。**建议将该参数设置为 True**,这样即便您 import 的第三方库中含有 OneFlow 暂时不存在的接口,只要没有实际使用到该接口,mock torch 也能正常工作。 34 | 35 | 2. verbose 参数,如果同时设置 `verbose=True`,会打印出有哪些假对象被访问或使用,便于调试。 36 | 用法如下 37 | 38 | 开启模拟 PyTorch,并配置 lazy 和 verbose 参数 39 | 40 | ```shell 41 | eval $(oneflow-mock-torch --lazy --verbose) 42 | ``` 43 | 44 | 或 45 | 46 | ```shell 47 | eval $(python3 -m oneflow.mock_torch --lazy --verbose) 48 | ``` 49 | 50 | 51 | 运行上述命令后,通过以下示例观察效果 52 | 53 | ```py 54 | import torch 55 | print(torch.__file__) 56 | import oneflow as flow 57 | x = torch.zeros(2, 3) 58 | print(isinstance(x, flow.Tensor)) 59 | ``` 60 | 61 | 关闭模拟 PyTorch 62 | 63 | ```shell 64 | eval $(oneflow-mock-torch disable) 65 | ``` 66 | 67 | 或 68 | 69 | ```shell 70 | eval $(python3 -m oneflow.mock_torch disable) 71 | ``` 72 | 73 | ## 3. 使用 OneFlow 的内置函数 74 | 75 | 我们提供了更细粒度的 mock 功能,用户可以自行控制某段代码是否启用 mock 功能。 76 | 如下的 `with` 语句中,导入的 PyTorch 模块实际上是 OneFlow 77 | 78 | ```py 79 | import oneflow.mock_torch as mock 80 | with mock.enable(): 81 | import torch 82 | print(torch.__file__) 83 | import oneflow as flow 84 | x = torch.zeros(2, 3) 85 | print(isinstance(x, flow.Tensor)) 86 | ``` 87 | 88 | 同样 OneFlow 为 `mock.enable()` 提供了便于调试的参数 lazy 和 verbose,可以这样设置 89 | 90 | `with mock.enable(lazy=True, verbose=True)` 91 | 92 | 93 | 当你需要使用真正的 torch 模块时,可以这样关闭 mock 功能 94 | 95 | ```py 96 | with mock.disable(): 97 | import torch 98 | print(torch.__file__) 99 | ``` 100 | 101 | `mock.enable` 和 `mock.disable` 也可以作为函数使用,例如,对于一段用户想要用 OneFlow 进行训练的模型,而该模型需要 PyTorch 来加载,可以这样使用 102 | 103 | ```py 104 | mock.enable() 105 | ... 106 | with mock.disable() 107 | module = torch.load_module(...) 108 | # train the module with oneflow 109 | ``` 110 | 111 | enable 模式和 disable 模式各自保存了一份值为模块的字典,在开关enable/disable时会替换 `sys.modules` 和当前所属模块的全局变量,故用户需要在 enable 模式和 disable 模式时自行 `import` 需要的模块,如下代码会在 disable 的 `with` 语句里报 `name 'torch' is not defined` 的错 112 | ```py 113 | with mock.enable(): 114 | import torch 115 | with mock.disable(): 116 | torch.ones(2, 3) 117 | ``` 118 | 119 | ## 总结 120 | 121 | 由于 OneFlow 的 API 与 PyTorch 对齐,用户能够将 PyTorch 代码很方便地迁移到 OneFlow。以上介绍了三种使用 OneFlow 来训练 PyTorch 模型的方法,希望用户能够体验到 OneFlow 极致的性能。 122 | -------------------------------------------------------------------------------- /cn/docs/cookies/torch2flow.md: -------------------------------------------------------------------------------- 1 | # 将 PyTorch 预训练模型转为 OneFlow 格式 2 | 3 | 当需要使用 PyTorch 的预训练模型时,可以利用 OneFlow 与 PyTorch 模型接口对齐的特点,将 PyTorch 预训练模型,转存为 OneFlow 模型。 4 | 5 | 6 | ## 转换示例 7 | 8 | 我们将定义一个 PyTorch 模型并保存,然后展示如何将其转换成 OneFlow 模型。 9 | 10 | ```python 11 | import torch 12 | import torch.nn as nn 13 | 14 | save_file = 'model.pth' 15 | 16 | model_torch = nn.Sequential( 17 | nn.Linear(128, 2), 18 | nn.Softmax() 19 | ) 20 | 21 | torch.save(model_torch, save_file) 22 | ``` 23 | 24 | 运行以上代码,将得到 PyTorch 模型文件 `model.pth` 。将它转为 OneFlow 版本的模型包括两个主要步骤: 25 | 26 | - 定义一个具有 **相同结构** 的 OneFlow 模型 27 | - 加载 PyTorch 存储的模型文件 `model.pth`,并将模型参数初始化到 OneFlow 版本的模型中 28 | 29 | 转换代码如下: 30 | 31 | ```python 32 | import oneflow as flow 33 | import oneflow.nn as nn 34 | import torch 35 | 36 | model_flow = nn.Sequential( 37 | nn.Linear(128, 2), 38 | nn.Softmax() 39 | ) 40 | 41 | parameters = torch.load(save_file).state_dict() 42 | 43 | for key, value in parameters.items(): 44 | val = value.detach().cpu().numpy() 45 | parameters[key] = val 46 | 47 | model_flow.load_state_dict(parameters) 48 | ``` 49 | 50 | 通过 `.state_dict()` 获取到以 `key-value` 形式存储的模型参数后, `.detach().cpu().numpy()` 将梯度阻断后的参数值转换成 Numpy 类型,最后 `.load_state_dict(parameters)` 将模型参数传递到 OneFlow 模型中。 51 | 52 | 通过上述简单示例,我们可以发现将 PyTorch 存储的数据(无论是模型还是变量等等)转换成 OneFlow 的思路是 **使用 Numpy 作为二者的媒介**。只要确保 PyTorch 和 OneFlow 定义的模型是一致的,那么无论多么复杂的模型都可以通过上述方式转换。 53 | 54 | 55 | ## 拓展 56 | 57 | [flowvision](https://github.com/Oneflow-Inc/vision) 与 torchvision 相同,提供了许多预训练好的模型,同时 flowvision 各个模型能够做到与 torchvision 对齐。我们使用 flowvision,以经典的 AlexNet 为例,看看如何将 PyTorch 中 **复杂的预训练模型** 转换成 OneFlow 版本。转换代码如下所示: 58 | 59 | ```python 60 | import torchvision.models as models_torch 61 | import flowvision.models as models_flow 62 | 63 | alexnet_torch = models_torch.alexnet(pretrained=True) 64 | alexnet_flow = models_flow.alexnet() 65 | 66 | parameters = alexnet_torch.state_dict() 67 | for key, value in parameters.items(): 68 | val = value.detach().cpu().numpy() 69 | parameters[key] = val 70 | 71 | alexnet_flow.load_state_dict(parameters) 72 | ``` 73 | 74 | flowvision 也配备了预训练模型,设置 `pretrained=True` 即可: 75 | 76 | ```python 77 | alexnet_flow = models_flow.alexnet(pretrained=True) 78 | ``` 79 | 80 | 关于 flowvision 的详细使用,欢迎访问 [flowvision documentation](https://flowvision.readthedocs.io/en/latest/index.html) 。 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /cn/docs/cookies/transfer_learning.md: -------------------------------------------------------------------------------- 1 | # 计算机视觉迁移学习 2 | 3 | 在本教程中,我们将介绍迁移学习的基本原理,并展示一个在计算机视觉领域中的迁移学习的使用示例。 4 | 5 | ## 原理简介 6 | 7 | **迁移学习 (Transfer Learning)** 是一种将从源数据集学到的知识迁移到目标数据集的方法。 8 | 9 | 众所周知,有监督学习是一种相当常见的深度学习模型的训练方式,但它需要大量带标注的数据才能达到较好的效果,当我们想将某个模型应用于某个特定的任务上时,通常受制于成本而无法获得大量带标注的数据,如果直接在这样的小规模数据上进行训练,很容易造成过拟合。而迁移学习是解决这一问题的方法之一。 10 | 11 | 以计算机视觉领域中常见的图像分类任务为例,一般的图像分类模型可以分为两个部分:特征提取器(或称为骨干网络)和分类器(或称为输出层)。特征提取器一般是诸如卷积神经网络的多层网络,分类器一般是诸如全连接层的单层网络。由于不同分类任务的类别一般不同,分类器通常无法复用,而特征提取器通常可以复用,虽然源数据集中的物体可能与目标数据集大相径庭,甚至完全没有交集,但在大规模数据上预训练得到的模型可能具备提取更常规的图像特征(例如边缘、形状和纹理)的能力,从而有助于有效地识别目标数据集中的物体。 12 | 13 | 假设我们已有一个预训练模型,大致有三种使用方式: 14 | 15 | 1. **使用预训练模型的参数对特征提取器进行初始化,然后对整个模型进行训练。** 对于深度学习模型来说,参数初始化的方法对保持数值稳定性相当重要,不当的初始化方法可能会导致在训练时出现梯度爆炸或梯度消失的问题。如果使用预训练模型进行初始化,可以在很大程度上保证模型参数初始值的合理性,让模型“赢在起跑线上”。 16 | 17 | 2. **对整个模型进行训练,但对特征提取器使用较小的学习率,对分类器使用较大的学习率。** 预训练得到的特征提取器已经得到了充分的训练,所以只需要较小的学习率;而分类器的参数通常是随机初始化的,所以需要从头开始学习,因此需要较大的学习率。 18 | 19 | 3. **固定特征提取器的参数,只训练分类器。** 如果目标数据集的类别恰好是源数据集的子集,那么这样的方式一般会很有效且快速。 20 | 21 | 22 | ## 迁移学习示例 23 | 24 | 在本节中,我们将使用 ResNet-18 作为特征提取器在 [CIFAR-10 数据集](http://www.cs.toronto.edu/~kriz/cifar.html) 上进行图像分类任务。 25 | 26 | ResNet-18 的预训练模型(在 ImageNet 数据集上训练得到)和 CIFAR-10 数据集都可以通过 [FlowVision](https://github.com/Oneflow-Inc/vision) 方便地获取。 27 | 28 | 29 | 首先导入所需的依赖: 30 | 31 | ```python 32 | import oneflow as flow 33 | from oneflow import nn 34 | from oneflow.utils.data import DataLoader 35 | 36 | from flowvision.models import resnet18 37 | from flowvision.datasets import CIFAR10 38 | import flowvision.transforms as transforms 39 | ``` 40 | 41 | 定义 epoch, batch size, 以及使用的计算设备: 42 | ```python 43 | NUM_EPOCHS = 3 44 | BATCH_SIZE = 64 45 | DEVICE = 'cuda' if flow.cuda.is_available() else 'cpu' 46 | ``` 47 | 48 | ### 数据加载及预处理 49 | 50 | 定义 Dataset 和 DataLoader: 51 | 52 | ```python 53 | train_transform = transforms.Compose([ 54 | transforms.RandomHorizontalFlip(), 55 | transforms.RandomVerticalFlip(), 56 | transforms.Resize(224), 57 | transforms.ToTensor(), 58 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 59 | ]) 60 | test_transform = transforms.Compose([ 61 | transforms.Resize(224), 62 | transforms.ToTensor(), 63 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 64 | ]) 65 | 66 | train_dataset = CIFAR10(root='./data', train=True, transform=train_transform, download=True) 67 | test_dataset = CIFAR10(root='./data', train=False, transform=test_transform, download=True) 68 | 69 | train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) 70 | test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) 71 | ``` 72 | 73 | ### 定义模型 74 | 75 | ```python 76 | model = resnet18(pretrained=True) 77 | ``` 78 | 在这里,我们通过将 `pretrained` 参数设置为 `True` 来获取加载了预训练权重的 ResNet-18 模型。如果输出 `model.fc`,将会得到 "Linear(in_features=512, out_features=1000, bias=True)",可以看出此分类器有 1000 个输出神经元,对应于 ImageNet 的 1000 个类别。CIFAR-10 数据集是 10 个类别,因此我们需要替换掉这个全连接层分类器: 79 | 80 | ```python 81 | model.fc = nn.Linear(model.fc.in_features, 10) 82 | ``` 83 | 84 | 然后将模型加载到计算设备: 85 | ```python 86 | model = model.to(DEVICE) 87 | ``` 88 | 89 | ### 训练模型 90 | 91 | 定义训练函数: 92 | ```python 93 | def train_model(model, train_data_loader, test_data_loader, loss_func, optimizer): 94 | dataset_size = len(train_data_loader.dataset) 95 | model.train() 96 | for epoch in range(NUM_EPOCHS): 97 | for batch, (images, labels) in enumerate(train_data_loader): 98 | images, labels = images.to(DEVICE), labels.to(DEVICE) 99 | preds = model(images) 100 | loss = loss_func(preds, labels) 101 | optimizer.zero_grad() 102 | loss.backward() 103 | optimizer.step() 104 | 105 | if batch % 100 == 0: 106 | print(f'loss: {loss:>7f} [epoch: {epoch} {batch * BATCH_SIZE:>5d}/{dataset_size:>5d}]') 107 | 108 | evaluate(model, test_data_loader) 109 | ``` 110 | 111 | 定义评估函数,使用准确率作为评估指标: 112 | ```python 113 | def evaluate(model, data_loader): 114 | dataset_size = len(data_loader.dataset) 115 | model.eval() 116 | num_corrects = 0 117 | for images, labels in data_loader: 118 | images, labels = images.to(DEVICE), labels.to(DEVICE) 119 | preds = model(images) 120 | num_corrects += flow.sum(flow.argmax(preds, dim=1) == labels) 121 | 122 | print('Accuracy: ', num_corrects.item() / dataset_size) 123 | ``` 124 | 125 | 我们可以通过给优化器传入相应的需要优化的参数,来实现上文中提到的三种方式。 126 | 127 | 第 1 种方式,对整个模型进行训练: 128 | 129 | ```python 130 | optimizer = flow.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4) 131 | ``` 132 | 133 | 第 2 种方式,对特征提取器使用较小的学习率,对分类器使用较大的学习率: 134 | 135 | ```python 136 | fc_params = list(map(id, model.fc.parameters())) 137 | backbone_params = filter(lambda p: id(p) not in fc_params, model.parameters()) 138 | optimizer = flow.optim.SGD([{'params': backbone_params, 'lr': 0.0001}, 139 | {'params': model.fc.parameters(), 'lr': 0.001}], 140 | momentum=0.9, weight_decay=5e-4) 141 | ``` 142 | 143 | 第 3 种方式,固定特征提取器的参数,只训练分类器: 144 | 145 | ```python 146 | optimizer = flow.optim.SGD(model.fc.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4) 147 | ``` 148 | 149 | 开始训练: 150 | 151 | ```python 152 | loss_func = nn.CrossEntropyLoss() 153 | train_model(model, train_data_loader, test_data_loader, loss_func, optimizer) 154 | ``` 155 | 156 | ### 结果对比 157 | 158 | 在使用迁移学习的情况下(这里使用第一种方式),模型在经过 3 个 epoch 的训练后在测试集上的准确率达到了 **0.9017**; 如果从头开始训练、不使用迁移学习,同样经过 3 个 epoch 的训练后,准确率仅为 **0.4957**。这表明迁移学习确实能起到显著的作用。 159 | -------------------------------------------------------------------------------- /cn/docs/cookies/zero.md: -------------------------------------------------------------------------------- 1 | # Zero Redundancy Optimizer (ZeRO) 2 | 3 | ## ZeRO 简介 4 | 5 | **Zero Redundancy Optimizer (ZeRO)** 是论文 [ZeRO: Memory Optimization Towards Training A Trillion Parameter Models](https://arxiv.org/pdf/1910.02054.pdf) 提出的一种用于减少数据并行策略下的显存占用的方法。 6 | 7 | 在普通的数据并行策略中,每个 GPU 都独立地维护一组完整的模型参数,计算与通信效率较高,但内存效率较差。这个问题在训练大型模型时尤为突出。ZeRO 由 ZeRO-DP 和 ZeRO-R 两部分组成,可以有效地减少显存消耗量,这意味着在同样的显存下,可以训练更大的模型,也意味着对于以往只能通过模型并行策略才能训练的大模型也有可能使用数据并行。 8 | 9 | 训练深度学习模型时的显存消耗可以分为两大部分: 10 | 11 | 1. **模型状态(model states)**。对于大型模型来说,大部分显存消耗都是被模型状态占用的,主要包括三部分:优化器的状态(Optimizer States)、梯度(Gradients)、参数(Parameters)。三者简称为 **OPG**。 12 | 2. **残余状态(residual states)**。包括激活函数、临时缓冲区和不可用的内存碎片。 13 | 14 | ZeRO-DP 可以分为三个阶段,通过对 OPG 状态进行分区而不是直接复制来消除内存冗余,每个 GPU 仅保存部分 OPG。具体来说,ZeRO-DP 有三个主要的优化阶段,分别对应 O、P 和 G。三个阶段逐级递加: 15 | 16 | - 阶段1,优化器状态分区(Pos):显存消耗减少 4 倍,通信量与数据并行相同。 17 | - 阶段2,添加梯度分区优化(Pos+g):显存消耗减少 8 倍,通信量与数据并行相同。 18 | - 阶段3,添加参数分区优化(Pos+g+p):模型占用的显存被平均分配到每个 GPU 中,显存消耗量与数据并行的并行度成线性反比关系,但通信量会有些许增加。 19 | 20 | 三个阶段的显存消耗的分布情况可以参见下图(来自 ZeRO 原论文 Figure 1): 21 | 22 |
23 | Three Stages of ZeRO-DP Optimizations 24 |
25 | 26 | ## ZeRO 使用示例 27 | 28 | 首先导入 OneFlow: 29 | ```python 30 | import oneflow as flow 31 | from oneflow import nn 32 | ``` 33 | 34 | ### 定义数据并行训练流程 35 | 36 | 我们定义一个数据并行策略下的训练流程,与 [通过设置 SBP 做数据并行训练](../parallelism/05_ddp.md#通过设置-sbp-做数据并行训练) 中所介绍的是类似的。 37 | 38 | !!! Note 39 | 只要存在数据并行组,都可以使用ZeRO来做内存优化。比如2D/3D并行中,只要存在数据并行组,都可以打开ZeRO。 40 | 41 | 定义之后要使用到 placement、SBP 等: 42 | ```python 43 | P = flow.placement("cuda", ranks=[0, 1]) 44 | B = flow.sbp.broadcast 45 | S0 = flow.sbp.split(0) 46 | DEVICE = "cuda" 47 | ``` 48 | 49 | 为了便于演示,我们定义一个简单的模型,然后广播到集群上: 50 | ```python 51 | model = nn.Sequential(nn.Linear(256, 128), 52 | nn.ReLU(), 53 | nn.Linear(128, 10)) 54 | model = model.to(DEVICE) 55 | model.train() 56 | model = model.to_global(placement=P, sbp=B) 57 | 58 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 59 | optimizer = flow.optim.SGD(model.parameters(), lr=1e-3) 60 | ``` 61 | 62 | ZeRO 是在 [nn.Graph](../basics/08_nn_graph.md) 的图编译器中实现的,因此需要将动态图模型转换为 nn.Graph: 63 | 64 | ```python 65 | class CustomGraph(flow.nn.Graph): 66 | def __init__(self): 67 | super().__init__() 68 | self.model = model 69 | self.loss_fn = loss_fn 70 | self.add_optimizer(optimizer) 71 | 72 | # TODO: 设置 ZeRO 73 | 74 | def build(self, x, y): 75 | preds = self.model(x) 76 | loss = self.loss_fn(preds, y) 77 | loss.backward() 78 | return preds 79 | ``` 80 | 81 | 定义训练流程: 82 | 83 | ```python 84 | graph_model = CustomGraph() 85 | 86 | for _ in range(100): 87 | x = flow.randn(128, 256).to(DEVICE) 88 | y = flow.ones(128, 1, dtype=flow.int64).to(DEVICE) 89 | global_x = x.to_global(placement=P, sbp=S0) 90 | global_y = y.to_global(placement=P, sbp=S0) 91 | 92 | graph_model(global_x, global_y) 93 | ``` 94 | 95 | 然后通过 [launch 模块](../parallelism/04_launch.md) 启动训练即可。 96 | 97 | ### 在 nn.Graph 中开启 ZeRO 98 | 99 | 通过 [config.enable_zero](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.graph_config.GraphConfig.enable_zero.html#oneflow.nn.graph.graph_config.GraphConfig.enable_zero) 接口可以开启ZeRO优化。 100 | 101 | #### 开启阶段1优化 102 | ```python 103 | class CustomGraph(flow.nn.Graph): 104 | def __init__(self): 105 | super().__init__() 106 | ... 107 | # 设置 ZeRO 开启 stage 1 108 | self.config.enable_zero(True, stage=1) 109 | ... 110 | ``` 111 | 112 | !!! Note 113 | 当使用模型连续进行训练和和预测时:训练执行一次后,ZeRO 会自动把模型的 SBP 参数从 Broadcast 改变为 Split;在执行预测时,将会使用 Split 自动推理,无需配置 ZeRO。 114 | 115 | #### 开启阶段 2 优化 116 | ```python 117 | class CustomGraph(flow.nn.Graph): 118 | def __init__(self): 119 | super().__init__() 120 | ... 121 | # 设置 ZeRO 开启 stage 2 122 | self.config.enable_zero(True, stage=2) 123 | ... 124 | ``` 125 | 一般阶段 2 的优化的显存优化大、速度影响小,所以推荐使用阶段 2 优化。可以简单的开始阶段 2 优化: 126 | ```python 127 | class CustomGraph(flow.nn.Graph): 128 | def __init__(self): 129 | super().__init__() 130 | ... 131 | # 设置 ZeRO 开启 stage 2 132 | self.config.enable_zero() 133 | ... 134 | ``` 135 | 136 | #### 开启阶段 3 优化 137 | ```python 138 | class CustomGraph(flow.nn.Graph): 139 | def __init__(self): 140 | super().__init__() 141 | ... 142 | # 设置 ZeRO 开启 stage 3 143 | self.config.enable_zero(True, stage=3) 144 | ... 145 | ``` 146 | 147 | 虽然开启第三阶段可以最大限度地减少显存消耗,但这会增加通信成本,执行速度会降低。 148 | -------------------------------------------------------------------------------- /cn/docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: OneFlow -- 全新一代深度学习框架 3 | template: templates/home.html 4 | --- 5 | -------------------------------------------------------------------------------- /cn/docs/javascripts/config.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.typesetPromise() 16 | }) 17 | -------------------------------------------------------------------------------- /cn/docs/parallelism/01_introduction.md: -------------------------------------------------------------------------------- 1 | # 常见的分布式并行策略 2 | 3 | ## 为什么分布式训练越来越流行 4 | 近年来,深度学习被广泛应用到各个领域,包括计算机视觉、语言理解、语音识别、广告推荐等。在这些不同的领域中,一个共同的特点就是模型规模越来越大,比如 GPT-3 模型的参数量达到1750亿。即使用1024张 80 GB 的 A100,那么完整训练 GPT-3 的时长都需要1个月。 5 | 6 | 模型规模的扩大,对硬件(算力、内存)的发展提出要求。然而,因为 [内存墙](https://oneflow.org/a/share/jishuboke/75.html) 的存在,单一设备的算力及容量,受限于物理定律,持续提高芯片的集成越来越困难,难以跟上模型扩大的需求。 7 | 8 | 为了解决算力增速不足的问题,人们考虑用多节点集群进行分布式训练,以提升算力,分布式训练势在必行。 9 | 10 | ## 常见的并行策略 11 | 12 | 简单的机器堆叠并不一定会带来算力的增长。因为神经网络的训练并不是单纯的“把原来一个设备做的事情,现在分给多个设备各自做”,它不仅需要多个设备进行计算,还涉及到设备之间的数据传输,只有协调好集群中的计算与通信,才能做高效的分布式训练。 13 | 14 | 我们将以矩阵乘法的例子,解释数据并行、模型并行的区别。 15 | 16 | 先了解以下逻辑上的矩阵乘法例子: 17 | 18 | 假设神经网络中某一层是做矩阵乘法,其中的输入 $x$ 的形状为 $4\times5$,模型参数 $w$ 的形状为 $5\times8$,那么,矩阵乘法输出形状为 $4\times8$。示意图如下: 19 | 20 | ![matmul](./imgs/matmul_logical.png) 21 | 22 | 单机单卡的训练中,以上矩阵乘法,先计算得到 $out$,并将 $out$ 传递给下一层,并最终计算得到 $loss$,然后在反向传播过程中,得到 $\frac{\partial loss}{\partial w}$,用于更新 $w$。 23 | 24 | 分布式训练中,依据是切分 $x$ 还是 $w$ 的不同,分为“数据并行”和“模型并行”策略。接下来,我们介绍常见的并行策略。 25 | 26 | ### 数据并行 27 | 所谓的数据并行,就是将数据 $x$ 进行切分,而每个设备上的模型 $w$ 是完整的、一致的。如下图所示,$x$ 被按照第0维度平均切分到2个设备上,两个设备上都有完整的 $w$。 28 | 29 | 这样,在两台设备上,分别得到的输出,都只是逻辑上输出的一半(形状为 $2\times8$),将两个设备上的输出拼接到一起,才能得到逻辑上完整的输出。 30 | 31 | ![Data Paralelism](./imgs/matmul_data_paralelism.png) 32 | 33 | 注意,因为数据被分发到了2个设备上,因此反向传播过程,各自设备上得到的 $\frac{\partial loss}{\partial w}$ 会不一样,如果直接使用各个设备上的梯度更新各自的模型,会造成2个设备上的 **模型不一致**,训练就失去了意义(到底用哪个模型好呢?)。 34 | 35 | 因此,数据并行策略下,在反向传播过程中,需要对各个设备上的梯度进行 [AllReduce](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce),以确保各个设备上的模型始终保持一致。 36 | 37 | 当数据集较大,模型较小时,由于反向过程中为同步梯度产生的通信代价较小,此时选择数据并行一般比较有优势,常见的视觉分类模型,如 ResNet50,比较适合采用数据并行。 38 | 39 | ### 模型并行 40 | 41 | 当神经网络非常巨大,数据并行同步梯度的代价就会很大,甚至网络可能巨大到无法存放到单一计算设备中,这时候,可以采用模型并行策略解决问题。 42 | 43 | 所谓的模型并行,就是每个设备上的数据是完整的、一致的,而模型 $w$ 被切分到了各个设备上,每个设备只拥有模型的一部分,所有计算设备上的模型拼在一起,才是完整的模型。 44 | 45 | 如下图所示,$w$ 被按照第1维度平均切分到2个设备上,两个设备上都有完整的 $x$。两个设备上的输出也需要通过拼接才能得到逻辑上的输出。 46 | 47 | ![Model Paralelism](./imgs/matmul_model_paralelism.png) 48 | 49 | 模型并行的好处是,省去了多个设备之间的梯度 AllReduce;但是,由于每个设备都需要完整的数据输入,因此,数据会在多个设备之间进行广播,产生通信代价。比如,上图中的最终得到的 $out~(4\times8)$ ,如果它作为下一层网络的输入,那么它就需要被广播发送到两个设备上。 50 | 51 | 语言模型,如 BERT,常采用模型并行。 52 | 53 | ### 流水并行 54 | 当神经网络过于巨大,无法在一个设备上存放时,除了上述的模型并行的策略外,还可以选择流水并行。 55 | 流水并行指将网络切为多个阶段,并分发到不同的计算设备上,各个计算设备之间以“接力”的方式完成训练。 56 | 57 | 如下图,展示了一个逻辑上的4层网络(`T1` 至 `T4`)是如何做流水并行的。 58 | 59 | 4层网络被切分到2个计算设备上,其中 `GPU0` 上进行 `T1` 与 `T2` 的运算,`GPU1` 上进行 `T3` 与 `T4` 的计算。 60 | 61 | `GPU0` 上完成前两层的计算后,它的输出被当作 `GPU1` 的输入,继续进行后两层的计算。 62 | 63 |
64 | Relay 65 |
66 | 67 | ### 混合并行 68 | 网络的训练中,也可以将多种并行策略混用,以 GPT-3 为例,以下是它训练时的设备并行方案: 69 | 70 | 它首先被分为 64 个阶段,进行流水并行。每个阶段都运行在 6 台 DGX-A100 主机上。在6台主机之间,进行的是数据并行训练;每台主机有 8 张 GPU 显卡,同一台机器上的8张 GPU 显卡之间是进行模型并行训练。 71 | 72 | ![gpt-3](./imgs/gpt3-overview.png) 73 | 74 | 并行策略的选择影响着训练效率,框架对并行训练的接口支持程度,决定了算法工程师的开发效率。OneFlow 针对分布式训练所做的系统级设计和创新,为用户轻松上手分布式训练做足了铺垫。我们将在本专题的其它文章中看到相关示例。 -------------------------------------------------------------------------------- /cn/docs/parallelism/02_sbp.md: -------------------------------------------------------------------------------- 1 | # 集群的全局视角 2 | 3 | OneFlow 提出了 **全局视角(Global View)** 的概念,用于简化分布式训练。简单而言,在 OneFlow 的全局视角下,集群被抽象为一台“超级计算设备”。 4 | 5 | 用户不用关心集群中计算、通信的细节,只需关心逻辑上的数据与计算,依然像单机单卡那样思考、编程,就能进行分布式训练。 6 | 7 | ![global view](./imgs/consistent-view.png) 8 | 9 | OneFlow 的全局视角,依赖几个重要概念:Placement、SBP 与 SBP Signature。 10 | 11 | ## Placement 12 | 13 | OneFlow 全局视角下的 Tensor 有 `placement` 属性,通过 `placement` 属性可以指定该 Tensor 存放在哪个物理设备上。 14 | 15 | OneFlow 会自动为集群中的计算设备编号。比如,如果集群中有 4 台主机,每台主机上有 8 张显卡,一共 32 张显卡,那么 OneFlow 会将这 32 张显卡自动编号为 0~31。 16 | 17 | 如果想将 Tensor 放置在第 0 台机器的前 4 张显卡上,只需要配置:`placement("cuda", [0, 1, 2, 3])`。 18 | 19 | 如果想将 Tensor 放置在第 0 台机器的后 4 张显卡上,只需要配置:`placement("cuda", [4, 5, 6, 7])`。 20 | 21 | `placement` 使得 OneFlow 很容易支持流水并行,我们将在本专题的其它文章中看到与 `placement` 有关的实际例子。 22 | 23 | ## SBP 24 | 25 | SBP 是 OneFlow 发明的概念,描述了“超级计算设备”全局视角下的数据与集群中真实的物理设备上的数据的映射关系,它由 `split`, `broadcast`, `partial` 的首字母组合而成。 26 | 27 | 详细而言: 28 | 29 | - `split` 表示物理设备上的 Tensor,是将全局视角的 Tensor 切分得到的。切分时,需要指定切分的维度。物理设备上的 Tensor ,经过拼接,可以还原得到全局视角的 Tensor 。 30 | - `broadcast` 表示全局视角下的 Tensor,会复制并广播到所有的物理设备上。 31 | - `partial` 表示全局视角下的 Tensor 与物理设备上的 Tensor 的 **形状相同**,但是物理设备上的值,只是全局视角下 Tensor 的 **一部分**。以 `partial sum` 为例,如果我们将集群中所有设备的张量按位置相加,那么就可以还原得到全局视角的 Tensor。除了 `sum` 外,`min`、`max` 等操作也适用于 `partial`。 32 | 33 | 下图中分别展示了 SBP 的情况,分别是 `split(0)`、`split(1)`、`broadcast` 和 `partial sum`。 34 | 35 | ![SBP Example](./imgs/sbp-example.png) 36 | 37 | 在创建 Global Tensor 时,可以指定 Tensor 的 SBP,实际的代码例子将在下一篇文章 [Global Tensor](./03_consistent_tensor.md) 中看到。 38 | 39 | ## SBP Signature 40 | 41 | SBP 描述了全局视角下的数据与物理设备上的数据的映射关系,当进行分布式训练时,OneFlow 根据数据的 SBP 属性,将数据分发到各个物理设备,进行计算,并输出结果。 42 | 43 | 对于一个孤立的 Tensor,我们可以随意设置它的 SBP 属性。 44 | 但是,对于一个有输入、输出数据的算子,我们却不可以随意设置它的输入、输出的 SBP 属性。这是因为随意设置一个算子输入输出的 SBP 属性,可能不符合全局视角下算子的运算法则。 45 | 46 | 让我们以矩阵乘法为例讨论这个问题。看看在有2个设备的分布式系统中,矩阵乘法的输入、输出的 SBP 要如何组合才合法,如何组合不合法。 47 | 48 | 假设全局视角下要,一个形状为 $(m, k)$ 的矩阵 $A$ 与形状为 $(k, n)$ 的矩阵 $B$ 相乘得到 $Y$,$Y$ 的形状必然为 $(m, n)$。 49 | 50 | 依据矩阵乘法的规律,我们可以将矩阵 $A$ 按第0维进行切分,切分为形状分别为 $(m_0, k)$、$(m_1, k)$ 的两个矩阵:$A_0$ 和 $A_1$,然后在2个设备上分别计算: 51 | 52 | 设备一: 53 | 54 | $$ 55 | \begin{matrix} 56 | A_0 \times B = Y_0 57 | \\ 58 | (m_0, k) (k, n) (m_0, n) 59 | \end{matrix} 60 | $$ 61 | 62 | 设备二: 63 | 64 | $$ 65 | \begin{matrix} 66 | A_1 \times B = Y_1 67 | \\ 68 | (m_1, k) (k, n) (m_1, n) 69 | \end{matrix} 70 | $$ 71 | 72 | 我们容易得到物理设备上的 $A_0$、$A_1$ 与全局视角 $A$ 的关系,以及 $Y_0$、$Y_1$ 与全局视角数据 $Y$ 的关系: 73 | 74 | $$ 75 | \begin{matrix} 76 | A &= concat&(A_0 ,& A_1) \\ 77 | (m,k) & & (m_0, k) & (m_1, k) 78 | \end{matrix} 79 | $$ 80 | 81 | $$ 82 | \begin{matrix} 83 | Y &= concat&(Y_0 ,& Y_1) \\ 84 | (m,n) & & (m_0, n) & (m_1, n) 85 | \end{matrix} 86 | $$ 87 | 88 | > 注意:以上的 `concat` 表示拼接操作。 89 | 90 | 可见,按照以上的方式,将全局视角的数据分发到各个物理设备上,是能够完成运算,并且最终得到全局视角上的正确结果的。以上较长的篇幅,若 **使用 SBP 来描述,会变得异常简单** : 91 | 92 | $A$ 为 `split(0)`, $B$ 为 `broadcast`,运算结果 $Y$ 为 `split(0)`。 93 | 94 | 可见,对于矩阵乘法而言,其输入输出的 SBP,按以上方式组合,是合法的。对于矩阵乘法而言,**合法的 SBP 组合不止一种**,比如还可以是: 95 | 96 | $A$ 为 `broadcast`, $B$ 为 `split(1)`,运算结果 $Y$ 为 `split(1)`。 97 | 98 | 或者: 99 | 100 | $A$ 为 `split(1)`, $B$ 为 `split(0)`,运算结果 $Y$ 为 `partial sum`。 101 | 102 | 虽然展示了多个合法的 SBP 组合,但是并不是任意的 SBP 组合都是合法的,比如对于矩阵乘法,如果 $A$、$B$ 均为 `split(0)`,那么: 103 | 104 | $$ 105 | \begin{matrix} 106 | A &= concat&(A_0 ,& A_1) \\ 107 | (m,k) & & (m_0, k) & (m_1, k) 108 | \end{matrix} 109 | $$ 110 | 111 | $$ 112 | \begin{matrix} 113 | B &= concat&(B_0 ,& B_1) \\ 114 | (k,n) & & (k_0, n) & (k_1, n) 115 | \end{matrix} 116 | $$ 117 | 118 | 那么在物理设备上,因为 $A_0$ 与 $B_0$ 的形状,并不满足矩阵乘法的要求,也就无法在物理设备上完成矩阵乘法。我们可以说, $A$ 为 `split(0)`, $B$ 为 `split(0)` 的 SBP 组合是不合法的。 119 | 120 | 我们将上文出现的,对于某个算子,其输入输出的一个 **特定的、合法的 SBP 组合**,称为这个算子的一个 **SBP Signature**。 121 | 122 | ## SBP Signature 自动推导 123 | 124 | 有了 SBP Signature 的概念后,我们可能会提出几个问题: 125 | 126 | - 用户是否要知道算子的所有 SBP Signature,才能用 OneFlow 做好分布式训练? 127 | - 作为算法工程师,用户是否要为每层网络都设置输入的 SBP? 128 | 129 | 对于前一个问题,用户当然不需要知晓算子所有的 SBP Signature。罗列某个算子所有可能的 SBP Signature 的工作,是 **算子作者** 的责任。算子作者根据算子的运算法则,在开发算子时,就已经罗列并预设好该算子所有可能的 SBP Signature。 130 | 131 | 这顺便就解答了第二个问题:因为有预设好的 SBP Signature,所以,某一层算子只要有输入的 SBP,OneFlow 就可以根据 SBP Signature 推导出该层算子输出的 SBP。而上游算子的输出,又是下游算子的输入,这样,就确定了下游算子输入的 SBP,然后又可以根据 SBP Signature 确定更下游输出的 SBP…… 132 | 这样不断推导、传播。因此通常情况下,用户是不需要为每层网络都设置输入的 SBP。而只有最初输入层,或者需要强制指定某层的 SBP 时,才需要显式指定。 133 | 134 | 用户还可能会有新的问题: 135 | 136 | - 一个算子的合法 SBP Signature 常常有多个,OneFlow 运行时到底会选择哪一个呢,它是依据什么做出选择的? 137 | 138 | 对于这个问题,需要了解 OneFlow 的 **SBP Signature 自动推导** 机制。所谓的 SBP Signature 自动推导,指的是:在给定所有算子的所有合法的 SBP Signature 的前提下,OneFlow 有一套算法,会基于传输代价为每种合法的 SBP Signature 进行打分,并选择传输代价最小的那个 SBP Signature。这样使得系统的吞吐效率最高。 139 | 140 | ### Boxing 机制 141 | 142 | 严格地说,OneFlow 的 Boxing 机制对于用户其实是透明的,用户使用 OneFlow 做分布式训练时,不用知晓它也感知不到它。 143 | 144 | 但是,鉴于某些深入思考的用户,可能了解 SBP Signature 自动推导后,会自然提出以下问题: 145 | 146 | - 如果 OneFlow 自动选择的 SBP Signature,上一层算子的输出与下一层算子的输入的 SBP 属性不匹配时,那怎么办呢? 147 | 148 | 举个具体例子,比如以下代码中,上一层算子 `matmul` 的输出 SBP 本来是 `split(0)`,但是下一层算子 `matmul` 的输入,被转成了 `broadcast`。此时,上一层的输出与下一层的输入,它们的 SBP 其实就不一致了。 149 | 150 | ```python 151 | import oneflow as flow 152 | 153 | P0 = flow.placement("cuda", ranks=[0, 1]) 154 | P1 = flow.placement("cuda", ranks=[2, 3]) 155 | a0_sbp = flow.sbp.split(0) 156 | b0_sbp = flow.sbp.broadcast 157 | y0_sbp = flow.sbp.broadcast 158 | b1_sbp = flow.sbp.split(1) 159 | 160 | A0 = flow.randn(4, 5, placement=P0, sbp=a0_sbp) 161 | B0 = flow.randn(5, 8, placement=P0, sbp=b0_sbp) 162 | Y0 = flow.matmul(A0, B0) 163 | 164 | Y0 = Y0.to_global(placement=P1, sbp=y0_sbp) 165 | B1 = flow.randn(8, 6, placement=P1, sbp=b1_sbp) 166 | Y2 = flow.matmul(Y0, B1) 167 | ``` 168 | 169 | 这种情况下,OneFlow 其实会检测到这种不一致,并且在上游的输出和下游的输入间插入一个算子,做相关的转换工作。这类自动加入做转换的算子,就称为 **Boxing 算子**。 170 | 171 | 以上代码的逻辑图和物理执行图的对应关系如下: 172 | 173 | ![](./imgs/sbp_translation.png) 174 | 175 | ## 总结 176 | 177 | `placement` 与 `SBP`、`SBP Signature` 是 OneFlow 分布式全局视角的重要保证,OneFlow 的全局视角使得 OneFlow 的分布式训练与单机单卡一样简单。 178 | 179 | 通常情况下,用户只需要在起始网络层设置 `SBP`,由此可以省略传统分布式训练中手写通信操作的麻烦。更值得一提的是,除了本文介绍的 SBP Signature 自动推导机制外,OneFlow 团队正在研发一种寻求全局最优解的自动并行方法,正在内测,等它上线后,用户可以不做任何 SBP 配置就得到很好的分布式训练效果,敬请期待。 180 | 181 | 在下一篇 [Global Tensor](./03_consistent_tensor) 中,我们将看到全局视角的编程例子。 182 | -------------------------------------------------------------------------------- /cn/docs/parallelism/04_2d-sbp.md: -------------------------------------------------------------------------------- 1 | # 2D SBP 2 | 3 | 阅读 [集群的全局视角](./02_sbp.md) 和 [Global Tensor](./03_consistent_tensor.md) 之后,相信你已经掌握了 SBP 和 SBP Signature 的基本概念,并且能够上手相关的编程任务。实际上,以上资料中涉及都是 **1D SBP**。 4 | 5 | 本文将在读者掌握 1D SBP 的基础上,介绍 2D SBP,它能够更灵活地应对更复杂的分布式训练场景。 6 | 7 | ## 2D 设备阵列 8 | 9 | 我们已经熟悉 1D SBP 的 placement 配置,在 1D SBP 的场景下,通过 [oneflow.placement](https://oneflow.readthedocs.io/en/v0.8.1/tensor_attributes.html?highlight=oneflow.placement#oneflow-placement) 接口配置集群,比如使用集群中的第 0~3 号 GPU 显卡: 10 | 11 | ```python 12 | >>> placement1 = flow.placement("cuda", ranks=[0, 1, 2, 3]) 13 | ``` 14 | 15 | 以上的 `"cuda"` 指定了设备类型,`ranks=[0, 1, 2, 3]` 指定了集群中的计算设备。其实,`ranks` 不仅可以是一维的int list,还可以是多维的int数组: 16 | 17 | ```python 18 | placement2 = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) 19 | ``` 20 | 21 | 当 `ranks` 是 `ranks=[0, 1, 2, 3]` 这种一维list的形式时,集群中的所有设备组成了一个 1D 设备向量,这也是 1D SBP 名称的由来。 22 | 23 | 当 `ranks` 是多维数组的形式时,集群中的设备被分组为一个多维的设备阵列。`ranks=[[0, 1], [2, 3]]` 表示集群中的四个计算设备被划分为了 $2 \times 2$ 的设备阵列。 24 | 25 | ## 2D SBP 26 | 27 | 我们已经知道,构造 Global Tensor 时,需要同时指定 `placement` 与 `SBP`。当 `placement` 中的集群是 2 维的设备阵列时;SBP 也必须与之对应,是一个长度为 2 的 `tuple`,这个`tuple`中的第 0 个、第 1 个 元素,分别描述了 Global Tensor 张量在设备阵列第 0 维、第 1 维的分布。 28 | 29 | 比如,以下代码,配置了 $2 \times 2$ 的设备阵列,并且设置 2D SBP 为 `(broadcast, split(0))`。 30 | 31 | ```python 32 | >>> a = flow.Tensor([[1,2],[3,4]]) 33 | >>> placement = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) 34 | >>> sbp = (flow.sbp.broadcast, flow.sbp.split(0)) 35 | >>> a_to_global = a.to_global(placement=placement, sbp=sbp) 36 | ``` 37 | 38 | 它意味着,逻辑上的数据,在整个设备阵列上,在第 0 维度(“竖着看”)做 `broadcast`;在第 1 维度(“横着看”)做 `split(0)`。 39 | 40 | 我们通过下图做阐述: 41 | 42 | ![](./imgs/2d-sbp.png) 43 | 44 | 此图的最左边是全局视角的数据,最右边是设备阵列上各个设备的数据。可以看到,从第 0 维的角度看,它们都是 `broadcast` 的关系: 45 | 46 | - (group0, device0) 与 (group1, device0) 中数据一致,互为 `broadcast` 关系 47 | - (group0, device1) 与 (group1, device1) 中数据一致,互为 `broadcast` 关系 48 | 49 | 而从第 1 维的角度看,它们都是 `split(0)` 的关系: 50 | 51 | - (group0, device0) 与 (group0, device1) 互为 `split(0)` 关系 52 | - (group1, device0) 与 (group1, device1) 互为 `split(0)` 关系 53 | 54 | 直接理解逻辑数据和最终的设备阵列中的物理数据对应关系可能有一定难度,大家在思考 2D SBP 时,可以假想一个中间状态(上图中灰色部分),以 `(broadcast, split(0))` 为例: 55 | 56 | - 原始逻辑张量,先经过 `broadcast`,广播到 2 个 group 上,得到中间的状态 57 | - 在中间状态的基础上,继续在各自的 group 上,做 `split(0)`,得到最终设备阵列中各个物理张量的状态 58 | 59 | ## 2D SBP Signature 60 | 61 | 类似 1D SBP 有 SBP Signature 的概念,算子也有 2D SBP Signature,在掌握了 1D SBP 及其 Signature 概念的基础上,2D SBP Signature 非常简单,只需要遵循一条原则: 62 | 63 | - 在各自的维度上独立推导即可 64 | 65 | 我们以矩阵乘法为例,先回顾 1D SBP 的情况,假定有 $x \times w = y$ 可以有以下的 SBP Signature: 66 | 67 | $$ broadcast \times split(1) = split(1) $$ 68 | 69 | 以及 70 | 71 | $$ split(0) \times broadcast = split(0) $$ 72 | 73 | 现在,假定我们给 $x$ 设置了 2D SBP 为:$(broadcast, split(0))$, 给 $w$ 设置 2D SBP 为 $(split(1), broadcast)$,那么,在 2D SBP 的背景下, $x \times w = y$ 运算,得到 $y$ 的 SBP 属性为 $(split(1), split(0))$。 74 | 75 | 也就是说,以下几个 2D SBP,构成矩阵乘法的 2D SBP Signature: 76 | 77 | $$ (broadcast, split(0)) \times (split(1), broadcast) = (split(1), split(0)) $$ 78 | 79 | 80 | ## 2D SBP 使用示例 81 | 82 | 在本节中,我们将通过一个简单的例子演示如何使用 2D SBP 进行分布式训练。同上文中的例子,假设有一个 $2 \times 2$ 的设备阵列,鉴于读者可能目前并没有多个 GPU 设备,我们将使用 **CPU** 来模拟 $2 \times 2$ 设备阵列的情形,对输入张量采用上文图中 `(broadcast, split(0))` 的并行策略。 83 | 84 | 85 | 首先,导入依赖: 86 | ```python 87 | import oneflow as flow 88 | import oneflow.nn as nn 89 | ``` 90 | 91 | 然后,定义要使用到的 placement 和 sbp: 92 | ```python 93 | PLACEMENT = flow.placement("cpu", [[0, 1], [2, 3]]) 94 | BROADCAST = (flow.sbp.broadcast, flow.sbp.broadcast) 95 | BS0 = (flow.sbp.broadcast, flow.sbp.split(0)) 96 | ``` 97 | `PLACEMENT` 的 `ranks` 参数是一个二维 list,代表将集群中的设备划分成 $2 \times 2$ 的设备阵列。如前文所述,SBP 需要与其对应,指定为长度为 2 的 tuple。其中,`BROADCAST` 表示在设备阵列的第 0 维和第 1 维都进行广播,`BS0` 的含义与前文的描述相同。 98 | 99 | 100 | 假设我们有以下模型: 101 | ```python 102 | model = nn.Sequential(nn.Linear(8, 4), 103 | nn.ReLU(), 104 | nn.Linear(4, 2)) 105 | ``` 106 | 将模型在集群上广播: 107 | ```python 108 | model = model.to_global(placement=PLACEMENT, sbp=BROADCAST) 109 | ``` 110 | 111 | 然后构造数据并进行前向推理: 112 | ```python 113 | x = flow.randn(1, 2, 8) 114 | global_x = x.to_global(placement=PLACEMENT, sbp=BS0) 115 | pred = model(global_x) 116 | ``` 117 | 在这里,我们创建了一个形状为 `(1, 2, 8)` 的 local tensor,然后通过 [Tensor.to_global](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.Tensor.to_global.html) 方法获取对应的 global tensor,最后将其输入到模型中进行推理。 118 | 119 | 通过 [Tensor.to_local](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.Tensor.to_local.html) 方法获取当前物理设备上的 local tensor 后,我们可以通过输出其形状和值来验证数据是否被正确处理: 120 | ```python 121 | local_x = global_x.to_local() 122 | print(f'{local_x.device}, {local_x.shape}, \n{local_x}') 123 | ``` 124 | 输出结果为: 125 | ```text 126 | cpu:2, oneflow.Size([1, 2, 8]), 127 | tensor([[[ 0.6068, 0.1986, -0.6363, -0.5572, -0.2388, 1.1607, -0.7186, 1.2161], 128 | [-0.1632, -1.5293, -0.6637, -1.0219, 0.1464, 1.1574, -0.0811, -1.6568]]], dtype=oneflow.float32) 129 | cpu:3, oneflow.Size([1, 2, 8]), 130 | tensor([[[-0.7676, 0.4519, -0.8810, 0.5648, 1.5428, 0.5752, 0.2466, -0.7708], 131 | [-1.2131, 1.4590, 0.2749, 0.8824, -0.8286, 0.9989, 0.5599, -0.5099]]], dtype=oneflow.float32) 132 | cpu:1, oneflow.Size([1, 2, 8]), 133 | tensor([[[-0.7676, 0.4519, -0.8810, 0.5648, 1.5428, 0.5752, 0.2466, -0.7708], 134 | [-1.2131, 1.4590, 0.2749, 0.8824, -0.8286, 0.9989, 0.5599, -0.5099]]], dtype=oneflow.float32) 135 | cpu:0, oneflow.Size([1, 2, 8]), 136 | tensor([[[ 0.6068, 0.1986, -0.6363, -0.5572, -0.2388, 1.1607, -0.7186, 1.2161], 137 | [-0.1632, -1.5293, -0.6637, -1.0219, 0.1464, 1.1574, -0.0811, -1.6568]]], dtype=oneflow.float32) 138 | ``` 139 | 通过比较这些不同“设备”上 local tensor 可以看到,符合上文图中描述的状态,证明数据已被正确分布到各个设备上。 140 | 141 | 142 | 需要注意的是,不能直接通过 `python xxx.py` 的方式执行上述代码,而需要通过 `oneflow.distributed.launch` 启动。此模块可以方便地启动分布式训练,在终端中执行下列命令 (假设上述代码已经保存至当前目录中的名为 "2d_sbp.py" 的文件中): 143 | ```bash 144 | python3 -m oneflow.distributed.launch --nproc_per_node=4 2d_sbp.py 145 | ``` 146 | 在此,通过将参数 `nproc_per_node` 指定为 4 来创建 4 个进程,模拟共有 4 个 GPU 的情形。关于此模块的详细用法,请参见:[用 launch 模块启动分布式训练](./04_launch.md)。 147 | 148 | 149 | 完整代码如下: 150 | ??? code 151 | ```python 152 | PLACEMENT = flow.placement("cpu", [[0, 1], [2, 3]]) 153 | BROADCAST = (flow.sbp.broadcast, flow.sbp.broadcast) 154 | BS0 = (flow.sbp.broadcast, flow.sbp.split(0)) 155 | 156 | model = nn.Sequential(nn.Linear(8, 4), 157 | nn.ReLU(), 158 | nn.Linear(4, 2)) 159 | model = model.to_global(placement=PLACEMENT, sbp=BROADCAST) 160 | 161 | x = flow.randn(1, 2, 8) 162 | global_x = x.to_global(placement=PLACEMENT, sbp=BS0) 163 | pred = model(global_x) 164 | 165 | local_x = global_x.to_local() 166 | print(f'{local_x.device}, {local_x.shape}, \n{local_x}') 167 | ``` 168 | -------------------------------------------------------------------------------- /cn/docs/parallelism/04_launch.md: -------------------------------------------------------------------------------- 1 | # 用 launch 模块启动分布式训练 2 | 3 | OneFlow 提供了 `oneflow.distributed.launch` 模块帮助用户更方便地启动分布式训练。 4 | 5 | 用户可以借助以下的形式,启动分布式训练: 6 | 7 | ```shell 8 | python3 -m oneflow.distributed.launch [启动选项] 训练脚本.py 9 | ``` 10 | 11 | 比如,启动单机两卡的训练: 12 | 13 | ```shell 14 | python3 -m oneflow.distributed.launch --nproc_per_node 2 ./script.py 15 | ``` 16 | 17 | 再比如,启动两台机器,每台机器有两张显卡的训练。 18 | 19 | 在0号机器上运行: 20 | 21 | ```shell 22 | python3 -m oneflow.distributed.launch \ 23 | --nnodes=2 \ 24 | --node_rank=0 \ 25 | --nproc_per_node=2 \ 26 | --master_addr="192.168.1.1" \ 27 | --master_port=7788 \ 28 | script.py 29 | ``` 30 | 31 | 在1号机器上运行: 32 | 33 | ```shell 34 | python3 -m oneflow.distributed.launch \ 35 | --nnodes=2 \ 36 | --node_rank=1 \ 37 | --nproc_per_node=2 \ 38 | --master_addr="192.168.1.1" \ 39 | --master_port=7788 \ 40 | script.py 41 | ``` 42 | 43 | ## 常见选项说明 44 | 45 | 通过 `python3 -m oneflow.distributed.launch -h` 可以查看 `launch` 模块的选项说明,以下是部分常见选项。 46 | 47 | - `--nnodes`:机器的数目(number of nodes) 48 | - `--node_rank`: 机器的编号,从0开始 49 | - `--nproc_per_node`:每台机器上要启动的进程数目(number of processes per node),推荐与 GPU 数目一致 50 | - `--logdir`:子进程日志的相对存储路径 51 | 52 | ## launch 模块与并行策略的关系 53 | 54 | 注意 `oneflow.distributed.launch` 的主要作用,是待用户完成分布式程序后,让用户可以更方便地启动分布式训练。它省去了配置集群中[环境变量](./03_consistent_tensor.md#_5) 的繁琐。 55 | 56 | 但是 `oneflow.distributed.launch` **并不决定** [并行策略](./01_introduction.md),并行策略是由设置数据、模型的分发方式、在物理设备上的放置位置决定的。 57 | 58 | OneFlow 提供的 [全局视角](./02_sbp.md) 和 [Global Tensor](./03_consistent_tensor.md) 可以灵活地配置并行策略。并且针对数据并行,OneFlow 提供了 [DistributedDataParallel](./05_ddp.md) 模块,可以在极少修改代码的前提下,将单机单卡的脚本改为数据并行的脚本。 59 | -------------------------------------------------------------------------------- /cn/docs/parallelism/05_ddp.md: -------------------------------------------------------------------------------- 1 | # 数据并行训练 2 | 3 | 在 [常见的分布式并行策略](./01_introduction.md) 一文中介绍了数据并行的特点。 4 | 在 OneFlow 中,提供了两种做数据并行的方式。 5 | 6 | 一种是使用 OneFlow 的原生的 SBP 概念,通过设置 global 张量,进行数据并行训练,这也是用 OneFlow 做数据并行训练的 **推荐方式** 。 7 | 8 | 此外,为了方便从 PyTorch 迁移到 OneFlow 的用户,OneFlow 提供了与 `torch.nn.parallel.DistributedDataParallel` 对齐一致的接口 [oneflow.nn.parallel.DistributedDataParallel](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.parallel.DistributedDataParallel.html),它也能让用户方便地从单机训练脚本,扩展为数据并行训练。 9 | 10 | ## 通过设置 SBP 做数据并行训练 11 | 12 | 以下代码,是通过配置设置 global 张量,完成数据并行训练。点击以下 “Code” 查看详细代码。 13 | 14 | ??? code 15 | ```python 16 | import oneflow as flow 17 | import oneflow.nn as nn 18 | import flowvision 19 | import flowvision.transforms as transforms 20 | 21 | BATCH_SIZE=64 22 | EPOCH_NUM = 1 23 | 24 | PLACEMENT = flow.placement("cuda", [0,1]) 25 | S0 = flow.sbp.split(0) 26 | B = flow.sbp.broadcast 27 | 28 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 29 | print("Using {} device".format(DEVICE)) 30 | 31 | training_data = flowvision.datasets.CIFAR10( 32 | root="data", 33 | train=True, 34 | transform=transforms.ToTensor(), 35 | download=True, 36 | ) 37 | 38 | train_dataloader = flow.utils.data.DataLoader( 39 | training_data, BATCH_SIZE, shuffle=True 40 | ) 41 | 42 | model = flowvision.models.mobilenet_v2().to(DEVICE) 43 | model.classifer = nn.Sequential(nn.Dropout(0.2), nn.Linear(model.last_channel, 10)) 44 | model = model.to_global(placement=PLACEMENT, sbp=B) 45 | 46 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 47 | optimizer = flow.optim.SGD(model.parameters(), lr=1e-3) 48 | 49 | for t in range(EPOCH_NUM): 50 | print(f"Epoch {t+1}\n-------------------------------") 51 | size = len(train_dataloader.dataset) 52 | for batch, (x, y) in enumerate(train_dataloader): 53 | x = x.to_global(placement=PLACEMENT, sbp=S0) 54 | y = y.to_global(placement=PLACEMENT, sbp=S0) 55 | 56 | # Compute prediction error 57 | pred = model(x) 58 | loss = loss_fn(pred, y) 59 | 60 | # Backpropagation 61 | optimizer.zero_grad() 62 | loss.backward() 63 | optimizer.step() 64 | 65 | current = batch * BATCH_SIZE 66 | if batch % 5 == 0: 67 | print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") 68 | ``` 69 | 70 | 可以发现,这个脚本的与单机单卡的训练脚本几乎是一样的。少数的区别在于几行与 global tensor 有关的配置代码外,它们是: 71 | 72 | - 设置 placement,让训练放置在集群第 0号、1号 GPU 上: 73 | 74 | ```python 75 | PLACEMENT = flow.placement("cuda", [0,1]) 76 | ``` 77 | 78 | - 模型在集群上做广播 79 | 80 | ```python 81 | model = model.to_global(placement=PLACEMENT, sbp=B) 82 | ``` 83 | 84 | - 数据在集群上按 `split(0)` 做切分: 85 | 86 | ```python 87 | x = x.to_global(placement=PLACEMENT, sbp=S0) 88 | y = y.to_global(placement=PLACEMENT, sbp=S0) 89 | ``` 90 | 91 | 这样,按照 [常见的分布式并行策略](./01_introduction.md) 中的介绍,我们就通过对数据进行 `split(0)` 切分,对模型进行广播,进行了分布式数据并行训练。 92 | 93 | ## 使用 DistributedDataParallel 做数据并行训练 94 | 95 | 可以用以下命令快速体验 `oneflow.nn.parallel.DistributedDataParallel` 做数据并行: 96 | 97 | ```shell 98 | wget https://docs.oneflow.org/master/code/parallelism/ddp_train.py #下载脚本 99 | python3 -m oneflow.distributed.launch --nproc_per_node 2 ./ddp_train.py #数据并行训练 100 | ``` 101 | 102 | 输出: 103 | 104 | ```text 105 | 50/500 loss:0.004111831542104483 106 | 50/500 loss:0.00025336415274068713 107 | ... 108 | 500/500 loss:6.184563972055912e-11 109 | 500/500 loss:4.547473508864641e-12 110 | 111 | w:tensor([[2.0000], 112 | [3.0000]], device='cuda:1', dtype=oneflow.float32, 113 | grad_fn=) 114 | 115 | w:tensor([[2.0000], 116 | [3.0000]], device='cuda:0', dtype=oneflow.float32, 117 | grad_fn=) 118 | ``` 119 | 120 | 点击以下 “Code” 可以展开以上运行脚本的代码。 121 | 122 | ??? code 123 | ```python 124 | import oneflow as flow 125 | from oneflow.nn.parallel import DistributedDataParallel as ddp 126 | 127 | train_x = [ 128 | flow.tensor([[1, 2], [2, 3]], dtype=flow.float32), 129 | flow.tensor([[4, 6], [3, 1]], dtype=flow.float32), 130 | ] 131 | train_y = [ 132 | flow.tensor([[8], [13]], dtype=flow.float32), 133 | flow.tensor([[26], [9]], dtype=flow.float32), 134 | ] 135 | 136 | 137 | class Model(flow.nn.Module): 138 | def __init__(self): 139 | super().__init__() 140 | self.lr = 0.01 141 | self.iter_count = 500 142 | self.w = flow.nn.Parameter(flow.tensor([[0], [0]], dtype=flow.float32)) 143 | 144 | def forward(self, x): 145 | x = flow.matmul(x, self.w) 146 | return x 147 | 148 | 149 | m = Model().to("cuda") 150 | m = ddp(m) 151 | loss = flow.nn.MSELoss(reduction="sum") 152 | optimizer = flow.optim.SGD(m.parameters(), m.lr) 153 | 154 | for i in range(0, m.iter_count): 155 | rank = flow.env.get_rank() 156 | x = train_x[rank].to("cuda") 157 | y = train_y[rank].to("cuda") 158 | 159 | y_pred = m(x) 160 | l = loss(y_pred, y) 161 | if (i + 1) % 50 == 0: 162 | print(f"{i+1}/{m.iter_count} loss:{l}") 163 | 164 | optimizer.zero_grad() 165 | l.backward() 166 | optimizer.step() 167 | 168 | print(f"\nw:{m.w}") 169 | ``` 170 | 171 | 可以发现,它与单机单卡脚本的不同只有2个: 172 | 173 | - 使用 `DistributedDataParallel` 处理一下 module 对象(`m = ddp(m)`) 174 | - 使用 [get_rank](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.env.get_rank.html) 获取当前设备编号,并针对设备分发数据 175 | 176 | 然后使用 `launcher` 启动脚本,把剩下的一切都交给 OneFlow,让分布式训练,像单机单卡训练一样简单: 177 | 178 | ```pytohn 179 | python3 -m oneflow.distributed.launch --nproc_per_node 2 ./ddp_train.py 180 | ``` 181 | 182 | ### DistributedSampler 183 | 184 | 本文为了简化问题,突出 `DistributedDataParallel`,因此使用的数据是手工分发的。在实际应用中,可以直接使用 [DistributedSampler](https://oneflow.readthedocs.io/en/v0.8.1/utils.data.html?highlight=DistributedSampler#oneflow.utils.data.distributed.DistributedSampler) 配合数据并行使用。 185 | 186 | `DistributedSampler` 会在每个进程中实例化 Dataloader,每个 Dataloader 实例会加载完整数据的一部分,自动完成数据的分发。 187 | -------------------------------------------------------------------------------- /cn/docs/parallelism/06_pipeline.md: -------------------------------------------------------------------------------- 1 | # 流水并行训练 2 | 3 | 在 [常见的分布式并行策略](./01_introduction.md) 一文中介绍了流水并行的特点。 4 | 5 | 在 OneFlow 的 [全局视角](./03_consistent_tensor.md) 下,通过简单的设置 Tensor 的 `placement` 属性,就可以实现流水并行。 6 | 7 | 以下代码是简单的示范,它将 [快速上手](../basics/01_quickstart.md) 中的网络,以流水并行的方式运行。前几层的 Module `nn.Flatten`、`nn.Linear(28*28, 512)`、`nn.ReLU()` 在 GPU0 上运行;剩余的网络部分在 GPU1 上运行。 8 | 9 | ??? code 10 | ```python 11 | import oneflow as flow 12 | 13 | BATCH_SIZE = 16 14 | BROADCAST = [flow.sbp.broadcast] 15 | P0 = flow.placement("cuda", ranks=[0]) 16 | P1 = flow.placement("cuda", ranks=[1]) 17 | 18 | class Stage0Module(flow.nn.Module): 19 | def __init__(self): 20 | super().__init__() 21 | self.flatten = flow.nn.Flatten() 22 | self.linear0 = flow.nn.Linear(28*28, 512) 23 | self.relu0 = flow.nn.ReLU() 24 | 25 | def forward(self, x): 26 | out = self.flatten(x) 27 | out = self.linear0(out) 28 | out = self.relu0(out) 29 | return out 30 | 31 | class Stage1Module(flow.nn.Module): 32 | def __init__(self): 33 | super().__init__() 34 | self.linear1 = flow.nn.Linear(512, 512) 35 | self.relu1 = flow.nn.ReLU() 36 | self.linear2 = flow.nn.Linear(512, 10) 37 | self.relu2 = flow.nn.ReLU() 38 | 39 | def forward(self, x): 40 | out = self.linear1(x) 41 | out = self.relu1(out) 42 | out = self.linear2(out) 43 | out = self.relu2(out) 44 | return out 45 | 46 | class PipelineModule(flow.nn.Module): 47 | def __init__(self): 48 | super().__init__() 49 | self.m_stage0 = Stage0Module() 50 | self.m_stage1 = Stage1Module() 51 | 52 | self.m_stage0.to_global(placement=P0, sbp=BROADCAST) 53 | self.m_stage1.to_global(placement=P1, sbp=BROADCAST) 54 | 55 | def forward(self, x): 56 | out_stage0 = self.m_stage0(x) 57 | in_stage1 = out_stage0.to_global(placement=P1, sbp=BROADCAST) 58 | out_stage1 = self.m_stage1(in_stage1) 59 | return out_stage1 60 | 61 | module_pipeline = PipelineModule() 62 | sgd = flow.optim.SGD(module_pipeline.parameters(), lr=0.001) 63 | 64 | class PipelineGraph(flow.nn.Graph): 65 | def __init__(self): 66 | super().__init__() 67 | self.module_pipeline = module_pipeline 68 | self.module_pipeline.m_stage0.to(nn.graph.GraphModule).set_stage(stage_id=0, placement=P0) 69 | self.module_pipeline.m_stage1.to(nn.graph.GraphModule).set_stage(stage_id=1, placement=P1) 70 | self.loss_fn = flow.nn.CrossEntropyLoss() 71 | self.config.set_gradient_accumulation_steps(2) 72 | self.add_optimizer(sgd) 73 | 74 | def build(self, x, y): 75 | out = self.module_pipeline(x) 76 | loss = self.loss_fn(out, y) 77 | loss.backward() 78 | return loss 79 | 80 | graph_pipeline = PipelineGraph() 81 | 82 | x = flow.randn(BATCH_SIZE, 1, 28, 28) 83 | x = x.to_global(P0, BROADCAST) 84 | y = flow.randint(0, 10, (BATCH_SIZE,)) 85 | y = y.to_global(P1, BROADCAST) 86 | 87 | for i in range(20): 88 | loss = graph_pipeline(x, y) 89 | print(loss.to_local()) 90 | ``` 91 | 92 | 以上代码,保存为脚本(如 `pipeline.py`)后,使用 [launch 模块启动分布式训练](./04_launch.md): 93 | 94 | ```shell 95 | python3 -m oneflow.distributed.launch --nproc_per_node 2 ./pipeline.py 96 | ``` 97 | 98 | ## 代码解读 99 | ### 设置 placement 与 sbp 100 | 101 | 将需要使用的 placement 与 sbp 设置提前准备好: 102 | 103 | ```python 104 | BROADCAST = [flow.sbp.broadcast] 105 | P0 = flow.placement("cuda", ranks=[0]) 106 | P1 = flow.placement("cuda", ranks=[1]) 107 | ``` 108 | 109 | `P0`、`P1` 分别代表集群的第 0 个 GPU 和第 1 个 GPU。 110 | 111 | 通过调用 [nn.Module.to_global](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.Module.html) 或 [Tensor.to_global](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.Tensor.to_global.html) 就可以将模型或张量分配到指定的计算设备上运行,将一个网络拆分为多个流水阶段(stage)。 112 | 113 | 在此我们定义了一个 `PipelineModule` 专门设置各阶段的流水。 114 | 115 | ```python 116 | class PipelineModule(flow.nn.Module): 117 | def __init__(self): 118 | #... 119 | 120 | self.m_stage0.to_global(placement=P0, sbp=BROADCAST) 121 | self.m_stage1.to_global(placement=P1, sbp=BROADCAST) 122 | 123 | def forward(self, x): 124 | out_stage0 = self.m_stage0(x) 125 | in_stage1 = out_stage0.to_global(placement=P1, sbp=BROADCAST) 126 | out_stage1 = self.m_stage1(in_stage1) 127 | return out_stage1 128 | ``` 129 | 130 | ### Local Tensor 与 Global Tensor 的转换 131 | 132 | 示例中使用了随机生成的数据作为输入。 133 | 134 | ```python 135 | x = flow.randn(BATCH_SIZE, 1, 28, 28) 136 | x = x.to_global(P0, BROADCAST) 137 | ``` 138 | 139 | 当使用 `launch` 模块启动训练时,因为命令行参数为 `--nproc_per_node 2`,`launch` 会启动 2 个进程。两个进程均为执行脚本中的代码。 140 | 141 | 其中 `x = flow.randn(BATCH_SIZE, 1, 28, 28)` 返回的是 Local Tensor(只在本进程中有效的本地数据),当运行 `x = x.to_global(P0, BROADCAST)` 时,OneFlow 会自动将所有进程中的 Local Tensor 整合为 Global Tensor。 142 | 143 | 在实际训练中,各个计算设备也可以加载属于各自的本地数据,然后通过 `to_global` 实现 Local Tensor 到 Global Tensor 的转化。 144 | 145 | ### Stage ID 及梯度累积设置 146 | 147 | 当 [nn.Module](https://oneflow.readthedocs.io/en/master/nn.html?highlight=nn.Module#nn-module) 的一个实例化网络层作为属性加入继承于 [nn.Graph](https://oneflow.readthedocs.io/en/master/graph.html) 的新类时,内部会将该网络层用 `ProxyModule` 进行包装,利用方法 `.to` 得到一个 `nn.graph.GraphModule` 的实例化对象,然后使用方法 `stage_id` 设置流水线 Stage ID 和 Stage 对应的 Placement,Stage ID 从 0 开始编号,依次加 1。 148 | 149 | 调用 [config.set_gradient_accumulation_steps](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.graph_config.GraphConfig.set_gradient_accumulation_steps.html#oneflow.nn.graph.graph_config.GraphConfig.set_gradient_accumulation_steps) 方法,设置梯度累积的步长。 150 | OneFlow 通过这两项配置,获取实现流水并行中的 micro batch 技术所需的信息。 151 | 152 | ```python 153 | self.module_pipeline.m_stage0.to(nn.graph.GraphModule).set_stage(stage_id=0, placement=P0) 154 | self.module_pipeline.m_stage1.to(nn.graph.GraphModule).set_stage(stage_id=1, placement=P1) 155 | self.config.set_gradient_accumulation_steps(2) 156 | ``` 157 | -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/2d-sbp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/2d-sbp.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/boxing_s2b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/boxing_s2b.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/consistent-view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/consistent-view.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/gpt3-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/gpt3-overview.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/matmul_data_paralelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/matmul_data_paralelism.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/matmul_logical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/matmul_logical.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/matmul_model_paralelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/matmul_model_paralelism.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/multi-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/multi-matmul.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/realy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/realy.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/sbp-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/sbp-example.png -------------------------------------------------------------------------------- /cn/docs/parallelism/imgs/sbp_translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/cn/docs/parallelism/imgs/sbp_translation.png -------------------------------------------------------------------------------- /cn/mkdocs.yml: -------------------------------------------------------------------------------- 1 | 2 | site_name: OneFlow 3 | site_url: https://docs.oneflow.org/ 4 | site_description: >- 5 | OneFlow -- 极致性能的分布式机器学习框架 6 | # Repository 7 | repo_name: OneFlow 8 | repo_url: https://github.com/OneFlow-Inc/oneflow-documentation 9 | edit_uri: "blob/master/cn/docs/" 10 | 11 | # Copyright 12 | copyright: Copyright © 2017 - 2021 OneFlow 13 | 14 | #website/xxx.html style 15 | use_directory_urls: false 16 | 17 | # Configuration 18 | theme: 19 | name: material 20 | custom_dir: 'overrides/' 21 | 22 | # 404 page 23 | static_templates: 24 | - 404.html 25 | 26 | # Don't include MkDocs' JavaScript 27 | include_search_page: false 28 | search_index_only: true 29 | 30 | # Default values, taken from mkdocs_theme.yml 31 | language: zh 32 | features: 33 | - navigation.tabs 34 | - navigation.top 35 | #- instant 36 | palette: 37 | scheme: default 38 | primary: indigo 39 | accent: indigo 40 | font: 41 | text: Roboto 42 | code: Roboto Mono 43 | icon: 44 | logo: logo 45 | favicon: assets/favicon.png 46 | 47 | # Plugins 48 | plugins: 49 | - search 50 | - redirects: 51 | redirect_maps: 52 | 'parallelism/03_global_tensor.md': 'parallelism/03_consistent_tensor.md' 53 | 54 | extra: 55 | version: 56 | provider: mike 57 | alternate: 58 | #Switch to English 59 | - name: English 60 | link: https://docs.oneflow.org/en 61 | lang: en 62 | 63 | # Switch to Chinese 64 | - name: 中文 65 | link: https://docs.oneflow.org 66 | lang: zh 67 | 68 | #disqus: livere_indeed 69 | 70 | # Extensions 71 | markdown_extensions: 72 | - markdown.extensions.admonition 73 | - markdown.extensions.attr_list 74 | - markdown.extensions.codehilite: 75 | guess_lang: false 76 | - markdown.extensions.def_list 77 | - markdown.extensions.footnotes 78 | - markdown.extensions.meta 79 | - markdown.extensions.toc: 80 | permalink: true 81 | - pymdownx.arithmatex: 82 | generic: true 83 | - pymdownx.betterem: 84 | smart_enable: all 85 | - pymdownx.caret 86 | - pymdownx.critic 87 | - pymdownx.details 88 | - pymdownx.emoji: 89 | emoji_index: !!python/name:materialx.emoji.twemoji 90 | emoji_generator: !!python/name:materialx.emoji.to_svg 91 | # - pymdownx.highlight: 92 | # linenums_style: pymdownx-inline 93 | - pymdownx.inlinehilite 94 | - pymdownx.keys 95 | - pymdownx.magiclink: 96 | repo_url_shorthand: true 97 | user: squidfunk 98 | repo: mkdocs-material 99 | - pymdownx.mark 100 | - pymdownx.smartsymbols 101 | - pymdownx.snippets: 102 | check_paths: true 103 | - pymdownx.superfences 104 | - pymdownx.tabbed 105 | - pymdownx.tasklist: 106 | custom_checkbox: true 107 | - pymdownx.tilde 108 | 109 | extra_javascript: 110 | - javascripts/config.js 111 | - https://polyfill.io/v3/polyfill.min.js?features=es6 112 | - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js 113 | 114 | nav: 115 | - 首页: index.md 116 | 117 | - 基础专题: 118 | - 快速上手: basics/01_quickstart.md 119 | - Tensor: basics/02_tensor.md 120 | - Dataset 与 DataLoader: basics/03_dataset_dataloader.md 121 | - 搭建神经网络: basics/04_build_network.md 122 | - Autograd: basics/05_autograd.md 123 | - 反向传播与 optimizer: basics/06_optimization.md 124 | - 模型的加载与保存: basics/07_model_load_save.md 125 | - 静态图模块 nn.Graph: basics/08_nn_graph.md 126 | 127 | - 分布式训练: 128 | - 常见的分布式并行策略: parallelism/01_introduction.md 129 | - 集群的全局视角: parallelism/02_sbp.md 130 | - Global Tensor: parallelism/03_consistent_tensor.md 131 | - 2D SBP: parallelism/04_2d-sbp.md 132 | - 用 launch 模块启动分布式训练: parallelism/04_launch.md 133 | - 数据并行训练: parallelism/05_ddp.md 134 | - 流水并行训练: parallelism/06_pipeline.md 135 | 136 | - 实践指南: 137 | - 使用 Global Tensor 进行分布式编程:基础操作: cookies/global_tensor.md 138 | - 使用 Global Tensor 进行分布式编程:分布式并行策略: cookies/global_tensor_distributed.md 139 | - OneFlow 与 ONNX 交互: cookies/oneflow2onnnx.md 140 | - 模型部署: cookies/serving.md 141 | - 自动混合精度训练: cookies/amp.md 142 | - Activation Checkpointing: cookies/activation_checkpointing.md 143 | - 将 PyTorch 预训练模型转为 OneFlow 格式: cookies/torch2flow.md 144 | - 计算机视觉迁移学习: cookies/transfer_learning.md 145 | - 大规模 Embedding 方案: OneEmbedding: cookies/one_embedding.md 146 | - Zero Redundancy Optimizer (ZeRO): cookies/zero.md 147 | - 大模型分片保存和加载: cookies/save_load.md 148 | - OneFlow 模拟 PyTorch: cookies/oneflow_torch.md 149 | - 如何分层设置学习率: cookies/lr_scale.md 150 | 151 | - API: 152 | - API: https://oneflow.readthedocs.io/en/master/ 153 | -------------------------------------------------------------------------------- /cn/overrides/partials/integrations/disqus.html: -------------------------------------------------------------------------------- 1 | {#- 2 | This file was automatically generated - do not edit 3 | -#} 4 | {% set disqus = config.extra.disqus %} 5 | {% if page and page.meta and page.meta.disqus is string %} 6 | {% set disqus = page.meta.disqus %} 7 | {% endif %} 8 | {% if not page.is_homepage and disqus %} 9 | 10 |
11 | 24 | 25 |
26 | 27 | {% endif %} 28 | -------------------------------------------------------------------------------- /cn/overrides/partials/source.html: -------------------------------------------------------------------------------- 1 | {#- 2 | This file was automatically generated - do not edit 3 | -#} 4 | {% import "partials/language.html" as lang with context %} 5 | 7 |
8 | {% set icon = config.theme.icon.repo or "fontawesome/brands/git-alt" %} 9 | {% include ".icons/" ~ icon ~ ".svg" %} 10 |
11 |
12 | {{ config.repo_name }} 13 |
14 |
15 | -------------------------------------------------------------------------------- /en/docs/assets/index-mdx.css: -------------------------------------------------------------------------------- 1 | @-webkit-keyframes mdx-heart { 2 | 0%,40%,80%,to { 3 | transform: scale(1) 4 | } 5 | 6 | 20%,60% { 7 | transform: scale(1.15) 8 | } 9 | } 10 | 11 | @keyframes mdx-heart { 12 | 0%,40%,80%,to { 13 | transform: scale(1) 14 | } 15 | 16 | 20%,60% { 17 | transform: scale(1.15) 18 | } 19 | } 20 | 21 | .mdx-hero { 22 | color: var(--md-primary-bg-color); 23 | margin: 0 .8rem 24 | } 25 | 26 | .mdx-hero h1 { 27 | color: currentColor; 28 | font-weight: 700; 29 | margin-bottom: 1rem 30 | } 31 | 32 | @media screen and (max-width: 29.9375em) { 33 | .mdx-hero h1 { 34 | font-size:1.4rem 35 | } 36 | } 37 | 38 | .mdx-hero__content { 39 | padding-bottom: 1rem 40 | } 41 | 42 | @media screen and (min-width: 60em) { 43 | .mdx-hero { 44 | align-items:stretch; 45 | display: flex 46 | } 47 | 48 | .mdx-hero__content { 49 | margin-top: 1.5rem; 50 | max-width: 19rem; 51 | /* padding-bottom: 14vw */ 52 | } 53 | 54 | .mdx-hero__image { 55 | order: 1; 56 | transform: translateX(4rem); 57 | width: 40rem; 58 | height: 20rem 59 | } 60 | } 61 | 62 | @media screen and (min-width: 76.25em) { 63 | .mdx-hero__image { 64 | transform:translateX(8rem) 65 | } 66 | } 67 | 68 | .mdx-hero .md-button { 69 | color: var(--md-primary-bg-color); 70 | margin-right: .5rem; 71 | margin-top: .5rem 72 | } 73 | 74 | .mdx-hero .md-button:focus,.mdx-hero .md-button:hover { 75 | background-color: var(--md-accent-fg-color); 76 | border-color: var(--md-accent-fg-color); 77 | color: var(--md-accent-bg-color) 78 | } 79 | 80 | .mdx-hero .md-button--primary { 81 | background-color: var(--md-primary-bg-color); 82 | border-color: var(--md-primary-bg-color); 83 | color: #894da8 84 | } 85 | .tx-container { 86 | padding-top: .0rem; 87 | background: linear-gradient(to bottom, var(--md-primary-fg-color), #9941d4 100%,#4051b5 99%) 88 | } 89 | 90 | .feature-item h2 svg { 91 | height: 30px; 92 | float: left; 93 | margin-right: 10px; 94 | transform: translateY(10%); 95 | } 96 | 97 | .top-hr { 98 | margin-top: 42px; 99 | } 100 | 101 | .feature-item { 102 | font-family: 'Lato', sans-serif; 103 | font-weight: 300; 104 | box-sizing: border-box; 105 | padding: 0 15px; 106 | word-break: break-word 107 | } 108 | 109 | .feature-item h2 { 110 | color: #333; 111 | font-weight: 300; 112 | font-size: 25px; 113 | white-space: nowrap; 114 | overflow: hidden; 115 | text-overflow: ellipsis; 116 | line-height: normal; 117 | margin-top: 20px; 118 | margin-bottom: 10px; 119 | font-family: inherit; 120 | } 121 | 122 | .feature-item p { 123 | font-size: 16px; 124 | line-height: 1.8em; 125 | text-rendering: optimizeLegibility; 126 | -webkit-font-smoothing: antialiased; 127 | color: #111; 128 | margin: 0 0 10px; 129 | display: block; 130 | } 131 | 132 | @media screen and (min-width:76.25em) { 133 | .md-sidebar--primary { 134 | display: none 135 | } 136 | 137 | .top-hr { 138 | width: 100%; 139 | display: flex; 140 | max-width: 61rem; 141 | margin-right: auto; 142 | margin-left: auto; 143 | padding: 0 .2rem; 144 | } 145 | 146 | .bottom-hr { 147 | margin-top: 10px; 148 | width: 100%; 149 | display: flex; 150 | max-width: 61rem; 151 | margin-right: auto; 152 | margin-left: auto; 153 | padding: 0 .2rem; 154 | } 155 | 156 | .feature-item { 157 | flex: 1; 158 | min-width: 0; 159 | } 160 | 161 | .feature-item:hover { 162 | background-color: #526cfe47; 163 | border-radius: 3px; 164 | } 165 | } 166 | 167 | .hr { 168 | border-bottom: 1px solid #eee; 169 | width: 100%; 170 | margin: 20px 0; 171 | } 172 | 173 | .md-footer-meta__inner { 174 | display: flex; 175 | flex-wrap: wrap; 176 | justify-content: space-between; 177 | margin-top: 0rem; 178 | } 179 | 180 | .md-footer-social { 181 | padding-top: 20px; 182 | } 183 | -------------------------------------------------------------------------------- /en/docs/assets/install-selector.css: -------------------------------------------------------------------------------- 1 | #instruction ul, #instruction li, #instruction p{ 2 | margin: 0; 3 | padding: 0; 4 | } 5 | #instruction{ 6 | padding-top: 20px; 7 | width: 700px; 8 | } 9 | #instruction ul{ 10 | display: flex; 11 | width: 100%; 12 | margin: 10px -3px 0; 13 | } 14 | #instruction li{ 15 | border: 2px solid #FFF; 16 | height: 48px; 17 | line-height: 48px; 18 | cursor: pointer; 19 | text-align: center; 20 | list-style: none; 21 | flex: 1; 22 | margin: 0 3px; 23 | border-radius: 4px; 24 | transition: all .3s; 25 | } 26 | #instruction li.active{ 27 | border: 2px solid #FFFFFF; 28 | background: #526cfe6b; 29 | } 30 | 31 | #instruction .command{ 32 | padding-top: 10px; 33 | } 34 | #instruction .panel-code{ 35 | border-radius: 4px; 36 | background: #FFF; 37 | color: #333; 38 | padding: 20px; 39 | margin-top: 10px; 40 | } 41 | .smlVers{ 42 | overflow: hidden; 43 | } 44 | 45 | .command-copy{ 46 | margin-top: 5px; 47 | margin-bottom: 0px; 48 | width: 0%; 49 | height: 0%; 50 | float: right; 51 | } 52 | 53 | @media screen and (max-width: 29.9375em) { 54 | #instruction{ 55 | padding-top: 20px; 56 | width: 95%; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /en/docs/assets/install-selector.js: -------------------------------------------------------------------------------- 1 | ; (function () { 2 | window.addEventListener('load', () => { 3 | 4 | function get_commands(latest_version) { 5 | let stable_command_118 = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cu118 oneflow' 6 | let stable_command_121 = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cu121 oneflow' 7 | let stable_command_122 = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cu122 oneflow' 8 | let stable_command_cpu = 'python3 -m pip install -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/release/v1.0.0/cpu oneflow' 9 | let commands = [ 10 | { 11 | versions: 'Stable', 12 | framework: 'CUDA', 13 | smlVers: '11.8', 14 | command: stable_command_118.replace("VERSION", latest_version) 15 | }, 16 | { 17 | versions: 'Stable', 18 | framework: 'CUDA', 19 | smlVers: '12.1', 20 | command: stable_command_121.replace("VERSION", latest_version) 21 | }, 22 | { 23 | versions: 'Stable', 24 | framework: 'CUDA', 25 | smlVers: '12.2', 26 | command: stable_command_122.replace("VERSION", latest_version) 27 | }, 28 | { 29 | versions: 'Stable', 30 | framework: 'CPU', 31 | smlVers: '', 32 | command: stable_command_cpu.replace("VERSION", latest_version) 33 | }, 34 | { 35 | versions: 'Nightly', 36 | framework: 'CUDA', 37 | smlVers: '11.8', 38 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cu118 --pre oneflow' 39 | }, 40 | { 41 | versions: 'Nightly', 42 | framework: 'CUDA', 43 | smlVers: '12.1', 44 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cu121 --pre oneflow' 45 | }, 46 | { 47 | versions: 'Nightly', 48 | framework: 'CUDA', 49 | smlVers: '12.2', 50 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cu122 --pre oneflow' 51 | }, 52 | { 53 | versions: 'Nightly', 54 | framework: 'CPU', 55 | smlVers: '', 56 | command: 'python3 -m pip install -f https://staging.oneflow.info/branch/master/cpu --pre oneflow' 57 | }, 58 | ] 59 | return commands 60 | } 61 | 62 | function init_selector(commands) { 63 | let condition = { 64 | versions: 'Stable', 65 | framework: 'CUDA', 66 | smlVers: '11.8', 67 | } 68 | selectCommands(condition) 69 | let items = document.querySelectorAll('#instruction li') 70 | 71 | function selectCommands(conditioning) { 72 | let filter = null 73 | if (conditioning.framework == "CPU") { 74 | filter = commands.filter(e => e.versions == conditioning.versions).filter(e => e.framework == conditioning.framework) 75 | } else { 76 | filter = commands.filter(e => e.versions == conditioning.versions).filter(e => e.framework == conditioning.framework).filter(e => e.smlVers == conditioning.smlVers) 77 | } 78 | if (filter && filter[0]) { 79 | document.querySelector('.panel-code').innerHTML = filter[0].command 80 | } 81 | } 82 | items.forEach(e => { 83 | e.addEventListener('click', function () { 84 | let attach = this.getAttribute('attach') 85 | let tempItems = document.querySelectorAll(`[attach=${attach}]`) 86 | tempItems.forEach(e => { 87 | e.className = '' 88 | }) 89 | this.className = 'active' 90 | condition[attach] = this.innerHTML 91 | if (condition['framework'] == 'CPU') { 92 | document.querySelector('.smlVers').style.height = '0px' 93 | } else { 94 | document.querySelector('.smlVers').style.height = '48px' 95 | } 96 | selectCommands(condition) 97 | }) 98 | }) 99 | } 100 | 101 | let TAGS_API_URL = 'https://api.github.com/repos/Oneflow-Inc/oneflow/tags' 102 | let xmlhttp = new XMLHttpRequest(); 103 | let latest_version_hardcode = "0.8.0" // using latest version in hard-code way if request fails 104 | xmlhttp.onreadystatechange = function () { 105 | if (xmlhttp.readyState == 4) {// 4 = "loaded" 106 | if (xmlhttp.status == 200) {// 200 = "OK" 107 | localStorage.latest_version = eval(xmlhttp.responseText)[0].name.replace("v", "").replace("0.8.1", "0.8.0") // eg: v0.x.0 => 0.x.0 108 | init_selector(get_commands(localStorage.latest_version)) 109 | } 110 | else { 111 | init_selector(get_commands(localStorage.latest_version ? localStorage.latest_version : latest_version_hardcode)) 112 | } 113 | } 114 | } 115 | xmlhttp.open("GET", TAGS_API_URL, true) 116 | xmlhttp.send(null) 117 | }) 118 | })(); 119 | 120 | function copyPipCommand() { 121 | var copyText = document.querySelector('.panel-code').innerHTML 122 | navigator.clipboard.writeText(copyText) 123 | } 124 | -------------------------------------------------------------------------------- /en/docs/assets/product-layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/assets/product-layer.png -------------------------------------------------------------------------------- /en/docs/basics/06_optimization.md: -------------------------------------------------------------------------------- 1 | # BACKPROPAGATION AND OPTIMIZER 2 | 3 | So far, we have learned how to use OneFlow to [Dataset and DataLoader](./03_dataset_dataloader.md), [Build Models](./04_build_network.md),[Autograd](./05_autograd.md), and combine them so that we can train models by using backpropagation algorithms. 4 | 5 | In [oneflow.optim](https://oneflow.readthedocs.io/en/v0.8.1/optim.html), there are various `optimizer`s that simplify the code of back propagation. 6 | 7 | This article will first introduce the basic concepts of back propagation and then show you how to use the `oneflow.optim` class. 8 | 9 | ## Backpropagation by Numpy Code 10 | 11 | In order to make it easier for readers to understand the relationship between backpropagation and autograd, a training process of a simple model implemented with numpy is provided here: 12 | 13 | ```python 14 | import numpy as np 15 | 16 | ITER_COUNT = 500 17 | LR = 0.01 18 | 19 | # Forward propagation 20 | def forward(x, w): 21 | return np.matmul(x, w) 22 | 23 | 24 | # Loss function 25 | def loss(y_pred, y): 26 | return ((y_pred - y) ** 2).sum() 27 | 28 | 29 | # Calculate gradient 30 | def gradient(x, y, y_pred): 31 | return np.matmul(x.T, 2 * (y_pred - y)) 32 | 33 | 34 | if __name__ == "__main__": 35 | # Train: Y = 2*X1 + 3*X2 36 | x = np.array([[1, 2], [2, 3], [4, 6], [3, 1]], dtype=np.float32) 37 | y = np.array([[8], [13], [26], [9]], dtype=np.float32) 38 | 39 | w = np.array([[2], [1]], dtype=np.float32) 40 | # Training cycle 41 | for i in range(0, ITER_COUNT): 42 | y_pred = forward(x, w) 43 | l = loss(y_pred, y) 44 | if (i + 1) % 50 == 0: 45 | print(f"{i+1}/{500} loss:{l}") 46 | 47 | grad = gradient(x, y, y_pred) 48 | w -= LR * grad 49 | 50 | print(f"w:{w}") 51 | ``` 52 | 53 | output: 54 | 55 | ```text 56 | 50/500 loss:0.0034512376878410578 57 | 100/500 loss:1.965487399502308e-06 58 | 150/500 loss:1.05524122773204e-09 59 | 200/500 loss:3.865352482534945e-12 60 | 250/500 loss:3.865352482534945e-12 61 | 300/500 loss:3.865352482534945e-12 62 | 350/500 loss:3.865352482534945e-12 63 | 400/500 loss:3.865352482534945e-12 64 | 450/500 loss:3.865352482534945e-12 65 | 500/500 loss:3.865352482534945e-12 66 | w:[[2.000001 ] 67 | [2.9999993]] 68 | ``` 69 | 70 | Note that the loss function expression we selected is $\sum (y_{p} - y)^2$, so the code for gradient of `loss` to parameter `w` is: 71 | 72 | ```python 73 | def gradient(x, y, y_pred): 74 | return np.matmul(x.T, 2 * (y_pred - y)) 75 | ``` 76 | 77 | [SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) is used to update parameters: 78 | 79 | ```python 80 | grad = gradient(x, y, y_pred) 81 | w -= LR*grad 82 | ``` 83 | 84 | In summary, a complete iteration in the training includes the following steps: 85 | 86 | 1. The model calculates the predicted value based on the input and parameters (`y_pred`) 87 | 2. Calculate loss, which is the error between the predicted value and the label 88 | 3. Calculate the gradient of loss to parameter 89 | 4. Update parameter(s) 90 | 91 | 1 and 2 are forward propagation process; 3 and 4 are back propagation process. 92 | 93 | ## Hyperparameters 94 | 95 | Hyperparameters are parameters related to model training settings, which can affect the efficiency and results of model training.As in the above code `ITER_COUNT`,`LR` are hyperparameters. 96 | 97 | ## Using the optimizer class in `oneflow.optim` 98 | 99 | Using the optimizer class in `oneflow.optim` for back propagation will be more concise. 100 | 101 | First, prepare the data and model. The convenience of using Module is that you can place the hyperparameters in Module for management. 102 | 103 | ```python 104 | import oneflow as flow 105 | 106 | x = flow.tensor([[1, 2], [2, 3], [4, 6], [3, 1]], dtype=flow.float32) 107 | y = flow.tensor([[8], [13], [26], [9]], dtype=flow.float32) 108 | 109 | 110 | class MyLrModule(flow.nn.Module): 111 | def __init__(self, lr, iter_count): 112 | super().__init__() 113 | self.w = flow.nn.Parameter(flow.tensor([[2], [1]], dtype=flow.float32)) 114 | self.lr = lr 115 | self.iter_count = iter_count 116 | 117 | def forward(self, x): 118 | return flow.matmul(x, self.w) 119 | 120 | 121 | model = MyLrModule(0.01, 500) 122 | ``` 123 | 124 | ### Loss function 125 | 126 | Then, select the loss function. OneFlow comes with a variety of loss functions. We choose [MSELoss](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.MSELoss.html) here: 127 | 128 | ```python 129 | loss = flow.nn.MSELoss(reduction="sum") 130 | ``` 131 | 132 | ### Construct Optimizer 133 | 134 | The logic of back propagation is wrapped in optimizer. We choose [SGD](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.optim.SGD.html) here, You can choose other optimization algorithms as needed, such as [Adam](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.optim.Adam.html) and[AdamW](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.optim.AdamW.html) . 135 | 136 | ```python 137 | optimizer = flow.optim.SGD(model.parameters(), model.lr) 138 | ``` 139 | 140 | When the `optimizer` is constructed, the model parameters and learning rate are given to `SGD`. Then the `optimizer.step()` is called, and it automatically completes the gradient of the model parameters and updates the model parameters according to the SGD algorithm. 141 | 142 | ### Train 143 | 144 | When the above preparations are completed, we can start training: 145 | 146 | ```python 147 | for i in range(0, model.iter_count): 148 | y_pred = model(x) 149 | l = loss(y_pred, y) 150 | if (i + 1) % 50 == 0: 151 | print(f"{i+1}/{model.iter_count} loss:{l.numpy()}") 152 | 153 | optimizer.zero_grad() 154 | l.backward() 155 | optimizer.step() 156 | 157 | print(f"\nw: {model.w}") 158 | ``` 159 | 160 | output: 161 | 162 | ```text 163 | 50/500 loss:0.003451163647696376 164 | 100/500 loss:1.965773662959691e-06 165 | 150/500 loss:1.103217073250562e-09 166 | 200/500 loss:3.865352482534945e-12 167 | 250/500 loss:3.865352482534945e-12 168 | 300/500 loss:3.865352482534945e-12 169 | 350/500 loss:3.865352482534945e-12 170 | 400/500 loss:3.865352482534945e-12 171 | 450/500 loss:3.865352482534945e-12 172 | 500/500 loss:3.865352482534945e-12 173 | 174 | w: tensor([[2.], 175 | [3.]], dtype=oneflow.float32, grad_fn=) 176 | ``` 177 | -------------------------------------------------------------------------------- /en/docs/basics/07_model_load_save.md: -------------------------------------------------------------------------------- 1 | # SAVE AND LOAD THE MODEL 2 | 3 | There are two common uses for loading and saving models: 4 | 5 | - Save the model that has been trained to continue training next time. 6 | - Save the trained model for direct prediction in the future. 7 | 8 | We will introduce how to use [save](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.save.html) and [load](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.load.html) to save and load models as follows. 9 | 10 | Also, we will show how to load a pre-trained model for inference. 11 | 12 | ## Saving and Loading Model Parameters 13 | 14 | `Module` provided by OneFlow and defined by users provides the `state_dict` method to obtain all the model parameters, which is stored in a dictionary with the format "name-value". 15 | 16 | ```python 17 | import oneflow as flow 18 | m = flow.nn.Linear(2,3) 19 | print(m.state_dict()) 20 | ``` 21 | 22 | The above code first constructs a Linear object, then prints its parameters. 23 | 24 | ```text 25 | OrderedDict([('weight', 26 | tensor([[-0.4297, -0.3571], 27 | [ 0.6797, -0.5295], 28 | [ 0.4918, -0.3039]], dtype=oneflow.float32, requires_grad=True)), 29 | ('bias', 30 | tensor([ 0.0977, 0.1219, -0.5372], dtype=oneflow.float32, requires_grad=True))]) 31 | ``` 32 | 33 | We can load parameters by calling `load_state_dict` method of `Module`, as the following code: 34 | 35 | ```python 36 | myparams = {"weight":flow.ones(3,2), "bias":flow.zeros(3)} 37 | m.load_state_dict(myparams) 38 | print(m.state_dict()) 39 | ``` 40 | 41 | The tensor in the dictionary created by us has been loaded into m Module: 42 | 43 | ```text 44 | OrderedDict([('weight', 45 | tensor([[1., 1.], 46 | [1., 1.], 47 | [1., 1.]], dtype=oneflow.float32, requires_grad=True)), 48 | ('bias', 49 | tensor([0., 0., 0.], dtype=oneflow.float32, requires_grad=True))]) 50 | ``` 51 | 52 | ## Saving Models 53 | 54 | We can use [oneflow.save](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.save.html) to save models. 55 | 56 | ```python 57 | flow.save(m.state_dict(), "./model") 58 | ``` 59 | 60 | The first parameter is the Module parameters, and the second is the saved path. The above code saves the parameters of the `m` Module object to the path `./model`. 61 | 62 | ## Loading Models 63 | 64 | Using [oneflow.load](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.load.html) to load parameters from disk to memory with the specified path, and get the dictionary of the parameters. 65 | 66 | ```python 67 | params = flow.load("./model") 68 | ``` 69 | 70 | Then use `load_state_dict` to load the dictionary into the model. 71 | 72 | ```python 73 | m2 = flow.nn.Linear(2,3) 74 | m2.load_state_dict(params) 75 | print(m2.state_dict()) 76 | ``` 77 | 78 | We have created a new Linear Module object `m2`, and loaded the parameters saved from the above to `m2`. Then we get the output as below: 79 | 80 | ```text 81 | OrderedDict([('weight', tensor([[1., 1.], 82 | [1., 1.], 83 | [1., 1.]], dtype=oneflow.float32, requires_grad=True)), ('bias', tensor([0., 0., 0.], dtype=oneflow.float32, requires_grad=True))]) 84 | ``` 85 | 86 | ### Using a Pre-trained Model to Make Predictions 87 | 88 | OneFlow can directly load PyTorch's pre-trained model for prediction as long as the structure and parameter names of the model are aligned with the PyTorch model. 89 | 90 | Examples can be found in [here](https://github.com/Oneflow-Inc/models/blob/main/README.md). 91 | 92 | Run commands below for trying how to use the pre-trained model to make predictions: 93 | 94 | ```bash 95 | git clone https://github.com/Oneflow-Inc/models.git 96 | cd models/Vision/classification/image/shufflenetv2/ 97 | bash infer.sh 98 | ``` 99 | -------------------------------------------------------------------------------- /en/docs/basics/imgs/compute_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/compute_graph.png -------------------------------------------------------------------------------- /en/docs/basics/imgs/dataloader_item.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/dataloader_item.png -------------------------------------------------------------------------------- /en/docs/basics/imgs/dynamic_graph.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/dynamic_graph.gif -------------------------------------------------------------------------------- /en/docs/basics/imgs/fashionMNIST.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/fashionMNIST.png -------------------------------------------------------------------------------- /en/docs/basics/imgs/neural-network-layers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/neural-network-layers.png -------------------------------------------------------------------------------- /en/docs/basics/imgs/poly_fit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/poly_fit.png -------------------------------------------------------------------------------- /en/docs/basics/imgs/qq_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/qq_group.png -------------------------------------------------------------------------------- /en/docs/basics/imgs/static_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/basics/imgs/static_graph.png -------------------------------------------------------------------------------- /en/docs/cookies/activation_checkpointing.md: -------------------------------------------------------------------------------- 1 | # Activation Checkpointing 2 | 3 | ## Introduction to Activation Checkpointing 4 | 5 | Activation Checkpointing is a sub-linear memory optimization technique proposed in 2016, by Chen Tianqi's team in their paper [Training Deep Nets with Sublinear Memory Cost](https://arxiv.org/abs/1604.06174), aiming to reduce the memory usage during training. The basic principle of Activation Checkpointing is **exchange time for space**: After the analysis of the computational graph, some intermediate activation features that are not used temporarily in the forward process will be deleted to reduce the memory usage, and they will be restored with additional forward computation when needed in the backward process. 6 | 7 | OneFlow's static graph module `nn.Graph` already supports Activation Checkpointing. This article will introduce how to turn on it during training. 8 | 9 | ## Example of using Activation Checkpointing 10 | 11 | First, we define a simple model consist of loss function and optimizer in exactly the same way as before. 12 | 13 | ```python 14 | import oneflow as flow 15 | import oneflow.nn as nn 16 | 17 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 18 | print("Using {} device".format(DEVICE)) 19 | 20 | model_part1 = nn.Sequential( 21 | nn.Linear(256, 128), 22 | nn.ReLU(), 23 | nn.Linear(128, 64), 24 | nn.ReLU() 25 | ) 26 | model_part1 = model_part1.to(DEVICE) 27 | model_part1.train() 28 | 29 | model_part2 = nn.Sequential( 30 | nn.Linear(64, 32), 31 | nn.ReLU(), 32 | nn.Linear(32, 10) 33 | ) 34 | model_part2 = model_part2.to(DEVICE) 35 | model_part2.train() 36 | 37 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 38 | optimizer = flow.optim.SGD([{'params': model_part1.parameters()}, 39 | {'params': model_part2.parameters()}], 40 | lr=1e-3) 41 | ``` 42 | 43 | To turn on activation checkpointing, you only need to use method `.to(nn.graph.GraphModule)` on the Eager model member (i.e. the nn.Module object) to get `nn.graph.GraphModule` object. And then modify corresponding attribute as `.activation_checkpointing = True` on the `nn.graph.GraphModule`. For more details of this API, please refer to: [activation_checkpointing](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.block_config.BlockConfig.activation_checkpointing.html). For each nn.Module with "activation checkpointing" turned on, its input activations will be preserved, while other intermediate activations will be recomputed when used during backpropagation. 44 | 45 | ```python 46 | class CustomGraph(flow.nn.Graph): 47 | def __init__(self): 48 | super().__init__() 49 | self.model_part1 = model_part1 50 | self.model_part2 = model_part2 51 | # Turn on activation checkpointing on two consecutive nn.Module 52 | self.model_part1.to(nn.graph.GraphModule).activation_checkpointing = True 53 | self.model_part2.to(nn.graph.GraphModule).activation_checkpointing = True 54 | self.loss_fn = loss_fn 55 | self.add_optimizer(optimizer) 56 | 57 | def build(self, x, y): 58 | y_pred = self.model_part2(self.model_part1(x)) 59 | loss = self.loss_fn(y_pred, y) 60 | loss.backward() 61 | return y_pred, loss 62 | ``` 63 | 64 | Then, you can start training and other operations as usual. 65 | 66 | ```python 67 | graph_model = CustomGraph() 68 | 69 | for _ in range(100): 70 | x = flow.randn(128, 256).to(DEVICE) 71 | y = flow.ones(128, 1, dtype=flow.int64).to(DEVICE) 72 | graph_model(x, y) 73 | # Other codes... 74 | ``` 75 | 76 | ## Comparative Experiment on BERT Model 77 | 78 | In order to verify the actual effect of Activation Checkpointing, we can conduct comparative experiments on the model [BERT](https://arxiv.org/abs/1810.04805). We can directly use the BERT model provided by [libai](https://github.com/Oneflow-Inc/libai). To turn on Activation Checkpointing, we just need to set `train.activation_checkpoint.enabled` to `True` in the configuration file. 79 | 80 | First, get data ready according to [Prepare the Data and the Vocab](https://libai.readthedocs.io/en/latest/tutorials/get_started/quick_run.html#prepare-the-data-and-the-vocab). For simplicity, we use a single device for training (the GPU used in the experimental environment is NVIDIA GeForce RTX 3090, and the memory size is 24268 MB): 81 | 82 | ```bash 83 | time python tools/train_net.py --config-file configs/bert_large_pretrain.py 84 | ``` 85 | 86 | Add the `time` command at the beginning of the whole command to measure the time spent in the training process. 87 | 88 | The experimental results are as follows: 89 | 90 | | Whether to Turn on Activation Checkpointing | Average Memory Usage| Time Spent | 91 | |:-----------------------------:|:-------:|:---------:| 92 | | No | 9141 MB | 25 minutes 16 seconds | 93 | | Yes | 5978 MB | 33 minutes 36 seconds | 94 | 95 | We can see from the above table that Activation Checkpointin significantly reduces the memory usage during training. At the same time, the time spent increases due to the additional forward computation required. Overall, Activation Checkpointing is a very effective solution when there is a lack of video memory. 96 | -------------------------------------------------------------------------------- /en/docs/cookies/amp.md: -------------------------------------------------------------------------------- 1 | # Automatic Mixed Precision Training 2 | 3 | ## Introduction to AMP 4 | 5 | When we train deep learning models, we typically use 32-bit single-precision floating point (FP32), while **AMP (Automatic Mixed Precision)** is a technique that allows both FP32 and FP16 to be used when training models. This can make the memory usage less and the computation faster when training the model. But because the numerical range of FP16 is smaller than that of FP32, it is more prone to numerical overflow problems, and there may be some errors. But lots of practice has proved that many deep learning models can be trained with this technique without loss of accuracy. 6 | 7 | ## Example of using AMP 8 | 9 | First, we define a simple model, loss function and optimizer in exactly the same way as before. 10 | 11 | ```python 12 | import oneflow as flow 13 | import oneflow.nn as nn 14 | 15 | DEVICE = "cuda" if flow.cuda.is_available() else "cpu" 16 | print("Using {} device".format(DEVICE)) 17 | 18 | model = nn.Sequential( 19 | nn.Linear(256, 128), 20 | nn.ReLU(), 21 | nn.Linear(128, 10) 22 | ) 23 | model = model.to(DEVICE) 24 | model.train() 25 | 26 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 27 | optimizer = flow.optim.SGD(model.parameters(), lr=1e-3) 28 | ``` 29 | 30 | If you want to enable AMP mode, just add `self.config.enable_amp(True)` to the model [nn.Graph](../basics/08_nn_graph.md). The details of this API is at: [enable_amp](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.graph_config.GraphConfig.enable_amp.html). 31 | 32 | ```python 33 | class CustomGraph(flow.nn.Graph): 34 | def __init__(self): 35 | super().__init__() 36 | self.model = model 37 | self.loss_fn = loss_fn 38 | self.add_optimizer(optimizer) 39 | self.config.enable_amp(True) # 开启 AMP 模式 40 | 41 | def build(self, x, y): 42 | y_pred = self.model(x) 43 | loss = self.loss_fn(y_pred, y) 44 | loss.backward() 45 | return y_pred 46 | ``` 47 | 48 | Then, you can start training and other operations as usual. 49 | 50 | ```python 51 | graph_model = CustomGraph() 52 | 53 | for _ in range(100): 54 | x = flow.randn(128, 256).to(DEVICE) 55 | y = flow.ones(128, 1, dtype=flow.int64).to(DEVICE) 56 | 57 | graph_model(x, y) 58 | ``` 59 | 60 | ## Gradient Scaling 61 | 62 | **Gradient Scaling** is a method for solving the problem that FP16 is prone to numerical overflow. The basic principle is to use a scale factor to scale the loss and gradient in the process of backpropagation to change the magnitude of its value, thereby mitigate numerical overflow problems as much as possible. 63 | 64 | OneFlow provides `GradScaler` to use Gradient Scaling in AMP mode. You only need to instantiate a `GradScaler` object in the `__init__` method of the nn.Graph model, and then specify it through the interface [set_grad_scaler](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.Graph.set_grad_scaler.html). nn.Graph will automatically manage the whole process of Gradient Scaling. Taking the `CustomGraph` above as an example, you need to add the following code to its `__init__` method: 65 | 66 | ```python 67 | grad_scaler = flow.amp.GradScaler( 68 | init_scale=2**12, 69 | growth_factor=2.0, 70 | backoff_factor=0.5, 71 | growth_interval=1000, 72 | ) 73 | self.set_grad_scaler(grad_scaler) 74 | ``` 75 | 76 | The calculation process of the scale factor and the meaning of the GradScaler parameters are as follows: 77 | 78 | The size of the scale factor is dynamically estimated in the iterative update (the initial value is specified by `init_scale`). In order to reduce the numerical underflow as much as possible, the scale factor should be larger; but if it is too large, FP16 is prone to numerical overflow , resulting in an inf or NaN. The process of dynamic estimation is to increase the scale factor as much as possible without occuring inf or NaN. At each iteration, it will check whether there is a gradient of inf or NaN: 79 | 80 | 1. If there is: this weight update will be ignored and the scale factor will be reduced (multiplied by the `backoff_factor`) 81 | 82 | 2. If not: weight will update normally. Scale factor will be increased (multiplied by `growth_factor`) when no inf or NaN occurs in successive iterations (specified by `growth_interval`) 83 | -------------------------------------------------------------------------------- /en/docs/cookies/imgs/Three_Stages_of_ZeRO-DP_Optimizations.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/cookies/imgs/Three_Stages_of_ZeRO-DP_Optimizations.jpg -------------------------------------------------------------------------------- /en/docs/cookies/imgs/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/cookies/imgs/cat.jpg -------------------------------------------------------------------------------- /en/docs/cookies/imgs/hybrid-parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/cookies/imgs/hybrid-parallel.png -------------------------------------------------------------------------------- /en/docs/cookies/imgs/oneflow-serving-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/cookies/imgs/oneflow-serving-demo.png -------------------------------------------------------------------------------- /en/docs/cookies/imgs/triton-oneflow-backend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/cookies/imgs/triton-oneflow-backend.png -------------------------------------------------------------------------------- /en/docs/cookies/oneflow_torch.md: -------------------------------------------------------------------------------- 1 | # Mocking PyTorch with OneFlow 2 | 3 | With the alignment of OneFlow API and PyTorch, users can easily migrate the model from PyTorch to OneFlow. In this article, we will introduce three methods to port PyTorch code to OneFlow. 4 | 5 | ## 1. import oneflow as torch 6 | 7 | Change the original code ` import torch ` to: 8 | 9 | ```py 10 | import oneflow as torch 11 | ``` 12 | 13 | You can train the original model with OneFlow; however, you have to manually modify all the files that contain `import torch` by this approach. In addition, it is necessary to modify the source code if a third-party library uses `torch`. 14 | 15 | ## 2. using the command-line tool 16 | 17 | Oneflow provides a command-line tool, which helps to mock the environment of PyTorch within OneFlow’s Python Package and forward references of Pytorch to the real OneFlow module. The specific steps are as follows: 18 | 19 | 20 | Enabling the mocking of PyTorch 21 | 22 | ```shell 23 | eval $(oneflow-mock-torch) 24 | ``` 25 | 26 | or 27 | 28 | ```shell 29 | eval $(python3 -m oneflow.mock_torch) 30 | ``` 31 | 32 | To facilitate debugging, OneFlow provides two parameters to this method: 33 | 34 | 1. lazy parameter. When `lazy=True`, a mock object will be returned without an immediate error for non-existent interfaces. It is **recommended** to enable this parameter, so that even if the third-party library you import contains an interface that OneFlow does not currently exist, mock torch can still work normally as long as the interface is not actually used. 35 | 36 | 2. verbose parameter. If `verbose=True` is set simultaneously, it will print out which mock objects are accessed or used for debugging. 37 | 38 | The usage is as follows: 39 | 40 | Enabling the mocking of PyTorch, and configuring lazy and verbose parameters. 41 | 42 | ```shell 43 | eval $(oneflow-mock-torch --lazy --verbose) 44 | ``` 45 | 46 | or 47 | 48 | ```shell 49 | eval $(python3 -m oneflow.mock_torch --lazy --verbose) 50 | ``` 51 | 52 | After running the above command, you can observe the effect in the following example. 53 | 54 | ```py 55 | import torch 56 | print(torch.__file__) 57 | import oneflow as flow 58 | x = torch.zeros(2, 3) 59 | print(isinstance(x, flow.Tensor)) 60 | ``` 61 | 62 | Disabling the mocking of PyTorch 63 | 64 | ```shell 65 | eval $(oneflow-mock-torch disable) 66 | ``` 67 | 68 | or 69 | 70 | ```shell 71 | eval $(python3 -m oneflow.mock_torch disable) 72 | ``` 73 | 74 | ## 3. using built-in functions of OneFlow 75 | 76 | We provide the mock function with fine granularity, and the users can determine whether to enable this function for a piece of code. 77 | 78 | In the following `with` statement, the PyTorch module imported is OneFlow. 79 | 80 | ```py 81 | import oneflow.mock_torch as mock 82 | with mock.enable(): 83 | import torch 84 | print(torch.__file__) 85 | import oneflow as flow 86 | x = torch.zeros(2, 3) 87 | print(isinstance(x, flow.Tensor)) 88 | ``` 89 | 90 | Similarly, OneFlow provides parameters lazy and verbose for `mock.enable()` to facilitate debugging, which can be set like this: 91 | 92 | `with mock.enable(lazy=True, verbose=True)` 93 | 94 | You can turn off the mock function like this when it’s needed to use the real torch module. 95 | 96 | ```py 97 | with mock.disable(): 98 | import torch 99 | print(torch.__file__) 100 | ``` 101 | 102 | `mock.enable` and `mock.disable` can act as functions. For example, if you want to train a model with OneFlow, but it needs to be loaded by PyTorch. Then, you can use it with the following code: 103 | 104 | ```py 105 | mock.enable() 106 | ... 107 | with mock.disable() 108 | module = torch.load_module(...) 109 | # train the module with oneflow 110 | ``` 111 | 112 | A dictionary with the value of module is saved separately in enable and disable mode. When you turn enable or disable on and off, the dictionary will replace `sys.modules` and the global variables that the current module belongs to. Therefore, users are required to `import` the module they need in each mode, and the code below will raise an error `name 'torch' is not defined` in `with` statment of disable. 113 | 114 | ```py 115 | with mock.enable(): 116 | import torch 117 | with mock.disable(): 118 | torch.ones(2, 3) 119 | ``` 120 | 121 | ## Conclusion 122 | 123 | With the alignment of OneFlow API and PyTorch, users can easily migrate the PyTorch code to OneFlow. As mentioned above, three methods are available to train the PyTorch model with OneFlow. This is how users can experience the ultimate performance of OneFlow. 124 | -------------------------------------------------------------------------------- /en/docs/cookies/torch2flow.md: -------------------------------------------------------------------------------- 1 | 2 | # Converting Pre-trained Model from PyTorch to OneFlow 3 | 4 | 5 | Since interfaces of OneFlow and PyTorch are compatible, we can convert a pre-trained model from PyTorch to OneFlow when it's needed to use a PyTorch pre-trained model. 6 | 7 | ## Example of Model Conversion 8 | 9 | In the following code, we define and save a PyTorch model and then convert it to a OneFlow model. 10 | 11 | ```python 12 | import torch 13 | import torch.nn as nn 14 | save_file = 'model.pth' 15 | model_torch = nn.Sequential( 16 | nn.Linear(128, 2), 17 | nn.Softmax() 18 | ) 19 | torch.save(model_torch, save_file) 20 | ``` 21 | 22 | After running the above code, we get a `model.pth` file of PyTorch model. Then, the following two steps enable us to covert a PyTorch model to a OneFlow model: 23 | 24 | 25 | - Defining a OneFlow model with **the same structure** 26 | - Loading the `model.pth` file and initializing model parameters into OneFlow model 27 | 28 | Code is shown below: 29 | 30 | ```python 31 | import oneflow as flow 32 | import oneflow.nn as nn 33 | import torch 34 | model_flow = nn.Sequential( 35 | nn.Linear(128, 2), 36 | nn.Softmax() 37 | ) 38 | parameters = torch.load(save_file).state_dict() 39 | for key, value in parameters.items(): 40 | val = value.detach().cpu().numpy() 41 | parameters[key] = val 42 | model_flow.load_state_dict(parameters) 43 | ``` 44 | 45 | 46 | `.state_dict()` enables us to obtain model parameters defined by `key-value`. Then, we use `.detach().cpu().numpy()` to convert parameter whose gradients are blocked into Numpy. Lastly, `.load_state_dict(parameters)` allows to pass model parameters to OneFlow model. 47 | 48 | 49 | With the simple example described above, we can find that the approach to convert PyTorch model into OneFlow is to **use Numpy as a bridge**. Therefore, provided the models defined by PyTorch and by OneFlow whose structures are consistent, even complicated models can still be smoothly converted. 50 | 51 | 52 | ## More Information about FlowVision 53 | 54 | 55 | Same as torchvision, [flowvision](https://github.com/Oneflow-Inc/vision) also provides many pre-trained models, and the models in flowvision are compatible with those in torchvision. Taking AlexNet in flowvision for example, we will show how to convert **complicate PyTorch pre-trained models** into OneFlow by running the following code: 56 | 57 | ```python 58 | import torchvision.models as models_torch 59 | import flowvision.models as models_flow 60 | alexnet_torch = models_torch.alexnet(pretrained=True) 61 | alexnet_flow = models_flow.alexnet() 62 | parameters = alexnet_torch.state_dict() 63 | for key, value in parameters.items(): 64 | val = value.detach().cpu().numpy() 65 | parameters[key] = val 66 | alexnet_flow.load_state_dict(parameters) 67 | ``` 68 | 69 | 70 | We can also use pre-trained models provided in flowvision by importing the following code: 71 | 72 | ```python 73 | alexnet_flow = models_flow.alexnet(pretrained=True) 74 | ``` 75 | 76 | 77 | 78 | For more information about flowvision, please visit [flowvision documentation](https://flowvision.readthedocs.io/en/latest/index.html). 79 | -------------------------------------------------------------------------------- /en/docs/cookies/zero.md: -------------------------------------------------------------------------------- 1 | # Zero Redundancy Optimizer (ZeRO) 2 | 3 | ## Introduction to ZeRO 4 | 5 | **Zero Redundancy Optimizer (ZeRO)** is a method proposed in paper [ZeRO: Memory Optimization Towards Training A Trillion Parameter Models](https://arxiv.org/pdf/1910.02054.pdf), aiming to reduce the memory usage under the data parallelism strategy. 6 | 7 | In common data parallelism strategy, each GPU independently maintains a complete set of model parameters, which is efficient in computation and communication, but inefficient in memory. This problem is especially acute when training large models. ZeRO consists of ZeRO-DP and ZeRO-R, which can effectively reduce the consumption of video memory. This means that larger models can be trained with the same amount of memory. It also means that it is possible to use data parallelism for large models that could only be trained with model parallelism strategies in the past. 8 | 9 | The memory consumption when training a deep learning model can be divided into two parts: 10 | 11 | 1. **Model States**. For large models, most of the memory consumption is occupied by the model state, which mainly includes three parts: Optimizer States, Gradients, and Parameters. The three parts are abbreviated as **OPG**. 12 | 13 | 2. **Residual States**. It includes activation functions, temporary buffers, and unusable memory fragments. 14 | 15 | ZeRO-DP can be divided into three stages, eliminating memory redundancy by partitioning the OPG state rather than copying it directly, and each GPU only saves part of the OPG. Specifically, ZeRO-DP has three main optimization stages, corresponding to O, P, and G respectively. The three stages increase step by step: 16 | 17 | 1. Optimizer states partition(Pos): This state is 4x less memory consumption and the same amount of traffic as data parallel. 18 | 2. Add gradients partition optimizer (Pos+g): At this stage, the memory consumption is reduced by 8 times, and the traffic is the same as the data parallel. 19 | 3. Add parameter partition optimizer (Pos+g+p): At this stage, the memory occupied by the model is evenly distributed among each GPU. Memory consumption is linearly inversely proportional to the degree of data parallel, but there will be a slight increase in traffic. 20 | 21 | The distribution of the memory consumption of the three stages can be seen in the following figure (from the original ZeRO paper Figure 1): 22 | 23 |
24 | Three Stages of ZeRO-DP Optimizations 26 |
27 | 28 | ## ZeRo Usage Example 29 | 30 | First, import OneFlow: 31 | ```python 32 | import oneflow as flow 33 | from oneflow import nn 34 | ``` 35 | 36 | ### Definine the Training Process of Data Parallelism 37 | 38 | We define a training process under a data parallellelism strategy, similar to that described in [Conduct data parallel training by setting SBP](../parallelism/05_ddp.md#通过设置-sbp-做数据并行训练). 39 | 40 | !!! Note 41 | ZeRO can be applied for all the cases where data parallel groups exist. For example, in 2D/3D parallel, ZeRO can be turned on as long as there is a data parallel group. 42 | 43 | After the definition, we will use placement, SBP, etc: 44 | ```python 45 | P = flow.placement("cuda", ranks=[0, 1]) 46 | B = flow.sbp.broadcast 47 | S0 = flow.sbp.split(0) 48 | DEVICE = "cuda" 49 | ``` 50 | 51 | For demonstration purposes, we define a simple model and broadcast it to the cluster: 52 | ```python 53 | model = nn.Sequential(nn.Linear(256, 128), 54 | nn.ReLU(), 55 | nn.Linear(128, 10)) 56 | model = model.to(DEVICE) 57 | model.train() 58 | model = model.to_global(placement=P, sbp=B) 59 | 60 | loss_fn = nn.CrossEntropyLoss().to(DEVICE) 61 | optimizer = flow.optim.SGD(model.parameters(), lr=1e-3) 62 | ``` 63 | 64 | ZeRO is set in the graph compiler of [nn.Graph](../basics/08_nn_graph.md), so the dynamic graph model needs to be converted to nn.Graph: 65 | 66 | ```python 67 | class CustomGraph(flow.nn.Graph): 68 | def __init__(self): 69 | super().__init__() 70 | self.model = model 71 | self.loss_fn = loss_fn 72 | self.add_optimizer(optimizer) 73 | 74 | # TODO: Set ZeRO 75 | 76 | def build(self, x, y): 77 | preds = self.model(x) 78 | loss = self.loss_fn(preds, y) 79 | loss.backward() 80 | return preds 81 | ``` 82 | 83 | Definine the Training Process 84 | 85 | ```python 86 | graph_model = CustomGraph() 87 | 88 | for _ in range(100): 89 | x = flow.randn(128, 256).to(DEVICE) 90 | y = flow.ones(128, 1, dtype=flow.int64).to(DEVICE) 91 | global_x = x.to_global(placement=P, sbp=S0) 92 | global_y = y.to_global(placement=P, sbp=S0) 93 | 94 | graph_model(global_x, global_y) 95 | ``` 96 | 97 | Then start training through [launch Module](../parallelism/04_launch.md) 98 | 99 | ### Enable ZeRO in nn.Graph 100 | 101 | ZeRO can be enabled through the interface [config.set_zero_redundancy_optimizer_mode](https://oneflow.readthedocs.io/en/v0.8.1/generated/oneflow.nn.graph.graph_config.GraphConfig.enable_zero.html#oneflow.nn.graph.graph_config.GraphConfig.enable_zero) . 102 | 103 | #### Enable Stage 1 of ZeRO 104 | 105 | ```python 106 | class CustomGraph(flow.nn.Graph): 107 | def __init__(self): 108 | super().__init__() 109 | ... 110 | # Enable stage 1 of ZeRO 111 | self.config.enable_zero(True, stage=1) 112 | ... 113 | ``` 114 | 115 | !!! Note 116 | When using the model for continuous training and prediction: After the training is performed once, ZeRO will automatically change the SBP parameter of the model from Broadcast to Split; when performing prediction, Split will be used for automatic inference without configuring ZeRO. 117 | 118 | #### Enable Stage 2 of ZeRO 119 | 120 | ```python 121 | class CustomGraph(flow.nn.Graph): 122 | def __init__(self): 123 | super().__init__() 124 | ... 125 | # Enable stage 2 of ZeRO 126 | self.config.enable_zero(True, stage=2) 127 | ... 128 | ``` 129 | 130 | Generally speaking, the optimization of stage 2 has large optimization of memory and small speed impact, so it is recommended to use stage 2 optimization. It can be enabled in a simpler way: 131 | 132 | ```python 133 | class CustomGraph(flow.nn.Graph): 134 | def __init__(self): 135 | super().__init__() 136 | ... 137 | # Enable stage 2 of ZeRO 138 | self.config.enable_zero() 139 | ... 140 | ``` 141 | 142 | #### Enable Stage 3 of ZeRO 143 | 144 | ```python 145 | class CustomGraph(flow.nn.Graph): 146 | def __init__(self): 147 | super().__init__() 148 | ... 149 | # Enable stage 3 of ZeRO 150 | self.config.enable_zero(True, stage=3) 151 | ... 152 | ``` 153 | 154 | Although enabling the third stage can minimize the memory consumption, it will increase the communication cost which will bring lower speed. -------------------------------------------------------------------------------- /en/docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: OneFlow -- A Whole New Deep Learning Framework 3 | template: templates/home.html 4 | --- 5 | -------------------------------------------------------------------------------- /en/docs/javascripts/config.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.typesetPromise() 16 | }) 17 | -------------------------------------------------------------------------------- /en/docs/parallelism/01_introduction.md: -------------------------------------------------------------------------------- 1 | # COMMON DISTRIBUTED PARALLEL STRATEGY 2 | 3 | ## Why Distributed Training is Prevailing 4 | 5 | In recent years, deep learning has been widely used in various fields, including computer vision, language understanding, speech recognition, advertising recommendation and so on. A common feature in these different areas is the growing size of models, such as the GPT-3 model, which has 175 billion parameters. Even with 1,024 of 80 GB A100 cards, the full GPT-3 training time would take a month. 6 | 7 | The enlargement of model scale requires the development of hardware (computing power, memory). However, because of the existence of memory walls, the computational power and capacity of a single device, limited by the laws of physics, it is increasingly difficult to continuously improve the integration of chips and to keep up with the demands of model expansion. 8 | 9 | In order to solve the problem of insufficient increase speed of computing power, it is necessary to use multi-node cluster for distributed training in order to improve computing speed. 10 | 11 | ## Common Parallel Strategies 12 | 13 | Simply stacking machines does not increase computing power necessarily. Because the training of neural networks can not be simply "divide the work done by one device to multiple ones". It requires not only multiple devices to perform computing, but also data transmission between devices, only by coordinating the computing and communication in the cluster, can we do efficient distributed training. 14 | 15 | We will explain the difference between data parallelism and model parallelism with an example of matrix multiplication. 16 | 17 | Let's look at the following logical matrix multiplication examples: 18 | 19 | If a layer in the neural network is doing matrix multiplication, where the shape of the input $x$ is $4\times5$ and the shape of the model parameter $w$ is $5\times8$, then the matrix multiplication output shape is $4\times8$. The schematic diagram is as follows: 20 | 21 | ![matmul](./imgs/matmul_logical.png) 22 | 23 | In the single machine single card training situaiton, the above matrix multiplication first calculates $out$, passes $out$ to the next layer, and finally calculates $loss$, and then in the backpropagation process, gets $\frac{\partial loss}{\partial w}$, which then be used to update $w$. 24 | 25 | In distributed training, there are "**Data Parallelism**" and "**Model Parallelism**" strategies depending on whether $x$ or $w$ is partitioned. In the next section, we will introduce common strategies for parallelism. 26 | 27 | 28 | ### Data Parallelism 29 | 30 | Data parallelism slices $x$, while the model parameter $w$ on each device is complete and consistent. As shown in the figure below, $x$ is split evenly into two devices by dimension 0, each with a full $w$. 31 | 32 | In this way, the output on each device is only half the logical output, which shape is $2\times8$. The output on both devices combind together to produce the logically complete output. 33 | 34 | ![Data Paralelism](./imgs/matmul_data_paralelism.png) 35 | 36 | Note that because the data is distributed to two devices, the backpropagation process will get different values for $\frac{\partial loss}{\partial w}$, if the models are updated directly using the gradients on each device, it would cause the models on the two devices to be inconsistent, and the training would be meaningless(Which model should be used?). 37 | 38 | Therefore, in the process of backpropagation under data parallelism strategy, the gradients on each device should do [AllReduce](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce) before use, which ensures the model on each device is always consistent. 39 | 40 | When the dataset is large and the model is small, and the communication cost for the gradients synchronization is small in the backpropagation process, so it is more advantageous to choose data parallelism in this situation. For example, the common vision classification model, such as ResNet50, is more suitable to use data parallelism strategy. 41 | 42 | 43 | ### Model Parallelism 44 | 45 | When the neural network is very large, the cost of gradients synchronization will be very high, moreover, the network may be too large to be stored in a single computing device, then the model parallelism strategy can be used. 46 | 47 | The so called model parallelism is that the data on each device is complete and consistent, while the model $w$ is split into different devices, each device only has a part of the model, all the parts of model on the computing device put together forms the complete model. 48 | 49 | 50 | As shown in the figure below, $w$ is split evenly into two devices by the first dimension, each with a full $x$. The output on both devices also needs to be combind together to get the logical output. 51 | 52 | ![Model Paralelism](./imgs/matmul_model_paralelism.png) 53 | 54 | 55 | The benefit of model parallelism is that it eliminates the gradient AllReduce between multiple devices. However, since each device requires complete input data, the input is broadcasted among multiple devices with some communication cost. For example, the $out~(4\times8)$ shown above needs to be broadcast to both devices if it is the input of the next layer. 56 | 57 | Language models, such as BERT, often use model parallelism. 58 | 59 | 60 | ### Pipelining Parallelism 61 | 62 | When the neural network is too large to be stored on a single device, in addition to the above parallel strategies, we can also choose pipelining parallel strategy. Pipelining paralelism divides the network into stages and places it to different computing devices, each of which completes the training in a "relay" manner. 63 | 64 | The figure below shows how to run with pipelining parallelism with a logical four-layer network (`T1` to `T4`). 65 | 66 | The four-layer network is divided into two computing devices, so that the `T1` and `T2` are placed to `GPU0` and `T3` and `T4` are placed to `GPU1`. 67 | 68 | After computing the first two layers on `GPU0`, its output is treated as the input of `GPU1` to continue computation of the next two layers. 69 | 70 |
71 | Relay 72 |
73 | 74 | ### Hybrid Parallelism 75 | 76 | You can also mix with a variety of parallelism strategies when training a network, take GPT-3 as an example, the parallelism strategy for training could be like this: 77 | 78 | This large GPT network is partitioned into 64 stages, with each stage running on 6 DGX-A100s. The workload among the 6 machines is trained with data parallelism, while the workload among GPUs inside each machine is trained with model parallelism. The 3072 A100s in the entire cluster are divided into a matrix of $6\times8\times64$, and then train the model using data parallelism, model parallelism and pipeline parallelism simultaneously. 79 | 80 | ![gpt-3](./imgs/gpt3-overview.png) 81 | 82 | The choice of parallelism strategy affects the efficiency of training. Whether the interface of framework supports parallelism well also determines the efficiency of algorithm engineer. OneFlow's system-level design and innovation for distributed training will help users to get comfortable well with distributed training. The related examples will be shown in other articles on this topic. 83 | -------------------------------------------------------------------------------- /en/docs/parallelism/04_launch.md: -------------------------------------------------------------------------------- 1 | # DISTRIBUTED TRAINING LAUNCHER 2 | 3 | OneFlow provides the `oneflow.distributed.launch` module to help users start distributed training more conveniently. 4 | 5 | Users can start distributed training by the following commands: 6 | 7 | ```shell 8 | python3 -m oneflow.distributed.launch [Boot Option] training_script.py 9 | ``` 10 | 11 | For example, to start the training on single-node double-GPUs: 12 | 13 | ```shell 14 | python3 -m oneflow.distributed.launch --nproc_per_node 2 ./script.py 15 | ``` 16 | 17 | For another example, start two machines, and each machine has two graphics for training. 18 | 19 | Run on machine 0: 20 | 21 | ```shell 22 | python3 -m oneflow.distributed.launch --nproc_per_node=2 \ 23 | --nnodes=2 \ 24 | --node_rank=0 \ 25 | --nproc_per_node=2 \ 26 | --master_addr="192.168.1.1" \ 27 | --master_port=7788 \ 28 | script.py 29 | ``` 30 | 31 | Run on machine 1: 32 | 33 | ```shell 34 | python3 -m oneflow.distributed.launch --nproc_per_node=2 \ 35 | --nnodes=2 \ 36 | --node_rank=1 \ 37 | --nproc_per_node=2 \ 38 | --master_addr="192.168.1.1" \ 39 | --master_port=7788 \ 40 | script.py 41 | ``` 42 | 43 | ## Description of Common Options 44 | 45 | We can view the description of the options of the `launch` module after running `python3 -m oneflow.distributed.launch -h`. The following are some common options: 46 | 47 | - `--nnodes`: number of nodes 48 | - `--node_rank`: the serial number of the machines, starting from 0 49 | - `--nproc_per_node`: The number of processes per node to be started on each machine, which is recommended to be global with the number of GPUs 50 | - `--logdir`: The relative storage path of the child process log 51 | 52 | ## The Relationship between Launch Module and Parallel Strategy 53 | 54 | The main function of `oneflow.distributed.launch` is to allow users to start distributed training more conveniently after them complete the distributed program. It saves the trouble of configuring [environment variables](./03_consistent_tensor.md#_5) in the cluster. 55 | 56 | But `oneflow.distributed.launch` **does not determine** [Parallel Strategy](./01_introduction.md). The Parallel Strategy is determined by the setup of the distribution method of data and the model, and the placement of those on the physical devices. 57 | 58 | OneFlow provides [Global View](./02_sbp.md) and [Global Tensor](./03_consistent_tensor.md) to flexibly configure parallel strategies. And for data parallelism, OneFlow provides the [DistributedDataParallel](./05_ddp.md) module, which can change the single-node single-GPU script to the script of data parallel with minimal code modification. 59 | -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/2d-sbp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/2d-sbp.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/boxing_s2b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/boxing_s2b.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/consistent-view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/consistent-view.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/gpt3-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/gpt3-overview.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/matmul_data_paralelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/matmul_data_paralelism.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/matmul_logical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/matmul_logical.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/matmul_model_paralelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/matmul_model_paralelism.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/multi-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/multi-matmul.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/realy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/realy.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/sbp-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/sbp-example.png -------------------------------------------------------------------------------- /en/docs/parallelism/imgs/sbp_translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/en/docs/parallelism/imgs/sbp_translation.png -------------------------------------------------------------------------------- /en/mkdocs.yml: -------------------------------------------------------------------------------- 1 | 2 | site_name: OneFlow 3 | site_url: https://docs.oneflow.org/ 4 | site_description: >- 5 | OneFlow: a efficient distributed deep learning framework. 6 | # Repository 7 | repo_name: OneFlow 8 | repo_url: https://github.com/OneFlow-Inc/oneflow-documentation 9 | edit_uri: "blob/master/en/docs/" 10 | 11 | # Copyright 12 | copyright: Copyright © 2017 - 2021 OneFlow 13 | 14 | #website/xxx.html style 15 | use_directory_urls: false 16 | 17 | # Configuration 18 | theme: 19 | name: material 20 | custom_dir: 'overrides/' 21 | 22 | # 404 page 23 | static_templates: 24 | - 404.html 25 | 26 | # Don't include MkDocs' JavaScript 27 | include_search_page: false 28 | search_index_only: true 29 | 30 | # Default values, taken from mkdocs_theme.yml 31 | language: en 32 | features: 33 | - navigation.tabs 34 | - navigation.top 35 | #- instant 36 | palette: 37 | scheme: default 38 | primary: indigo 39 | accent: indigo 40 | font: 41 | text: Roboto 42 | code: Roboto Mono 43 | icon: 44 | logo: logo 45 | favicon: assets/favicon.png 46 | 47 | # Plugins 48 | plugins: 49 | - search 50 | - redirects: 51 | redirect_maps: 52 | 'parallelism/03_global_tensor.md': 'parallelism/03_consistent_tensor.md' 53 | 54 | extra: 55 | version: 56 | provider: mike 57 | alternate: 58 | # Switch to Chinese 59 | - name: 中文 60 | link: https://docs.oneflow.org 61 | lang: zh 62 | 63 | # Switch to English 64 | - name: English 65 | link: https://docs.oneflow.org/en 66 | lang: en 67 | # disqus: livere_indeed # livere 英文版带了广告 68 | 69 | # Extensions 70 | markdown_extensions: 71 | - markdown.extensions.admonition 72 | - markdown.extensions.attr_list 73 | - markdown.extensions.codehilite: 74 | guess_lang: false 75 | - markdown.extensions.def_list 76 | - markdown.extensions.footnotes 77 | - markdown.extensions.meta 78 | - markdown.extensions.toc: 79 | permalink: true 80 | - pymdownx.arithmatex: 81 | generic: true 82 | - pymdownx.betterem: 83 | smart_enable: all 84 | - pymdownx.caret 85 | - pymdownx.critic 86 | - pymdownx.details 87 | - pymdownx.emoji: 88 | emoji_index: !!python/name:materialx.emoji.twemoji 89 | emoji_generator: !!python/name:materialx.emoji.to_svg 90 | # - pymdownx.highlight: 91 | # linenums_style: pymdownx-inline 92 | - pymdownx.inlinehilite 93 | - pymdownx.keys 94 | - pymdownx.magiclink: 95 | repo_url_shorthand: true 96 | user: squidfunk 97 | repo: mkdocs-material 98 | - pymdownx.mark 99 | - pymdownx.smartsymbols 100 | - pymdownx.snippets: 101 | check_paths: true 102 | - pymdownx.superfences 103 | - pymdownx.tabbed 104 | - pymdownx.tasklist: 105 | custom_checkbox: true 106 | - pymdownx.tilde 107 | 108 | extra_javascript: 109 | - javascripts/config.js 110 | - https://polyfill.io/v3/polyfill.min.js?features=es6 111 | - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js 112 | 113 | nav: 114 | - Home: index.md 115 | 116 | - Basics: 117 | - Quickstart: basics/01_quickstart.md 118 | - Tensor: basics/02_tensor.md 119 | - Datesets & Dataloaders: basics/03_dataset_dataloader.md 120 | - Build Neural Network: basics/04_build_network.md 121 | - Autograd: basics/05_autograd.md 122 | - Backpropagation and Optimizer: basics/06_optimization.md 123 | - Model saving and loading: basics/07_model_load_save.md 124 | - Static Graph Interface: basics/08_nn_graph.md 125 | 126 | - Distributed Training: 127 | - Common Parallel Strategies: parallelism/01_introduction.md 128 | - Global View: parallelism/02_sbp.md 129 | - Global Tensor: parallelism/03_consistent_tensor.md 130 | - 2D SBP: parallelism/04_2d-sbp.md 131 | - Distributed Training Launcher: parallelism/04_launch.md 132 | - Data Parallelism Training: parallelism/05_ddp.md 133 | - Pipelining Parallelism: parallelism/06_pipeline.md 134 | 135 | - Cookbook: 136 | - Basic Operations of Distributed Programming with Global Tensor: cookies/global_tensor.md 137 | - Distributed Parallelism Strategies of Distributed Programming with Global Tensor: cookies/global_tensor_distributed.md 138 | - OneFlow with ONNX: cookies/oneflow2onnnx.md 139 | - Model Deployment: cookies/serving.md 140 | - Automatic Mixed Precision Training: cookies/amp.md 141 | - Activation Checkpointing: cookies/activation_checkpointing.md 142 | - Converting Pre-trained Model from PyTorch to OneFlow: cookies/torch2flow.md 143 | - Transfer Learning in Computer Vision: cookies/transfer_learning.md 144 | - Large-Scale Embedding Solution OneEmbedding: cookies/one_embedding.md 145 | - Zero Redundancy Optimizer (ZeRO): cookies/zero.md 146 | - OneFlow's Distributed Saving and Loading of Large Models: cookies/save_load.md 147 | - Oneflow is compatible with PyTorch: cookies/oneflow_torch.md 148 | - API: 149 | - API: https://oneflow.readthedocs.io/en/master/ 150 | -------------------------------------------------------------------------------- /en/overrides/partials/integrations/disqus.html: -------------------------------------------------------------------------------- 1 | {#- 2 | This file was automatically generated - do not edit 3 | -#} 4 | {% set disqus = config.extra.disqus %} 5 | {% if page and page.meta and page.meta.disqus is string %} 6 | {% set disqus = page.meta.disqus %} 7 | {% endif %} 8 | {% if not page.is_homepage and disqus %} 9 | 10 |
11 | 24 | 25 |
26 | 27 | {% endif %} 28 | -------------------------------------------------------------------------------- /en/overrides/partials/source.html: -------------------------------------------------------------------------------- 1 | {#- 2 | This file was automatically generated - do not edit 3 | -#} 4 | {% import "partials/language.html" as lang with context %} 5 | 7 |
8 | {% set icon = config.theme.icon.repo or "fontawesome/brands/git-alt" %} 9 | {% include ".icons/" ~ icon ~ ".svg" %} 10 |
11 |
12 | {{ config.repo_name }} 13 |
14 |
15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs==1.2.3 2 | mkdocs-material==7.1.11 3 | mkdocs-material-extensions==1.0.1 4 | mike==1.0.1 5 | Jinja2==3.0.2 6 | mkdocs-redirects==1.0.4 7 | Pygments==2.13.0 -------------------------------------------------------------------------------- /scripts/ci-requirements.txt: -------------------------------------------------------------------------------- 1 | Markdown==3.3.4 2 | PyYAML==6.0 3 | flowvision==0.1.0 4 | matplotlib==3.5.2 5 | oneflow-onnx==0.5.4 6 | onnx==1.9.0 7 | onnxruntime-gpu==1.11.1 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/README.md: -------------------------------------------------------------------------------- 1 | # Run Python Code of Markdown Files 2 | 3 | 本目录下的程序,可以读取 YAML 文件中的配置,提取并运行 markdown 文件中的 Python 代码。 4 | 5 | - [配置 YAML 文件](#配置-YAML-文件) 6 | - [读取 YAML 中的设置并运行](#读取-YAML-中的设置并运行) 7 | - [读取所有 YAML 中的设置并运行](#读取所有-YAML-中的设置并运行) 8 | - [测试前修正 markdown 中的代码](#测试前修正-markdown-中的代码) 9 | - [run_by_yamls.py 选项说明](#run_by_yamlspy-选项说明) 10 | - [如何查看错误信息](#如何查看错误信息) 11 | 12 | ## 配置 YAML 文件 13 | 14 | 配置文件(YAML 格式)放置在 [configs](./configs/) 目录下。 15 | 16 | 以 [basics_02_tensor.yml](./configs/basics_02_tensor.yml) 为例,查看其中的设置项: 17 | 18 | ```yaml 19 | - file_path: cn/docs/basics/02_tensor.md 20 | run: 21 | - all 22 | ``` 23 | 24 | `file_path` 指定 markdown 文件(相对于本仓库)的路径。 25 | 26 | `run` 指定需要运行的 Python 代码块。所谓 “Python 代码块”,指的是 markdown 中以 \`\`\`python 开头的代码块。“all” 表示依次运行其中所有的 Python 代码块。 27 | 28 | `run` 的值也可以是一个代码块序号组成的 list,如: 29 | 30 | ```yaml 31 | - file_path: cn/docs/basics/02_tensor.md 32 | run: 33 | - [0, 1, 2] 34 | ``` 35 | 36 | 表示只运行 `02_tensor.md` 文件中的前 3 个 Python 代码块。 37 | 38 | 通过 `run_by_yamls.py` 的 `--markdown` 选项,可以查看一个 markdown 文件中的所有 Python 代码块及其序号。 39 | 40 | ```shell 41 | python3 run_by_yamls.py --markdown ../../cn/docs/basics/04_build_network.md 42 | ``` 43 | 44 | 以上命令输出: 45 | 46 | ```text 47 | =============CODE 0============= 48 | import oneflow as flow 49 | import oneflow.nn as nn 50 | 51 | 52 | =============CODE 1============= 53 | class NeuralNetwork(nn.Module): 54 | def __init__(self): 55 | super(NeuralNetwork, self).__init__() 56 | 57 | ... 58 | 59 | =============CODE 5============= 60 | class MySeqModel(nn.Module): 61 | def __init__(self): 62 | super(MySeqModel, self).__init__() 63 | self.seq = nn.Sequential( 64 | nn.Conv2d(1,20,5), 65 | nn.ReLU(), 66 | nn.Conv2d(20,64,5), 67 | nn.ReLU() 68 | ) 69 | 70 | def forward(self, x): 71 | return self.seq(x) 72 | ``` 73 | 74 | 一个 markdown 文件中,在运行时可以运行多组(次)测试。如: 75 | 76 | ```yaml 77 | - file_path: cn/docs/cookies/transfer_learning.md 78 | run: 79 | - [0, 1, 2, 3, 4, 5, 6, 7, 8, 11] 80 | - [0, 1, 2, 3, 4, 5, 6, 7, 9, 11] 81 | - [0, 1, 2, 3, 4, 5, 6, 7, 10, 11] 82 | ``` 83 | 84 | 以上配置,针对 `transfer_learning.md` 文件,会运行 3 组 Python 代码。 85 | 86 | 一个 YAML 文件中,可以针对任意多个 markdown 文件进行配置,如: 87 | 88 | ```yaml 89 | - file_path: cn/docs/basics/02_tensor.md 90 | run: 91 | - all 92 | 93 | - file_path: en/docs/basics/02_tensor.md 94 | run: 95 | - all 96 | ``` 97 | 98 | 99 | ## 读取 YAML 中的设置并运行 100 | 101 | `run_by_yamls.py` 的 `--yaml` 选项,可以读取指定的 YAML,提取对应 markdown 文件中的代码,并运行。如: 102 | 103 | ```shell 104 | python3 run_by_yamls.py --yaml ./configs/basics_02_tensor.yml 105 | ``` 106 | 107 | ## 读取所有 YAML 中的设置并运行 108 | 109 | 如果 `run_by_yamls.py` 运行时不带任何选项,则表示读取本仓库 `scripts/markdown_ci/configs/` 目录下的所有 YAML 文件,并运行对应 markdown 文件中的代码。 110 | 111 | ```bash 112 | python3 run_by_yamls.py 113 | ``` 114 | 115 | 116 | ## 测试前修正 markdown 中的代码 117 | 118 | 每一个 markdown 文件,还可以配置一个 `hook` 项,如: 119 | 120 | ```yaml 121 | - file_path: cn/docs/basics/01_quickstart.md 122 | run: 123 | - all 124 | hook: | # hook(index, codetext) 125 | if index == 8: 126 | code = code.replace("epochs = 5", "epochs = 1") 127 | return code 128 | ``` 129 | 130 | `hook` 中的值为回调函数的实现代码,该函数会接受 `index` 和 `code` 两个参数,分别指代码块的序号和代码块的内容,该函数的返回值,将被当作最终运行的代码块内容。 131 | 如以上的配置中,将 8 号代码块中的 `epochs = 5` 改成 `epochs = 1`,用以减少运行时间。 132 | 133 | ## run_by_yamls.py 选项说明 134 | 135 | ```shell 136 | python run_by_yamls.py -h 137 | ``` 138 | 139 | ```text 140 | usage: run_by_yamls.py [-h] [--markdown MARKDOWN] [--output OUTPUT] 141 | [--yaml YAML] [--configs CONFIGS] 142 | 143 | read config yaml files and run realted code 144 | 145 | optional arguments: 146 | -h, --help show this help message and exit 147 | --markdown MARKDOWN the input markdown file 148 | --output OUTPUT if not None, output will be written to the path 149 | --yaml YAML the path of yaml file. eg: ./sample.yaml 150 | --configs CONFIGS config dir where yaml files exists, markdown_ci/configs 151 | by default. 152 | ``` 153 | 154 | - `--output` 配合 `--markdown` 使用,如果指定了 `--output`,则提取的 Python 代码块内容,会重定向到文件。 155 | - `--config` 用于指定存放 YAML 文件的路径,方便测试。 156 | 157 | ## 如何查看错误信息 158 | 159 | 当运行报错时,会打印出错误信息: 160 | 161 | ```text 162 | ====RUN CODE IN MARKDOWN====: python3 run_markdown_codes.py --markdown_file /workspace/oneflow-documentation/cn/docs/basics/04_build_network.md --index all 163 | ... 164 | ****EXEC ERROR**** 165 | markdown file: /workspace/oneflow-documentation/cn/docs/basics/04_build_network.md 166 | codeblock index: 2 167 | Code:b'X = flow.ones(1, 28, 28)\nlogits = net(X)\npred_probab = nn.Softmax(dim=1)(logits)\ny_pred = pred_probab.argmax(1)\nprint(f"Predicted class: {y_pred}")\n' 168 | 169 | Traceback (most recent call last): 170 | File "run_markdown_codes.py", line 21, in run_block_item 171 | exec(code, globals(), globals()) 172 | File "", line 1, in 173 | NameError: name 'flow' is not defined 174 | 175 | During handling of the above exception, another exception occurred: 176 | 177 | ... 178 | ``` 179 | 180 | 其中 `====RUN CODE IN MARKDOWN====` 告知了正在提取并运行哪个 markdown 文件。 181 | 182 | `****EXEC ERROR****` 告知出错代码块的序号(`codeblock index: 2`),代码块的内容 `Code: ...`。 183 | -------------------------------------------------------------------------------- /scripts/markdown_ci/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/oneflow-documentation/d4dc26f4de1aac396afb4d59b9de9f78f91871d5/scripts/markdown_ci/__init__.py -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_01_quick_start.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/01_quickstart.md 2 | run: 3 | - all 4 | hook: | # hook(index, codetext) 5 | if index == 6: 6 | code = code.replace('print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")', 7 | 'print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]");break') 8 | if index == 8: 9 | code = code.replace("epochs = 5", "epochs = 1") 10 | return code 11 | 12 | - file_path: en/docs/basics/01_quickstart.md 13 | run: 14 | - all 15 | hook: | # hook(index, codetext) 16 | if index == 6: 17 | code = code.replace('print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")', 18 | 'print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]");break') 19 | if index == 8: 20 | code = code.replace("epochs = 5", "epochs = 1") 21 | return code 22 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_02_tensor.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/02_tensor.md 2 | run: 3 | - all 4 | 5 | - file_path: en/docs/basics/02_tensor.md 6 | run: 7 | - all 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_03_dataset.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/03_dataset_dataloader.md 2 | run: 3 | - [0, 1, 2, 3, 4, 5] 4 | 5 | - file_path: en/docs/basics/03_dataset_dataloader.md 6 | run: 7 | - [0, 1, 2, 3, 4, 5] 8 | 9 | # REMOVED CODE BLOCKS 06, A HUGE LOOP: 10 | # for x, label in train_dataloader: 11 | # print(x.shape, label.shape) 12 | # # training... 13 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_04_build_network.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/04_build_network.md 2 | run: 3 | - all 4 | 5 | - file_path: en/docs/basics/04_build_network.md 6 | run: 7 | - all 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_05_autograd.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/05_autograd.md 2 | run: 3 | - [0, 1, 2, 4, 5, 6, 7, 9, 10] 4 | 5 | - file_path: en/docs/basics/05_autograd.md 6 | run: 7 | - [0, 1, 2, 4, 5, 6, 7, 9, 10] 8 | 9 | # REMOVE CODE BLOCK 3, 8. WRONG EXAMPLE: 10 | 11 | # n1 = flow.tensor(10., requires_grad=True) 12 | # n2 = flow.pow(n1, 2) 13 | # n2.backward() 14 | # n2.backward() 15 | 16 | # x = flow.randn(1, 2, requires_grad=True) 17 | # y = 3*x + 1 18 | # y.backward() 19 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_06_optimization.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/06_optimization.md 2 | run: 3 | - all 4 | 5 | - file_path: en/docs/basics/06_optimization.md 6 | run: 7 | - all 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_07_save_load.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/07_model_load_save.md 2 | run: 3 | - all 4 | 5 | - file_path: en/docs/basics/07_model_load_save.md 6 | run: 7 | - all 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/basics_08_nn_graph.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/basics/08_nn_graph.md 2 | run: 3 | - [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 15] 4 | hook: | # hook(index, code) 5 | if index == 0: 6 | code = code.replace('print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")', 7 | 'print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]");break') 8 | if index == 5: 9 | code = code.replace('print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")', 10 | 'print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]");break') 11 | return code 12 | 13 | - file_path: en/docs/basics/08_nn_graph.md 14 | run: 15 | - [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 14, 15] 16 | hook: | # hook(index, code) 17 | if index == 0: 18 | code = code.replace('print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")', 19 | 'print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]");break') 20 | if index == 5: 21 | code = code.replace('print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")', 22 | 'print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]");break') 23 | return code 24 | 25 | # REMOVED CODE BLOCKS: 26 | 27 | # 12 28 | # state_dict = flow.load(CHECKPOINT_SAVE_DIR) 29 | # graph_mobile_net_v2.load_state_dict(state_dict) 30 | # Because: nn.Graph's state dict can only be loaded before the first call of a graph. 31 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_activation_checkpointing.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/activation_checkpointing.md 2 | run: 3 | - all 4 | 5 | - file_path: en/docs/cookies/activation_checkpointing.md 6 | run: 7 | - all 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_amp.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/amp.md 2 | run: 3 | - [0, 1, 2] 4 | 5 | - file_path: en/docs/cookies/amp.md 6 | run: 7 | - [0, 1, 2] 8 | 9 | # CODE BLOCK 3 REMOVED 10 | # It's pseudo code 11 | 12 | # grad_scaler = flow.amp.GradScaler( 13 | # init_scale=3000, 14 | # growth_factor=2.0, 15 | # backoff_factor=0.5, 16 | # growth_interval=1000, 17 | # ) 18 | # self.set_grad_scaler(grad_scaler) 19 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_lr_scale.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/lr_scale.md 2 | run: 3 | - all 4 | 5 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_one_embedding.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/one_embedding.md 2 | run: 3 | - [0, 1, 2, 3, 4, 5] 4 | hook: | # hook(index, code) 5 | if index == 2: 6 | code = code.replace("capacity=40000000", "capacity=40") 7 | code = code.replace("cache_budget_mb=8142", "cache_budget_mb=32") 8 | return code 9 | 10 | - file_path: en/docs/cookies/one_embedding.md 11 | run: 12 | - [0, 1, 2, 3, 4, 5] 13 | hook: | # hook(index, code) 14 | if index == 2: 15 | code = code.replace("capacity=40000000", "capacity=40") 16 | code = code.replace("cache_budget_mb=8142", "cache_budget_mb=32") 17 | return code 18 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_oneflow2onnx.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/oneflow2onnnx.md 2 | run: 3 | - [2, 3, 4, 5, 6] 4 | 5 | - file_path: en/docs/cookies/oneflow2onnnx.md 6 | run: 7 | - [2, 3, 4, 5, 6] 8 | 9 | #REMOVED BLOCKS 10 | 11 | # BLOCK 0, 1: pseduo code 12 | # =============CODE 0============= 13 | # from oneflow_onnx.oneflow2onnx.util import export_onnx_model 14 | 15 | # export_onnx_model(graph, 16 | # external_data=False, 17 | # opset=None, 18 | # flow_weight_dir=None, 19 | # onnx_model_path="/tmp", 20 | # dynamic_batch_size=False) 21 | 22 | 23 | # =============CODE 1============= 24 | # from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check 25 | 26 | # convert_to_onnx_and_check(...) 27 | 28 | # BLOCK 7: some extra files are needed 29 | # =============CODE 7============= 30 | # # 从文件中读取 ImageNet 数据集的类别名称 31 | # with open('ImageNet-Class-Names.txt') as f: 32 | # CLASS_NAMES = f.readlines() 33 | 34 | # # 读取图像文件并使用 `preprocess_image` 函数进行预处理 35 | # img = cv2.imread('cat.jpg', cv2.IMREAD_COLOR) 36 | # img = preprocess_image(img) 37 | 38 | # # 创建一个 InferenceSession 对象 39 | # ort_sess = InferenceSession('model.onnx', providers=['TensorrtExecutionProvider', 40 | # 'CUDAExecutionProvider', 41 | # 'CPUExecutionProvider']) 42 | # # 调用 InferenceSession 对象的 `run` 方法进行推理 43 | # results = ort_sess.run(None, {"_ResNet34Graph_0-input_0/out": img}) 44 | 45 | # # 输出推理结果 46 | # print(CLASS_NAMES[np.argmax(results[0])]) 47 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_torch2flow.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/torch2flow.md 2 | run: 3 | - all 4 | 5 | - file_path: en/docs/cookies/torch2flow.md 6 | run: 7 | - all 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_transfer_learning.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/transfer_learning.md 2 | run: 3 | - [0, 1, 2, 3, 4, 5, 6, 7, 8, 11] 4 | - [0, 1, 2, 3, 4, 5, 6, 7, 9, 11] 5 | - [0, 1, 2, 3, 4, 5, 6, 7, 10, 11] 6 | hook: | # hook(index, code) 7 | if index == 1: 8 | code = code.replace("NUM_EPOCHS = 3", "NUM_EPOCHS = 1") 9 | if index == 6: 10 | code = code.replace("print(f'loss: {loss:>7f} [epoch: {epoch} {batch * BATCH_SIZE:>5d}/{dataset_size:>5d}]')", 11 | "print(f'loss: {loss:>7f} [epoch: {epoch} {batch * BATCH_SIZE:>5d}/{dataset_size:>5d}]');break") 12 | return code 13 | 14 | - file_path: en/docs/cookies/transfer_learning.md 15 | run: 16 | - [0, 1, 2, 3, 4, 5, 6, 7, 8, 11] 17 | - [0, 1, 2, 3, 4, 5, 6, 7, 9, 11] 18 | - [0, 1, 2, 3, 4, 5, 6, 7, 10, 11] 19 | hook: | # hook(index, code) 20 | if index == 1: 21 | code = code.replace("NUM_EPOCHS = 3", "NUM_EPOCHS = 1") 22 | if index == 6: 23 | code = code.replace("print(f'loss: {loss:>7f} [epoch: {epoch} {batch * BATCH_SIZE:>5d}/{dataset_size:>5d}]')", 24 | "print(f'loss: {loss:>7f} [epoch: {epoch} {batch * BATCH_SIZE:>5d}/{dataset_size:>5d}]');break") 25 | return code 26 | 27 | # WHY 3 CASES FOR ONE ARTICLE 28 | # the code blocks 8, 9, 10 are three ways for transfer learning 29 | -------------------------------------------------------------------------------- /scripts/markdown_ci/configs/guide_zero.yml: -------------------------------------------------------------------------------- 1 | - file_path: cn/docs/cookies/zero.md 2 | run: 3 | - [0, 1] 4 | 5 | - file_path: en/docs/cookies/zero.md 6 | run: 7 | - [0, 1] 8 | -------------------------------------------------------------------------------- /scripts/markdown_ci/extract_code_block.py: -------------------------------------------------------------------------------- 1 | import re 2 | import markdown 3 | from collections import OrderedDict 4 | import argparse 5 | import sys 6 | 7 | __all__ = ["get_all_python_blocks", "get_all_text_blocks", "pickup_blocks"] 8 | 9 | 10 | def get_markdown_blocks(filepath, pattern, strict=True): 11 | codeblocks = [] 12 | codeblock_re = r"^```.*" 13 | codeblock_open_re = pattern + "{0}$".format("" if strict else "?") 14 | question_mark_re = r"^\?\?\? code$" 15 | 16 | with open(filepath, "r", encoding="utf-8") as f: 17 | block = [] 18 | python = True 19 | in_codeblock = False 20 | in_question_mark = False 21 | 22 | for line in f.readlines(): 23 | if not in_question_mark: 24 | in_question_mark = re.match(question_mark_re, line) 25 | else: 26 | line = line[4:] 27 | codeblock_delimiter = re.match(codeblock_re, line) 28 | 29 | if in_codeblock: 30 | if codeblock_delimiter: 31 | if python: 32 | codeblocks.append("".join(block)) 33 | block = [] 34 | python = True 35 | in_codeblock = False 36 | in_question_mark = False 37 | else: 38 | block.append(line) 39 | elif codeblock_delimiter: 40 | in_codeblock = True 41 | if not re.match(codeblock_open_re, line): 42 | python = False 43 | return codeblocks 44 | 45 | 46 | def get_all_python_blocks(filepath, strict=True): 47 | return get_markdown_blocks(filepath, r"^```(`*)(py|python)", strict) 48 | 49 | 50 | def get_all_text_blocks(filepath, strict=True): 51 | return get_markdown_blocks(filepath, r"^```(`*)(text)", strict) 52 | 53 | 54 | def pickup_blocks(all_blocks, index): 55 | sub_blocks = OrderedDict() 56 | 57 | if isinstance(index, list): 58 | for i in index: 59 | sub_blocks[i] = all_blocks[i] 60 | return sub_blocks 61 | elif isinstance(index, str): 62 | if index != "all": 63 | raise RuntimeError("index can only be 'all' if it is str") 64 | index = [x for x in range(0, len(all_blocks))] 65 | return OrderedDict(zip(index, all_blocks)) 66 | else: 67 | raise RuntimeError("index can be list only or literal string - 'all'") 68 | 69 | 70 | def print_all_blocks(file): 71 | all_blocks = get_all_python_blocks(file) 72 | for i in range(0, len(all_blocks)): 73 | print("=============CODE {0}=============".format(i)) 74 | print(all_blocks[i]) 75 | print("") 76 | 77 | 78 | def main(): 79 | pass 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /scripts/markdown_ci/run_by_yamls.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import subprocess 3 | import os 4 | import argparse 5 | from extract_code_block import print_all_blocks 6 | 7 | __all__ = [] 8 | 9 | BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 10 | CONFIG_DIR = os.path.join(BASE_DIR, "scripts/markdown_ci/configs") 11 | CN_DOCS = os.path.join(BASE_DIR, "cn/docs") 12 | EN_DOCS = os.path.join(BASE_DIR, "en/docs") 13 | 14 | 15 | def read_config(yaml_file): 16 | with open(yaml_file) as f: 17 | config = yaml.load(f, Loader=yaml.Loader) 18 | return config 19 | 20 | 21 | def run_yaml_markdown_codes(yaml_path, config, all_markdown_files): 22 | file_path = os.path.join(BASE_DIR, config["file_path"]) 23 | if "hook" in config: 24 | hook_body = config["hook"] 25 | else: 26 | hook_body = "return code" 27 | 28 | if all_markdown_files: 29 | try: 30 | all_markdown_files.remove(file_path) 31 | except: 32 | pass # do nothing if remove more than once 33 | 34 | for index in config["run"]: 35 | cmd = r"python3 run_markdown_codes.py --markdown_file {0} --index {1}".format( 36 | file_path, str(index).replace(" ", "") 37 | ) 38 | cmd_list = cmd.split(" ", 5) 39 | print("====RUN CODE IN MARKDOWN====:", cmd) 40 | subprocess_ret = subprocess.run( 41 | cmd_list, input=bytes(hook_body, encoding="utf-8"), check=True 42 | ) 43 | 44 | def get_all_yaml_files(): 45 | yaml_files_list = [] 46 | for root, dirs, files in os.walk(CONFIG_DIR): 47 | for file in files: 48 | if os.path.splitext(file)[1] == ".yml": 49 | yaml_files_list.append(os.path.join(root, file)) 50 | return yaml_files_list 51 | 52 | 53 | def get_all_markdown_files(): 54 | md_files_list = [] 55 | for root, dirs, files in os.walk(CN_DOCS): 56 | for file in files: 57 | if os.path.splitext(file)[1] == ".md": 58 | md_files_list.append(os.path.join(root, file)) 59 | for root, dirs, files in os.walk(EN_DOCS): 60 | for file in files: 61 | if os.path.splitext(file)[1] == ".md": 62 | md_files_list.append(os.path.join(root, file)) 63 | return md_files_list 64 | 65 | 66 | def run_all_yamls(all_markdown_files): 67 | print("====ALL YAMLS====:") 68 | print("\n".join(get_all_yaml_files())) 69 | for yaml_file in get_all_yaml_files(): 70 | run_configs_in_yaml(yaml_file, all_markdown_files) 71 | print("MARKDOWN FILES NOT TEST:") 72 | print("\n".join(all_markdown_files)) 73 | 74 | 75 | def run_configs_in_yaml(yaml_file, all_markdown_files=None): 76 | for config in read_config(yaml_file): 77 | run_yaml_markdown_codes(yaml_file, config, all_markdown_files) 78 | 79 | 80 | def main(): 81 | parser = argparse.ArgumentParser( 82 | description="read config yaml files and run realted code" 83 | ) 84 | parser.add_argument( 85 | "--markdown", type=str, default=None, help="the input markdown file" 86 | ) 87 | parser.add_argument( 88 | "--output", 89 | type=str, 90 | default=None, 91 | help="if not None, output will be written to the path", 92 | ) 93 | parser.add_argument( 94 | "--yaml", 95 | type=str, 96 | default=None, 97 | help="the path of yaml file. eg: ./sample.yaml", 98 | ) 99 | parser.add_argument( 100 | "--configs", 101 | type=str, 102 | default=None, 103 | help="config dir where yaml files exists, markdown_ci/configs by default.", 104 | ) 105 | args = parser.parse_args() 106 | 107 | if args.markdown: 108 | if args.output: 109 | with open(args.output, "w") as f: 110 | saved_std_output = sys.stdout 111 | sys.stdout = f 112 | print_all_blocks(args.markdown) 113 | sys.stdout = saved_std_output 114 | else: 115 | print_all_blocks(args.markdown) 116 | return 117 | 118 | if args.configs: 119 | CONFIG_DIR = args.configs 120 | 121 | if args.yaml: 122 | run_configs_in_yaml(args.yaml) 123 | else: 124 | markdown_files = get_all_markdown_files() 125 | run_all_yamls(markdown_files) 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /scripts/markdown_ci/run_markdown_codes.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import argparse 3 | import sys 4 | from extract_code_block import * 5 | 6 | 7 | def get_hooker(): 8 | hooker = """def hook(index, code):\n""" 9 | for line in sys.stdin: 10 | hooker = hooker + " " + line 11 | exec(hooker, globals(), globals()) 12 | 13 | 14 | def run_block_item(block_dict: OrderedDict, file_path=None): 15 | exec_error_msg = """ markdown file: {0} 16 | codeblock index: {1} 17 | Code:{2}""" 18 | for index in block_dict: 19 | try: 20 | code = hook(index, block_dict[index]) 21 | exec(code, globals(), globals()) 22 | except: 23 | print(" ****EXEC ERROR****") 24 | print( 25 | exec_error_msg.format( 26 | file_path, index, bytes(block_dict[index], encoding="utf-8") 27 | ) 28 | ) 29 | print("") 30 | raise RuntimeError("markdown test fails") 31 | 32 | 33 | def run_markdown_codes(file_path, index): 34 | get_hooker() 35 | codes = get_all_python_blocks(file_path) 36 | picked_codes = pickup_blocks(codes, index) 37 | run_block_item(picked_codes, file_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser(description="Run python code in markdown files") 42 | parser.add_argument( 43 | "--markdown_file", type=str, help="the path of markdown file. eg: ./sample.md" 44 | ) 45 | parser.add_argument( 46 | "--index", 47 | type=str, 48 | default="all", 49 | help='the index set of code blocks. eg: [0, 1, 2]. Default for "all"', 50 | ) 51 | 52 | args = parser.parse_args() 53 | file_path = args.markdown_file 54 | if args.index != "all": 55 | index = eval(args.index) 56 | else: 57 | index = args.index 58 | run_markdown_codes(file_path, index) 59 | -------------------------------------------------------------------------------- /scripts/run-markdown-ci.sh: -------------------------------------------------------------------------------- 1 | export PATH=$HOME/.local/bin:$PATH 2 | cd scripts/markdown_ci && python3 run_by_yamls.py -------------------------------------------------------------------------------- /scripts/run-mike.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | MIKE="mike" 3 | CN_SITE="_site" 4 | EN_SITE="_site/en" 5 | LATEST_VERSION="latest" 6 | OUTPUT_BRANCH="docs_output" 7 | 8 | CN_OPTIONS="--prefix ${CN_SITE} -b ${OUTPUT_BRANCH}" 9 | EN_OPTIONS="--prefix ${EN_SITE} -b ${OUTPUT_BRANCH}" 10 | 11 | cd cn 12 | 13 | # master 14 | ${MIKE} delete --all ${CN_OPTIONS} 15 | git checkout master 16 | ${MIKE} deploy master ${LATEST_VERSION} -u ${CN_OPTIONS} 17 | ${MIKE} set-default ${LATEST_VERSION} ${CN_OPTIONS} 18 | cd ../en 19 | ${MIKE} delete --all ${EN_OPTIONS} 20 | ${MIKE} deploy master ${LATEST_VERSION} -u ${EN_OPTIONS} 21 | ${MIKE} set-default ${LATEST_VERSION} ${EN_OPTIONS} 22 | 23 | # v0.4.0 24 | VERSION="v0.4.0" 25 | cd .. && git checkout ${VERSION} 26 | cd cn 27 | ${MIKE} deploy ${VERSION} -u ${CN_OPTIONS} 28 | cd ../en 29 | ${MIKE} deploy ${VERSION} -u ${EN_OPTIONS} 30 | 31 | # v0.7.0 32 | VERSION="v0.7.0" 33 | cd .. && git checkout ${VERSION} 34 | cd cn 35 | ${MIKE} deploy ${VERSION} -u ${CN_OPTIONS} 36 | cd ../en 37 | ${MIKE} deploy ${VERSION} -u ${EN_OPTIONS} 38 | 39 | # v0.8.0 40 | VERSION="v0.8.0" 41 | cd .. && git checkout ${VERSION} 42 | cd cn 43 | ${MIKE} deploy ${VERSION} -u ${CN_OPTIONS} 44 | cd ../en 45 | ${MIKE} deploy ${VERSION} -u ${EN_OPTIONS} 46 | --------------------------------------------------------------------------------