├── .gitignore
├── LICENSE
├── README.md
├── ReadmePic
└── arch.png
├── furnace
├── __init__.py
├── apex
│ ├── LICENSE
│ ├── README.md
│ ├── apex.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ └── top_level.txt
│ ├── apex
│ │ ├── RNN
│ │ │ ├── README.md
│ │ │ ├── RNNBackend.py
│ │ │ ├── __init__.py
│ │ │ ├── cells.py
│ │ │ └── models.py
│ │ ├── __init__.py
│ │ ├── amp
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── __version__.py
│ │ │ ├── amp.py
│ │ │ ├── compat.py
│ │ │ ├── handle.py
│ │ │ ├── lists
│ │ │ │ ├── __init__.py
│ │ │ │ ├── functional_overrides.py
│ │ │ │ ├── tensor_overrides.py
│ │ │ │ └── torch_overrides.py
│ │ │ ├── opt.py
│ │ │ ├── rnn_compat.py
│ │ │ ├── scaler.py
│ │ │ ├── utils.py
│ │ │ └── wrap.py
│ │ ├── fp16_utils
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── fp16_optimizer.py
│ │ │ ├── fp16util.py
│ │ │ └── loss_scaler.py
│ │ ├── normalization
│ │ │ ├── __init__.py
│ │ │ ├── csrc
│ │ │ │ ├── layer_norm_cuda.cpp
│ │ │ │ └── layer_norm_cuda_kernel.cu
│ │ │ └── fused_layer_norm.py
│ │ ├── optimizers
│ │ │ ├── __init__.py
│ │ │ ├── csrc
│ │ │ │ ├── fused_adam_cuda.cpp
│ │ │ │ └── fused_adam_cuda_kernel.cu
│ │ │ ├── fp16_optimizer.py
│ │ │ └── fused_adam.py
│ │ ├── parallel
│ │ │ ├── LARC.py
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── distributed.py
│ │ │ ├── multiproc.py
│ │ │ ├── optimized_sync_batchnorm.py
│ │ │ ├── optimized_sync_batchnorm_kernel.py
│ │ │ ├── sync_batchnorm.py
│ │ │ └── sync_batchnorm_kernel.py
│ │ └── reparameterization
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── reparameterization.py
│ │ │ └── weight_norm.py
│ ├── csrc
│ │ ├── flatten_unflatten.cpp
│ │ ├── scale_check_overflow.cpp
│ │ ├── scale_check_overflow_kernel.cu
│ │ ├── syncbn.cpp
│ │ └── welford.cu
│ ├── dist
│ │ └── apex-0.1-py3.6-linux-x86_64.egg
│ ├── docs
│ │ ├── Makefile
│ │ └── source
│ │ │ ├── _static
│ │ │ └── css
│ │ │ │ └── pytorch_theme.css
│ │ │ ├── _templates
│ │ │ └── layout.html
│ │ │ ├── amp.rst
│ │ │ ├── conf.py
│ │ │ ├── fp16_utils.rst
│ │ │ ├── index.rst
│ │ │ ├── layernorm.rst
│ │ │ ├── optimizers.rst
│ │ │ └── parallel.rst
│ ├── examples
│ │ ├── FP16_Optimizer_simple
│ │ │ ├── README.md
│ │ │ ├── closure.py
│ │ │ ├── distributed_apex
│ │ │ │ ├── README.md
│ │ │ │ ├── distributed_data_parallel.py
│ │ │ │ └── run.sh
│ │ │ ├── distributed_apex_legacy_launcher
│ │ │ │ ├── README.md
│ │ │ │ ├── distributed_data_parallel.py
│ │ │ │ └── run.sh
│ │ │ ├── distributed_pytorch
│ │ │ │ ├── README.md
│ │ │ │ ├── distributed_data_parallel.py
│ │ │ │ └── run.sh
│ │ │ ├── minimal.py
│ │ │ └── save_load.py
│ │ ├── README.md
│ │ ├── distributed
│ │ │ ├── README.md
│ │ │ └── main.py
│ │ ├── docker
│ │ │ ├── Dockerfile
│ │ │ └── README.md
│ │ ├── imagenet
│ │ │ ├── README.md
│ │ │ ├── main.py
│ │ │ ├── main_amp.py
│ │ │ ├── main_fp16_optimizer.py
│ │ │ └── main_reducer.py
│ │ └── word_language_model
│ │ │ ├── README.md
│ │ │ ├── data.py
│ │ │ ├── data
│ │ │ └── wikitext-2
│ │ │ │ └── README
│ │ │ ├── generate.py
│ │ │ ├── main.py
│ │ │ ├── main_fp16_optimizer.py
│ │ │ └── model.py
│ ├── setup.py
│ └── tests
│ │ ├── RNN
│ │ └── RNN_tests.py
│ │ ├── distributed
│ │ ├── ddp_race_condition_test.py
│ │ └── run_race_test.sh
│ │ ├── run_amp
│ │ ├── __init__.py
│ │ ├── test_basic_casts.py
│ │ ├── test_cache.py
│ │ ├── test_promotion.py
│ │ ├── test_rnn.py
│ │ ├── test_scale.py
│ │ └── utils.py
│ │ ├── run_fp16_optimizer
│ │ ├── __init__.py
│ │ └── test_fp16_optimizer.py
│ │ ├── run_fp16util
│ │ ├── __init__.py
│ │ └── test_fp16util.py
│ │ ├── run_mixed_adam
│ │ ├── __init__.py
│ │ ├── test_fp16_optimizer.py
│ │ └── test_mixed_adam.py
│ │ ├── run_test.py
│ │ └── synced_batchnorm
│ │ ├── single_gpu_unit_test.py
│ │ ├── test_groups.py
│ │ ├── two_gpu_unit_test.py
│ │ └── unit_test.sh
├── base_model
│ ├── README.md
│ ├── __init__.py
│ ├── resnet.py
│ └── xception.py
├── datasets
│ ├── BaseDataset.py
│ └── __init__.py
├── engine
│ ├── __init__.py
│ ├── engine.py
│ ├── evaluator.py
│ ├── logger.py
│ ├── lr_policy.py
│ └── version.py
├── seg_opr
│ ├── __init__.py
│ ├── loss_opr.py
│ ├── metric.py
│ ├── parallel
│ │ └── parallel_apply.py
│ ├── seg_oprs.py
│ ├── sgd.py
│ └── sync_bn
│ │ ├── __init__.py
│ │ ├── comm.py
│ │ ├── functions.py
│ │ ├── parallel.py
│ │ ├── parallel_apply.py
│ │ ├── src
│ │ ├── __init__.py
│ │ ├── cpu
│ │ │ ├── .ninja_deps
│ │ │ ├── .ninja_log
│ │ │ ├── __init__.py
│ │ │ ├── build.ninja
│ │ │ ├── dist
│ │ │ │ └── syncbn_cpu-0.0.0-py3.6-linux-x86_64.egg
│ │ │ ├── operator.cpp
│ │ │ ├── operator.h
│ │ │ ├── operator.o
│ │ │ ├── setup.py
│ │ │ ├── syncbn_cpu.cpp
│ │ │ ├── syncbn_cpu.egg-info
│ │ │ │ └── PKG-INFO
│ │ │ ├── syncbn_cpu.o
│ │ │ └── syncbn_cpu.so
│ │ └── gpu
│ │ │ ├── .ninja_deps
│ │ │ ├── .ninja_log
│ │ │ ├── __init__.py
│ │ │ ├── build.ninja
│ │ │ ├── common.h
│ │ │ ├── device_tensor.h
│ │ │ ├── dist
│ │ │ └── syncbn_gpu-0.0.0-py3.6-linux-x86_64.egg
│ │ │ ├── operator.cpp
│ │ │ ├── operator.h
│ │ │ ├── operator.o
│ │ │ ├── setup.py
│ │ │ ├── syncbn_gpu.egg-info
│ │ │ └── PKG-INFO
│ │ │ ├── syncbn_gpu.so
│ │ │ ├── syncbn_kernel.cu
│ │ │ └── syncbn_kernel.cuda.o
│ │ └── syncbn.py
└── utils
│ ├── __init__.py
│ ├── img_utils.py
│ ├── init_func.py
│ ├── pyt_utils.py
│ └── visualize.py
├── install.md
├── model
└── sketch.nyu
│ ├── config.py
│ ├── dataloader.py
│ ├── eval.py
│ ├── network.py
│ ├── nyu.py
│ ├── resnet.py
│ ├── run.sh
│ └── train.py
└── ssc.yaml
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | log/
3 | *.npz
4 | *.npy
5 | *.png
6 | *.jpg
7 | *.log
8 | *.pth
9 | __pycache__
10 |
11 |
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Xiaokang Chen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TorchSSC
2 |  
3 |
4 | Implement some state-of-the-art methods of Semantic Scene Completion (SSC) task in PyTorch.
5 |
6 |
7 |
8 | ## Highlights:
9 |
10 | - **Distributed training**
11 | - **Easy-to-modify benchmark code**
12 | - **High performance**
13 |
14 |
15 |
16 |
17 | ## News
18 |
19 | - 2020/07/28
20 |
21 | Code release for the paper **3D Sketch-aware Semantic Scene Completion via Semi-supervised Structure Prior**, *CVPR 2020*. [[arXiv]](https://arxiv.org/abs/2003.14052), [[Supplementary Material and Demo]](https://charlesCXK.github.io)
22 |
23 |
24 |
25 |
26 | ## Performance
27 |
28 | #### NYU
29 |
30 | | Method | Resolution | Trained on | SC IoU | SSC mIoU |
31 | | ------------------------- | ------------ | ---------- | -------- | -------- |
32 | | SSCNet | (240, 60) | NYU | 55.1 | 24.7 |
33 | | VVNetR-120 | (120, 60) | NYU+SUNCG | 61.1 | 32.9 |
34 | | DDRNet | (240, 60) | NYU | 61.0 | 30.4 |
35 | | ForkNet | (80, 80) | NYU | 63.4 | 37.1 |
36 | | CCPNet | (240, 240) | NYU | 63.5 | 38.5 |
37 | | **SketchAwareSSC (Ours)** | **(60, 60)** | **NYU** | **71.3** | **41.1** |
38 |
39 |
40 |
41 | ## Data Preparation && Environment Installation
42 |
43 | #### Pretrained ResNet-50
44 |
45 | Please download the pretrained ResNet-50 and then put it into `./DATA/pytorch-weight`.
46 |
47 | | Source | Link |
48 | | :----------: | :--------------------------------------: |
49 | | BaiDu Cloud | Link: https://pan.baidu.com/s/1wS1TozvS3cBdutsXRWUmUw Key: 4g9u |
50 | | Google Drive | https://drive.google.com/drive/folders/121yZXBZ8wV77WRXRur86YBA4ifJEhsJQ?usp=sharing |
51 |
52 | #### NYU Depth V2
53 |
54 | Please download NYU dataset and then put it into `./DATA/NYU`.
55 |
56 | | Source | Link |
57 | | :----------: | :--------------------------------------: |
58 | | BaiDu Cloud | Link: https://pan.baidu.com/s/1GfWqAbsfMp3NOjFcEnL54A Key: v5ta |
59 | | Google Drive | https://drive.google.com/drive/folders/121yZXBZ8wV77WRXRur86YBA4ifJEhsJQ?usp=sharing |
60 |
61 | #### Environment Installation
62 |
63 | Please refer to [this documentation](./install.md)
64 |
65 |
66 |
67 | ## 3D Sketch-aware Semantic Scene Completion via Semi-supervised Structure Prior
68 |
69 |
70 |
71 | #### Training and Inference
72 |
73 | #### Training
74 |
75 | Training on NYU Depth V2:
76 |
77 | ```shell
78 | $ cd ./model/sketch.nyu
79 | $ export NGPUS=2
80 | $ python -m torch.distributed.launch --nproc_per_node=$NGPUS train.py -p 10097
81 | ```
82 |
83 | - `-p` is the port number. It is about the distributed training. If you run more than one experiments in the same machine, you should set different ports for them.
84 | - The tensorboard file is saved in ` sketch.nyu/log/tb/` directory.
85 |
86 | #### Inference
87 |
88 | Inference on NYU Depth V2:
89 |
90 | ```shell
91 | $ cd ./model/sketch.nyu
92 | $ python eval.py -e 200-250 -d 0-1 --save_path results
93 | ```
94 |
95 | - Here, 200-250 means we evaluate on checkpoints whose ID is in [200, 250], such as epoch-200.pth, epoch-249.pth, etc.
96 | - The SSC predictions will be saved in `results/` and `results_sketch/`, the former stores the SSC predictions and the latter stores sketch preditcions. Performance will be written to `log/*.log`. You will expect `0.411@SSC mIoU` and `0.713@SC IoU`.
97 |
98 |
99 |
100 |
101 | ## Citation
102 |
103 | If you find this work useful in your research, please consider cite:
104 |
105 | ```
106 | @InProceedings{Chen_2020_SketchAwareSSC,
107 | author = {Chen, Xiaokang and Lin, Kwan-Yee and Qian, Chen and Zeng, Gang and Li, Hongsheng},
108 | title = {3D Sketch-aware Semantic Scene Completion via Semi-supervised Structure Prior},
109 | booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
110 | month = {June},
111 | year = {2020}
112 | }
113 | ```
114 |
115 |
116 |
117 | ## Acknowledgement
118 |
119 | Thanks [TorchSeg](https://github.com/ycszen/TorchSeg) for their excellent project!
120 |
121 |
122 |
123 | ## TODO
124 |
125 | - [ ] Code on more datasets (NYUCAD/SUNCG).
126 | - [ ] More SSC models.
127 |
128 |
129 |
--------------------------------------------------------------------------------
/ReadmePic/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/ReadmePic/arch.png
--------------------------------------------------------------------------------
/furnace/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/__init__.py
--------------------------------------------------------------------------------
/furnace/apex/LICENSE:
--------------------------------------------------------------------------------
1 | All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4 |
5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 |
7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 |
9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 |
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/furnace/apex/apex.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.0
2 | Name: apex
3 | Version: 0.1
4 | Summary: PyTorch Extensions written by NVIDIA
5 | Home-page: UNKNOWN
6 | Author: UNKNOWN
7 | Author-email: UNKNOWN
8 | License: UNKNOWN
9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 |
--------------------------------------------------------------------------------
/furnace/apex/apex.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | README.md
2 | setup.py
3 | apex/__init__.py
4 | apex.egg-info/PKG-INFO
5 | apex.egg-info/SOURCES.txt
6 | apex.egg-info/dependency_links.txt
7 | apex.egg-info/top_level.txt
8 | apex/RNN/RNNBackend.py
9 | apex/RNN/__init__.py
10 | apex/RNN/cells.py
11 | apex/RNN/models.py
12 | apex/amp/__init__.py
13 | apex/amp/__version__.py
14 | apex/amp/amp.py
15 | apex/amp/compat.py
16 | apex/amp/handle.py
17 | apex/amp/opt.py
18 | apex/amp/rnn_compat.py
19 | apex/amp/scaler.py
20 | apex/amp/utils.py
21 | apex/amp/wrap.py
22 | apex/amp/lists/__init__.py
23 | apex/amp/lists/functional_overrides.py
24 | apex/amp/lists/tensor_overrides.py
25 | apex/amp/lists/torch_overrides.py
26 | apex/fp16_utils/__init__.py
27 | apex/fp16_utils/fp16_optimizer.py
28 | apex/fp16_utils/fp16util.py
29 | apex/fp16_utils/loss_scaler.py
30 | apex/normalization/__init__.py
31 | apex/normalization/fused_layer_norm.py
32 | apex/normalization/csrc/layer_norm_cuda.cpp
33 | apex/normalization/csrc/layer_norm_cuda_kernel.cu
34 | apex/optimizers/__init__.py
35 | apex/optimizers/fp16_optimizer.py
36 | apex/optimizers/fused_adam.py
37 | apex/optimizers/csrc/fused_adam_cuda.cpp
38 | apex/optimizers/csrc/fused_adam_cuda_kernel.cu
39 | apex/parallel/LARC.py
40 | apex/parallel/__init__.py
41 | apex/parallel/distributed.py
42 | apex/parallel/multiproc.py
43 | apex/parallel/optimized_sync_batchnorm.py
44 | apex/parallel/optimized_sync_batchnorm_kernel.py
45 | apex/parallel/sync_batchnorm.py
46 | apex/parallel/sync_batchnorm_kernel.py
47 | apex/reparameterization/__init__.py
48 | apex/reparameterization/reparameterization.py
49 | apex/reparameterization/weight_norm.py
50 | csrc/flatten_unflatten.cpp
51 | csrc/scale_check_overflow.cpp
52 | csrc/scale_check_overflow_kernel.cu
53 | csrc/syncbn.cpp
54 | csrc/welford.cu
--------------------------------------------------------------------------------
/furnace/apex/apex.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/furnace/apex/apex.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | amp_C
2 | apex
3 | apex_C
4 | fused_adam_cuda
5 | fused_layer_norm_cuda
6 | syncbn
7 |
--------------------------------------------------------------------------------
/furnace/apex/apex/RNN/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 |
--------------------------------------------------------------------------------
/furnace/apex/apex/RNN/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM
2 |
3 | __all__ = ['models']
4 |
--------------------------------------------------------------------------------
/furnace/apex/apex/RNN/cells.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from .RNNBackend import RNNCell
6 |
7 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
8 |
9 | import math
10 |
11 |
12 | class mLSTMRNNCell(RNNCell):
13 | """
14 | mLSTMRNNCell
15 | """
16 |
17 | def __init__(self, input_size, hidden_size, bias = False, output_size = None):
18 | gate_multiplier = 4
19 | super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
20 |
21 | self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
22 | self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
23 |
24 | self.reset_parameters()
25 |
26 | def forward(self, input):
27 | """
28 | mLSTMRNNCell.forward()
29 | """
30 | #if not inited or bsz has changed this will create hidden states
31 | self.init_hidden(input.size()[0])
32 |
33 | hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
34 |
35 | self.hidden = list(
36 | self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
37 | b_ih=self.b_ih, b_hh=self.b_hh)
38 | )
39 |
40 | if self.output_size != self.hidden_size:
41 | self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
42 | return tuple(self.hidden)
43 |
44 |
45 | def new_like(self, new_input_size=None):
46 | if new_input_size is None:
47 | new_input_size = self.input_size
48 |
49 | return type(self)(
50 | new_input_size,
51 | self.hidden_size,
52 | self.bias,
53 | self.output_size)
54 |
55 | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
56 | """
57 | mLSTMCell
58 | """
59 |
60 | if input.is_cuda:
61 | igates = F.linear(input, w_ih)
62 | m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
63 | hgates = F.linear(m, w_hh)
64 |
65 | state = fusedBackend.LSTMFused.apply
66 | return state(igates, hgates, hidden[1], b_ih, b_hh)
67 |
68 | hx, cx = hidden
69 |
70 | m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
71 | gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
72 |
73 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
74 |
75 | ingate = F.sigmoid(ingate)
76 | forgetgate = F.sigmoid(forgetgate)
77 | cellgate = F.tanh(cellgate)
78 | outgate = F.sigmoid(outgate)
79 |
80 | cy = (forgetgate * cx) + (ingate * cellgate)
81 | hy = outgate * F.tanh(cy)
82 |
83 | return hy, cy
84 |
85 |
--------------------------------------------------------------------------------
/furnace/apex/apex/RNN/models.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
4 |
5 | from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
6 | from .cells import mLSTMRNNCell, mLSTMCell
7 |
8 | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
9 | """
10 | :class:`toRNNBackend`
11 | """
12 |
13 | if bidirectional:
14 | return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
15 | else:
16 | return stackedRNN(inputRNN, num_layers, dropout = dropout)
17 |
18 |
19 | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
20 | """
21 | :class:`LSTM`
22 | """
23 | inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
24 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
25 |
26 | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
27 | """
28 | :class:`GRU`
29 | """
30 | inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
31 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
32 |
33 | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
34 | """
35 | :class:`ReLU`
36 | """
37 | inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
38 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
39 |
40 | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
41 | """
42 | :class:`Tanh`
43 | """
44 | inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
45 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
46 |
47 | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
48 | """
49 | :class:`mLSTM`
50 | """
51 | inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
52 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
53 |
54 |
55 |
--------------------------------------------------------------------------------
/furnace/apex/apex/__init__.py:
--------------------------------------------------------------------------------
1 | from . import fp16_utils
2 | from . import parallel
3 | from . import amp
4 |
5 | # For optimizers and normalization there is no Python fallback.
6 | # Absence of cuda backend is a hard error.
7 | # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
8 | # to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
9 | # so they expect those backends to be available, but for some reason they actually aren't
10 | # available (for example because they built improperly in a way that isn't revealed until
11 | # load time) the error message is timely and visible.
12 | from . import optimizers
13 | from . import normalization
14 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from .amp import init, half_function, float_function, promote_function,\
2 | register_half_function, register_float_function, register_promote_function
3 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | __version__ = '.'.join(map(str, VERSION))
3 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/compat.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | # True for post-0.4, when Variables/Tensors merged.
4 | def variable_is_tensor():
5 | v = torch.autograd.Variable()
6 | return isinstance(v, torch.Tensor)
7 |
8 | def tensor_is_variable():
9 | x = torch.Tensor()
10 | return type(x) == torch.autograd.Variable
11 |
12 | # False for post-0.4
13 | def tensor_is_float_tensor():
14 | x = torch.Tensor()
15 | return type(x) == torch.FloatTensor
16 |
17 | # Akin to `torch.is_tensor`, but returns True for Variable
18 | # objects in pre-0.4.
19 | def is_tensor_like(x):
20 | return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
21 |
22 | # Wraps `torch.is_floating_point` if present, otherwise checks
23 | # the suffix of `x.type()`.
24 | def is_floating_point(x):
25 | if hasattr(torch, 'is_floating_point'):
26 | return torch.is_floating_point(x)
27 | try:
28 | torch_type = x.type()
29 | return torch_type.endswith('FloatTensor') or \
30 | torch_type.endswith('HalfTensor') or \
31 | torch_type.endswith('DoubleTensor')
32 | except AttributeError:
33 | return False
34 |
35 | def scalar_python_val(x):
36 | if hasattr(x, 'item'):
37 | return x.item()
38 | else:
39 | if isinstance(x, torch.autograd.Variable):
40 | return x.data[0]
41 | else:
42 | return x[0]
43 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/handle.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import logging
3 | import warnings
4 |
5 | from . import utils
6 | from .opt import OptimWrapper
7 | from .scaler import LossScaler
8 |
9 | class AmpHandle(object):
10 | def __init__(self, enable_caching=True, verbose=False):
11 | self._enable_caching = enable_caching
12 | self._verbose = verbose
13 | self._cache = dict()
14 | self._default_scaler = LossScaler()
15 | self._is_active = True
16 | self._all_wrappers = []
17 |
18 | def is_active(self):
19 | return self._is_active
20 |
21 | @contextlib.contextmanager
22 | def _disable_casts(self):
23 | self._is_active = False
24 | yield
25 | self._is_active = True
26 |
27 | def wrap_optimizer(self, optimizer, num_loss=1):
28 | self._default_scaler = None
29 | return OptimWrapper(optimizer, self, num_loss)
30 |
31 | @contextlib.contextmanager
32 | def scale_loss(self, loss, optimizer):
33 | if not self.is_active():
34 | yield loss
35 | return
36 |
37 | if self._default_scaler is None:
38 | raise RuntimeError(
39 | 'After calling `handle.wrap_optimizer()`, you must explicitly ' +
40 | 'use `optimizer.scale_loss(loss)`.')
41 |
42 | # TODO: this code block is duplicated here and `opt.py`. Unify.
43 | loss_scale = self._default_scaler.loss_scale()
44 | yield loss * loss_scale
45 |
46 | should_skip = self._default_scaler.unscale_and_update(
47 | optimizer.param_groups, loss_scale)
48 | if should_skip:
49 | optimizer_step = optimizer.step
50 | def skip_step():
51 | logger = logging.getLogger('apex.amp')
52 | logger.warning('Gradient overflow, skipping update')
53 | optimizer.step = optimizer_step
54 | optimizer.step = skip_step
55 |
56 | self._clear_cache()
57 |
58 | def _clear_cache(self):
59 | self._cache.clear()
60 |
61 | # Experimental support for saving / restoring uncasted versions of functions
62 | def _save_func(self, mod, fn, func):
63 | self._all_wrappers.append((mod, fn, func))
64 |
65 | def _deactivate(self):
66 | for mod, fn, func in self._all_wrappers:
67 | utils.set_func(mod, fn, func)
68 | self._all_wrappers = []
69 |
70 | @property
71 | def has_cache(self):
72 | return self._enable_caching
73 |
74 | @property
75 | def cache(self):
76 | return self._cache
77 |
78 | def remove_cache(self, param):
79 | if self.has_cache and param in self.cache:
80 | del self.cache[param]
81 |
82 | @property
83 | def verbose(self):
84 | return self._verbose
85 |
86 | class NoOpHandle(object):
87 | def is_active(self):
88 | return False
89 |
90 | @contextlib.contextmanager
91 | def _disable_casts(self):
92 | yield
93 |
94 | def wrap_optimizer(self, optimizer, num_loss=1):
95 | return OptimWrapper(optimizer, self, num_loss)
96 |
97 | @contextlib.contextmanager
98 | def scale_loss(self, loss, optimizer):
99 | yield loss
100 |
101 | @property
102 | def has_cache(self):
103 | return False
104 |
105 | @property
106 | def verbose(self):
107 | return False
108 |
109 | def _deactivate(self):
110 | pass
111 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/lists/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/apex/amp/lists/__init__.py
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/lists/functional_overrides.py:
--------------------------------------------------------------------------------
1 |
2 | # TODO: think about the following two. They do weird things.
3 | # - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
4 | # - torch.nn.utils.weight_norm
5 |
6 | # Notes:
7 | # F.instance_norm uses batch_norm internally. Which correctly handles
8 | # fp16 in/out with fp32 weights. So we shouldn't do anything for
9 | # either of these.
10 | # F.normalize calls `input.norm()` internally, so it's redundant, but
11 | # kept here in case impl. changes.
12 | # F.cosine_similarity is same: calls `x.norm()` internally.
13 |
14 | import torch.nn.functional
15 |
16 | MODULE = torch.nn.functional
17 |
18 | FP16_FUNCS = [
19 | 'conv1d',
20 | 'conv2d',
21 | 'conv3d',
22 | 'conv_transpose1d',
23 | 'conv_transpose2d',
24 | 'conv_transpose3d',
25 | 'conv_tbc', # Undocumented / maybe new?
26 | 'linear',
27 | ]
28 |
29 | FP32_FUNCS = [
30 | # Pointwise
31 | 'softplus',
32 | 'softmin',
33 | 'log_softmax',
34 | 'softmax',
35 |
36 | # Normalization
37 | 'layer_norm',
38 | 'group_norm',
39 | 'local_response_norm',
40 | 'normalize',
41 | 'cosine_similarity',
42 |
43 | # Loss functions
44 | # TODO: which of these can be fp16?
45 | 'poisson_nll_loss',
46 | 'cosine_embedding_loss',
47 | 'cross_entropy',
48 | 'hinge_embedding_loss',
49 | 'kl_div',
50 | 'l1_loss',
51 | 'mse_loss',
52 | 'margin_ranking_loss',
53 | 'multilabel_margin_loss',
54 | 'multilabel_soft_margin_loss',
55 | 'multi_margin_loss',
56 | 'nll_loss',
57 | 'binary_cross_entropy_with_logits',
58 | 'smooth_l1_loss',
59 | 'soft_margin_loss',
60 | 'triplet_margin_loss'
61 | ]
62 |
63 | BANNED_FUNCS = [
64 | ('binary_cross_entropy',
65 | ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
66 | "It requires that the output of the previous function be already a FloatTensor. \n\n"
67 | "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
68 | " torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
69 | "that is compatible with amp.\nAnother option is to add\n"
70 | " amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
71 | "If you _really_ know what you are doing, you can disable this warning by passing "
72 | "allow_banned=True to `amp.init()`."))
73 | ]
74 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/lists/tensor_overrides.py:
--------------------------------------------------------------------------------
1 | from .. import compat
2 | from . import torch_overrides
3 |
4 | import importlib
5 |
6 | import torch
7 |
8 | if compat.variable_is_tensor() and not compat.tensor_is_variable():
9 | MODULE = torch.Tensor
10 | else:
11 | MODULE = torch.autograd.Variable
12 |
13 |
14 | FP16_FUNCS = [
15 | '__matmul__',
16 | ]
17 |
18 | FP32_FUNCS = [
19 | '__ipow__',
20 | '__pow__',
21 | '__rpow__',
22 |
23 | # Cast to fp32 before transfer to CPU
24 | 'cpu',
25 | ]
26 |
27 | CASTS = [
28 | '__add__',
29 | '__div__',
30 | '__eq__',
31 | '__ge__',
32 | '__gt__',
33 | '__iadd__',
34 | '__idiv__',
35 | '__imul__',
36 | '__isub__',
37 | '__itruediv__',
38 | '__le__',
39 | '__lt__',
40 | '__mul__',
41 | '__ne__',
42 | '__radd__',
43 | '__rdiv__',
44 | '__rmul__',
45 | '__rsub__',
46 | '__rtruediv__',
47 | '__sub__',
48 | '__truediv__',
49 | ]
50 |
51 | # None of these, but here to make code cleaner.
52 | SEQUENCE_CASTS = []
53 |
54 | # We need to grab all the methods from torch_overrides and add them to
55 | # the Tensor lists as well, as almost all methods are duplicated
56 | # between `torch` and `torch.Tensor` (and check with `hasattr`,
57 | # because a few random ones aren't defined on Tensor)
58 | _self_mod = importlib.import_module(__name__)
59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
60 | lst = getattr(_self_mod, attrname)
61 | for fn in getattr(torch_overrides, attrname):
62 | if hasattr(MODULE, fn):
63 | lst.append(fn)
64 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/lists/torch_overrides.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | MODULE = torch
4 |
5 | FP16_FUNCS = [
6 | # Math
7 | # TODO: why are these in top-level torch namespace?
8 | 'conv1d',
9 | 'conv2d',
10 | 'conv3d',
11 | 'conv_transpose1d',
12 | 'conv_transpose2d',
13 | 'conv_transpose3d',
14 | 'conv_tbc',
15 |
16 | # BLAS
17 | 'addmm',
18 | 'addmv',
19 | 'addr',
20 | 'matmul',
21 | 'mm',
22 | 'mv',
23 |
24 | ]
25 |
26 | # TODO: ban in-place versions of these in fp16
27 | FP32_FUNCS = [
28 | # Pointwise
29 | 'acos',
30 | 'asin',
31 | 'cosh',
32 | 'erfinv',
33 | 'exp',
34 | 'expm1',
35 | 'log',
36 | 'log10',
37 | 'log2',
38 | 'reciprocal',
39 | 'rsqrt',
40 | 'sinh',
41 | 'tan',
42 |
43 | # Other math
44 | 'pow',
45 |
46 | # Reduction
47 | 'cumprod',
48 | 'cumsum',
49 | 'dist',
50 | 'mean',
51 | 'norm',
52 | 'prod',
53 | 'std',
54 | 'sum',
55 | 'var',
56 |
57 | # Special reduction-like BLAS
58 | 'addbmm',
59 | 'baddbmm',
60 | 'bmm',
61 |
62 | # Misc
63 | 'renorm'
64 | ]
65 |
66 | # Multi-tensor fns that may need type promotion
67 | CASTS = [
68 | # Multi-tensor math
69 | 'addcdiv',
70 | 'addcmul',
71 | 'atan2',
72 | 'cross',
73 |
74 | # Element-wise _or_ tensor-wise math
75 | 'add',
76 | 'div',
77 | 'mul',
78 |
79 | # Comparison
80 | 'eq',
81 | 'equal',
82 | 'ge',
83 | 'gt',
84 | 'le',
85 | 'lt',
86 | 'ne'
87 | ]
88 |
89 | # Will possibly need to promote *all* elements of `seq`
90 | SEQUENCE_CASTS = [
91 | 'cat', # torch.cat(seq, dim=0, out=None)
92 | 'stack' # torch.stack(seq, dim=0, out=None)
93 | ]
94 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/opt.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import logging
3 | import warnings
4 |
5 | from .scaler import LossScaler, iter_params
6 |
7 | import numpy as np
8 |
9 | class OptimWrapper(object):
10 | def __init__(self, optimizer, amp_handle, num_loss):
11 | self._optimizer = optimizer
12 | self._amp_handle = amp_handle
13 | self._num_loss = num_loss
14 | self._loss_idx = 0
15 | self._skip_next = [False] * num_loss
16 | self._loss_scaler = [LossScaler() for _ in range(num_loss)]
17 |
18 | @contextlib.contextmanager
19 | def scale_loss(self, loss):
20 | if not self._amp_handle.is_active():
21 | yield loss
22 | return
23 |
24 | # When there are multiple losses per-optimizer, we need
25 | # to save out current grad accumulation, since we won't be
26 | # able to unscale this particulare loss once the grads are
27 | # all mixed together.
28 | cached_grads = []
29 | if self._loss_idx > 0:
30 | for p in iter_params(self._optimizer.param_groups):
31 | if p.grad is not None:
32 | cached_grads.append(p.grad.data.detach().clone())
33 | else:
34 | cached_grads.append(None)
35 | self._optimizer.zero_grad()
36 |
37 | loss_scale = self._cur_loss_scaler().loss_scale()
38 | yield loss * loss_scale
39 |
40 | self._skip_next[self._loss_idx] = self._cur_loss_scaler().unscale_and_update(
41 | self._optimizer.param_groups, loss_scale)
42 | self._loss_idx += 1
43 |
44 | if len(cached_grads) > 0:
45 | for p, cached_grad in zip(iter_params(self._optimizer.param_groups),
46 | cached_grads):
47 | if cached_grad is not None:
48 | p.grad.data.add_(cached_grad)
49 | cached_grads = []
50 |
51 | def _cur_loss_scaler(self):
52 | assert 0 <= self._loss_idx < self._num_loss
53 | return self._loss_scaler[self._loss_idx]
54 |
55 | def step(self, closure=None):
56 | if not self._amp_handle.is_active():
57 | return self._optimizer.step(closure=closure)
58 |
59 | self._loss_idx = 0
60 |
61 | for group in self._optimizer.param_groups:
62 | for p in group['params']:
63 | self._amp_handle.remove_cache(p)
64 |
65 | if closure is not None:
66 | raise NotImplementedError(
67 | 'The `closure` argument is unsupported by the amp ' +
68 | 'optimizer wrapper.')
69 | if any(self._skip_next):
70 | logger = logging.getLogger('apex.amp')
71 | logger.info('Gradient overflow, skipping update')
72 | self._skip_next = [False] * self._num_loss
73 | else:
74 | return self._optimizer.step(closure=closure)
75 |
76 | # Forward any attribute lookups
77 | def __getattr__(self, attr):
78 | return getattr(self._optimizer, attr)
79 |
80 | # Forward all torch.optim.Optimizer methods
81 | def __getstate__(self):
82 | return self._optimizer.__getstate__()
83 |
84 | def __setstate__(self):
85 | return self._optimizer.__setstate__()
86 |
87 | def __repr__(self):
88 | return self._optimizer.__repr__()
89 |
90 | def state_dict(self):
91 | return self._optimizer.state_dict()
92 |
93 | def load_state_dict(self, state_dict):
94 | return self._optimizer.load_state_dict(state_dict)
95 |
96 | def zero_grad(self):
97 | return self._optimizer.zero_grad()
98 |
99 | def add_param_group(self, param_group):
100 | return self._optimizer.add_param_group(param_group)
101 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/rnn_compat.py:
--------------------------------------------------------------------------------
1 | from . import utils, wrap
2 |
3 | import torch
4 | _VF = torch._C._VariableFunctions
5 | RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
6 |
7 | def _gen_VF_wrapper(name):
8 | def wrapper(*args, **kwargs):
9 | return getattr(_VF, name)(*args, **kwargs)
10 | return wrapper
11 |
12 | # Some python magic to generate an object that has the rnn cell functions
13 | # defined on it, all of which call into corresponding _VF version.
14 | class VariableFunctionsShim(object):
15 | def __init__(self):
16 | for name in RNN_NAMES:
17 | setattr(self, name + '_cell', _gen_VF_wrapper(name + '_cell'))
18 |
19 | def has_old_rnns():
20 | try:
21 | torch.nn.backends.thnn.backend.LSTMCell
22 | return True
23 | except:
24 | return False
25 |
26 | def whitelist_rnn_cells(handle, verbose):
27 | # Different module + function names in old/new RNN cases
28 | if has_old_rnns():
29 | fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
30 | mod = torch.nn.backends.thnn.backend
31 | else:
32 | fn_names = [x + '_cell' for x in RNN_NAMES]
33 | mod = torch.nn.modules.rnn._VF
34 | assert isinstance(mod, VariableFunctionsShim)
35 |
36 | # Insert casts on cell functions
37 | for fn in fn_names:
38 | wrap.cached_cast(mod, fn, utils.maybe_half, handle,
39 | try_caching=True, verbose=verbose)
40 |
41 | if has_old_rnns():
42 | # Special handling of `backward` for fused gru / lstm:
43 | # The `backward` method calls Tensor.sum() (blacklist) internally,
44 | # and then the resulting grad_input has the wrong type.
45 | # TODO: where else is this a problem?
46 | for rnn_type in ['GRUFused', 'LSTMFused']:
47 | mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
48 | wrap.disable_casts(mod, 'backward', handle)
49 |
--------------------------------------------------------------------------------
/furnace/apex/apex/amp/scaler.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import logging
3 |
4 | # from apex_C import scale_check_overflow
5 |
6 | def scale_check_overflow_python(d_grads, scale):
7 | # Exception handling for 18.04 compatibility
8 | try:
9 | cpu_sum = float(d_grads.float().sum())
10 | except RuntimeError as instance:
11 | if "value cannot be converted" not in instance.args[0]:
12 | raise
13 | return True
14 | else:
15 | if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
16 | return True
17 | d_grads.mul_(scale)
18 | return False
19 |
20 | class LossScaler(object):
21 | warned_no_fused_kernel = False
22 | warned_fp16_grad = False
23 | has_fused_kernel = False
24 |
25 | def __init__(self):
26 | self._loss_scale = 2.**16
27 | self._max_loss_scale = 2.**24
28 | self._scale_seq_len = 2000
29 | self._unskipped = 0
30 | self._has_overflow = False
31 | try:
32 | import amp_C
33 | LossScaler.has_fused_kernel = True
34 | LossScaler.scale_check_overflow_cuda = amp_C.scale_check_overflow
35 | self._overflow_buf = torch.cuda.IntTensor([0])
36 | except ImportError as err:
37 | if not LossScaler.warned_no_fused_kernel:
38 | print("Warning: Amp fused downscale kernel is unavailable, possibly because apex "
39 | "was installed without --cuda_ext. Using Python fallback. ImportError was: ",
40 | err)
41 | LossScaler.has_fused_kernel = False
42 | LossScaler.warned_no_fused_kernel = True
43 |
44 | def loss_scale(self):
45 | return self._loss_scale
46 |
47 | def unscale_and_update(self, param_groups, scale):
48 | if LossScaler.has_fused_kernel:
49 | self._overflow_buf.zero_()
50 | self._has_overflow = False
51 | for p in iter_params(param_groups):
52 | if p.grad is not None:
53 | if LossScaler.has_fused_kernel and p.grad.data.type() == "torch.cuda.FloatTensor":
54 | LossScaler.scale_check_overflow_cuda(p.grad.data,
55 | 1./scale,
56 | self._overflow_buf,
57 | p.grad.data)
58 | else:
59 | if (p.grad.data.type() != "torch.cuda.FloatTensor"
60 | and not LossScaler.warned_fp16_grad):
61 | logger = logging.getLogger("apex.amp")
62 | logger.warning("Incoming grads are not fp32 (not master grads). "
63 | "Downscaling non-fp32 grads may indicate an error. "
64 | "When using Amp, you don't need to call .half() on your model.")
65 | LossScaler.warned_fp16_grad = True
66 | self._has_overflow = scale_check_overflow_python(p.grad.data,
67 | 1./scale)
68 | if self._has_overflow:
69 | break
70 |
71 | # If the fused kernel is available, we only need one D2H memcopy and sync.
72 | if LossScaler.has_fused_kernel and not self._has_overflow:
73 | self._has_overflow = self._overflow_buf.item()
74 |
75 | if self._has_overflow:
76 | should_skip = True
77 | self._loss_scale /= 2.
78 | self._unskipped = 0
79 | else:
80 | should_skip = False
81 | self._unskipped += 1
82 |
83 | if self._unskipped == self._scale_seq_len:
84 | self._loss_scale = min(self._max_loss_scale, self._loss_scale * 2.)
85 | self._unskipped = 0
86 |
87 | return should_skip
88 |
89 | def iter_params(param_groups):
90 | for group in param_groups:
91 | for p in group['params']:
92 | yield p
93 |
--------------------------------------------------------------------------------
/furnace/apex/apex/fp16_utils/README.md:
--------------------------------------------------------------------------------
1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user. To use `FP16_Optimizer`, only two lines of one's Python model need to change.
2 |
3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
4 |
5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
6 |
7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
8 |
9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
10 |
11 |
12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.
13 |
14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
15 |
16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling. These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
17 |
--------------------------------------------------------------------------------
/furnace/apex/apex/fp16_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .fp16util import (
2 | BN_convert_float,
3 | network_to_half,
4 | prep_param_lists,
5 | model_grads_to_master_grads,
6 | master_params_to_model_params,
7 | tofp16,
8 | to_python_float,
9 | clip_grad_norm,
10 | convert_module,
11 | convert_network,
12 | FP16Model,
13 | )
14 |
15 | from .fp16_optimizer import FP16_Optimizer
16 | from .loss_scaler import LossScaler, DynamicLossScaler
17 |
--------------------------------------------------------------------------------
/furnace/apex/apex/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_layer_norm import FusedLayerNorm
2 |
--------------------------------------------------------------------------------
/furnace/apex/apex/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_adam import FusedAdam
2 | from .fp16_optimizer import FP16_Optimizer
3 |
--------------------------------------------------------------------------------
/furnace/apex/apex/optimizers/csrc/fused_adam_cuda.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | // CUDA forward declaration
4 | void fused_adam_cuda(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);
5 |
6 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
7 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
8 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
9 |
10 | // C++ interface
11 | void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay) {
12 | CHECK_INPUT(p)
13 | if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
14 | CHECK_INPUT(m);
15 | CHECK_INPUT(v);
16 | CHECK_INPUT(g);
17 | int64_t num_elem = p.numel();
18 | AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
19 | AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
20 | AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
21 | AT_ASSERTM(p_copy.numel() == num_elem || p_copy.numel() == 0, "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");
22 |
23 | fused_adam_cuda(p, p_copy, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction, decay);
24 | }
25 |
26 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
27 | m.def("adam", &adam, "Adam optimized CUDA implementation.");
28 | }
29 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/LARC.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.autograd import Variable
4 | from torch.nn.parameter import Parameter
5 |
6 | class LARC(object):
7 | """
8 | :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC,
9 | in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive
10 | local learning rate for each individual parameter. The algorithm is designed to improve
11 | convergence of large batch training.
12 |
13 | See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate.
14 |
15 | In practice it modifies the gradients of parameters as a proxy for modifying the learning rate
16 | of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer.
17 |
18 | ```
19 | model = ...
20 | optim = torch.optim.Adam(model.parameters(), lr=...)
21 | optim = LARC(optim)
22 | ```
23 |
24 | It can even be used in conjunction with apex.fp16_utils.FP16_optimizer.
25 |
26 | ```
27 | model = ...
28 | optim = torch.optim.Adam(model.parameters(), lr=...)
29 | optim = LARC(optim)
30 | optim = apex.fp16_utils.FP16_Optimizer(optim)
31 | ```
32 |
33 | Args:
34 | optimizer: Pytorch optimizer to wrap and modify learning rate for.
35 | trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888
36 | clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`.
37 | eps: epsilon kludge to help with numerical stability while calculating adaptive_lr
38 | """
39 |
40 | def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8):
41 | self.param_groups = optimizer.param_groups
42 | self.optim = optimizer
43 | self.trust_coefficient = trust_coefficient
44 | self.eps = eps
45 | self.clip = clip
46 |
47 | def __getstate__(self):
48 | return self.optim.__getstate__()
49 |
50 | def __setstate__(self, state):
51 | self.optim.__setstate__(state)
52 |
53 | def __repr__(self):
54 | return self.optim.__repr__()
55 |
56 | def state_dict(self):
57 | return self.optim.state_dict()
58 |
59 | def load_state_dict(self, state_dict):
60 | self.optim.load_state_dict(state_dict)
61 |
62 | def zero_grad(self):
63 | self.optim.zero_grad()
64 |
65 | def add_param_group(self, param_group):
66 | self.optim.add_param_group( param_group)
67 |
68 | def step(self):
69 | with torch.no_grad():
70 | weight_decays = []
71 | for group in self.optim.param_groups:
72 | # absorb weight decay control from optimizer
73 | weight_decay = group['weight_decay'] if 'weight_decay' in group else 0
74 | weight_decays.append(weight_decay)
75 | group['weight_decay'] = 0
76 | for p in group['params']:
77 | if p.grad is None:
78 | continue
79 | param_norm = torch.norm(p.data)
80 | grad_norm = torch.norm(p.grad.data)
81 |
82 | if param_norm != 0 and grad_norm != 0:
83 | # calculate adaptive lr + weight decay
84 | adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps)
85 |
86 | # clip learning rate for LARC
87 | if self.clip:
88 | # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
89 | adaptive_lr = min(adaptive_lr/group['lr'], 1)
90 |
91 | p.grad.data += weight_decay * p.data
92 | p.grad.data *= adaptive_lr
93 |
94 | self.optim.step()
95 | # return weight decay control to optimizer
96 | for i, group in enumerate(self.optim.param_groups):
97 | group['weight_decay'] = weight_decays[i]
98 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/README.md:
--------------------------------------------------------------------------------
1 | ## Distributed Data Parallel
2 |
3 | distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library.
4 |
5 | `apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with
6 | computation in the backward pass and bucketing smaller transfers to reduce the total number of
7 | transfers required.
8 |
9 | multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs.
10 |
11 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html)
12 |
13 | #### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
14 |
15 | #### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
16 |
17 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
18 |
19 | ### Synchronized Batch Normalization
20 |
21 | `apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`.
22 | It reduces stats on the first (channel) dimension of the Tensor and accepts
23 | arbitrary spatial dimensions.
24 |
25 | #### Installation
26 |
27 | Apex provides two sync BN implementation:
28 |
29 | 1. There is the Python-only implementation, which is the default implementation
30 | when install with `python setup.py install`.
31 | It uses PyTorch primitive operations and distributed communication package from
32 | `torch.distributed`.
33 |
34 | - _Python-only implementation requires input tensor to be of same data type as
35 | layer_
36 |
37 | 2. We also provide implementation with kernels through CUDA/C++ extension with
38 | improved performance. We are experimenting with Welford and Kahan for reduction
39 | hoping to get better accuracy.
40 | To use the kernel implementation, user need to install Apex with CUDA extension
41 | enabled `python setup.py install --cuda_ext`.
42 |
43 | - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn.
44 | This is required to run imagenet example in fp16._
45 |
46 | - _Currently kernel implementation only supports GPU._
47 |
48 | #### HowTo
49 |
50 | 1. User could use `apex.parallel.SyncBatchNorm` by building their module with
51 | the layer explicitly.
52 |
53 | ```
54 | import apex
55 | input_t = torch.randn(3, 5, 20).cuda()
56 | sbn = apex.parallel.SyncBatchNorm(5).cuda()
57 | output_t = sbn(input)
58 | ```
59 |
60 | 2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`.
61 |
62 | ```
63 | # model is an instance of torch.nn.Module
64 | import apex
65 | sync_bn_model = apex.parallel.convert_syncbn_model(model)
66 | ```
67 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | if hasattr(torch.distributed, 'ReduceOp'):
4 | ReduceOp = torch.distributed.ReduceOp
5 | elif hasattr(torch.distributed, 'reduce_op'):
6 | ReduceOp = torch.distributed.reduce_op
7 | else:
8 | ReduceOp = torch.distributed.deprecated.reduce_op
9 |
10 | from .distributed import DistributedDataParallel, Reducer
11 | # This is tricky because I'd like SyncBatchNorm to be exposed the same way
12 | # for both the cuda-enabled and python-fallback versions, and I don't want
13 | # to suppress the error information.
14 | try:
15 | import syncbn
16 | from .optimized_sync_batchnorm import SyncBatchNorm
17 | except ImportError as err:
18 | from .sync_batchnorm import SyncBatchNorm
19 | SyncBatchNorm.syncbn_import_error = err
20 |
21 | def convert_syncbn_model(module, process_group=None, channel_last=False):
22 | '''
23 | Recursively traverse module and its children to replace all instances of
24 | ``torch.nn.modules.batchnorm._BatchNorm`` with :class:`apex.parallel.SyncBatchNorm`.
25 |
26 | All ``torch.nn.BatchNorm*N*d`` wrap around
27 | ``torch.nn.modules.batchnorm._BatchNorm``, so this function lets you easily switch
28 | to use sync BN.
29 |
30 | Args:
31 | module (torch.nn.Module): input module
32 |
33 | Example::
34 |
35 | >>> # model is an instance of torch.nn.Module
36 | >>> import apex
37 | >>> sync_bn_model = apex.parallel.convert_syncbn_model(model)
38 | '''
39 | mod = module
40 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
41 | mod = SyncBatchNorm(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats, process_group, channel_last=channel_last)
42 | mod.running_mean = module.running_mean
43 | mod.running_var = module.running_var
44 | if module.affine:
45 | mod.weight.data = module.weight.data.clone().detach()
46 | mod.bias.data = module.bias.data.clone().detach()
47 | for name, child in module.named_children():
48 | mod.add_module(name, convert_syncbn_model(child,
49 | process_group=process_group,
50 | channel_last=channel_last))
51 | # TODO(jie) should I delete model explicitly?
52 | del module
53 | return mod
54 |
55 | def create_syncbn_process_group(group_size):
56 | '''
57 | Creates process groups to be used for syncbn of a give ``group_size`` and returns
58 | process group that current GPU participates in.
59 |
60 | ``group_size`` must divide the total number of GPUs (world_size).
61 |
62 | ``group_size`` of 0 would be considered as =world_size. In this case ``None`` will be returned.
63 |
64 | ``group_size`` of 1 would be equivalent to using non-sync bn, but will still carry the overhead.
65 |
66 | Args:
67 | group_size (int): number of GPU's to collaborate for sync bn
68 |
69 | Example::
70 |
71 | >>> # model is an instance of torch.nn.Module
72 | >>> import apex
73 | >>> group = apex.parallel.create_syncbn_process_group(group_size)
74 | '''
75 |
76 | if group_size==0:
77 | return None
78 |
79 | world_size = torch.distributed.get_world_size()
80 | assert(world_size >= group_size)
81 | assert(world_size % group_size == 0)
82 |
83 | group=None
84 | for group_num in (range(world_size//group_size)):
85 | group_ids = range(group_num*group_size, (group_num+1)*group_size)
86 | cur_group = torch.distributed.new_group(ranks=group_ids)
87 | if (torch.distributed.get_rank()//group_size == group_num):
88 | group = cur_group
89 | #can not drop out and return here, every process must go through creation of all subgroups
90 |
91 | assert(group is not None)
92 | return group
93 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/multiproc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | import subprocess
4 |
5 | def docstring_hack():
6 | """
7 | Multiproc file which will launch a set of processes locally for multi-gpu
8 | usage: python -m apex.parallel.multiproc main.py ...
9 | """
10 | pass
11 |
12 | argslist = list(sys.argv)[1:]
13 | world_size = torch.cuda.device_count()
14 |
15 | if '--world-size' in argslist:
16 | world_size = int(argslist[argslist.index('--world-size')+1])
17 | else:
18 | argslist.append('--world-size')
19 | argslist.append(str(world_size))
20 |
21 | workers = []
22 |
23 | for i in range(world_size):
24 | if '--rank' in argslist:
25 | argslist[argslist.index('--rank')+1] = str(i)
26 | else:
27 | argslist.append('--rank')
28 | argslist.append(str(i))
29 | stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
30 | print(argslist)
31 | p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
32 | workers.append(p)
33 |
34 | for p in workers:
35 | p.wait()
36 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/optimized_sync_batchnorm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn.modules.batchnorm import _BatchNorm
3 | from torch.nn import functional as F
4 |
5 | import syncbn
6 | from .optimized_sync_batchnorm_kernel import SyncBatchnormFunction
7 |
8 |
9 | class SyncBatchNorm(_BatchNorm):
10 | """
11 | synchronized batch normalization module extented from `torch.nn.BatchNormNd`
12 | with the added stats reduction across multiple processes.
13 | :class:`apex.parallel.SyncBatchNorm` is designed to work with
14 | `DistributedDataParallel`.
15 |
16 | When running in training mode, the layer reduces stats across all processes
17 | to increase the effective batchsize for normalization layer. This is useful
18 | in applications where batch size is small on a given process that would
19 | diminish converged accuracy of the model. The model uses collective
20 | communication package from `torch.distributed`.
21 |
22 | When running in evaluation mode, the layer falls back to
23 | `torch.nn.functional.batch_norm`
24 |
25 | Args:
26 | num_features: :math:`C` from an expected input of size
27 | :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
28 | eps: a value added to the denominator for numerical stability.
29 | Default: 1e-5
30 | momentum: the value used for the running_mean and running_var
31 | computation. Can be set to ``None`` for cumulative moving average
32 | (i.e. simple average). Default: 0.1
33 | affine: a boolean value that when set to ``True``, this module has
34 | learnable affine parameters. Default: ``True``
35 | track_running_stats: a boolean value that when set to ``True``, this
36 | module tracks the running mean and variance, and when set to ``False``,
37 | this module does not track such statistics and always uses batch
38 | statistics in both training and eval modes. Default: ``True``
39 | process_group: pass in a process group within which the stats of the
40 | mini-batch is being synchronized. ``None`` for using default process
41 | group
42 | channel_last: a boolean value that when set to ``True``, this module
43 | take the last dimension of the input tensor to be the channel
44 | dimension. Default: False
45 |
46 | Examples::
47 | >>> # channel first tensor
48 | >>> sbn = apex.parallel.SyncBatchNorm(100).cuda()
49 | >>> inp = torch.randn(10, 100, 14, 14).cuda()
50 | >>> out = sbn(inp)
51 | >>> inp = torch.randn(3, 100, 20).cuda()
52 | >>> out = sbn(inp)
53 | >>> # channel last tensor
54 | >>> sbn = apex.parallel.SyncBatchNorm(100, channel_last=True).cuda()
55 | >>> inp = torch.randn(10, 14, 14, 100).cuda()
56 | """
57 |
58 | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False):
59 | super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
60 | self.process_group = process_group
61 | self.channel_last = channel_last
62 |
63 | def _specify_process_group(self, process_group):
64 | self.process_group = process_group
65 |
66 | def _specify_channel_last(self, channel_last):
67 | self.channel_last = channel_last
68 |
69 | def forward(self, input):
70 | if not self.training and self.track_running_stats and not self.channel_last:
71 | # fall back to pytorch implementation for inference
72 | return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
73 | else:
74 | exponential_average_factor = 0.0
75 | if self.training and self.track_running_stats:
76 | self.num_batches_tracked += 1
77 | if self.momentum is None:
78 | exponential_average_factor = 1.0 / float(self.num_batches_tracked)
79 | else:
80 | exponential_average_factor = self.momentum
81 | return SyncBatchnormFunction.apply(input, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, self.channel_last)
82 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/optimized_sync_batchnorm_kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd.function import Function
3 |
4 | import syncbn
5 | from apex.parallel import ReduceOp
6 |
7 | class SyncBatchnormFunction(Function):
8 |
9 | @staticmethod
10 | def forward(ctx, input, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False):
11 | torch.cuda.nvtx.range_push("sync_BN_fw")
12 | input = input.contiguous()
13 | world_size = 0
14 |
15 | mean = None
16 | var_biased = None
17 | inv_std = None
18 | var = None
19 | out = None
20 | count = None
21 | if track_running_stats:
22 | if channel_last:
23 | count = int(input.numel()/input.size(-1))
24 | mean, var_biased = syncbn.welford_mean_var_c_last(input)
25 | else :
26 | count = int(input.numel()/input.size(1))
27 | mean, var_biased = syncbn.welford_mean_var(input)
28 |
29 | if torch.distributed.is_initialized():
30 | if not process_group:
31 | process_group = torch.distributed.group.WORLD
32 | world_size = torch.distributed.get_world_size(process_group)
33 | mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=mean.device)
34 | var_all = torch.empty(world_size, var_biased.size(0), dtype=var_biased.dtype, device=var_biased.device)
35 | mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)]
36 | var_l = [var_all.narrow(0, i, 1) for i in range(world_size)]
37 | torch.distributed.all_gather(mean_l, mean, process_group)
38 | torch.distributed.all_gather(var_l, var_biased, process_group)
39 | mean, var, inv_std = syncbn.welford_parallel(mean_all, var_all, count, eps)
40 | # TODO(Jie): should do fp32 math instead!
41 | else:
42 | inv_std = 1.0 / torch.sqrt(var_biased + eps)
43 | var = var_biased * (count) / (count-1)
44 |
45 | r_m_inc = mean if running_mean.dtype != torch.float16 else mean.half()
46 | r_v_inc = var if running_variance.dtype != torch.float16 else var.half()
47 | running_mean.data = running_mean.data * (1-momentum) + momentum*r_m_inc
48 | running_variance.data = running_variance.data * (1-momentum) + momentum*r_v_inc
49 | else:
50 | mean = running_mean.data
51 | inv_std = 1.0 / torch.sqrt(running_var.data + eps)
52 |
53 | ctx.save_for_backward(input, weight, mean, inv_std)
54 | ctx.process_group = process_group
55 | ctx.channel_last = channel_last
56 | ctx.world_size = world_size
57 |
58 | if channel_last:
59 | out = syncbn.batchnorm_forward_c_last(input, mean, inv_std, weight, bias)
60 | else:
61 | out = syncbn.batchnorm_forward(input, mean, inv_std, weight, bias)
62 |
63 | torch.cuda.nvtx.range_pop()
64 | return out
65 |
66 | @staticmethod
67 | def backward(ctx, grad_output):
68 | grad_output = grad_output.contiguous()
69 | torch.cuda.nvtx.range_push("sync_BN_bw")
70 | # mini batch mean & var are calculated by forward path.
71 | # mu = 1./N*np.sum(h, axis = 0)
72 | # var = 1./N*np.sum((h-mu)**2, axis = 0)
73 | saved_input, weight, mean, inv_std = ctx.saved_tensors
74 | process_group = ctx.process_group
75 | channel_last = ctx.channel_last
76 | world_size = ctx.world_size
77 | grad_input = grad_weight = grad_bias = None
78 |
79 | # TODO(jie): why do I have to clone here? life time of grad_output?
80 | if channel_last:
81 | mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn_c_last(grad_output, saved_input, mean, inv_std, weight)
82 | else:
83 | mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output, saved_input, mean, inv_std, weight)
84 |
85 | # calculate grad_input
86 | if ctx.needs_input_grad[0]:
87 |
88 | if torch.distributed.is_initialized():
89 | torch.distributed.all_reduce(
90 | mean_dy, ReduceOp.SUM, process_group)
91 | mean_dy = mean_dy / world_size
92 | torch.distributed.all_reduce(
93 | mean_dy_xmu, ReduceOp.SUM, process_group)
94 | mean_dy_xmu = mean_dy_xmu / world_size
95 | if channel_last:
96 | grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu)
97 | else:
98 | grad_input = syncbn.batchnorm_backward(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu)
99 |
100 | if weight is None or not ctx.needs_input_grad[1]:
101 | grad_weight = None
102 |
103 | if weight is None or not ctx.needs_input_grad[2]:
104 | grad_bias = None
105 |
106 | torch.cuda.nvtx.range_pop()
107 | return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None
108 |
--------------------------------------------------------------------------------
/furnace/apex/apex/parallel/sync_batchnorm_kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd.function import Function
3 |
4 | from apex.parallel import ReduceOp
5 |
6 |
7 | class SyncBatchnormFunction(Function):
8 |
9 | @staticmethod
10 | def forward(ctx, input, weight, bias, running_mean, running_variance, eps, process_group, world_size):
11 | torch.cuda.nvtx.range_push("sync_BN_fw")
12 | # transpose it to channel last to support broadcasting for input with different rank
13 | c_last_input = input.transpose(1, -1).contiguous().clone()
14 |
15 | ctx.save_for_backward(c_last_input, weight, bias,
16 | running_mean, running_variance)
17 | ctx.eps = eps
18 | ctx.process_group = process_group
19 | ctx.world_size = world_size
20 |
21 | c_last_input = (c_last_input - running_mean) / \
22 | torch.sqrt(running_variance + eps)
23 |
24 | if weight is not None:
25 | c_last_input = c_last_input * weight
26 | if bias is not None:
27 | c_last_input = c_last_input + bias
28 |
29 | torch.cuda.nvtx.range_pop()
30 | return c_last_input.transpose(1, -1).contiguous().clone()
31 |
32 | @staticmethod
33 | def backward(ctx, grad_output):
34 | torch.cuda.nvtx.range_push("sync_BN_bw")
35 | # mini batch mean & var are calculated by forward path.
36 | # mu = 1./N*np.sum(h, axis = 0)
37 | # var = 1./N*np.sum((h-mu)**2, axis = 0)
38 | c_last_input, weight, bias, running_mean, running_variance = ctx.saved_tensors
39 |
40 | eps = ctx.eps
41 | process_group = ctx.process_group
42 | world_size = ctx.world_size
43 | grad_input = grad_weight = grad_bias = None
44 | num_features = running_mean.size()[0]
45 |
46 | # transpose it to channel last to support broadcasting for input with different rank
47 | torch.cuda.nvtx.range_push("carilli field")
48 | c_last_grad = grad_output.transpose(1, -1).contiguous()
49 | # squash non-channel dimension so we can easily calculate mean
50 | c_grad = c_last_grad.view(-1, num_features).contiguous()
51 | torch.cuda.nvtx.range_pop()
52 |
53 | # calculate grad_input
54 | if ctx.needs_input_grad[0]:
55 | # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0)
56 | # - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0))
57 | mean_dy = c_grad.mean(0)
58 | mean_dy_xmu = (c_last_grad * (c_last_input -
59 | running_mean)).view(-1, num_features).mean(0)
60 | if torch.distributed.is_initialized():
61 | torch.distributed.all_reduce(
62 | mean_dy, ReduceOp.SUM, process_group)
63 | mean_dy = mean_dy / world_size
64 | torch.distributed.all_reduce(
65 | mean_dy_xmu, ReduceOp.SUM, process_group)
66 | mean_dy_xmu = mean_dy_xmu / world_size
67 | c_last_grad_input = (c_last_grad - mean_dy - (c_last_input - running_mean) / (
68 | running_variance + eps) * mean_dy_xmu) / torch.sqrt(running_variance + eps)
69 | if weight is not None:
70 | c_last_grad_input.mul_(weight)
71 | grad_input = c_last_grad_input.transpose(1, -1).contiguous()
72 |
73 | # calculate grad_weight
74 | grad_weight = None
75 | if weight is not None and ctx.needs_input_grad[1]:
76 | # dgamma = np.sum((h - mu) * (var + eps)**(-1. / 2.) * dy, axis=0)
77 | grad_weight = ((c_last_input - running_mean) / torch.sqrt(
78 | running_variance + eps) * c_last_grad).view(-1, num_features).sum(0)
79 |
80 | # calculate grad_bias
81 | grad_bias = None
82 | if bias is not None and ctx.needs_input_grad[2]:
83 | # dbeta = np.sum(dy, axis=0)
84 | grad_bias = c_grad.sum(0)
85 |
86 | torch.cuda.nvtx.range_pop()
87 | return grad_input, grad_weight, grad_bias, None, None, None, None, None
88 |
--------------------------------------------------------------------------------
/furnace/apex/apex/reparameterization/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 |
--------------------------------------------------------------------------------
/furnace/apex/apex/reparameterization/weight_norm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn.parameter import Parameter
3 | from ..fp16_utils import Fused_Weight_Norm
4 | import time
5 |
6 | from .reparameterization import Reparameterization
7 |
8 | def _norm(p, dim):
9 | """Computes the norm over all dimensions except dim"""
10 | if dim is None:
11 | return p.norm()
12 | elif dim == 0:
13 | output_size = (p.size(0),) + (1,) * (p.dim() - 1)
14 | return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
15 | elif dim == p.dim() - 1:
16 | output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
17 | return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
18 | return _norm(p.transpose(0, dim), 0).transpose(0, dim)
19 |
20 | HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor)
21 |
22 | class WeightNorm(Reparameterization):
23 | """
24 | Weight normalization is a reparameterization that decouples the magnitude
25 | of a weight tensor from its direction. This replaces the parameter specified
26 | by `name` (e.g. "weight") with two parameters: one specifying the magnitude
27 | (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
28 | Weight normalization is implemented via a hook that recomputes the weight
29 | tensor from the magnitude and direction before every :meth:`~Module.forward`
30 | call.
31 |
32 | .. math::
33 | \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
34 |
35 | By default, with `dim=0`, the norm is computed independently per output
36 | channel/plane. To compute a norm over the entire weight tensor, use
37 | `dim=None`.
38 | """
39 | def compute_weight(self, module=None, name=None):
40 | """
41 | Computes weight normalized weight value to assign value to module attribute
42 | with name `name`.
43 | Arguments:
44 | module (nn.Module): module with weight we'd like to reparameterize
45 | Returns:
46 | w (Tensor): Tensor object containing value of reparameterized weight
47 | """
48 | if module is None:
49 | module = self.module
50 | if name is None:
51 | name = self.name
52 | module, name = Reparameterization.get_module_and_name(module, name)
53 | g = getattr(module, name + '_g')
54 | v = getattr(module, name + '_v')
55 |
56 | fused_weight_norm = Fused_Weight_Norm.apply
57 | v = v.contiguous()
58 | w = fused_weight_norm(v, g, self.dim)
59 |
60 | return w
61 |
62 | def reparameterize(self, name, weight, dim):
63 | """
64 | Creates Parameters v and gto be used for weight normalization
65 | and creates names that for attributes for the module these Parameters
66 | will correspond to. The parameters will be registered according to the names
67 | provided.
68 | Arguments:
69 | module (nn.Module): module with weight we'd like to reparameterize
70 | name (str, optional): name of weight parameter
71 | dim (int, optional): dimension over which to compute parameterization
72 | Returns:
73 | names (list, str): names of Parameters to be used for reparameterization
74 | params (list, Parameter): Parameters to be used for reparameterization
75 | """
76 | names = [name + '_g', name + '_v']
77 | params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)]
78 | return names, params
79 |
--------------------------------------------------------------------------------
/furnace/apex/csrc/flatten_unflatten.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | // https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
4 |
5 | at::Tensor flatten(std::vector tensors)
6 | {
7 | return torch::utils::flatten_dense_tensors(tensors);
8 | }
9 |
10 | std::vector unflatten(at::Tensor flat, std::vector tensors)
11 | {
12 | return torch::utils::unflatten_dense_tensors(flat, tensors);
13 | }
14 |
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 | m.def("flatten", &flatten, "Flatten dense tensors");
17 | m.def("unflatten", &unflatten, "Unflatten dense tensors");
18 | }
19 |
--------------------------------------------------------------------------------
/furnace/apex/csrc/scale_check_overflow.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | void scale_check_overflow_cuda(const at::Tensor& grads,
4 | float scale,
5 | const at::Tensor& d_buf,
6 | const at::Tensor& downscaled_grads);
7 |
8 | void scale_check_overflow(at::Tensor grads,
9 | float scale,
10 | at::Tensor overflow_buf,
11 | at::Tensor downscaled_grads)
12 | // const at::optional downscaled_grads)
13 | {
14 | AT_CHECK(grads.type().is_cuda(), "grads must be a CUDA tensor");
15 | AT_CHECK(grads.is_contiguous(), "grads must be contiguous");
16 | AT_CHECK(overflow_buf.type().is_cuda(), "overflow_buf must be a CUDA tensor");
17 | AT_CHECK(overflow_buf.is_contiguous(), "overflow_buf must be contiguous");
18 | AT_CHECK(downscaled_grads.type().is_cuda(), "downscaled_grads must be a CUDA tensor");
19 | AT_CHECK(downscaled_grads.is_contiguous(), "downscaled_grads must be contiguous");
20 | // Make sure we are downscaling the FP32 master grads
21 | AT_CHECK(downscaled_grads.type().scalarType() == at::ScalarType::Float,
22 | "The output grads supplied to scale_check_overflow should be fp32 (master grads).")
23 | AT_CHECK(grads.numel() == downscaled_grads.numel(), "Input and output grads must be the same size.");
24 |
25 | scale_check_overflow_cuda(grads, scale, overflow_buf, downscaled_grads);
26 | }
27 |
28 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
29 | m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors");
30 | }
31 |
--------------------------------------------------------------------------------
/furnace/apex/csrc/scale_check_overflow_kernel.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | #define BLOCK_SIZE 1024
10 | #define NBLOCKS 160
11 |
12 | // It makes sense to lock the output type to fp32 because the downscaled
13 | // grads should be master grads (and in the case of Amp, the params and their
14 | // gradients should always be fp32.
15 |
16 | // This can be optimized with ILP but it's fine for now.
17 | template
18 | __global__ void scale_reduce_overflow(in_t* in,
19 | float* out,
20 | int n,
21 | float scale,
22 | volatile int* overflow_global)
23 | {
24 | __shared__ int overflow;
25 |
26 | int tid = blockIdx.x*blockDim.x + threadIdx.x;
27 | int stride = gridDim.x*blockDim.x;
28 |
29 | // Non-divergent exit condition for the __syncthreads
30 | for(int i = tid; i - threadIdx.x < n; i += stride)
31 | {
32 | if(threadIdx.x == 0)
33 | overflow = *overflow_global;
34 |
35 | __syncthreads();
36 |
37 | if(overflow == 1)
38 | break;
39 |
40 | if(i < n)
41 | {
42 | float incoming_val = static_cast(in[i]);
43 | if(isfinite(incoming_val))
44 | out[i] = incoming_val*scale;
45 | else
46 | *overflow_global = 1; // Blindly fire off a write. These will race but that's ok.
47 | // This is NOT guaranteed to be seen immediately by thread 0 on the next iteration.
48 | // I wonder if there's a way we can rig the short-circuiting with only one syncthreads.
49 | // It's possible we can just lean on the cache (no smem or syncs) and still be fast.
50 | }
51 | }
52 | }
53 |
54 |
55 | void scale_check_overflow_cuda
56 | (const at::Tensor& grads,
57 | float scale,
58 | const at::Tensor& overflow_buf,
59 | const at::Tensor& downscaled_grads)
60 | {
61 | using namespace at;
62 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
63 |
64 | int n = grads.numel();
65 |
66 | // Lock the output (downscaled) type to float.
67 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(grads.type(),
68 | "scale_check_overflow_cuda",
69 | [&]
70 | {
71 | // using accscalar_t = acc_type;
72 | scale_reduce_overflow<<>>
73 | (grads.data(),
74 | downscaled_grads.data(),
75 | n,
76 | scale,
77 | overflow_buf.data());
78 | });
79 |
80 | AT_CUDA_CHECK(cudaGetLastError());
81 | }
82 |
--------------------------------------------------------------------------------
/furnace/apex/dist/apex-0.1-py3.6-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/dist/apex-0.1-py3.6-linux-x86_64.egg
--------------------------------------------------------------------------------
/furnace/apex/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = NVIDIAAPEX
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | gh-pages:
16 | git checkout gh-pages
17 | rm -rf build
18 | rm -rf source
19 | git checkout master -- .
20 | make html
21 | rm -rf ../_modules ../_sources ../_static
22 | mv -fv build/html/* ../
23 | rm -rf build
24 | git add -A
25 | git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
26 |
27 | .PHONY: help Makefile
28 |
29 | # Catch-all target: route all unknown targets to Sphinx using the new
30 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
31 | %: Makefile
32 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
33 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/_static/css/pytorch_theme.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
3 | }
4 |
5 | /* Default header fonts are ugly */
6 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
7 | font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
8 | }
9 |
10 | /* Use white for docs background */
11 | .wy-side-nav-search {
12 | background-color: #fff;
13 | }
14 |
15 | .wy-nav-content-wrap, .wy-menu li.current > a {
16 | background-color: #fff;
17 | }
18 |
19 | @media screen and (min-width: 1400px) {
20 | .wy-nav-content-wrap {
21 | background-color: rgba(0, 0, 0, 0.0470588);
22 | }
23 |
24 | .wy-nav-content {
25 | background-color: #fff;
26 | }
27 | }
28 |
29 | /* Fixes for mobile */
30 | .wy-nav-top {
31 | background-color: #fff;
32 | background-image: url('../img/apex.jpg');
33 | background-repeat: no-repeat;
34 | background-position: center;
35 | padding: 0;
36 | margin: 0.4045em 0.809em;
37 | color: #333;
38 | }
39 |
40 | .wy-nav-top > a {
41 | display: none;
42 | }
43 |
44 | @media screen and (max-width: 768px) {
45 | .wy-side-nav-search>a img.logo {
46 | height: 60px;
47 | }
48 | }
49 |
50 | /* This is needed to ensure that logo above search scales properly */
51 | .wy-side-nav-search a {
52 | display: block;
53 | }
54 |
55 | /* This ensures that multiple constructors will remain in separate lines. */
56 | .rst-content dl:not(.docutils) dt {
57 | display: table;
58 | }
59 |
60 | /* Use our red for literals (it's very similar to the original color) */
61 | .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
62 | color: #F05732;
63 | }
64 |
65 | .rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
66 | .rst-content code.xref, a .rst-content tt, a .rst-content code {
67 | color: #404040;
68 | }
69 |
70 | /* Change link colors (except for the menu) */
71 |
72 | a {
73 | color: #F05732;
74 | }
75 |
76 | a:hover {
77 | color: #F05732;
78 | }
79 |
80 |
81 | a:visited {
82 | color: #D44D2C;
83 | }
84 |
85 | .wy-menu a {
86 | color: #b3b3b3;
87 | }
88 |
89 | .wy-menu a:hover {
90 | color: #b3b3b3;
91 | }
92 |
93 | /* Default footer text is quite big */
94 | footer {
95 | font-size: 80%;
96 | }
97 |
98 | footer .rst-footer-buttons {
99 | font-size: 125%; /* revert footer settings - 1/80% = 125% */
100 | }
101 |
102 | footer p {
103 | font-size: 100%;
104 | }
105 |
106 | /* For hidden headers that appear in TOC tree */
107 | /* see http://stackoverflow.com/a/32363545/3343043 */
108 | .rst-content .hidden-section {
109 | display: none;
110 | }
111 |
112 | nav .hidden-section {
113 | display: inherit;
114 | }
115 |
116 | .wy-side-nav-search>div.version {
117 | color: #000;
118 | }
119 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% block sidebartitle %} {{ super() }}
3 |
4 |
32 | {% endblock %}
33 |
34 | {% block footer %} {{ super() }}
35 |
36 |
51 | {% endblock %}
52 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/amp.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | apex.amp
5 | ===================================
6 |
7 | Amp (Automatic Mixed Precision) is a tool designed for ease of use and maximum safety in FP16 training. All potentially unsafe ops are performed in FP32 under the hood, while safe ops are performed using faster, Tensor Core-friendly FP16 math. Amp also automatically implements dynamic loss scaling.
8 |
9 | The intention of Amp is to be the "on-ramp" to easy FP16 training: achieve all the numerical stability of full FP32 training, with most of the performance benefits of full FP16 training.
10 |
11 | Currently, complete API documentation resides on the Github page: https://github.com/NVIDIA/apex/tree/master/apex/amp.
12 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/fp16_utils.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | apex.fp16_utils
5 | ===================================
6 |
7 | This submodule contains utilities designed to streamline the mixed precision training recipe
8 | presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions
9 | `Training Neural Networks with Mixed Precision: Theory and Practice`_ and
10 | `Training Neural Networks with Mixed Precision: Real Examples`_.
11 | For Pytorch users, Real Examples in particular is recommended.
12 |
13 | Full runnable Python scripts demonstrating ``apex.fp16_utils``
14 | can be found on the Github page:
15 |
16 | | `Simple FP16_Optimizer demos`_
17 | |
18 | | `Distributed Mixed Precision Training with imagenet`_
19 | |
20 | | `Mixed Precision Training with word_language_model`_
21 | |
22 | |
23 |
24 | .. _`on Parallel Forall`:
25 | https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
26 | .. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
27 | http://on-demand.gputechconf.com/gtc/2018/video/S8923/
28 | .. _`Training Neural Networks with Mixed Precision: Real Examples`:
29 | http://on-demand.gputechconf.com/gtc/2018/video/S81012/
30 | .. _`Simple FP16_Optimizer demos`:
31 | https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple
32 | .. _`Distributed Mixed Precision Training with imagenet`:
33 | https://github.com/NVIDIA/apex/tree/master/examples/imagenet
34 | .. _`Mixed Precision Training with word_language_model`:
35 | https://github.com/NVIDIA/apex/tree/master/examples/word_language_model
36 |
37 | .. automodule:: apex.fp16_utils
38 | .. currentmodule:: apex.fp16_utils
39 |
40 | Automatic management of master params + loss scaling
41 | ----------------------------------------------------
42 |
43 | .. autoclass:: FP16_Optimizer
44 | :members:
45 |
46 | .. autoclass:: LossScaler
47 | :members:
48 |
49 | .. autoclass:: DynamicLossScaler
50 | :members:
51 |
52 | Manual master parameter management
53 | ----------------------------------
54 |
55 | .. autofunction:: prep_param_lists
56 |
57 | .. autofunction:: master_params_to_model_params
58 |
59 | .. autofunction:: model_grads_to_master_grads
60 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. PyTorch documentation master file, created by
2 | sphinx-quickstart on Fri Dec 23 13:31:47 2016.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | :github_url: https://github.com/nvidia/apex
7 |
8 | Apex (A PyTorch Extension)
9 | ===================================
10 |
11 | This site contains the API documentation for Apex (https://github.com/nvidia/apex),
12 | a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.
13 |
14 | Installation requires CUDA 9 or later, PyTorch 0.4 or later, and Python 3. Install by running
15 |
16 | ::
17 |
18 | git clone https://www.github.com/nvidia/apex
19 | cd apex
20 | python setup.py install [--cuda_ext] [--cpp_ext]
21 |
22 |
23 | .. toctree::
24 | :maxdepth: 1
25 | :caption: AMP: Automatic Mixed Precision
26 |
27 | amp
28 |
29 | .. toctree::
30 | :maxdepth: 1
31 | :caption: FP16/Mixed Precision Utilities
32 |
33 | fp16_utils
34 |
35 | .. toctree::
36 | :maxdepth: 1
37 | :caption: Distributed Training
38 |
39 | parallel
40 |
41 | .. toctree::
42 | :maxdepth: 1
43 | :caption: Fused Optimizers
44 |
45 | optimizers
46 |
47 | .. toctree::
48 | :maxdepth: 1
49 | :caption: Fused Layer Norm
50 |
51 | layernorm
52 |
53 | .. reparameterization
54 | .. RNN
55 |
56 | Indices and tables
57 | ==================
58 |
59 | * :ref:`genindex`
60 | * :ref:`modindex`
61 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/layernorm.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | apex.normalization.fused_layer_norm
5 | ===================================
6 |
7 | .. automodule:: apex.normalization
8 | .. currentmodule:: apex.normalization
9 |
10 | .. FusedAdam
11 | ----------
12 |
13 | .. autoclass:: FusedLayerNorm
14 | :members:
15 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/optimizers.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | apex.optimizers
5 | ===================================
6 |
7 | .. automodule:: apex.optimizers
8 | .. currentmodule:: apex.optimizers
9 |
10 | .. FusedAdam
11 | ----------
12 |
13 | .. autoclass:: FusedAdam
14 | :members:
15 |
--------------------------------------------------------------------------------
/furnace/apex/docs/source/parallel.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | apex.parallel
5 | ===================================
6 |
7 | .. automodule:: apex.parallel
8 | .. currentmodule:: apex.parallel
9 |
10 | .. DistributedDataParallel
11 | ----------
12 |
13 | .. autoclass:: DistributedDataParallel
14 | :members:
15 |
16 | .. autoclass:: Reducer
17 | :members:
18 |
19 | .. autoclass:: SyncBatchNorm
20 | :members:
21 |
22 | Utility functions
23 | ----------------------------------
24 |
25 | .. autofunction:: convert_syncbn_model
26 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/README.md:
--------------------------------------------------------------------------------
1 | # Simple examples of FP16_Optimizer functionality
2 |
3 | To use `FP16_Optimizer` on a half-precision model, or a model with a mixture of
4 | half and float parameters, only two lines of your training script need to change:
5 | 1. Construct an `FP16_Optimizer` instance from an existing optimizer.
6 | 2. Replace `loss.backward()` with `optimizer.backward(loss)`.
7 |
8 | #### [Full API Documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
9 |
10 | See "Other Options" at the bottom of this page for some cases that require special treatment.
11 |
12 | #### Minimal Working Sample
13 | `minimal.py` shows the basic usage of `FP16_Optimizer` with either static or dynamic loss scaling. Test via `python minimal.py`.
14 |
15 | #### Closures
16 | `FP16_Optimizer` supports closures with the same control flow as ordinary Pytorch optimizers.
17 | `closure.py` shows an example. Test via `python closure.py`.
18 |
19 | See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.step) for more details.
20 |
21 | #### Serialization/Deserialization
22 | `FP16_Optimizer` supports saving and loading with the same control flow as ordinary Pytorch optimizers.
23 | `save_load.py` shows an example. Test via `python save_load.py`.
24 |
25 | See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.load_state_dict) for more details.
26 |
27 | #### Distributed
28 | **distributed_apex** shows an example using `FP16_Optimizer` with Apex DistributedDataParallel.
29 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary single-process
30 | usage. Test via
31 | ```bash
32 | cd distributed_apex
33 | bash run.sh
34 | ```
35 |
36 | **distributed_pytorch** shows an example using `FP16_Optimizer` with Pytorch DistributedDataParallel.
37 | Again, the usage of `FP16_Optimizer` with distributed does not need to change from ordinary
38 | single-process usage. Test via
39 | ```bash
40 | cd distributed_pytorch
41 | bash run.sh
42 | ```
43 |
44 | #### Other Options
45 |
46 | Gradient clipping requires that calls to `torch.nn.utils.clip_grad_norm`
47 | be replaced with [fp16_optimizer_instance.clip_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.clip_master_grads). The [word_language_model example](https://github.com/NVIDIA/apex/blob/master/examples/word_language_model/main_fp16_optimizer.py) uses this feature.
48 |
49 | Multiple losses will work if you simply replace
50 | ```bash
51 | loss1.backward()
52 | loss2.backward()
53 | ```
54 | with
55 | ```bash
56 | optimizer.backward(loss1)
57 | optimizer.backward(loss2)
58 | ```
59 | but `FP16_Optimizer` can be told to handle this more efficiently using the
60 | [update_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.update_master_grads) option.
61 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/closure.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from apex.fp16_utils import FP16_Optimizer
3 |
4 | torch.backends.cudnn.benchmark = True
5 |
6 | N, D_in, D_out = 64, 1024, 16
7 |
8 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
9 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
10 |
11 | model = torch.nn.Linear(D_in, D_out).cuda().half()
12 |
13 | optimizer = torch.optim.LBFGS(model.parameters())
14 | ### Construct FP16_Optimizer
15 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
16 | ###
17 |
18 | loss_fn = torch.nn.MSELoss()
19 |
20 | for t in range(5):
21 | def closure():
22 | optimizer.zero_grad()
23 | y_pred = model(x)
24 | loss = loss_fn(y_pred.float(), y.float())
25 | ### Change loss.backward() within the closure to: ###
26 | optimizer.backward(loss)
27 | ###
28 | return loss
29 | loss = optimizer.step(closure)
30 |
31 | print("final loss = ", loss)
32 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_apex/README.md:
--------------------------------------------------------------------------------
1 | **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
2 | `apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
3 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
4 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
5 | single-process usage. Test via
6 | ```bash
7 | bash run.sh
8 | ```
9 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_apex/distributed_data_parallel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import argparse
3 | from apex.parallel import DistributedDataParallel as DDP
4 | from apex.fp16_utils import FP16_Optimizer
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--local_rank", default=0, type=int)
8 | args = parser.parse_args()
9 |
10 | torch.cuda.set_device(args.local_rank)
11 | torch.distributed.init_process_group(backend='nccl',
12 | init_method='env://')
13 |
14 | torch.backends.cudnn.benchmark = True
15 |
16 | N, D_in, D_out = 64, 1024, 16
17 |
18 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
19 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
20 |
21 | model = torch.nn.Linear(D_in, D_out).cuda().half()
22 | model = DDP(model)
23 |
24 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
25 | ### Construct FP16_Optimizer ###
26 | optimizer = FP16_Optimizer(optimizer)
27 | ###
28 |
29 | loss_fn = torch.nn.MSELoss()
30 |
31 | for t in range(500):
32 | optimizer.zero_grad()
33 | y_pred = model(x)
34 | loss = loss_fn(y_pred.float(), y.float())
35 | ### Change loss.backward() to: ###
36 | optimizer.backward(loss)
37 | ###
38 | optimizer.step()
39 |
40 | print("final loss = ", loss)
41 |
42 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_apex/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
3 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/README.md:
--------------------------------------------------------------------------------
1 | **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
2 | `apex.parallel.DistributedDataParallel` in conjuction with the legacy Apex
3 | launcher script, `apex.parallel.multiproc`. See
4 | [FP16_Optimizer_simple/distributed_apex](https://github.com/NVIDIA/apex/tree/torch_launcher/examples/FP16_Optimizer_simple/distributed_apex) for a more up-to-date example that uses the Pytorch launcher
5 | script, `torch.distributed.launch`.
6 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
7 | single-process usage. Test via
8 | ```bash
9 | bash run.sh
10 | ```
11 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/distributed_data_parallel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import argparse
3 | from apex.parallel import DistributedDataParallel as DDP
4 | from apex.fp16_utils import FP16_Optimizer
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
8 | help='url used to set up distributed training')
9 | parser.add_argument('--world-size', default=2, type=int,
10 | help='Number of distributed processes.')
11 | parser.add_argument("--rank", type=int,
12 | help='Rank of this process')
13 |
14 | args = parser.parse_args()
15 |
16 | torch.cuda.set_device(args.rank)
17 | torch.distributed.init_process_group(backend='nccl',
18 | init_method=args.dist_url,
19 | world_size=args.world_size,
20 | rank=args.rank)
21 |
22 | torch.backends.cudnn.benchmark = True
23 |
24 | N, D_in, D_out = 64, 1024, 16
25 |
26 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
27 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
28 |
29 | model = torch.nn.Linear(D_in, D_out).cuda().half()
30 | model = DDP(model)
31 |
32 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
33 | ### Construct FP16_Optimizer ###
34 | optimizer = FP16_Optimizer(optimizer)
35 | ###
36 |
37 | loss_fn = torch.nn.MSELoss()
38 |
39 | for t in range(500):
40 | optimizer.zero_grad()
41 | y_pred = model(x)
42 | loss = loss_fn(y_pred.float(), y.float())
43 | ### Change loss.backward() to: ###
44 | optimizer.backward(loss)
45 | ###
46 | optimizer.step()
47 |
48 | print("final loss = ", loss)
49 |
50 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # By default, apex.parallel.multiproc will attempt to use all available GPUs on the system.
3 | # The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
4 | export CUDA_VISIBLE_DEVICES=0,1
5 | python -m apex.parallel.multiproc distributed_data_parallel.py
6 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_pytorch/README.md:
--------------------------------------------------------------------------------
1 | **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
2 | `torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
3 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
4 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
5 | single-process usage. Test via
6 | ```bash
7 | bash run.sh
8 | ```
9 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_pytorch/distributed_data_parallel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import argparse
3 | from apex.fp16_utils import FP16_Optimizer
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--local_rank", default=0, type=int)
7 | args = parser.parse_args()
8 |
9 | torch.cuda.set_device(args.local_rank)
10 | torch.distributed.init_process_group(backend='nccl',
11 | init_method='env://')
12 |
13 | torch.backends.cudnn.benchmark = True
14 |
15 | N, D_in, D_out = 64, 1024, 16
16 |
17 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
18 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
19 |
20 | model = torch.nn.Linear(D_in, D_out).cuda().half()
21 | model = torch.nn.parallel.DistributedDataParallel(model,
22 | device_ids=[args.local_rank],
23 | output_device=args.local_rank)
24 |
25 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
26 | ### Construct FP16_Optimizer ###
27 | optimizer = FP16_Optimizer(optimizer)
28 | ###
29 |
30 | loss_fn = torch.nn.MSELoss()
31 |
32 | for t in range(500):
33 | optimizer.zero_grad()
34 | y_pred = model(x)
35 | loss = loss_fn(y_pred.float(), y.float())
36 | ### Change loss.backward() to: ###
37 | optimizer.backward(loss)
38 | ###
39 | optimizer.step()
40 |
41 | print("final loss = ", loss)
42 |
43 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/distributed_pytorch/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
3 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/minimal.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from apex.fp16_utils import FP16_Optimizer
3 |
4 | torch.backends.cudnn.benchmark = True
5 |
6 | N, D_in, D_out = 64, 1024, 16
7 |
8 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
9 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
10 |
11 | model = torch.nn.Linear(D_in, D_out).cuda().half()
12 |
13 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
14 |
15 | ### Construct FP16_Optimizer
16 | ### FP16_Optimizer will ingest and remember the original optimizer's param_groups.
17 | ###
18 | ### Construct with static loss scaling...
19 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
20 | ### ...or dynamic loss scaling
21 | # optimizer = FP16_Optimizer(optimizer,
22 | # dynamic_loss_scale=True,
23 | # dynamic_loss_args={'scale_factor' : 2})
24 | ### dynamic_loss_args is optional, for "power users," and unnecessary in most cases.
25 |
26 | loss_fn = torch.nn.MSELoss()
27 |
28 | for t in range(200):
29 | optimizer.zero_grad()
30 | y_pred = model(x)
31 | loss = loss_fn(y_pred.float(), y.float())
32 | ### Change loss.backward() to:
33 | optimizer.backward(loss)
34 | ###
35 | optimizer.step()
36 |
37 | print("final loss = ", loss)
38 |
--------------------------------------------------------------------------------
/furnace/apex/examples/FP16_Optimizer_simple/save_load.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from apex.fp16_utils import FP16_Optimizer
3 |
4 | torch.backends.cudnn.benchmark = True
5 |
6 | N, D_in, D_out = 64, 1024, 16
7 |
8 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
9 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
10 |
11 | model = torch.nn.Linear(D_in, D_out).cuda().half()
12 |
13 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
14 | ### Construct FP16_Optimizer with static loss scaling...
15 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
16 | ### ...or dynamic loss scaling
17 | # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
18 |
19 | loss_fn = torch.nn.MSELoss()
20 |
21 | # The checkpointing shown here is identical to what you'd use without FP16_Optimizer.
22 | #
23 | # We save/load checkpoints within local scopes, so the "checkpoint" object
24 | # does not persist. This helps avoid dangling references to intermediate deserialized data,
25 | # and is good practice for Pytorch in general, not just with FP16_Optimizer.
26 | def save_checkpoint():
27 | checkpoint = {}
28 | checkpoint['model'] = model.state_dict()
29 | checkpoint['optimizer'] = optimizer.state_dict()
30 | torch.save(checkpoint, 'saved.pth')
31 |
32 | def load_checkpoint():
33 | checkpoint = torch.load('saved.pth',
34 | map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))
35 | model.load_state_dict(checkpoint['model'])
36 | optimizer.load_state_dict(checkpoint['optimizer'])
37 |
38 | for t in range(100):
39 | optimizer.zero_grad()
40 | y_pred = model(x)
41 | loss = loss_fn(y_pred.float(), y.float())
42 | optimizer.backward(loss) ### formerly loss.backward()
43 | optimizer.step()
44 |
45 | save_checkpoint()
46 |
47 | load_checkpoint()
48 |
49 | for t in range(100):
50 | optimizer.zero_grad()
51 | y_pred = model(x)
52 | loss = loss_fn(y_pred.float(), y.float())
53 | optimizer.backward(loss) ### formerly loss.backward()
54 | optimizer.step()
55 |
56 | print("final loss = ", loss)
57 |
--------------------------------------------------------------------------------
/furnace/apex/examples/README.md:
--------------------------------------------------------------------------------
1 | ## Contents:
2 |
3 | **distributed**: Walkthrough of apex distributed data parallel utilities.
4 |
5 | **FP16_Optimizer_simple**: Simple examples demonstrating various use cases of `FP16_Optimizer` to automatically manage master parameters and static or dynamic loss scaling.
6 |
7 | **imagenet**: Example based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet) showing the use of `FP16_Optimizer`, as well as manual management of master parameters and loss scaling for illustration/comparison.
8 |
9 | **word_language_model**: Example based on [https://github.com/pytorch/examples/tree/master/word_language_model](https://github.com/pytorch/examples/tree/master/word_language_model) showing the use of `FP16_Optimizer`, as well as manual management of master parameters and loss scaling for illustration/comparison.
10 |
11 | **docker**: Example of a minimal Dockerfile that installs Apex on top of an existing container.
12 |
--------------------------------------------------------------------------------
/furnace/apex/examples/distributed/README.md:
--------------------------------------------------------------------------------
1 | # Multiprocess Example based on pytorch/examples/mnist
2 |
3 | main.py demonstrates how to modify a simple model to enable multiprocess distributed data parallel
4 | training using the module wrapper `apex.parallel.DistributedDataParallel`
5 | (similar to `torch.nn.parallel.DistributedDataParallel`).
6 |
7 | Multiprocess distributed data parallel training frequently outperforms single-process
8 | data parallel training (such as that offered by `torch.nn.DataParallel`) because each process has its
9 | own python interpreter. Therefore, driving multiple GPUs with multiple processes reduces
10 | global interpreter lock contention versus having a single process (with a single GIL) drive all GPUs.
11 |
12 | `apex.parallel.DistributedDataParallel` is optimized for use with NCCL. It achieves high performance by
13 | overlapping communication with computation during ``backward()`` and bucketing smaller gradient
14 | transfers to reduce the total number of transfers required.
15 |
16 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html)
17 |
18 | #### [Source Code](https://github.com/NVIDIA/apex/tree/master/apex/parallel)
19 |
20 | #### [Another example: Imagenet with mixed precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
21 |
22 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
23 |
24 | ## Getting started
25 | Prior to running please run
26 | ```pip install -r requirements.txt```
27 |
28 | To download the dataset, run
29 | ```python main.py```
30 | without any arguments. Once you have downloaded the dataset, you should not need to do this again.
31 |
32 | `main.py` runs multiprocess distributed data parallel jobs using the Pytorch launcher script
33 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
34 | Jobs are launched via
35 | ```bash
36 | python -m torch.distributed.launch --nproc_per_node=N main.py args...
37 | ```
38 | `torch.distributed.launch` spawns `N` processes, each of which runs as
39 | `python main.py args... --local_rank `.
40 | The `local_rank` argument for each process is determined and appended by `torch.distributed.launch`,
41 | and varies between 0 and `N-1`. `torch.distributed.launch` also provides environment variables
42 | for each process.
43 | Internally, each process calls `set_device` according to its local
44 | rank and `init_process_group` with `init_method=`env://' to ingest the provided environment
45 | variables.
46 | For best performance, set `N` equal to the number of visible CUDA devices on the node.
47 |
48 | ## Converting your own model
49 |
50 | To understand how to convert your own model, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags.
51 |
52 | ## Requirements
53 | Pytorch with NCCL available as a distributed backend. Pytorch 0.4+, installed as a pip or conda package, should have this by default. Otherwise, you can build Pytorch from source, in an environment where NCCL is installed and visible.
54 |
--------------------------------------------------------------------------------
/furnace/apex/examples/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # Base image must at least have pytorch and CUDA installed.
2 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:18.12-py3
3 | FROM $BASE_IMAGE
4 | ARG BASE_IMAGE
5 | RUN echo "Installing Apex on top of ${BASE_IMAGE}"
6 | WORKDIR /workspace
7 | # uninstall Apex if present
8 | RUN pip uninstall -y apex || :
9 | # SHA is something the user can touch to force recreation of this Docker layer,
10 | # and therefore force cloning of the latest version of Apex
11 | RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
12 | WORKDIR /workspace/apex
13 | RUN python setup.py install
14 | WORKDIR /workspace
15 |
--------------------------------------------------------------------------------
/furnace/apex/examples/docker/README.md:
--------------------------------------------------------------------------------
1 | ## Option 1: Create a new container with Apex
2 |
3 | **Dockerfile** installs the latest Apex on top of an existing image. Run
4 | ```
5 | docker build -t image_with_apex .
6 | ```
7 | By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
8 | which requires an NVIDIA GPU Cloud (NGC) account. If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
9 |
10 | Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
11 | Any `BASE_IMAGE` you supply must have Pytorch and Cuda installed, for example:
12 | ```
13 | docker build --build-arg BASE_IMAGE=pytorch/pytorch:0.4-cuda9-cudnn7-devel -t image_with_apex .
14 | ```
15 |
16 | If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
17 |
18 | **Warning:**
19 | Currently, Pytorch's default non-devel image on Dockerhub
20 | [pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries. It does not contain NVCC, which means it is not an eligible candidate for ``.
21 |
22 | ## Option 2: Install Apex in a running container
23 |
24 | Instead of building a new container, it is also a viable option to `git clone https://github.com/NVIDIA/apex.git` on bare metal, mount the Apex repo into your container at launch by running, for example,
25 | ```
26 | docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container
27 | ```
28 | then go to /apex/in/container within the running container and `python setup.py install [--cuda_ext] [--cpp_ext]`.
29 |
--------------------------------------------------------------------------------
/furnace/apex/examples/imagenet/README.md:
--------------------------------------------------------------------------------
1 | # ImageNet training in PyTorch
2 |
3 | This example is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
4 | It implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
5 |
6 | `main.py` with the `--fp16` argument demonstrates mixed precision training with manual management of master parameters and loss scaling.
7 |
8 | `main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.
9 |
10 | `main_amp.py` with `--fp16` demonstrates use of Amp to automatically perform all FP16-friendly operations in half precision under the hood. Notice that with Amp:
11 | ..* you don't need to explicitly convert your model, or the input data, to half(). Conversions will occur on-the-fly internally within the Amp-patched torch functions.
12 | ..* dynamic loss scaling is always used under the hood.
13 |
14 | `main_reducer.py` is identical to `main.py`, except that it shows the use of [apex.parallel.Reduce](https://nvidia.github.io/apex/parallel.html#apex.parallel.Reducer) instead of `DistributedDataParallel`.
15 |
16 | ## Requirements
17 |
18 | - `pip install -r requirements.txt`
19 | - Download the ImageNet dataset and move validation images to labeled subfolders
20 | - To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
21 |
22 | ## Training
23 |
24 | To train a model, run `main.py` with the desired model architecture and the path to the ImageNet dataset.
25 |
26 | The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG:
27 |
28 | ```bash
29 | python main.py -a alexnet --lr 0.01 /path/to/imagenet/folder
30 | ```
31 |
32 | The directory at /path/to/imagenet/directory should contain two subdirectories called "train"
33 | and "val" that contain the training and validation data respectively.
34 |
35 | ## Distributed training
36 |
37 | `main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. `apex.parallel.DistributedDataParallel`
38 | is a drop-in replacement for `torch.nn.parallel.DistribtuedDataParallel` (see our [distributed example](https://github.com/NVIDIA/apex/tree/master/examples/distributed)).
39 | The scripts can interact with
40 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility)
41 | to spawn multiprocess jobs using the following syntax:
42 | ```
43 | python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py args...
44 | ```
45 | `NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.
46 |
47 | Optionally one can run imagenet with sync batch normalization by adding
48 | `--sync_bn` into the `args...`
49 |
50 | ## Example commands
51 |
52 | (note: batch size `--b 224` assumes your GPUs have >=16GB of onboard memory)
53 |
54 | ```bash
55 | ### Softlink training dataset into current directory
56 | $ ln -sf /data/imagenet/train-jpeg/ train
57 | ### Softlink validation dataset into current directory
58 | $ ln -sf /data/imagenet/val-jpeg/ val
59 | ### Single-process training
60 | $ python main.py -a resnet50 --fp16 --b 224 --workers 4 --static-loss-scale 128.0 ./
61 | ### Single-process training with Amp. Amp's casting causes it to use a bit more memory,
62 | ### hence the batch size 128.
63 | $ python main_amp.py -a resnet50 --fp16 --b 128 --workers 4 ./
64 | ### Multi-process training (uses all visible GPUs on the node)
65 | $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py -a resnet50 --fp16 --b 224 --workers 4 --static-loss-scale 128.0 ./
66 | ### Multi-process training on GPUs 0 and 1 only
67 | $ export CUDA_VISIBLE_DEVICES=0,1
68 | $ python -m torch.distributed.launch --nproc_per_node=2 main.py -a resnet50 --fp16 --b 224 --workers 4 ./
69 | ### Multi-process training with FP16_Optimizer, static loss scale 128.0 (still uses FP32 master params)
70 | $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 224 --static-loss-scale 128.0 --workers 4 ./
71 | ### Multi-process training with FP16_Optimizer, dynamic loss scaling
72 | $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 224 --dynamic-loss-scale --workers 4 ./
73 | ```
74 |
75 | ## Usage for `main.py` and `main_fp16_optimizer.py`
76 |
77 | `main_fp16_optimizer.py` also accepts the optional flag
78 | ```bash
79 | --dynamic-loss-scale Use dynamic loss scaling. If supplied, this argument
80 | supersedes --static-loss-scale.
81 | ```
82 |
83 |
--------------------------------------------------------------------------------
/furnace/apex/examples/word_language_model/data.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 |
4 |
5 | class Dictionary(object):
6 | def __init__(self):
7 | self.word2idx = {}
8 | self.idx2word = []
9 |
10 | def add_word(self, word):
11 | if word not in self.word2idx:
12 | self.idx2word.append(word)
13 | self.word2idx[word] = len(self.idx2word) - 1
14 | return self.word2idx[word]
15 |
16 | def __len__(self):
17 | return len(self.idx2word)
18 |
19 |
20 | class Corpus(object):
21 | def __init__(self, path, pad_to_multiple_of=1):
22 | # Synthetic elements used to pad the dictionary length.
23 | # It is assumed that these synthetic elements do not appear in the actual data files.
24 | self.synthetic = ["vvvvvvvv" + str(i) for i in range(pad_to_multiple_of-1)]
25 |
26 | self.dictionary = Dictionary()
27 | self.train = self.tokenize(os.path.join(path, 'train.txt'))
28 | self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
29 | self.test = self.tokenize(os.path.join(path, 'test.txt'))
30 |
31 | # Pad dictionary size to desired multiple. For example, padding to a multiple of 8
32 | # is necessary to ensure Tensor Core usage for the decoder.
33 | pad_elem = pad_to_multiple_of - len(self.dictionary)%pad_to_multiple_of
34 | if pad_elem != pad_to_multiple_of:
35 | for i in range(pad_elem):
36 | self.dictionary.add_word(self.synthetic[i])
37 |
38 | def tokenize(self, path):
39 | """Tokenizes a text file."""
40 | assert os.path.exists(path)
41 | # Add words to the dictionary
42 | with open(path, 'r') as f:
43 | tokens = 0
44 | for line in f:
45 | words = line.split() + ['']
46 | tokens += len(words)
47 | for word in words:
48 | self.dictionary.add_word(word)
49 |
50 | # Tokenize file content
51 | with open(path, 'r') as f:
52 | ids = torch.LongTensor(tokens)
53 | token = 0
54 | for line in f:
55 | words = line.split() + ['']
56 | for word in words:
57 | ids[token] = self.dictionary.word2idx[word]
58 | token += 1
59 |
60 | return ids
61 |
--------------------------------------------------------------------------------
/furnace/apex/examples/word_language_model/data/wikitext-2/README:
--------------------------------------------------------------------------------
1 | This is raw data from the wikitext-2 dataset.
2 |
3 | See https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
4 |
--------------------------------------------------------------------------------
/furnace/apex/examples/word_language_model/generate.py:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Language Modeling on Penn Tree Bank
3 | #
4 | # This file generates new sentences sampled from the language model
5 | #
6 | ###############################################################################
7 |
8 | import argparse
9 |
10 | import torch
11 |
12 | import data
13 |
14 | parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')
15 |
16 | # Model parameters.
17 | parser.add_argument('--data', type=str, default='./data/wikitext-2',
18 | help='location of the data corpus')
19 | parser.add_argument('--checkpoint', type=str, default='./model.pt',
20 | help='model checkpoint to use')
21 | parser.add_argument('--outf', type=str, default='generated.txt',
22 | help='output file for generated text')
23 | parser.add_argument('--words', type=int, default='1000',
24 | help='number of words to generate')
25 | parser.add_argument('--seed', type=int, default=1111,
26 | help='random seed')
27 | parser.add_argument('--cuda', action='store_true',
28 | help='use CUDA')
29 | parser.add_argument('--temperature', type=float, default=1.0,
30 | help='temperature - higher will increase diversity')
31 | parser.add_argument('--log-interval', type=int, default=100,
32 | help='reporting interval')
33 | args = parser.parse_args()
34 |
35 | # Set the random seed manually for reproducibility.
36 | torch.manual_seed(args.seed)
37 | if torch.cuda.is_available():
38 | if not args.cuda:
39 | print("WARNING: You have a CUDA device, so you should probably run with --cuda")
40 |
41 | if args.temperature < 1e-3:
42 | parser.error("--temperature has to be greater or equal 1e-3")
43 |
44 | with open(args.checkpoint, 'rb') as f:
45 | model = torch.load(f)
46 | model.eval()
47 |
48 | if args.cuda:
49 | model.cuda()
50 | else:
51 | model.cpu()
52 |
53 | corpus = data.Corpus(args.data)
54 | ntokens = len(corpus.dictionary)
55 | hidden = model.init_hidden(1)
56 | with torch.no_grad():
57 | input = torch.rand(1, 1).mul(ntokens).long()
58 | if args.cuda:
59 | input = input.cuda()
60 |
61 | with open(args.outf, 'w') as outf:
62 | for i in range(args.words):
63 | output, hidden = model(input, hidden)
64 | word_weights = output.squeeze().float().data.div(args.temperature).exp().cpu()
65 | word_idx = torch.multinomial(word_weights, 1)[0]
66 | input.data.fill_(word_idx)
67 | word = corpus.dictionary.idx2word[word_idx]
68 |
69 | outf.write(word + ('\n' if i % 20 == 19 else ' '))
70 |
71 | if i % args.log_interval == 0:
72 | print('| Generated {}/{} words'.format(i, args.words))
73 |
--------------------------------------------------------------------------------
/furnace/apex/examples/word_language_model/model.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | class RNNModel(nn.Module):
5 | """Container module with an encoder, a recurrent module, and a decoder."""
6 |
7 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
8 | super(RNNModel, self).__init__()
9 | self.drop = nn.Dropout(dropout)
10 | self.encoder = nn.Embedding(ntoken, ninp)
11 | if rnn_type in ['LSTM', 'GRU']:
12 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
13 | else:
14 | try:
15 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
16 | except KeyError:
17 | raise ValueError("""An invalid option for `--model` was supplied,
18 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
19 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
20 | self.decoder = nn.Linear(nhid, ntoken)
21 |
22 | # Optionally tie weights as in:
23 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
24 | # https://arxiv.org/abs/1608.05859
25 | # and
26 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
27 | # https://arxiv.org/abs/1611.01462
28 | if tie_weights:
29 | if nhid != ninp:
30 | raise ValueError('When using the tied flag, nhid must be equal to emsize')
31 | self.decoder.weight = self.encoder.weight
32 |
33 | self.init_weights()
34 |
35 | self.rnn_type = rnn_type
36 | self.nhid = nhid
37 | self.nlayers = nlayers
38 |
39 | def init_weights(self):
40 | initrange = 0.1
41 | self.encoder.weight.data.uniform_(-initrange, initrange)
42 | self.decoder.bias.data.fill_(0)
43 | self.decoder.weight.data.uniform_(-initrange, initrange)
44 |
45 | def forward(self, input, hidden):
46 | emb = self.drop(self.encoder(input))
47 | output, hidden = self.rnn(emb, hidden)
48 | output = self.drop(output)
49 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
50 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
51 |
52 | def init_hidden(self, bsz):
53 | weight = next(self.parameters()).data
54 | if self.rnn_type == 'LSTM':
55 | return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
56 | weight.new(self.nlayers, bsz, self.nhid).zero_())
57 | else:
58 | return weight.new(self.nlayers, bsz, self.nhid).zero_()
59 |
--------------------------------------------------------------------------------
/furnace/apex/setup.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from setuptools import setup, find_packages
3 |
4 | import sys
5 |
6 | if not torch.cuda.is_available():
7 | print("Warning: Torch did not find available GPUs on this system.\n",
8 | "If your intention is to cross-compile, this is not an error.")
9 |
10 | print("torch.__version__ = ", torch.__version__)
11 | TORCH_MAJOR = int(torch.__version__.split('.')[0])
12 | TORCH_MINOR = int(torch.__version__.split('.')[1])
13 |
14 | if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
15 | raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
16 | "The latest stable release can be obtained from https://pytorch.org/")
17 |
18 | cmdclass = {}
19 | ext_modules = []
20 |
21 | if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
22 | from torch.utils.cpp_extension import BuildExtension
23 | cmdclass['build_ext'] = BuildExtension
24 |
25 | if "--cpp_ext" in sys.argv:
26 | from torch.utils.cpp_extension import CppExtension
27 | sys.argv.remove("--cpp_ext")
28 | ext_modules.append(
29 | CppExtension('apex_C',
30 | ['csrc/flatten_unflatten.cpp',]))
31 |
32 | if "--cuda_ext" in sys.argv:
33 | from torch.utils.cpp_extension import CUDAExtension
34 | sys.argv.remove("--cuda_ext")
35 |
36 | if torch.utils.cpp_extension.CUDA_HOME is None:
37 | print("Warning: nvcc is not available. Ignoring --cuda-ext")
38 | else:
39 | ext_modules.append(
40 | CUDAExtension(name='amp_C',
41 | sources=['csrc/scale_check_overflow.cpp',
42 | 'csrc/scale_check_overflow_kernel.cu']))
43 | ext_modules.append(
44 | CUDAExtension(name='fused_adam_cuda',
45 | sources=['apex/optimizers/csrc/fused_adam_cuda.cpp',
46 | 'apex/optimizers/csrc/fused_adam_cuda_kernel.cu'],
47 | extra_compile_args={'cxx': ['-O3',],
48 | 'nvcc':['-O3',
49 | '--use_fast_math']}))
50 | ext_modules.append(
51 | CUDAExtension(name='syncbn',
52 | sources=['csrc/syncbn.cpp',
53 | 'csrc/welford.cu']))
54 | ext_modules.append(
55 | CUDAExtension(name='fused_layer_norm_cuda',
56 | sources=['apex/normalization/csrc/layer_norm_cuda.cpp',
57 | 'apex/normalization/csrc/layer_norm_cuda_kernel.cu'],
58 | extra_compile_args={'cxx': ['-O3',],
59 | 'nvcc':['-maxrregcount=50',
60 | '-O3',
61 | '--use_fast_math']}))
62 |
63 | setup(
64 | name='apex',
65 | version='0.1',
66 | packages=find_packages(exclude=('build',
67 | 'csrc',
68 | 'include',
69 | 'tests',
70 | 'dist',
71 | 'docs',
72 | 'tests',
73 | 'examples',
74 | 'apex.egg-info',)),
75 | description='PyTorch Extensions written by NVIDIA',
76 | ext_modules=ext_modules,
77 | cmdclass=cmdclass,
78 | )
79 |
--------------------------------------------------------------------------------
/furnace/apex/tests/RNN/RNN_tests.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.autograd import Variable
4 | import apex
5 | from apex.RNN.models import bidirectionalRNN, stackedRNN, RNNCell
6 | from torch.nn._functions.rnn import LSTMCell
7 | import itertools
8 |
9 |
10 | torch.backends.cudnn.enabled=False
11 |
12 | batch_first = False #not implemented yet
13 | dropout = 0.0 #How to validate?
14 | bidirectional = False #True works, but differs in definition to PyTorch
15 |
16 | rnn_types = ['LSTM', 'GRU', 'ReLU', 'Tanh']
17 | sizes = [8,4,2]
18 |
19 | seq_sizes = sizes
20 | hidden_sizes = sizes
21 | inp_sizes = sizes
22 | batch_sizes = sizes
23 | num_layerss = sizes
24 |
25 | biases = [True]
26 |
27 | def copy_param_set(pyt_rnn, my_rnn, layer=0, reverse=False):
28 | my_params = None
29 |
30 | rnn = None
31 | if isinstance(my_rnn, bidirectionalRNN):
32 | rnn = my_rnn.fwd.rnns[layer] if not reverse else my_rnn.bckwrd.rnns[layer]
33 | elif isinstance(my_rnn, stackedRNN):
34 | rnn = my_rnn.rnns[layer]
35 | else:
36 | raise RuntimeError()
37 |
38 | param_names = ['w_ih', 'w_hh', 'b_ih', 'b_hh']
39 |
40 | if not hasattr(rnn, 'b_hh'):
41 | param_names = param_names[:2]
42 | my_params = [getattr(rnn, param_name) for param_name in param_names]
43 |
44 | pyt_params = None
45 | param_names = ['weight_ih_', 'weight_hh_', 'bias_ih_', 'bias_hh_']
46 | reverse_str = '_reverse' if reverse else ''
47 |
48 | if not hasattr(pyt_rnn, 'bias_hh_l0'):
49 | param_names=param_names[:2]
50 | pyt_params =[getattr(pyt_rnn, param_name + 'l' + str(layer) + reverse_str )
51 | for param_name in param_names ]
52 | for pyt_param, my_param in zip(pyt_params, my_params):
53 | pyt_param.data.copy_(my_param.data)
54 |
55 | def copy_all_params(pyt_rnn, my_rnn):
56 | for layer in range(num_layers):
57 | copy_param_set(pyt_rnn, my_rnn, layer)
58 | if bidirectional:
59 | copy_param_set(pyt_rnn, my_rnn, layer, bidirectional)
60 |
61 |
62 | def compare_variables(v1, v2, msg, params):
63 | diff = float((v1.data-v2.data).abs().max())
64 | if diff > 1e-5:
65 | print("Error of ", diff, " found for ", msg, " for case: ", str(params))
66 |
67 | def compare_tuple_variables(t1, t2, msg, params):
68 | for var1, var2 in zip(t1, t2):
69 | compare_variables(var1, var2, msg, params)
70 |
71 | def maybe_compare(v1, v2, msg, params):
72 | if isinstance(v1, Variable) and isinstance(v2, Variable):
73 | compare_variables(v1, v2, msg, params)
74 | else:
75 | compare_tuple_variables(v1, v2, msg, params)
76 |
77 | product = list(itertools.product(rnn_types, seq_sizes, hidden_sizes, inp_sizes, batch_sizes, num_layerss, biases))
78 |
79 | for test_case in product:
80 | rnn_type, seq_size, hidden_size, inp_size, batch_size, num_layers, bias = test_case
81 |
82 | inp = torch.cuda.FloatTensor(seq_size, batch_size, inp_size).uniform_()
83 |
84 | if rnn_type == 'ReLU' or rnn_type == 'Tanh':
85 | pytorch_rnn = nn.RNN(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, nonlinearity=rnn_type.lower()).cuda()
86 | else:
87 | pytorch_rnn = getattr(nn, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda()
88 | my_rnn = getattr(apex.RNN.models, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda()
89 |
90 | copy_all_params(pytorch_rnn, my_rnn)
91 |
92 | pyt_inp = Variable(inp, requires_grad=True)
93 | my_inp = Variable(inp, requires_grad=True)
94 |
95 | my_out, my_hiddens = my_rnn(my_inp)
96 | pyt_out, pyt_hiddens = pytorch_rnn(pyt_inp)
97 |
98 | pyt_out.sum().backward()
99 | my_out.sum().backward()
100 |
101 |
102 | maybe_compare(pyt_out, my_out, "out", test_case)
103 |
104 | #If there's only one hidden state PyTorch doesn't return it in a tuple,
105 | #apex does, so we wrap PyTorch's returned hidden state in a tuple.
106 | if not isinstance(pyt_hiddens, tuple):
107 | pyt_hiddens = (pyt_hiddens,)
108 |
109 | try:
110 | for i, (pyt_hid, my_hid) in enumerate(zip(pyt_hiddens, my_hiddens)):
111 | maybe_compare(pyt_hid, my_hid , "hx_"+str(i), test_case)
112 | except ValueError:
113 | maybe_compare(pyt_hiddens, my_hiddens , "hx_0", test_case)
114 |
115 |
116 | maybe_compare(pyt_inp.grad, my_inp.grad, "inp.grad", test_case)
117 |
118 | print("Test passed.")
119 |
--------------------------------------------------------------------------------
/furnace/apex/tests/distributed/ddp_race_condition_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.distributed as dist
3 | from torch.nn import Parameter
4 | from torch.nn import Module
5 | from apex.parallel import DistributedDataParallel as DDP
6 | import argparse
7 | import os
8 |
9 |
10 | parser = argparse.ArgumentParser(description='allreduce hook example')
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | args = parser.parse_args()
13 |
14 | args.distributed = False
15 | if 'WORLD_SIZE' in os.environ:
16 | args.distributed = int(os.environ['WORLD_SIZE']) > 1
17 |
18 | if args.distributed:
19 | args.gpu = args.local_rank % torch.cuda.device_count()
20 | torch.cuda.set_device(args.gpu)
21 | torch.distributed.init_process_group(backend='nccl',
22 | init_method='env://')
23 | args.world_size = torch.distributed.get_world_size()
24 |
25 | torch.set_printoptions(precision=10)
26 | torch.manual_seed(args.local_rank)
27 |
28 | class Model(Module):
29 | def __init__(self):
30 | super(Model, self).__init__()
31 | self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
32 | self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
33 | def forward(self, input):
34 | return (input*self.a)*self.b
35 |
36 | model = Model()
37 | # model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
38 | model = DDP(model, delay_allreduce=True)
39 | # model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
40 |
41 | x = torch.cuda.FloatTensor(4096*4096)
42 |
43 | passed = True
44 | torch.cuda.cudart().cudaProfilerStart()
45 | for i in range(10):
46 | x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
47 | model.zero_grad()
48 | out = model(x)
49 | loss = out.sum()
50 | # torch.cuda.nvtx.range_push("backward")
51 | loss.backward()
52 | # torch.cuda.nvtx.range_pop()
53 |
54 | # torch.cuda.nvtx.range_push("synchronize() + info")
55 | # torch.cuda.synchronize()
56 | print("i = {}".format(i))
57 | def info(name, param, val):
58 | expected = val*4096*4096*(2.*i+1)/2.
59 | actual = param.grad.data.sum().item()
60 | print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
61 | param.grad.data_ptr(), expected, actual))
62 | return (expected == actual)
63 | if not info("model.a", model.module.a, 2.): passed = False
64 | if not info("model.b", model.module.b, 1.): passed = False
65 | # torch.cuda.nvtx.range_pop()
66 | torch.cuda.cudart().cudaProfilerStop()
67 |
68 | print("passed = ", passed)
69 |
--------------------------------------------------------------------------------
/furnace/apex/tests/distributed/run_race_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py
4 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_amp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_amp/__init__.py
--------------------------------------------------------------------------------
/furnace/apex/tests/run_amp/test_cache.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import functools as ft
4 | import itertools as it
5 |
6 | from apex import amp
7 | import torch
8 | from torch import nn
9 | import torch.nn.functional as F
10 |
11 | from utils import common_init, HALF, FLOAT,\
12 | ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
13 |
14 | def get_reference_grad(i, w, ops):
15 | # Creating new tensors ensures, among other things, that the new tensors are not in the cache.
16 | # In fact, they are guaranteed not to use the cache because they are not torch.nn.Parameters.
17 | fp32_i = i.detach().clone().float()
18 | fp32_w = w.detach().clone().float().requires_grad_()
19 | loss = ops(fp32_i, fp32_w)
20 | loss.backward()
21 | return fp32_w.grad
22 |
23 | class WhitelistModule(torch.nn.Module):
24 | def __init__(self, dtype):
25 | super(WhitelistModule, self).__init__()
26 | self.weight = torch.nn.Parameter(torch.arange(8*8, device='cuda', dtype=dtype).view(8,8))
27 |
28 | @staticmethod
29 | def ops(input, weight):
30 | return (input.mm(weight)).mm(weight).sum()
31 |
32 | def forward(self, input):
33 | return self.ops(input, self.weight)
34 |
35 |
36 | class BlacklistModule(torch.nn.Module):
37 | def __init__(self, dtype):
38 | super(BlacklistModule, self).__init__()
39 | self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
40 |
41 | @staticmethod
42 | def ops(input, weight):
43 | return (input + torch.pow(weight, 2) + torch.pow(weight, 2)).sum()
44 |
45 | def forward(self, input):
46 | return self.ops(input, self.weight)
47 |
48 |
49 | class PromoteModule(torch.nn.Module):
50 | def __init__(self, dtype):
51 | super(PromoteModule, self).__init__()
52 | self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
53 |
54 | @staticmethod
55 | def ops(input, weight):
56 | return ((input*weight)*weight).sum()
57 |
58 | def forward(self, input):
59 | return self.ops(input, self.weight)
60 |
61 | class TestCache(unittest.TestCase):
62 | def setUp(self):
63 | self.handle = amp.init(enabled=True)
64 | self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
65 | common_init(self)
66 |
67 | def tearDown(self):
68 | self.handle._deactivate()
69 |
70 | def train_eval_train_test(self, module, t):
71 | model = module(t).cuda()
72 | dummy_optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
73 |
74 | def training_step():
75 | for param in model.parameters():
76 | param.grad = None
77 |
78 | loss = model(self.x).sum()
79 | self.handle._default_scaler._loss_scale = 1.0
80 | with self.handle.scale_loss(loss, dummy_optimizer) as scaled_loss:
81 | scaled_loss.backward()
82 |
83 | self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1)
84 | self.assertEqual(model.weight.grad.type(), model.weight.type())
85 |
86 | reference_grad = get_reference_grad(self.x, model.weight, model.ops)
87 |
88 | # Currently there's no difference in the allclose calls, so no need for branching,
89 | # but I'm keeping this in case we want different tolerances for fp16 and fp32 checks.
90 | if model.weight.grad.type() == "torch.cuda.HalfTensor":
91 | self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
92 | elif model.weight.grad.type() == "torch.cuda.FloatTensor":
93 | self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
94 | else:
95 | raise RuntimeError("model.weight.grad.type = {}".format(model.weight.grad.type()))
96 |
97 | model.weight.data -= 1.
98 |
99 | # Simulates first epoch
100 | training_step()
101 |
102 | # Simulates eval
103 | with torch.no_grad():
104 | loss = model(self.x).sum()
105 |
106 | # Simulates resuming training after eval
107 | training_step()
108 |
109 | # I could easily have these as a set of for loops in a single test,
110 | # instead of going for granularity.
111 | def test_whitelist_module_fp16_weight(self):
112 | self.train_eval_train_test(WhitelistModule, torch.float16)
113 |
114 | def test_whitelist_module_fp32_weight(self):
115 | self.train_eval_train_test(WhitelistModule, torch.float32)
116 |
117 | def test_blacklist_module_fp16_weight(self):
118 | self.train_eval_train_test(BlacklistModule, torch.float16)
119 |
120 | def test_blacklist_module_fp32_weight(self):
121 | self.train_eval_train_test(BlacklistModule, torch.float32)
122 |
123 | def test_promote_module_fp16_weight(self):
124 | self.train_eval_train_test(PromoteModule, torch.float16)
125 |
126 | def test_promote_module_fp32_weight(self):
127 | self.train_eval_train_test(PromoteModule, torch.float32)
128 |
129 |
130 | if __name__ == '__main__':
131 | unittest.main()
132 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_amp/test_promotion.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import itertools as it
4 |
5 | from apex import amp
6 | import torch
7 | from torch import nn
8 | import torch.nn.functional as F
9 |
10 | from utils import common_init, HALF, FLOAT, DTYPES
11 |
12 | class TestPromotion(unittest.TestCase):
13 | def setUp(self):
14 | self.handle = amp.init(enabled=True)
15 | common_init(self)
16 |
17 | def tearDown(self):
18 | self.handle._deactivate()
19 |
20 | def run_binary_promote_test(self, fns, input_shape, x_inplace=False):
21 | type_pairs = it.product(DTYPES, DTYPES)
22 | for fn, (xtype, ytype) in it.product(fns, type_pairs):
23 | x = torch.randn(input_shape, dtype=xtype).requires_grad_()
24 | x_leaf = x
25 | if x_inplace:
26 | # We need a non-leaf to call in place on
27 | x = x.clone()
28 | y = torch.randn(input_shape, dtype=ytype)
29 | out = fn(x, y)
30 | if x_inplace:
31 | # In place: always match xtype
32 | self.assertEqual(out.type(), x.type())
33 | else:
34 | # Out of place: match widest type
35 | if xtype == torch.float or ytype == torch.float:
36 | self.assertEqual(out.type(), FLOAT)
37 | else:
38 | self.assertEqual(out.type(), HALF)
39 | out.float().sum().backward()
40 | self.assertEqual(x_leaf.grad.dtype, xtype)
41 |
42 | def test_atan2_matches_widest(self):
43 | fns = [lambda x, y : torch.atan2(x, y),
44 | lambda x, y : x.atan2(y)]
45 | self.run_binary_promote_test(fns, (self.b,))
46 |
47 | def test_mul_matches_widest(self):
48 | fns = [lambda x, y : torch.mul(x, y),
49 | lambda x, y: x.mul(y)]
50 | self.run_binary_promote_test(fns, (self.b,))
51 |
52 | def test_cat_matches_widest(self):
53 | shape = self.b
54 | ys = [torch.randn(shape, dtype=torch.half) for _ in range(5)]
55 | x_float = torch.randn(shape)
56 | out = torch.cat(ys + [x_float])
57 | self.assertEqual(out.type(), FLOAT)
58 | x_half = torch.randn(shape, dtype=torch.half)
59 | out = torch.cat(ys + [x_half])
60 | self.assertEqual(out.type(), HALF)
61 |
62 | def test_inplace_exp_is_error_for_half(self):
63 | xs = torch.randn(self.b)
64 | xs.exp_()
65 | self.assertEqual(xs.type(), FLOAT)
66 | xs = torch.randn(self.b, dtype=torch.half)
67 | with self.assertRaises(NotImplementedError):
68 | xs.exp_()
69 |
70 | def test_inplace_add_matches_self(self):
71 | fn = lambda x, y: x.add_(y)
72 | self.run_binary_promote_test([fn], (self.b,), x_inplace=True)
73 |
74 | if __name__ == '__main__':
75 | unittest.main()
76 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_amp/test_rnn.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from apex import amp
4 | import random
5 | import torch
6 | from torch import nn
7 |
8 | from utils import common_init, HALF
9 |
10 | class TestRnnCells(unittest.TestCase):
11 | def setUp(self):
12 | self.handle = amp.init(enabled=True)
13 | common_init(self)
14 |
15 | def tearDown(self):
16 | self.handle._deactivate()
17 |
18 | def run_cell_test(self, cell, state_tuple=False):
19 | shape = (self.b, self.h)
20 | for typ in [torch.float, torch.half]:
21 | xs = [torch.randn(shape, dtype=typ).requires_grad_()
22 | for _ in range(self.t)]
23 | hidden_fn = lambda: torch.zeros(shape, dtype=typ)
24 | if state_tuple:
25 | hidden = (hidden_fn(), hidden_fn())
26 | else:
27 | hidden = hidden_fn()
28 | outputs = []
29 | for i in range(self.t):
30 | hidden = cell(xs[i], hidden)
31 | if state_tuple:
32 | output = hidden[0]
33 | else:
34 | output = hidden
35 | outputs.append(output)
36 | for y in outputs:
37 | self.assertEqual(y.type(), HALF)
38 | outputs[-1].float().sum().backward()
39 | for i, x in enumerate(xs):
40 | self.assertEqual(x.grad.dtype, x.dtype)
41 |
42 | def test_rnn_cell_is_half(self):
43 | cell = nn.RNNCell(self.h, self.h)
44 | self.run_cell_test(cell)
45 |
46 | def test_gru_cell_is_half(self):
47 | cell = nn.GRUCell(self.h, self.h)
48 | self.run_cell_test(cell)
49 |
50 | def test_lstm_cell_is_half(self):
51 | cell = nn.LSTMCell(self.h, self.h)
52 | self.run_cell_test(cell, state_tuple=True)
53 |
54 | class TestRnns(unittest.TestCase):
55 | def setUp(self):
56 | self.handle = amp.init(enabled=True)
57 | common_init(self)
58 |
59 | def tearDown(self):
60 | self.handle._deactivate()
61 |
62 | def run_rnn_test(self, rnn, layers, bidir, state_tuple=False):
63 | for typ in [torch.float, torch.half]:
64 | x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_()
65 | hidden_fn = lambda: torch.zeros((layers + (layers * bidir),
66 | self.b, self.h), dtype=typ)
67 | if state_tuple:
68 | hidden = (hidden_fn(), hidden_fn())
69 | else:
70 | hidden = hidden_fn()
71 | output, _ = rnn(x, hidden)
72 | self.assertEqual(output.type(), HALF)
73 | output[-1, :, :].float().sum().backward()
74 | self.assertEqual(x.grad.dtype, x.dtype)
75 |
76 | def test_rnn_is_half(self):
77 | configs = [(1, False), (2, False), (2, True)]
78 | for layers, bidir in configs:
79 | rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=layers,
80 | nonlinearity='relu', bidirectional=bidir)
81 | self.run_rnn_test(rnn, layers, bidir)
82 |
83 | def test_gru_is_half(self):
84 | configs = [(1, False), (2, False), (2, True)]
85 | for layers, bidir in configs:
86 | rnn = nn.GRU(input_size=self.h, hidden_size=self.h, num_layers=layers,
87 | bidirectional=bidir)
88 | self.run_rnn_test(rnn, layers, bidir)
89 |
90 | def test_lstm_is_half(self):
91 | configs = [(1, False), (2, False), (2, True)]
92 | for layers, bidir in configs:
93 | rnn = nn.LSTM(input_size=self.h, hidden_size=self.h, num_layers=layers,
94 | bidirectional=bidir)
95 | self.run_rnn_test(rnn, layers, bidir, state_tuple=True)
96 |
97 | def test_rnn_packed_sequence(self):
98 | num_layers = 2
99 | rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=num_layers)
100 | for typ in [torch.float, torch.half]:
101 | x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_()
102 | lens = sorted([random.randint(self.t // 2, self.t) for _ in range(self.b)],
103 | reverse=True)
104 | # `pack_padded_sequence` breaks if default tensor type is non-CPU
105 | torch.set_default_tensor_type(torch.FloatTensor)
106 | lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
107 | packed_seq = nn.utils.rnn.pack_padded_sequence(x, lens)
108 | torch.set_default_tensor_type(torch.cuda.FloatTensor)
109 | hidden = torch.zeros((num_layers, self.b, self.h), dtype=typ)
110 | output, _ = rnn(packed_seq, hidden)
111 | self.assertEqual(output.data.type(), HALF)
112 | output.data.float().sum().backward()
113 | self.assertEqual(x.grad.dtype, x.dtype)
114 |
115 | if __name__ == '__main__':
116 | unittest.main()
117 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_amp/test_scale.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import functools as ft
4 | import itertools as it
5 |
6 | from apex import amp
7 | import torch
8 | from torch import nn
9 | import torch.nn.functional as F
10 |
11 | from utils import common_init, HALF, FLOAT,\
12 | ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
13 |
14 | try:
15 | import amp_C
16 | scale_check_overflow = amp_C.scale_check_overflow
17 | disabled = False
18 | except ImportError as err:
19 | print("amp_C fused kernel unavailable, disabling TestScale. ImportError was ", err)
20 | disabled = True
21 |
22 |
23 | class TestScale(unittest.TestCase):
24 |
25 | def setUp(self):
26 | self.scale = 128.0
27 | self.nx = 999
28 | self.ny = 888
29 |
30 | self.overflow_buf = torch.cuda.IntTensor([0])
31 | self.fp16 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float16)
32 | self.fp32 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float32)
33 | self.fp16_ref = torch.ones((1, 1), device='cuda', dtype=torch.float16)
34 | self.fp32_ref = torch.ones((1, 1), device='cuda', dtype=torch.float32)
35 |
36 | common_init(self)
37 |
38 | def tearDown(self):
39 | pass
40 |
41 | def downscale_test(self, input, output, ref):
42 | self.overflow_buf.zero_()
43 | input.fill_(1.0)
44 | if input is not output:
45 | output.fill_(3.0)
46 | input.mul_(self.scale)
47 | scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
48 | self.assertTrue(torch.allclose(output, ref))
49 | self.assertTrue(self.overflow_buf.item() == 0)
50 |
51 | def find_inf_test(self, input, output, ref, x, y, val):
52 | self.overflow_buf.zero_()
53 | input.fill_(1.0)
54 | if input is not output:
55 | output.fill_(3.0)
56 | input[x,y] = val
57 | scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
58 | self.assertTrue(self.overflow_buf.item())
59 |
60 | # Currently, the fused kernel gives a hard error if you attempt to downscale
61 | # into fp16 output, which imo is the desired behavior. Maybe someday we
62 | # will learn otherwise.
63 | # @unittest.skipIf(disabled, "amp_C is unavailable")
64 | # def test_fp16_to_fp16(self):
65 | # self.downscale_test(self.fp16, self.fp16, self.fp16_ref)
66 |
67 | @unittest.skipIf(disabled, "amp_C is unavailable")
68 | def test_fp16_to_fp32(self):
69 | self.downscale_test(self.fp16, self.fp32, self.fp32_ref)
70 |
71 | # @unittest.skipIf(disabled, "amp_C is unavailable")
72 | # def test_fp32_to_fp16(self):
73 | # self.downscale_test(self.fp32, self.fp16, self.fp16_ref)
74 |
75 | @unittest.skipIf(disabled, "amp_C is unavailable")
76 | def test_fp32_to_fp32(self):
77 | self.downscale_test(self.fp32, self.fp32, self.fp32_ref)
78 |
79 | @unittest.skipIf(disabled, "amp_C is unavailable")
80 | def test_fp16_to_fp32_find_inf_nan(self):
81 | self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, 0, 0, float('nan'))
82 | self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('inf'))
83 | self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('nan'))
84 |
85 | @unittest.skipIf(disabled, "amp_C is unavailable")
86 | def test_fp32_to_fp32_find_inf_nan(self):
87 | self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, 0, 0, float('inf'))
88 | self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('nan'))
89 | self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('inf'))
90 |
91 |
92 | if __name__ == '__main__':
93 | unittest.main()
94 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_amp/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | HALF = 'torch.cuda.HalfTensor'
4 | FLOAT = 'torch.cuda.FloatTensor'
5 |
6 | DTYPES = [torch.half, torch.float]
7 |
8 | ALWAYS_HALF = {torch.float: HALF,
9 | torch.half: HALF}
10 | ALWAYS_FLOAT = {torch.float: FLOAT,
11 | torch.half: FLOAT}
12 | MATCH_INPUT = {torch.float: FLOAT,
13 | torch.half: HALF}
14 |
15 | def common_init(test_case):
16 | test_case.h = 64
17 | test_case.b = 16
18 | test_case.c = 16
19 | test_case.k = 3
20 | test_case.t = 10
21 | torch.set_default_tensor_type(torch.cuda.FloatTensor)
22 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_fp16_optimizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_fp16_optimizer/__init__.py
--------------------------------------------------------------------------------
/furnace/apex/tests/run_fp16_optimizer/test_fp16_optimizer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import functools as ft
4 | import itertools as it
5 |
6 | import torch
7 | from apex.fp16_utils import FP16_Optimizer
8 |
9 | # Currently no-ops (tested via examples).
10 | # FP16_Optimizer to be deprecated and moved under unified Amp API.
11 | class TestFP16Optimizer(unittest.TestCase):
12 | def setUp(self):
13 | N, D_in, D_out = 64, 1024, 16
14 | self.N = N
15 | self.D_in = D_in
16 | self.D_out = D_out
17 | self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda')
18 | self.y = torch.randn((N, D_out), dtype=torch.float16, device='cuda')
19 | self.model = torch.nn.Linear(D_in, D_out).cuda().half()
20 |
21 | # def tearDown(self):
22 | # pass
23 |
24 | def test_minimal(self):
25 | pass
26 |
27 | def test_minimal_static(self):
28 | pass
29 |
30 | def test_minimal_dynamic(self):
31 | pass
32 |
33 | def test_closure(self):
34 | pass
35 |
36 | def test_closure_dynamic(self):
37 | pass
38 |
39 | def test_save_load(self):
40 | pass
41 |
42 | if __name__ == '__main__':
43 | unittest.main()
44 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_fp16util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_fp16util/__init__.py
--------------------------------------------------------------------------------
/furnace/apex/tests/run_fp16util/test_fp16util.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from apex.fp16_utils import FP16Model
7 |
8 |
9 | class DummyBlock(nn.Module):
10 | def __init__(self):
11 | super(DummyBlock, self).__init__()
12 |
13 | self.conv = nn.Conv2d(10, 10, 2)
14 | self.bn = nn.BatchNorm2d(10, affine=True)
15 |
16 | def forward(self, x):
17 | return self.conv(self.bn(x))
18 |
19 |
20 | class DummyNet(nn.Module):
21 | def __init__(self):
22 | super(DummyNet, self).__init__()
23 |
24 | self.conv1 = nn.Conv2d(3, 10, 2)
25 | self.bn1 = nn.BatchNorm2d(10, affine=False)
26 | self.db1 = DummyBlock()
27 | self.db2 = DummyBlock()
28 |
29 | def forward(self, x):
30 | out = x
31 | out = self.conv1(out)
32 | out = self.bn1(out)
33 | out = self.db1(out)
34 | out = self.db2(out)
35 | return out
36 |
37 |
38 | class DummyNetWrapper(nn.Module):
39 | def __init__(self):
40 | super(DummyNetWrapper, self).__init__()
41 |
42 | self.bn = nn.BatchNorm2d(3, affine=True)
43 | self.dn = DummyNet()
44 |
45 | def forward(self, x):
46 | return self.dn(self.bn(x))
47 |
48 |
49 | class TestFP16Model(unittest.TestCase):
50 | def setUp(self):
51 | self.N = 64
52 | self.C_in = 3
53 | self.H_in = 16
54 | self.W_in = 32
55 | self.in_tensor = torch.randn((self.N, self.C_in, self.H_in, self.W_in)).cuda()
56 | self.orig_model = DummyNetWrapper().cuda()
57 | self.fp16_model = FP16Model(self.orig_model)
58 |
59 | def test_params_and_buffers(self):
60 | exempted_modules = [
61 | self.fp16_model.network.bn,
62 | self.fp16_model.network.dn.db1.bn,
63 | self.fp16_model.network.dn.db2.bn,
64 | ]
65 | for m in self.fp16_model.modules():
66 | expected_dtype = torch.float if (m in exempted_modules) else torch.half
67 | for p in m.parameters(recurse=False):
68 | assert p.dtype == expected_dtype
69 | for b in m.buffers(recurse=False):
70 | assert b.dtype in (expected_dtype, torch.int64)
71 |
72 | def test_output_is_half(self):
73 | out_tensor = self.fp16_model(self.in_tensor)
74 | assert out_tensor.dtype == torch.half
75 |
76 |
--------------------------------------------------------------------------------
/furnace/apex/tests/run_mixed_adam/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_mixed_adam/__init__.py
--------------------------------------------------------------------------------
/furnace/apex/tests/run_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import sys
3 |
4 | test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam"]
5 |
6 | runner = unittest.TextTestRunner(verbosity=2)
7 |
8 | errcode = 0
9 |
10 | for test_dir in test_dirs:
11 | suite = unittest.TestLoader().discover(test_dir)
12 |
13 | print("\nExecuting tests from " + test_dir)
14 |
15 | result = runner.run(suite)
16 |
17 | if not result.wasSuccessful():
18 | errcode = 1
19 |
20 | sys.exit(errcode)
21 |
--------------------------------------------------------------------------------
/furnace/apex/tests/synced_batchnorm/unit_test.sh:
--------------------------------------------------------------------------------
1 | python single_gpu_unit_test.py
2 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py
3 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp64
4 | #beware, you need a system with at least 4 gpus to test group_size 1 else False
87 | layers.append(block(self.in_channels, mid_out_channels, has_proj,
88 | stride=stride, norm_layer=norm_layer))
89 | self.in_channels = mid_out_channels * block.expansion
90 | for i in range(1, blocks):
91 | layers.append(block(self.in_channels, mid_out_channels,
92 | has_proj=False, stride=1,
93 | norm_layer=norm_layer))
94 |
95 | return nn.Sequential(*layers)
96 |
97 | def forward(self, x):
98 | x = self.conv1(x)
99 | x = self.maxpool(x)
100 |
101 | blocks = []
102 | x = self.layer1(x);
103 | blocks.append(x)
104 | x = self.layer2(x);
105 | blocks.append(x)
106 | x = self.layer3(x);
107 | blocks.append(x)
108 |
109 | return blocks
110 |
111 |
112 | def xception39(pretrained_model=None, **kwargs):
113 | model = Xception(Block, [4, 8, 4], [16, 32, 64], **kwargs)
114 |
115 | if pretrained_model is not None:
116 | model = load_model(model, pretrained_model)
117 | return model
118 |
--------------------------------------------------------------------------------
/furnace/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/datasets/__init__.py
--------------------------------------------------------------------------------
/furnace/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/engine/__init__.py
--------------------------------------------------------------------------------
/furnace/engine/logger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | # @Time : 2018/8/2 上午11:48
4 | # @Author : yuchangqian
5 | # @Contact : changqian_yu@163.com
6 | # @File : logger.py
7 | import os
8 | import sys
9 | import logging
10 |
11 | from utils import pyt_utils
12 | # from utils.pyt_utils import ensure_dir
13 |
14 | _default_level_name = os.getenv('ENGINE_LOGGING_LEVEL', 'INFO')
15 | _default_level = logging.getLevelName(_default_level_name.upper())
16 |
17 |
18 | class LogFormatter(logging.Formatter):
19 | log_fout = None
20 | date_full = '[%(asctime)s %(lineno)d@%(filename)s:%(name)s] '
21 | date = '%(asctime)s '
22 | msg = '%(message)s'
23 |
24 | def format(self, record):
25 | if record.levelno == logging.DEBUG:
26 | mcl, mtxt = self._color_dbg, 'DBG'
27 | elif record.levelno == logging.WARNING:
28 | mcl, mtxt = self._color_warn, 'WRN'
29 | elif record.levelno == logging.ERROR:
30 | mcl, mtxt = self._color_err, 'ERR'
31 | else:
32 | mcl, mtxt = self._color_normal, ''
33 |
34 | if mtxt:
35 | mtxt += ' '
36 |
37 | if self.log_fout:
38 | self.__set_fmt(self.date_full + mtxt + self.msg)
39 | formatted = super(LogFormatter, self).format(record)
40 | # self.log_fout.write(formatted)
41 | # self.log_fout.write('\n')
42 | # self.log_fout.flush()
43 | return formatted
44 |
45 | self.__set_fmt(self._color_date(self.date) + mcl(mtxt + self.msg))
46 | formatted = super(LogFormatter, self).format(record)
47 |
48 | return formatted
49 |
50 | if sys.version_info.major < 3:
51 | def __set_fmt(self, fmt):
52 | self._fmt = fmt
53 | else:
54 | def __set_fmt(self, fmt):
55 | self._style._fmt = fmt
56 |
57 | @staticmethod
58 | def _color_dbg(msg):
59 | return '\x1b[36m{}\x1b[0m'.format(msg)
60 |
61 | @staticmethod
62 | def _color_warn(msg):
63 | return '\x1b[1;31m{}\x1b[0m'.format(msg)
64 |
65 | @staticmethod
66 | def _color_err(msg):
67 | return '\x1b[1;4;31m{}\x1b[0m'.format(msg)
68 |
69 | @staticmethod
70 | def _color_omitted(msg):
71 | return '\x1b[35m{}\x1b[0m'.format(msg)
72 |
73 | @staticmethod
74 | def _color_normal(msg):
75 | return msg
76 |
77 | @staticmethod
78 | def _color_date(msg):
79 | return '\x1b[32m{}\x1b[0m'.format(msg)
80 |
81 |
82 | def get_logger(log_dir=None, log_file=None, formatter=LogFormatter):
83 | logger = logging.getLogger()
84 | logger.setLevel(_default_level)
85 | del logger.handlers[:]
86 |
87 | if log_dir and log_file:
88 | pyt_utils.ensure_dir(log_dir)
89 | LogFormatter.log_fout = True
90 | file_handler = logging.FileHandler(log_file, mode='a')
91 | file_handler.setLevel(logging.INFO)
92 | file_handler.setFormatter(formatter)
93 | logger.addHandler(file_handler)
94 |
95 | stream_handler = logging.StreamHandler()
96 | stream_handler.setFormatter(formatter(datefmt='%d %H:%M:%S'))
97 | stream_handler.setLevel(0)
98 | logger.addHandler(stream_handler)
99 | return logger
100 |
--------------------------------------------------------------------------------
/furnace/engine/lr_policy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | # @Time : 2018/8/1 上午1:50
4 | # @Author : yuchangqian
5 | # @Contact : changqian_yu@163.com
6 | # @File : lr_policy.py.py
7 |
8 | from abc import ABCMeta, abstractmethod
9 |
10 |
11 | class BaseLR():
12 | __metaclass__ = ABCMeta
13 |
14 | @abstractmethod
15 | def get_lr(self, cur_iter): pass
16 |
17 |
18 | class PolyLR(BaseLR):
19 | def __init__(self, start_lr, lr_power, total_iters):
20 | self.start_lr = start_lr
21 | self.lr_power = lr_power
22 | self.total_iters = total_iters + 0.0
23 |
24 | def get_lr(self, cur_iter):
25 | return self.start_lr * (
26 | (1 - float(cur_iter) / self.total_iters) ** self.lr_power)
27 |
28 | class WarmUpPolyLR(BaseLR):
29 | def __init__(self, start_lr, lr_power, total_iters, warmup_steps):
30 | self.start_lr = start_lr
31 | self.lr_power = lr_power
32 | self.total_iters = total_iters + 0.0
33 | self.warmup_steps = warmup_steps
34 |
35 | def get_lr(self, cur_iter):
36 | if cur_iter < self.warmup_steps:
37 | return self.start_lr * (cur_iter / self.warmup_steps)
38 | else:
39 | return self.start_lr * (
40 | (1 - float(cur_iter) / self.total_iters) ** self.lr_power)
41 |
42 | class MultiStageLR(BaseLR):
43 | def __init__(self, lr_stages):
44 | assert type(lr_stages) in [list, tuple] and len(lr_stages[0]) == 2, \
45 | 'lr_stages must be list or tuple, with [iters, lr] format'
46 | self._lr_stagess = lr_stages
47 |
48 | def get_lr(self, epoch):
49 | for it_lr in self._lr_stagess:
50 | if epoch < it_lr[0]:
51 | return it_lr[1]
52 |
53 |
54 | class LinearIncreaseLR(BaseLR):
55 | def __init__(self, start_lr, end_lr, warm_iters):
56 | self._start_lr = start_lr
57 | self._end_lr = end_lr
58 | self._warm_iters = warm_iters
59 | self._delta_lr = (end_lr - start_lr) / warm_iters
60 |
61 | def get_lr(self, cur_epoch):
62 | return self._start_lr + cur_epoch * self._delta_lr
63 |
64 |
65 |
--------------------------------------------------------------------------------
/furnace/engine/version.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | # @Time : 2018/8/3 下午2:59
4 | # @Author : yuchangqian
5 | # @Contact : changqian_yu@163.com
6 | # @File : version.py
7 |
8 | __version__ = '0.1.1'
--------------------------------------------------------------------------------
/furnace/seg_opr/__init__.py:
--------------------------------------------------------------------------------
1 | from .seg_oprs import *
2 |
--------------------------------------------------------------------------------
/furnace/seg_opr/metric.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | import numpy as np
4 |
5 | np.seterr(divide='ignore', invalid='ignore')
6 |
7 |
8 | # voc cityscapes metric
9 | def hist_info(n_cl, pred, gt):
10 | assert (pred.shape == gt.shape)
11 | k = (gt >= 0) & (gt < n_cl)
12 | labeled = np.sum(k)
13 | correct = np.sum((pred[k] == gt[k]))
14 |
15 | return np.bincount(n_cl * gt[k].astype(int) + pred[k].astype(int),
16 | minlength=n_cl ** 2).reshape(n_cl,
17 | n_cl), labeled, correct
18 |
19 |
20 | def compute_score(hist, correct, labeled):
21 | iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
22 | mean_IU = np.nanmean(iu)
23 | mean_IU_no_back = np.nanmean(iu[1:])
24 | freq = hist.sum(1) / hist.sum()
25 | freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
26 | mean_pixel_acc = correct / labeled
27 |
28 | return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
29 |
30 |
31 | # ade metric
32 | def meanIoU(area_intersection, area_union):
33 | iou = 1.0 * np.sum(area_intersection, axis=1) / np.sum(area_union, axis=1)
34 | meaniou = np.nanmean(iou)
35 | meaniou_no_back = np.nanmean(iou[1:])
36 |
37 | return iou, meaniou, meaniou_no_back
38 |
39 |
40 | def intersectionAndUnion(imPred, imLab, numClass):
41 | # Remove classes from unlabeled pixels in gt image.
42 | # We should not penalize detections in unlabeled portions of the image.
43 | imPred = imPred * (imLab >= 0)
44 |
45 | # Compute area intersection:
46 | intersection = imPred * (imPred == imLab)
47 | (area_intersection, _) = np.histogram(intersection, bins=numClass,
48 | range=(1, numClass))
49 |
50 | # Compute area union:
51 | (area_pred, _) = np.histogram(imPred, bins=numClass, range=(1, numClass))
52 | (area_lab, _) = np.histogram(imLab, bins=numClass, range=(1, numClass))
53 | area_union = area_pred + area_lab - area_intersection
54 |
55 | return area_intersection, area_union
56 |
57 |
58 | def mean_pixel_accuracy(pixel_correct, pixel_labeled):
59 | mean_pixel_accuracy = 1.0 * np.sum(pixel_correct) / (
60 | np.spacing(1) + np.sum(pixel_labeled))
61 |
62 | return mean_pixel_accuracy
63 |
64 |
65 | def pixelAccuracy(imPred, imLab):
66 | # Remove classes from unlabeled pixels in gt image.
67 | # We should not penalize detections in unlabeled portions of the image.
68 | pixel_labeled = np.sum(imLab >= 0)
69 | pixel_correct = np.sum((imPred == imLab) * (imLab >= 0))
70 | pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
71 |
72 | return pixel_accuracy, pixel_correct, pixel_labeled
73 |
--------------------------------------------------------------------------------
/furnace/seg_opr/parallel/parallel_apply.py:
--------------------------------------------------------------------------------
1 | # import threading
2 | import torch
3 | import torch.multiprocessing as mp
4 | from torch.cuda._utils import _get_device_index
5 |
6 |
7 | def get_a_var(obj):
8 | if isinstance(obj, torch.Tensor):
9 | return obj
10 |
11 | if isinstance(obj, list) or isinstance(obj, tuple):
12 | for result in map(get_a_var, obj):
13 | if isinstance(result, torch.Tensor):
14 | return result
15 | if isinstance(obj, dict):
16 | for result in map(get_a_var, obj.items()):
17 | if isinstance(result, torch.Tensor):
18 | return result
19 | return None
20 |
21 |
22 | def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
23 | r"""Applies each `module` in :attr:`modules` in parallel on arguments
24 | contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
25 | on each of :attr:`devices`.
26 | Args:
27 | modules (Module): modules to be parallelized
28 | inputs (tensor): inputs to the modules
29 | devices (list of int or torch.device): CUDA devices
30 | :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
31 | :attr:`devices` (if given) should all have same length. Moreover, each
32 | element of :attr:`inputs` can either be a single object as the only argument
33 | to a module, or a collection of positional arguments.
34 | """
35 | assert len(modules) == len(inputs)
36 | if kwargs_tup is not None:
37 | assert len(modules) == len(kwargs_tup)
38 | else:
39 | kwargs_tup = ({},) * len(modules)
40 | if devices is not None:
41 | assert len(modules) == len(devices)
42 | else:
43 | devices = [None] * len(modules)
44 | devices = list(map(lambda x: _get_device_index(x, True), devices))
45 | context = mp.get_context('spawn')
46 | # lock = threading.Lock()
47 | # results = {}
48 | # results = []
49 | results_queue = context.Queue(len(devices))
50 | grad_enabled = torch.is_grad_enabled()
51 |
52 | def _worker(i, module, input, kwargs, device=None):
53 | torch.set_grad_enabled(grad_enabled)
54 | if device is None:
55 | device = get_a_var(input).get_device()
56 | try:
57 | with torch.cuda.device(device):
58 | # this also avoids accidental slicing of `input` if it is a Tensor
59 | if not isinstance(input, (list, tuple)):
60 | input = (input,)
61 | output = module(*input, **kwargs)
62 | results_queue.put(output)
63 | # with lock:
64 | # results[i] = output
65 | except Exception as e:
66 | results_queue.put(e)
67 | # with lock:
68 | # results[i] = e
69 |
70 | if len(modules) > 1:
71 | processes = [context.Process(target=_worker,
72 | args=(i, module, input, kwargs, device))
73 | for i, (module, input, kwargs, device) in
74 | enumerate(zip(modules, inputs, kwargs_tup, devices))]
75 |
76 | for process in processes:
77 | process.start()
78 | for process in processes:
79 | process.join()
80 | else:
81 | _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
82 |
83 | outputs = []
84 | for i in range(len(inputs)):
85 | output = results_queue.get()
86 | if isinstance(output, Exception):
87 | raise output
88 | outputs.append(output)
89 | return outputs
90 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sgd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | # @Time : 2018/9/12 下午3:03
4 | # @Author : yuchangqian
5 | # @Contact : changqian_yu@163.com
6 | # @File : sgd.py
7 |
8 | import torch
9 | from torch.optim.sgd import SGD
10 |
11 |
12 | class StandardSGD(SGD):
13 | def step(self, closure=None):
14 | """Performs a single optimization step.
15 | Arguments:
16 | closure (callable, optional): A closure that reevaluates the model
17 | and returns the loss.
18 | """
19 | loss = None
20 | if closure is not None:
21 | loss = closure()
22 |
23 | for group in self.param_groups:
24 | weight_decay = group['weight_decay']
25 | momentum = group['momentum']
26 | dampening = group['dampening']
27 | nesterov = group['nesterov']
28 |
29 | for p in group['params']:
30 | if p.grad is None:
31 | continue
32 | d_p = p.grad.data
33 | if weight_decay != 0:
34 | d_p.add_(weight_decay, p.data)
35 | d_p.mul_(group['lr'])
36 | if momentum != 0:
37 | param_state = self.state[p]
38 | if 'momentum_buffer' not in param_state:
39 | buf = param_state['momentum_buffer'] = torch.zeros_like(
40 | p.data)
41 | buf.mul_(momentum).add_(d_p)
42 | else:
43 | buf = param_state['momentum_buffer']
44 | buf.mul_(momentum).add_(1 - dampening, d_p)
45 | if nesterov:
46 | d_p = d_p.add(momentum, buf)
47 | else:
48 | d_p = buf
49 |
50 | p.data.add_(-1, d_p)
51 |
52 | return loss
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | # @Time : 2018/10/3 下午2:10
4 | # @Author : yuchangqian
5 | # @Contact : changqian_yu@163.com
6 | # @File : __init__.py
7 |
8 | from .syncbn import *
9 | from .parallel import *
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/comm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # File : comm.py
3 | # Author : Jiayuan Mao
4 | # Email : maojiayuan@gmail.com
5 | # Date : 27/01/2018
6 | #
7 | # This file is part of Synchronized-BatchNorm-PyTorch.
8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
9 | # Distributed under MIT License.
10 |
11 | import queue
12 | import collections
13 | import threading
14 |
15 |
16 | __all__ = ['FutureResult', 'SlavePipe', 'SyncMaster']
17 |
18 |
19 | class FutureResult(object):
20 | """A thread-safe future implementation. Used only as one-to-one pipe."""
21 |
22 | def __init__(self):
23 | self._result = None
24 | self._lock = threading.Lock()
25 | self._cond = threading.Condition(self._lock)
26 |
27 | def put(self, result):
28 | with self._lock:
29 | assert self._result is None, 'Previous result has\'t been fetched.'
30 | self._result = result
31 | self._cond.notify()
32 |
33 | def get(self):
34 | with self._lock:
35 | if self._result is None:
36 | self._cond.wait()
37 |
38 | res = self._result
39 | self._result = None
40 | return res
41 |
42 |
43 | _MasterRegistry = collections.namedtuple('MasterRegistry', ['result'])
44 | _SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result'])
45 |
46 |
47 | class SlavePipe(_SlavePipeBase):
48 | """Pipe for master-slave communication."""
49 |
50 | def run_slave(self, msg):
51 | self.queue.put((self.identifier, msg))
52 | ret = self.result.get()
53 | self.queue.put(True)
54 | return ret
55 |
56 |
57 | class SyncMaster(object):
58 | """An abstract `SyncMaster` object.
59 |
60 | - During the replication, as the data parallel will trigger an callback of each module, all slave devices should
61 | call `register(id)` and obtain an `SlavePipe` to communicate with the master.
62 | - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected,
63 | and passed to a registered callback.
64 | - After receiving the messages, the master device should gather the information and determine to message passed
65 | back to each slave devices.
66 | """
67 |
68 | def __init__(self, master_callback):
69 | """
70 |
71 | Args:
72 | master_callback: a callback to be invoked after having collected messages from slave devices.
73 | """
74 | self._master_callback = master_callback
75 | self._queue = queue.Queue()
76 | self._registry = collections.OrderedDict()
77 | self._activated = False
78 |
79 | def register_slave(self, identifier):
80 | """
81 | Register an slave device.
82 |
83 | Args:
84 | identifier: an identifier, usually is the device id.
85 |
86 | Returns: a `SlavePipe` object which can be used to communicate with the master device.
87 |
88 | """
89 | if self._activated:
90 | assert self._queue.empty(), 'Queue is not clean before next initialization.'
91 | self._activated = False
92 | self._registry.clear()
93 | future = FutureResult()
94 | self._registry[identifier] = _MasterRegistry(future)
95 | return SlavePipe(identifier, self._queue, future)
96 |
97 | def run_master(self, master_msg):
98 | """
99 | Main entry for the master device in each forward pass.
100 | The messages were first collected from each devices (including the master device), and then
101 | an callback will be invoked to compute the message to be sent back to each devices
102 | (including the master device).
103 |
104 | Args:
105 | master_msg: the message that the master want to send to itself. This will be placed as the first
106 | message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example.
107 |
108 | Returns: the message to be sent back to the master device.
109 |
110 | """
111 | self._activated = True
112 |
113 | intermediates = [(0, master_msg)]
114 | for i in range(self.nr_slaves):
115 | intermediates.append(self._queue.get())
116 |
117 | results = self._master_callback(intermediates)
118 | assert results[0][0] == 0, 'The first result should belongs to the master.'
119 |
120 | for i, res in results:
121 | if i == 0:
122 | continue
123 | self._registry[i].result.put(res)
124 |
125 | for i in range(self.nr_slaves):
126 | assert self._queue.get() is True
127 |
128 | return results[0][1]
129 |
130 | @property
131 | def nr_slaves(self):
132 | return len(self._registry)
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/functions.py:
--------------------------------------------------------------------------------
1 | ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 | ## Created by: Hang Zhang
3 | ## Email: zhanghang0704@gmail.com
4 | ## Copyright (c) 2018
5 | ##
6 | ## This source code is licensed under the MIT-style license found in the
7 | ## LICENSE file in the root directory of this source tree
8 | ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
9 |
10 | """Synchronized Cross-GPU Batch Normalization functions"""
11 | import torch
12 | from torch.autograd import Variable, Function
13 | from .src import *
14 |
15 | __all__ = ['sum_square', 'batchnormtrain']
16 |
17 | def sum_square(input):
18 | r"""Calculate sum of elements and sum of squares for Batch Normalization"""
19 | return _sum_square.apply(input)
20 |
21 |
22 | class _sum_square(Function):
23 | @staticmethod
24 | def forward(ctx, input):
25 | ctx.save_for_backward(input)
26 | if input.is_cuda:
27 | xsum, xsqusum = gpu.sumsquare_forward(input)
28 | else:
29 | xsum, xsqusum = cpu.sumsquare_forward(input)
30 | return xsum, xsqusum
31 |
32 | @staticmethod
33 | def backward(ctx, gradSum, gradSquare):
34 | input, = ctx.saved_variables
35 | if input.is_cuda:
36 | gradInput = gpu.sumsquare_backward(input, gradSum, gradSquare)
37 | else:
38 | raise NotImplemented
39 | return gradInput
40 |
41 |
42 | class _batchnormtrain(Function):
43 | @staticmethod
44 | def forward(ctx, input, mean, std, gamma, beta):
45 | ctx.save_for_backward(input, mean, std, gamma, beta)
46 | if input.is_cuda:
47 | output = gpu.batchnorm_forward(input, mean, std, gamma, beta)
48 | else:
49 | output = cpu.batchnorm_forward(input, mean, std, gamma, beta)
50 | return output
51 |
52 | @staticmethod
53 | def backward(ctx, gradOutput):
54 | input, mean, std, gamma, beta = ctx.saved_variables
55 | if gradOutput.is_cuda:
56 | gradInput, gradMean, gradStd, gradGamma, gradBeta = \
57 | gpu.batchnorm_backward(gradOutput, input, mean,
58 | std, gamma, beta, True)
59 | else:
60 | raise NotImplemented
61 | return gradInput, gradMean, gradStd, gradGamma, gradBeta
62 |
63 |
64 | def batchnormtrain(input, mean, std, gamma, beta):
65 | r"""Applies Batch Normalization over a 3d input that is seen as a
66 | mini-batch.
67 |
68 | .. _encoding.batchnormtrain:
69 |
70 | .. math::
71 |
72 | y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
73 |
74 | Shape:
75 | - Input: :math:`(N, C)` or :math:`(N, C, L)`
76 | - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
77 |
78 | """
79 | return _batchnormtrain.apply(input, mean, std, gamma, beta)
80 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/parallel_apply.py:
--------------------------------------------------------------------------------
1 | # import threading
2 | import queue
3 | import torch
4 | import torch.multiprocessing as mp
5 | # from pathos.multiprocessing import ProcessPool as Pool
6 | from torch.cuda._utils import _get_device_index
7 |
8 |
9 | def get_a_var(obj):
10 | if isinstance(obj, torch.Tensor):
11 | return obj
12 |
13 | if isinstance(obj, list) or isinstance(obj, tuple):
14 | for result in map(get_a_var, obj):
15 | if isinstance(result, torch.Tensor):
16 | return result
17 | if isinstance(obj, dict):
18 | for result in map(get_a_var, obj.items()):
19 | if isinstance(result, torch.Tensor):
20 | return result
21 | return None
22 |
23 |
24 | def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
25 | r"""Applies each `module` in :attr:`modules` in parallel on arguments
26 | contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
27 | on each of :attr:`devices`.
28 | Args:
29 | modules (Module): modules to be parallelized
30 | inputs (tensor): inputs to the modules
31 | devices (list of int or torch.device): CUDA devices
32 | :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
33 | :attr:`devices` (if given) should all have same length. Moreover, each
34 | element of :attr:`inputs` can either be a single object as the only argument
35 | to a module, or a collection of positional arguments.
36 | """
37 | assert len(modules) == len(inputs)
38 | if kwargs_tup is not None:
39 | assert len(modules) == len(kwargs_tup)
40 | else:
41 | kwargs_tup = ({},) * len(modules)
42 | if devices is not None:
43 | assert len(modules) == len(devices)
44 | else:
45 | devices = [None] * len(modules)
46 | devices = list(map(lambda x: _get_device_index(x, True), devices))
47 | context = mp.get_context('spawn')
48 | # lock = threading.Lock()
49 | # results = {}
50 | # results = []
51 | # pool = context.Pool(len(devices))
52 | results_queue = queue.Queue(len(devices))
53 | grad_enabled = torch.is_grad_enabled()
54 |
55 | def _worker(module, input, kwargs, device=None):
56 | torch.set_grad_enabled(grad_enabled)
57 | if device is None:
58 | device = get_a_var(input).get_device()
59 | try:
60 | with torch.cuda.device(device):
61 | # this also avoids accidental slicing of `input` if it is a Tensor
62 | if not isinstance(input, (list, tuple)):
63 | input = (input,)
64 | output = module(*input, **kwargs)
65 | results_queue.put(output)
66 | # with lock:
67 | # results[i] = output
68 | except Exception as e:
69 | results_queue.put(e)
70 | # with lock:
71 | # results[i] = e
72 |
73 | if len(modules) > 1:
74 | # pool.map(_worker, [modules, inputs, kwargs_tup, devices])
75 | processes = [context.Process(target=_worker,
76 | args=(i, module, input, kwargs, device))
77 | for i, (module, input, kwargs, device) in
78 | enumerate(zip(modules, inputs, kwargs_tup, devices))]
79 |
80 | for process in processes:
81 | process.start()
82 | for process in processes:
83 | process.join()
84 | else:
85 | _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
86 |
87 | outputs = []
88 | for i in range(len(inputs)):
89 | output = results_queue.get()
90 | if isinstance(output, Exception):
91 | raise output
92 | outputs.append(output)
93 | return outputs
94 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from torch.utils.cpp_extension import load
4 |
5 | cwd = os.path.dirname(os.path.realpath(__file__))
6 | cpu_path = os.path.join(cwd, 'cpu')
7 | gpu_path = os.path.join(cwd, 'gpu')
8 |
9 | cpu = load('syncbn_cpu', [
10 | os.path.join(cpu_path, 'operator.cpp'),
11 | os.path.join(cpu_path, 'syncbn_cpu.cpp'),
12 | ], build_directory=cpu_path, verbose=False)
13 |
14 | if torch.cuda.is_available():
15 | gpu = load('syncbn_gpu', [
16 | os.path.join(gpu_path, 'operator.cpp'),
17 | os.path.join(gpu_path, 'syncbn_kernel.cu'),
18 | ], build_directory=gpu_path, verbose=False)
19 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/.ninja_deps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/.ninja_deps
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/.ninja_log:
--------------------------------------------------------------------------------
1 | # ninja log v5
2 | 1 3006 1563332513 syncbn_cpu.o 486ee2c6335a262c
3 | 1 6096 1563332516 operator.o df1c06f439a829e3
4 | 6096 6262 1563332517 syncbn_cpu.so 7b7138baea8e4fe0
5 | 0 3376 1578576757073544196 syncbn_cpu.o 238aaa649062d1c
6 | 0 4373 1578576758073181846 operator.o eedcce4cadeab94a
7 | 4373 4493 1578576758193138364 syncbn_cpu.so 7b7138baea8e4fe0
8 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/__init__.py
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/build.ninja:
--------------------------------------------------------------------------------
1 | ninja_required_version = 1.3
2 | cxx = c++
3 |
4 | cflags = -DTORCH_EXTENSION_NAME=syncbn_cpu -DTORCH_API_INCLUDE_EXTENSION_H -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -isystem /home/SENSETIME/chenxiaokang/anaconda3/include/python3.6m -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++11
5 | ldflags = -shared
6 |
7 | rule compile
8 | command = $cxx -MMD -MF $out.d $cflags -c $in -o $out
9 | depfile = $out.d
10 | deps = gcc
11 |
12 | rule link
13 | command = $cxx $in $ldflags -o $out
14 |
15 | build operator.o: compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/cpu/operator.cpp
16 | build syncbn_cpu.o: compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.cpp
17 |
18 | build syncbn_cpu.so: link operator.o syncbn_cpu.o
19 |
20 | default syncbn_cpu.so
21 |
22 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/dist/syncbn_cpu-0.0.0-py3.6-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/dist/syncbn_cpu-0.0.0-py3.6-linux-x86_64.egg
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/operator.cpp:
--------------------------------------------------------------------------------
1 | #include "operator.h"
2 |
3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
4 | m.def("batchnorm_forward", &BatchNorm_Forward_CPU, "BatchNorm forward (CPU)");
5 | m.def("batchnorm_backward", &BatchNorm_Backward_CPU, "BatchNorm backward (CPU)");
6 | m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)");
7 | m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)");
8 | }
9 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/operator.h:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | at::Tensor BatchNorm_Forward_CPU(
5 | const at::Tensor input_,
6 | const at::Tensor mean_,
7 | const at::Tensor std_,
8 | const at::Tensor gamma_,
9 | const at::Tensor beta_);
10 |
11 | std::vector BatchNorm_Backward_CPU(
12 | const at::Tensor gradoutput_,
13 | const at::Tensor input_,
14 | const at::Tensor mean_,
15 | const at::Tensor std_,
16 | const at::Tensor gamma_,
17 | const at::Tensor beta_,
18 | bool train);
19 |
20 | std::vector Sum_Square_Forward_CPU(
21 | const at::Tensor input_);
22 |
23 | at::Tensor Sum_Square_Backward_CPU(
24 | const at::Tensor input_,
25 | const at::Tensor gradSum_,
26 | const at::Tensor gradSquare_);
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/operator.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/operator.o
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CppExtension
3 |
4 | setup(
5 | name='syncbn_cpu',
6 | ext_modules=[
7 | CppExtension('syncbn_cpu', [
8 | 'operator.cpp',
9 | 'syncbn_cpu.cpp',
10 | ]),
11 | ],
12 | cmdclass={
13 | 'build_ext': BuildExtension
14 | })
15 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
6 | if (x.ndimension() == 2) {
7 | return v;
8 | } else {
9 | std::vector broadcast_size = {1, -1};
10 | for (int64_t i = 2; i < x.ndimension(); ++i)
11 | broadcast_size.push_back(1);
12 |
13 | return v.view(broadcast_size);
14 | }
15 | }
16 |
17 | at::Tensor BatchNorm_Forward_CPU(
18 | const at::Tensor input,
19 | const at::Tensor mean,
20 | const at::Tensor std,
21 | const at::Tensor gamma,
22 | const at::Tensor beta) {
23 | auto output = (input - broadcast_to(mean, input)) / broadcast_to(std, input);
24 | output = output * broadcast_to(gamma, input) + broadcast_to(beta, input);
25 | return output;
26 | }
27 |
28 | // Not implementing CPU backward for now
29 | std::vector BatchNorm_Backward_CPU(
30 | const at::Tensor gradoutput,
31 | const at::Tensor input,
32 | const at::Tensor mean,
33 | const at::Tensor std,
34 | const at::Tensor gamma,
35 | const at::Tensor beta,
36 | bool train) {
37 | /* outputs*/
38 | at::Tensor gradinput = at::zeros_like(input);
39 | at::Tensor gradgamma = at::zeros_like(gamma);
40 | at::Tensor gradbeta = at::zeros_like(beta);
41 | at::Tensor gradMean = at::zeros_like(mean);
42 | at::Tensor gradStd = at::zeros_like(std);
43 | return {gradinput, gradMean, gradStd, gradgamma, gradbeta};
44 | }
45 |
46 | std::vector Sum_Square_Forward_CPU(
47 | const at::Tensor input) {
48 | /* outputs */
49 | at::Tensor sum = torch::zeros({input.size(1)}, input.options());
50 | at::Tensor square = torch::zeros({input.size(1)}, input.options());
51 | return {sum, square};
52 | }
53 |
54 | at::Tensor Sum_Square_Backward_CPU(
55 | const at::Tensor input,
56 | const at::Tensor gradSum,
57 | const at::Tensor gradSquare) {
58 | /* outputs */
59 | at::Tensor gradInput = at::zeros_like(input);
60 | return gradInput;
61 | }
62 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.0
2 | Name: syncbn-cpu
3 | Version: 0.0.0
4 | Summary: UNKNOWN
5 | Home-page: UNKNOWN
6 | Author: UNKNOWN
7 | Author-email: UNKNOWN
8 | License: UNKNOWN
9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.o
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.so
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/.ninja_deps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/.ninja_deps
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/.ninja_log:
--------------------------------------------------------------------------------
1 | # ninja log v5
2 | 1 6029 1563332523 operator.o 93fbaee254d44db4
3 | 1 11088 1563332528 syncbn_kernel.cuda.o ec50d81437939f2c
4 | 11088 11258 1563332528 syncbn_gpu.so a2b728e60c853ec3
5 | 0 2904 1578576761208045520 operator.o cecba1516d789115
6 | 0 7039 1578576765338548294 syncbn_kernel.cuda.o a19378e9f1e5d587
7 | 7039 7134 1578576765434513509 syncbn_gpu.so a2b728e60c853ec3
8 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/__init__.py
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/build.ninja:
--------------------------------------------------------------------------------
1 | ninja_required_version = 1.3
2 | cxx = c++
3 | nvcc = /usr/local/cuda/bin/nvcc
4 |
5 | cflags = -DTORCH_EXTENSION_NAME=syncbn_gpu -DTORCH_API_INCLUDE_EXTENSION_H -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -isystem /usr/local/cuda/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/include/python3.6m -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++11
6 | cuda_flags = -DTORCH_EXTENSION_NAME=syncbn_gpu -DTORCH_API_INCLUDE_EXTENSION_H -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -isystem /usr/local/cuda/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/include/python3.6m -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --compiler-options '-fPIC' -std=c++11
7 | ldflags = -shared -L/usr/local/cuda/lib64 -lcudart
8 |
9 | rule compile
10 | command = $cxx -MMD -MF $out.d $cflags -c $in -o $out
11 | depfile = $out.d
12 | deps = gcc
13 |
14 | rule cuda_compile
15 | command = $nvcc $cuda_flags -c $in -o $out
16 |
17 | rule link
18 | command = $cxx $in $ldflags -o $out
19 |
20 | build operator.o: compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/gpu/operator.cpp
21 | build syncbn_kernel.cuda.o: cuda_compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/gpu/syncbn_kernel.cu
22 |
23 | build syncbn_gpu.so: link operator.o syncbn_kernel.cuda.o
24 |
25 | default syncbn_gpu.so
26 |
27 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/device_tensor.h:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | template
4 | struct DeviceTensor {
5 | public:
6 | inline __device__ __host__ DeviceTensor(DType *p, const int *size)
7 | : dptr_(p) {
8 | for (int i = 0; i < Dim; ++i) {
9 | size_[i] = size ? size[i] : 0;
10 | }
11 | }
12 |
13 | inline __device__ __host__ unsigned getSize(const int i) const {
14 | assert(i < Dim);
15 | return size_[i];
16 | }
17 |
18 | inline __device__ __host__ int numElements() const {
19 | int n = 1;
20 | for (int i = 0; i < Dim; ++i) {
21 | n *= size_[i];
22 | }
23 | return n;
24 | }
25 |
26 | inline __device__ __host__ DeviceTensor select(const size_t x) const {
27 | assert(Dim > 1);
28 | int offset = x;
29 | for (int i = 1; i < Dim; ++i) {
30 | offset *= size_[i];
31 | }
32 | DeviceTensor tensor(dptr_ + offset, nullptr);
33 | for (int i = 0; i < Dim - 1; ++i) {
34 | tensor.size_[i] = this->size_[i+1];
35 | }
36 | return tensor;
37 | }
38 |
39 | inline __device__ __host__ DeviceTensor operator[](const size_t x) const {
40 | assert(Dim > 1);
41 | int offset = x;
42 | for (int i = 1; i < Dim; ++i) {
43 | offset *= size_[i];
44 | }
45 | DeviceTensor tensor(dptr_ + offset, nullptr);
46 | for (int i = 0; i < Dim - 1; ++i) {
47 | tensor.size_[i] = this->size_[i+1];
48 | }
49 | return tensor;
50 | }
51 |
52 | inline __device__ __host__ size_t InnerSize() const {
53 | assert(Dim >= 3);
54 | size_t sz = 1;
55 | for (size_t i = 2; i < Dim; ++i) {
56 | sz *= size_[i];
57 | }
58 | return sz;
59 | }
60 |
61 | inline __device__ __host__ size_t ChannelCount() const {
62 | assert(Dim >= 3);
63 | return size_[1];
64 | }
65 |
66 | inline __device__ __host__ DType* data_ptr() const {
67 | return dptr_;
68 | }
69 |
70 | DType *dptr_;
71 | int size_[Dim];
72 | };
73 |
74 | template
75 | struct DeviceTensor {
76 | inline __device__ __host__ DeviceTensor(DType *p, const int *size)
77 | : dptr_(p) {
78 | size_[0] = size ? size[0] : 0;
79 | }
80 |
81 | inline __device__ __host__ unsigned getSize(const int i) const {
82 | assert(i == 0);
83 | return size_[0];
84 | }
85 |
86 | inline __device__ __host__ int numElements() const {
87 | return size_[0];
88 | }
89 |
90 | inline __device__ __host__ DType &operator[](const size_t x) const {
91 | return *(dptr_ + x);
92 | }
93 |
94 | inline __device__ __host__ DType* data_ptr() const {
95 | return dptr_;
96 | }
97 |
98 | DType *dptr_;
99 | int size_[1];
100 | };
101 |
102 | template
103 | static DeviceTensor devicetensor(const at::Tensor &blob) {
104 | DType *data = blob.data();
105 | DeviceTensor tensor(data, nullptr);
106 | for (int i = 0; i < Dim; ++i) {
107 | tensor.size_[i] = blob.size(i);
108 | }
109 | return tensor;
110 | }
111 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/dist/syncbn_gpu-0.0.0-py3.6-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/dist/syncbn_gpu-0.0.0-py3.6-linux-x86_64.egg
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/operator.cpp:
--------------------------------------------------------------------------------
1 | #include "operator.h"
2 |
3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
4 | m.def("batchnorm_forward", &BatchNorm_Forward_CUDA, "BatchNorm forward (CUDA)");
5 | m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)");
6 | m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)");
7 | m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)");
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/operator.h:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 |
5 | at::Tensor BatchNorm_Forward_CUDA(
6 | const at::Tensor input_,
7 | const at::Tensor mean_,
8 | const at::Tensor std_,
9 | const at::Tensor gamma_,
10 | const at::Tensor beta_);
11 |
12 | std::vector BatchNorm_Backward_CUDA(
13 | const at::Tensor gradoutput_,
14 | const at::Tensor input_,
15 | const at::Tensor mean_,
16 | const at::Tensor std_,
17 | const at::Tensor gamma_,
18 | const at::Tensor beta_,
19 | bool train);
20 |
21 | std::vector Sum_Square_Forward_CUDA(
22 | const at::Tensor input_);
23 |
24 | at::Tensor Sum_Square_Backward_CUDA(
25 | const at::Tensor input_,
26 | const at::Tensor gradSum_,
27 | const at::Tensor gradSquare_);
28 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/operator.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/operator.o
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3 |
4 | setup(
5 | name='syncbn_gpu',
6 | ext_modules=[
7 | CUDAExtension('syncbn_gpu', [
8 | 'operator.cpp',
9 | 'syncbn_kernel.cu',
10 | ]),
11 | ],
12 | cmdclass={
13 | 'build_ext': BuildExtension
14 | })
15 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/syncbn_gpu.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.0
2 | Name: syncbn-gpu
3 | Version: 0.0.0
4 | Summary: UNKNOWN
5 | Home-page: UNKNOWN
6 | Author: UNKNOWN
7 | Author-email: UNKNOWN
8 | License: UNKNOWN
9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 |
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/syncbn_gpu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/syncbn_gpu.so
--------------------------------------------------------------------------------
/furnace/seg_opr/sync_bn/src/gpu/syncbn_kernel.cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/syncbn_kernel.cuda.o
--------------------------------------------------------------------------------
/furnace/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/utils/__init__.py
--------------------------------------------------------------------------------
/furnace/utils/init_func.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | # @Time : 2018/9/28 下午12:13
4 | # @Author : yuchangqian
5 | # @Contact : changqian_yu@163.com
6 | # @File : init_func.py.py
7 | import torch
8 | import torch.nn as nn
9 |
10 |
11 | def __init_weight(feature, conv_init, norm_layer, bn_eps, bn_momentum,
12 | **kwargs):
13 | for name, m in feature.named_modules():
14 | if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
15 | conv_init(m.weight, **kwargs)
16 | elif isinstance(m, norm_layer):
17 | m.eps = bn_eps
18 | m.momentum = bn_momentum
19 | nn.init.constant_(m.weight, 1)
20 | nn.init.constant_(m.bias, 0)
21 |
22 |
23 | def init_weight(module_list, conv_init, norm_layer, bn_eps, bn_momentum,
24 | **kwargs):
25 | if isinstance(module_list, list):
26 | for feature in module_list:
27 | __init_weight(feature, conv_init, norm_layer, bn_eps, bn_momentum,
28 | **kwargs)
29 | else:
30 | __init_weight(module_list, conv_init, norm_layer, bn_eps, bn_momentum,
31 | **kwargs)
32 |
33 |
34 | def group_weight(weight_group, module, norm_layer, lr):
35 | group_decay = []
36 | group_no_decay = []
37 | for m in module.modules():
38 | if isinstance(m, nn.Linear):
39 | group_decay.append(m.weight)
40 | if m.bias is not None:
41 | group_no_decay.append(m.bias)
42 | elif isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.ConvTranspose2d, nn.ConvTranspose3d)):
43 | group_decay.append(m.weight)
44 | if m.bias is not None:
45 | group_no_decay.append(m.bias)
46 | elif isinstance(m, norm_layer) or isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) \
47 | or isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.GroupNorm):
48 | if m.weight is not None:
49 | group_no_decay.append(m.weight)
50 | if m.bias is not None:
51 | group_no_decay.append(m.bias)
52 | elif isinstance(m, nn.Parameter):
53 | group_decay.append(m)
54 | # else:
55 | # print(m, norm_layer)
56 | # print(module.modules)
57 | # print( len(list(module.parameters())) , 'HHHHHHHHHHHHHHHHH', len(group_decay) + len(
58 | # group_no_decay))
59 | assert len(list(module.parameters())) == len(group_decay) + len(
60 | group_no_decay)
61 | weight_group.append(dict(params=group_decay, lr=lr))
62 | weight_group.append(dict(params=group_no_decay, weight_decay=.0, lr=lr))
63 | return weight_group
64 |
--------------------------------------------------------------------------------
/furnace/utils/visualize.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | import scipy.io as sio
4 |
5 | def set_img_color(colors, background, img, pred, gt, show255=False):
6 | for i in range(0, len(colors)):
7 | if i != background:
8 | img[np.where(pred == i)] = colors[i]
9 | if show255:
10 | img[np.where(gt==background)] = 255
11 | return img
12 |
13 | def show_prediction(colors, background, img, pred, gt):
14 | im = np.array(img, np.uint8)
15 | set_img_color(colors, background, im, pred, gt)
16 | final = np.array(im)
17 | return final
18 |
19 | def show_img(colors, background, img, clean, gt, *pds):
20 | im1 = np.array(img, np.uint8)
21 | #set_img_color(colors, background, im1, clean, gt)
22 | final = np.array(im1)
23 | # the pivot black bar
24 | pivot = np.zeros((im1.shape[0], 15, 3), dtype=np.uint8)
25 | for pd in pds:
26 | im = np.array(img, np.uint8)
27 | # pd[np.where(gt == 255)] = 255
28 | set_img_color(colors, background, im, pd, gt)
29 | final = np.column_stack((final, pivot))
30 | final = np.column_stack((final, im))
31 |
32 | im = np.array(img, np.uint8)
33 | set_img_color(colors, background, im, gt, True)
34 | final = np.column_stack((final, pivot))
35 | final = np.column_stack((final, im))
36 | return final
37 |
38 | def get_colors(class_num):
39 | colors = []
40 | for i in range(class_num):
41 | colors.append((np.random.random((1,3)) * 255).tolist()[0])
42 |
43 | return colors
44 |
45 | def get_ade_colors():
46 | colors = sio.loadmat('./color150.mat')['colors']
47 | colors = colors[:,::-1,]
48 | colors = np.array(colors).astype(int).tolist()
49 | colors.insert(0,[0,0,0])
50 |
51 | return colors
52 |
53 | def print_iou(iu, mean_pixel_acc, class_names=None, show_no_back=False, no_print=False):
54 | n = iu.size
55 | lines = []
56 | for i in range(n):
57 | if class_names is None:
58 | cls = 'Class %d:' % (i+1)
59 | else:
60 | cls = '%d %s' % (i+1, class_names[i])
61 | lines.append('%-8s\t%.3f%%' % (cls, iu[i] * 100))
62 | mean_IU = np.nanmean(iu)
63 | mean_IU_no_back = np.nanmean(iu[1:])
64 | if show_no_back:
65 | lines.append('---------------------------- %-8s\t%.3f%%\t%-8s\t%.3f%%\t%-8s\t%.3f%%' % ('mean_IU', mean_IU * 100, 'mean_IU_no_back', mean_IU_no_back*100,
66 | 'mean_pixel_ACC',mean_pixel_acc*100))
67 | else:
68 | print(mean_pixel_acc)
69 | lines.append('---------------------------- %-8s\t%.3f%%\t%-8s\t%.3f%%' % ('mean_IU', mean_IU * 100,'mean_pixel_ACC',mean_pixel_acc*100))
70 | line = "\n".join(lines)
71 | if not no_print:
72 | print(line)
73 | return line
74 |
75 |
76 |
--------------------------------------------------------------------------------
/install.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 |
3 | The code is developed using Python 3.6 with PyTorch 1.0.0. The code is developed and tested using 2 GPU cards.
4 |
5 | 1. **Clone this repo.**
6 |
7 | ```shell
8 | $ git clone https://github.com/charlesCXK/TorchSSC.git
9 | $ cd TorchSSC
10 | ```
11 |
12 | 2. **Install dependencies.**
13 |
14 | **(1) Create a conda environment:**
15 |
16 | ```shell
17 | $ conda env create -f ssc.yaml
18 | $ conda activate ssc
19 | ```
20 |
21 | **(2) Install apex 0.1(needs CUDA)**
22 |
23 | ```shell
24 | $ cd ./furnace/apex
25 | $ python setup.py install --cpp_ext --cuda_ext
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/model/sketch.nyu/config.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import os.path as osp
9 | import sys
10 | import time
11 | import numpy as np
12 | from easydict import EasyDict as edict
13 | import argparse
14 |
15 | C = edict()
16 | config = C
17 | cfg = C
18 |
19 | C.seed = 12345
20 |
21 | remoteip = os.popen('pwd').read()
22 | C.volna = '/home/chen/TorchSSC/' # this is the path to your repo 'TorchSSC'
23 |
24 |
25 | """please config ROOT_dir and user when u first using"""
26 | C.repo_name = 'TorchSSC'
27 | C.abs_dir = osp.realpath(".")
28 | C.this_dir = C.abs_dir.split(osp.sep)[-1]
29 |
30 |
31 | C.root_dir = C.abs_dir[:C.abs_dir.index(C.repo_name) + len(C.repo_name)]
32 | C.log_dir = osp.abspath('log')
33 | C.tb_dir = osp.abspath(osp.join(C.log_dir, "tb"))
34 |
35 | C.log_dir_link = osp.join(C.abs_dir, 'log')
36 | C.snapshot_dir = osp.abspath(osp.join(C.log_dir, "snapshot"))
37 |
38 | exp_time = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())
39 | C.log_file = C.log_dir + '/log_' + exp_time + '.log'
40 | C.link_log_file = C.log_file + '/log_last.log'
41 | C.val_log_file = C.log_dir + '/_' + exp_time + '.lovalg'
42 | C.link_val_log_file = C.log_dir + '/val_last.log'
43 |
44 | """Data Dir and Weight Dir"""
45 | C.dataset_path = osp.join(C.volna, 'DATA/NYU/')
46 | C.img_root_folder = C.dataset_path
47 | C.gt_root_folder = C.dataset_path
48 | C.hha_root_folder = osp.join(C.dataset_path, 'HHA')
49 | C.mapping_root_folder = osp.join(C.dataset_path, 'Mapping')
50 | C.train_source = osp.join(C.dataset_path, "train.txt")
51 | C.eval_source = osp.join(C.dataset_path, "test.txt")
52 | C.is_test = False
53 |
54 | """Path Config"""
55 |
56 |
57 | def add_path(path):
58 | if path not in sys.path:
59 | sys.path.insert(0, path)
60 |
61 |
62 | add_path(osp.join(C.root_dir, 'furnace'))
63 |
64 | from utils.pyt_utils import model_urls
65 |
66 | """Image Config"""
67 | C.num_classes = 12
68 | C.background = 255
69 | C.image_mean = np.array([0.485, 0.456, 0.406])
70 | C.image_std = np.array([0.229, 0.224, 0.225])
71 | C.image_height = 480
72 | C.image_width = 640
73 | C.num_train_imgs = 795
74 | C.num_eval_imgs = 654
75 |
76 | """ Settings for network, this would be different for each kind of model"""
77 | C.fix_bias = True
78 | C.bn_eps = 1e-5
79 | C.bn_momentum = 0.1
80 | C.pretrained_model = C.volna + 'DATA/pytorch-weight/resnet50-imagenet.pth'
81 |
82 | """Train Config"""
83 | C.lr = 0.1
84 | C.lr_power = 0.9
85 | C.momentum = 0.9
86 | C.weight_decay = 5e-4
87 | C.batch_size = 4
88 | C.nepochs = 250
89 | C.niters_per_epoch = 795 // C.batch_size
90 | C.num_workers = C.batch_size
91 |
92 | C.train_scale_array = [1]
93 | C.warm_up_epoch = 0
94 |
95 | """Eval Config"""
96 | C.eval_iter = 30
97 | C.eval_stride_rate = 2 / 3
98 | C.eval_scale_array = [1, ]
99 | C.eval_flip = False
100 | C.eval_base_size = 480
101 | C.eval_crop_size = 640
102 |
103 | """Display Config"""
104 | C.snapshot_iter = 10
105 | C.record_info_iter = 20
106 | C.display_iter = 50
107 | C.sketch_weight = 1
108 | C.sketch_weight_gsnn = 1.5
109 |
110 | C.kld_weight = 2
111 | C.samples = 4
112 | C.lantent_size = 16
113 | C.empty_loss_weight = 1
114 |
115 | def open_tensorboard():
116 | pass
117 |
118 | if __name__ == '__main__':
119 | print(config.nepochs)
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument(
122 | '-tb', '--tensorboard', default=False, action='store_true')
123 | args = parser.parse_args()
124 |
125 | if args.tensorboard:
126 | open_tensorboard()
--------------------------------------------------------------------------------
/model/sketch.nyu/dataloader.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import torch
3 | import numpy as np
4 | from torch.utils import data
5 | import random
6 | from config import config
7 | from utils.img_utils import normalize, \
8 | generate_random_crop_pos, random_crop_pad_to_shape
9 |
10 | class TrainPre(object):
11 | def __init__(self, img_mean, img_std):
12 | self.img_mean = img_mean
13 | self.img_std = img_std
14 |
15 | def __call__(self, img, hha):
16 | img = normalize(img, self.img_mean, self.img_std)
17 | hha = normalize(hha, self.img_mean, self.img_std)
18 |
19 | p_img = img.transpose(2, 0, 1)
20 | p_hha = hha.transpose(2, 0, 1)
21 |
22 | extra_dict = {'hha_img': p_hha}
23 |
24 | return p_img, extra_dict
25 | class ValPre(object):
26 | def __call__(self, img, hha):
27 | extra_dict = {'hha_img': hha}
28 | return img, extra_dict
29 |
30 |
31 | def get_train_loader(engine, dataset, s3client=None):
32 | data_setting = {'img_root': config.img_root_folder,
33 | 'gt_root': config.gt_root_folder,
34 | 'hha_root':config.hha_root_folder,
35 | 'mapping_root': config.mapping_root_folder,
36 | 'train_source': config.train_source,
37 | 'eval_source': config.eval_source}
38 | train_preprocess = TrainPre(config.image_mean, config.image_std)
39 |
40 | train_dataset = dataset(data_setting, "train", train_preprocess,
41 | config.batch_size * config.niters_per_epoch, s3client=s3client)
42 |
43 | train_sampler = None
44 | is_shuffle = True
45 | batch_size = config.batch_size
46 |
47 | if engine.distributed:
48 | train_sampler = torch.utils.data.distributed.DistributedSampler(
49 | train_dataset)
50 | batch_size = config.batch_size // engine.world_size
51 | is_shuffle = False
52 |
53 | train_loader = data.DataLoader(train_dataset,
54 | batch_size=batch_size,
55 | num_workers=config.num_workers,
56 | drop_last=True,
57 | shuffle=is_shuffle,
58 | pin_memory=True,
59 | sampler=train_sampler)
60 |
61 | return train_loader, train_sampler
62 |
--------------------------------------------------------------------------------
/model/sketch.nyu/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export NGPUS=2
3 | export CUDA_VISIBLE_DEVICES=0,1
4 | python -m torch.distributed.launch --nproc_per_node=$NGPUS train.py -p 10097
5 | python eval.py -e 200-250 -d 0-1 --save_path results
--------------------------------------------------------------------------------