├── .gitignore ├── LICENSE ├── README.md ├── ReadmePic └── arch.png ├── furnace ├── __init__.py ├── apex │ ├── LICENSE │ ├── README.md │ ├── apex.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ └── top_level.txt │ ├── apex │ │ ├── RNN │ │ │ ├── README.md │ │ │ ├── RNNBackend.py │ │ │ ├── __init__.py │ │ │ ├── cells.py │ │ │ └── models.py │ │ ├── __init__.py │ │ ├── amp │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── __version__.py │ │ │ ├── amp.py │ │ │ ├── compat.py │ │ │ ├── handle.py │ │ │ ├── lists │ │ │ │ ├── __init__.py │ │ │ │ ├── functional_overrides.py │ │ │ │ ├── tensor_overrides.py │ │ │ │ └── torch_overrides.py │ │ │ ├── opt.py │ │ │ ├── rnn_compat.py │ │ │ ├── scaler.py │ │ │ ├── utils.py │ │ │ └── wrap.py │ │ ├── fp16_utils │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── fp16_optimizer.py │ │ │ ├── fp16util.py │ │ │ └── loss_scaler.py │ │ ├── normalization │ │ │ ├── __init__.py │ │ │ ├── csrc │ │ │ │ ├── layer_norm_cuda.cpp │ │ │ │ └── layer_norm_cuda_kernel.cu │ │ │ └── fused_layer_norm.py │ │ ├── optimizers │ │ │ ├── __init__.py │ │ │ ├── csrc │ │ │ │ ├── fused_adam_cuda.cpp │ │ │ │ └── fused_adam_cuda_kernel.cu │ │ │ ├── fp16_optimizer.py │ │ │ └── fused_adam.py │ │ ├── parallel │ │ │ ├── LARC.py │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── distributed.py │ │ │ ├── multiproc.py │ │ │ ├── optimized_sync_batchnorm.py │ │ │ ├── optimized_sync_batchnorm_kernel.py │ │ │ ├── sync_batchnorm.py │ │ │ └── sync_batchnorm_kernel.py │ │ └── reparameterization │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── reparameterization.py │ │ │ └── weight_norm.py │ ├── csrc │ │ ├── flatten_unflatten.cpp │ │ ├── scale_check_overflow.cpp │ │ ├── scale_check_overflow_kernel.cu │ │ ├── syncbn.cpp │ │ └── welford.cu │ ├── dist │ │ └── apex-0.1-py3.6-linux-x86_64.egg │ ├── docs │ │ ├── Makefile │ │ └── source │ │ │ ├── _static │ │ │ └── css │ │ │ │ └── pytorch_theme.css │ │ │ ├── _templates │ │ │ └── layout.html │ │ │ ├── amp.rst │ │ │ ├── conf.py │ │ │ ├── fp16_utils.rst │ │ │ ├── index.rst │ │ │ ├── layernorm.rst │ │ │ ├── optimizers.rst │ │ │ └── parallel.rst │ ├── examples │ │ ├── FP16_Optimizer_simple │ │ │ ├── README.md │ │ │ ├── closure.py │ │ │ ├── distributed_apex │ │ │ │ ├── README.md │ │ │ │ ├── distributed_data_parallel.py │ │ │ │ └── run.sh │ │ │ ├── distributed_apex_legacy_launcher │ │ │ │ ├── README.md │ │ │ │ ├── distributed_data_parallel.py │ │ │ │ └── run.sh │ │ │ ├── distributed_pytorch │ │ │ │ ├── README.md │ │ │ │ ├── distributed_data_parallel.py │ │ │ │ └── run.sh │ │ │ ├── minimal.py │ │ │ └── save_load.py │ │ ├── README.md │ │ ├── distributed │ │ │ ├── README.md │ │ │ └── main.py │ │ ├── docker │ │ │ ├── Dockerfile │ │ │ └── README.md │ │ ├── imagenet │ │ │ ├── README.md │ │ │ ├── main.py │ │ │ ├── main_amp.py │ │ │ ├── main_fp16_optimizer.py │ │ │ └── main_reducer.py │ │ └── word_language_model │ │ │ ├── README.md │ │ │ ├── data.py │ │ │ ├── data │ │ │ └── wikitext-2 │ │ │ │ └── README │ │ │ ├── generate.py │ │ │ ├── main.py │ │ │ ├── main_fp16_optimizer.py │ │ │ └── model.py │ ├── setup.py │ └── tests │ │ ├── RNN │ │ └── RNN_tests.py │ │ ├── distributed │ │ ├── ddp_race_condition_test.py │ │ └── run_race_test.sh │ │ ├── run_amp │ │ ├── __init__.py │ │ ├── test_basic_casts.py │ │ ├── test_cache.py │ │ ├── test_promotion.py │ │ ├── test_rnn.py │ │ ├── test_scale.py │ │ └── utils.py │ │ ├── run_fp16_optimizer │ │ ├── __init__.py │ │ └── test_fp16_optimizer.py │ │ ├── run_fp16util │ │ ├── __init__.py │ │ └── test_fp16util.py │ │ ├── run_mixed_adam │ │ ├── __init__.py │ │ ├── test_fp16_optimizer.py │ │ └── test_mixed_adam.py │ │ ├── run_test.py │ │ └── synced_batchnorm │ │ ├── single_gpu_unit_test.py │ │ ├── test_groups.py │ │ ├── two_gpu_unit_test.py │ │ └── unit_test.sh ├── base_model │ ├── README.md │ ├── __init__.py │ ├── resnet.py │ └── xception.py ├── datasets │ ├── BaseDataset.py │ └── __init__.py ├── engine │ ├── __init__.py │ ├── engine.py │ ├── evaluator.py │ ├── logger.py │ ├── lr_policy.py │ └── version.py ├── seg_opr │ ├── __init__.py │ ├── loss_opr.py │ ├── metric.py │ ├── parallel │ │ └── parallel_apply.py │ ├── seg_oprs.py │ ├── sgd.py │ └── sync_bn │ │ ├── __init__.py │ │ ├── comm.py │ │ ├── functions.py │ │ ├── parallel.py │ │ ├── parallel_apply.py │ │ ├── src │ │ ├── __init__.py │ │ ├── cpu │ │ │ ├── .ninja_deps │ │ │ ├── .ninja_log │ │ │ ├── __init__.py │ │ │ ├── build.ninja │ │ │ ├── dist │ │ │ │ └── syncbn_cpu-0.0.0-py3.6-linux-x86_64.egg │ │ │ ├── operator.cpp │ │ │ ├── operator.h │ │ │ ├── operator.o │ │ │ ├── setup.py │ │ │ ├── syncbn_cpu.cpp │ │ │ ├── syncbn_cpu.egg-info │ │ │ │ └── PKG-INFO │ │ │ ├── syncbn_cpu.o │ │ │ └── syncbn_cpu.so │ │ └── gpu │ │ │ ├── .ninja_deps │ │ │ ├── .ninja_log │ │ │ ├── __init__.py │ │ │ ├── build.ninja │ │ │ ├── common.h │ │ │ ├── device_tensor.h │ │ │ ├── dist │ │ │ └── syncbn_gpu-0.0.0-py3.6-linux-x86_64.egg │ │ │ ├── operator.cpp │ │ │ ├── operator.h │ │ │ ├── operator.o │ │ │ ├── setup.py │ │ │ ├── syncbn_gpu.egg-info │ │ │ └── PKG-INFO │ │ │ ├── syncbn_gpu.so │ │ │ ├── syncbn_kernel.cu │ │ │ └── syncbn_kernel.cuda.o │ │ └── syncbn.py └── utils │ ├── __init__.py │ ├── img_utils.py │ ├── init_func.py │ ├── pyt_utils.py │ └── visualize.py ├── install.md ├── model └── sketch.nyu │ ├── config.py │ ├── dataloader.py │ ├── eval.py │ ├── network.py │ ├── nyu.py │ ├── resnet.py │ ├── run.sh │ └── train.py └── ssc.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | log/ 3 | *.npz 4 | *.npy 5 | *.png 6 | *.jpg 7 | *.log 8 | *.pth 9 | __pycache__ 10 | 11 | 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Xiaokang Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TorchSSC 2 | ![license](https://img.shields.io/badge/license-MIT-green) ![PyTorch-1.0.0](https://img.shields.io/badge/PyTorch-1.0.0-blue) 3 | 4 | Implement some state-of-the-art methods of Semantic Scene Completion (SSC) task in PyTorch. 5 | 6 | 7 | 8 | ## Highlights: 9 | 10 | - **Distributed training** 11 | - **Easy-to-modify benchmark code** 12 | - **High performance** 13 | 14 | 15 | 16 | 17 | ## News 18 | 19 | - 2020/07/28 20 | 21 | Code release for the paper **3D Sketch-aware Semantic Scene Completion via Semi-supervised Structure Prior**, *CVPR 2020*. [[arXiv]](https://arxiv.org/abs/2003.14052), [[Supplementary Material and Demo]](https://charlesCXK.github.io) 22 | 23 | 24 | 25 | 26 | ## Performance 27 | 28 | #### NYU 29 | 30 | | Method | Resolution | Trained on | SC IoU | SSC mIoU | 31 | | ------------------------- | ------------ | ---------- | -------- | -------- | 32 | | SSCNet | (240, 60) | NYU | 55.1 | 24.7 | 33 | | VVNetR-120 | (120, 60) | NYU+SUNCG | 61.1 | 32.9 | 34 | | DDRNet | (240, 60) | NYU | 61.0 | 30.4 | 35 | | ForkNet | (80, 80) | NYU | 63.4 | 37.1 | 36 | | CCPNet | (240, 240) | NYU | 63.5 | 38.5 | 37 | | **SketchAwareSSC (Ours)** | **(60, 60)** | **NYU** | **71.3** | **41.1** | 38 | 39 | 40 | 41 | ## Data Preparation && Environment Installation 42 | 43 | #### Pretrained ResNet-50 44 | 45 | Please download the pretrained ResNet-50 and then put it into `./DATA/pytorch-weight`. 46 | 47 | | Source | Link | 48 | | :----------: | :--------------------------------------: | 49 | | BaiDu Cloud | Link: https://pan.baidu.com/s/1wS1TozvS3cBdutsXRWUmUw Key: 4g9u | 50 | | Google Drive | https://drive.google.com/drive/folders/121yZXBZ8wV77WRXRur86YBA4ifJEhsJQ?usp=sharing | 51 | 52 | #### NYU Depth V2 53 | 54 | Please download NYU dataset and then put it into `./DATA/NYU`. 55 | 56 | | Source | Link | 57 | | :----------: | :--------------------------------------: | 58 | | BaiDu Cloud | Link: https://pan.baidu.com/s/1GfWqAbsfMp3NOjFcEnL54A Key: v5ta | 59 | | Google Drive | https://drive.google.com/drive/folders/121yZXBZ8wV77WRXRur86YBA4ifJEhsJQ?usp=sharing | 60 | 61 | #### Environment Installation 62 | 63 | Please refer to [this documentation](./install.md) 64 | 65 | 66 | 67 | ## 3D Sketch-aware Semantic Scene Completion via Semi-supervised Structure Prior 68 | 69 | 70 | 71 | #### Training and Inference 72 | 73 | #### Training 74 | 75 | Training on NYU Depth V2: 76 | 77 | ```shell 78 | $ cd ./model/sketch.nyu 79 | $ export NGPUS=2 80 | $ python -m torch.distributed.launch --nproc_per_node=$NGPUS train.py -p 10097 81 | ``` 82 | 83 | - `-p` is the port number. It is about the distributed training. If you run more than one experiments in the same machine, you should set different ports for them. 84 | - The tensorboard file is saved in ` sketch.nyu/log/tb/` directory. 85 | 86 | #### Inference 87 | 88 | Inference on NYU Depth V2: 89 | 90 | ```shell 91 | $ cd ./model/sketch.nyu 92 | $ python eval.py -e 200-250 -d 0-1 --save_path results 93 | ``` 94 | 95 | - Here, 200-250 means we evaluate on checkpoints whose ID is in [200, 250], such as epoch-200.pth, epoch-249.pth, etc. 96 | - The SSC predictions will be saved in `results/` and `results_sketch/`, the former stores the SSC predictions and the latter stores sketch preditcions. Performance will be written to `log/*.log`. You will expect `0.411@SSC mIoU` and `0.713@SC IoU`. 97 | 98 | 99 | 100 | 101 | ## Citation 102 | 103 | If you find this work useful in your research, please consider cite: 104 | 105 | ``` 106 | @InProceedings{Chen_2020_SketchAwareSSC, 107 | author = {Chen, Xiaokang and Lin, Kwan-Yee and Qian, Chen and Zeng, Gang and Li, Hongsheng}, 108 | title = {3D Sketch-aware Semantic Scene Completion via Semi-supervised Structure Prior}, 109 | booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 110 | month = {June}, 111 | year = {2020} 112 | } 113 | ``` 114 | 115 | 116 | 117 | ## Acknowledgement 118 | 119 | Thanks [TorchSeg](https://github.com/ycszen/TorchSeg) for their excellent project! 120 | 121 | 122 | 123 | ## TODO 124 | 125 | - [ ] Code on more datasets (NYUCAD/SUNCG). 126 | - [ ] More SSC models. 127 | 128 | 129 | -------------------------------------------------------------------------------- /ReadmePic/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/ReadmePic/arch.png -------------------------------------------------------------------------------- /furnace/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/__init__.py -------------------------------------------------------------------------------- /furnace/apex/LICENSE: -------------------------------------------------------------------------------- 1 | All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /furnace/apex/apex.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: apex 3 | Version: 0.1 4 | Summary: PyTorch Extensions written by NVIDIA 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /furnace/apex/apex.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | apex/__init__.py 4 | apex.egg-info/PKG-INFO 5 | apex.egg-info/SOURCES.txt 6 | apex.egg-info/dependency_links.txt 7 | apex.egg-info/top_level.txt 8 | apex/RNN/RNNBackend.py 9 | apex/RNN/__init__.py 10 | apex/RNN/cells.py 11 | apex/RNN/models.py 12 | apex/amp/__init__.py 13 | apex/amp/__version__.py 14 | apex/amp/amp.py 15 | apex/amp/compat.py 16 | apex/amp/handle.py 17 | apex/amp/opt.py 18 | apex/amp/rnn_compat.py 19 | apex/amp/scaler.py 20 | apex/amp/utils.py 21 | apex/amp/wrap.py 22 | apex/amp/lists/__init__.py 23 | apex/amp/lists/functional_overrides.py 24 | apex/amp/lists/tensor_overrides.py 25 | apex/amp/lists/torch_overrides.py 26 | apex/fp16_utils/__init__.py 27 | apex/fp16_utils/fp16_optimizer.py 28 | apex/fp16_utils/fp16util.py 29 | apex/fp16_utils/loss_scaler.py 30 | apex/normalization/__init__.py 31 | apex/normalization/fused_layer_norm.py 32 | apex/normalization/csrc/layer_norm_cuda.cpp 33 | apex/normalization/csrc/layer_norm_cuda_kernel.cu 34 | apex/optimizers/__init__.py 35 | apex/optimizers/fp16_optimizer.py 36 | apex/optimizers/fused_adam.py 37 | apex/optimizers/csrc/fused_adam_cuda.cpp 38 | apex/optimizers/csrc/fused_adam_cuda_kernel.cu 39 | apex/parallel/LARC.py 40 | apex/parallel/__init__.py 41 | apex/parallel/distributed.py 42 | apex/parallel/multiproc.py 43 | apex/parallel/optimized_sync_batchnorm.py 44 | apex/parallel/optimized_sync_batchnorm_kernel.py 45 | apex/parallel/sync_batchnorm.py 46 | apex/parallel/sync_batchnorm_kernel.py 47 | apex/reparameterization/__init__.py 48 | apex/reparameterization/reparameterization.py 49 | apex/reparameterization/weight_norm.py 50 | csrc/flatten_unflatten.cpp 51 | csrc/scale_check_overflow.cpp 52 | csrc/scale_check_overflow_kernel.cu 53 | csrc/syncbn.cpp 54 | csrc/welford.cu -------------------------------------------------------------------------------- /furnace/apex/apex.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /furnace/apex/apex.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | amp_C 2 | apex 3 | apex_C 4 | fused_adam_cuda 5 | fused_layer_norm_cuda 6 | syncbn 7 | -------------------------------------------------------------------------------- /furnace/apex/apex/RNN/README.md: -------------------------------------------------------------------------------- 1 | Under construction... 2 | -------------------------------------------------------------------------------- /furnace/apex/apex/RNN/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM 2 | 3 | __all__ = ['models'] 4 | -------------------------------------------------------------------------------- /furnace/apex/apex/RNN/cells.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .RNNBackend import RNNCell 6 | 7 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend 8 | 9 | import math 10 | 11 | 12 | class mLSTMRNNCell(RNNCell): 13 | """ 14 | mLSTMRNNCell 15 | """ 16 | 17 | def __init__(self, input_size, hidden_size, bias = False, output_size = None): 18 | gate_multiplier = 4 19 | super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size) 20 | 21 | self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size)) 22 | self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size)) 23 | 24 | self.reset_parameters() 25 | 26 | def forward(self, input): 27 | """ 28 | mLSTMRNNCell.forward() 29 | """ 30 | #if not inited or bsz has changed this will create hidden states 31 | self.init_hidden(input.size()[0]) 32 | 33 | hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden 34 | 35 | self.hidden = list( 36 | self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh, 37 | b_ih=self.b_ih, b_hh=self.b_hh) 38 | ) 39 | 40 | if self.output_size != self.hidden_size: 41 | self.hidden[0] = F.linear(self.hidden[0], self.w_ho) 42 | return tuple(self.hidden) 43 | 44 | 45 | def new_like(self, new_input_size=None): 46 | if new_input_size is None: 47 | new_input_size = self.input_size 48 | 49 | return type(self)( 50 | new_input_size, 51 | self.hidden_size, 52 | self.bias, 53 | self.output_size) 54 | 55 | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None): 56 | """ 57 | mLSTMCell 58 | """ 59 | 60 | if input.is_cuda: 61 | igates = F.linear(input, w_ih) 62 | m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh) 63 | hgates = F.linear(m, w_hh) 64 | 65 | state = fusedBackend.LSTMFused.apply 66 | return state(igates, hgates, hidden[1], b_ih, b_hh) 67 | 68 | hx, cx = hidden 69 | 70 | m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh) 71 | gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh) 72 | 73 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 74 | 75 | ingate = F.sigmoid(ingate) 76 | forgetgate = F.sigmoid(forgetgate) 77 | cellgate = F.tanh(cellgate) 78 | outgate = F.sigmoid(outgate) 79 | 80 | cy = (forgetgate * cx) + (ingate * cellgate) 81 | hy = outgate * F.tanh(cy) 82 | 83 | return hy, cy 84 | 85 | -------------------------------------------------------------------------------- /furnace/apex/apex/RNN/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell 4 | 5 | from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell 6 | from .cells import mLSTMRNNCell, mLSTMCell 7 | 8 | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0): 9 | """ 10 | :class:`toRNNBackend` 11 | """ 12 | 13 | if bidirectional: 14 | return bidirectionalRNN(inputRNN, num_layers, dropout = dropout) 15 | else: 16 | return stackedRNN(inputRNN, num_layers, dropout = dropout) 17 | 18 | 19 | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 20 | """ 21 | :class:`LSTM` 22 | """ 23 | inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size) 24 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 25 | 26 | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 27 | """ 28 | :class:`GRU` 29 | """ 30 | inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size) 31 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 32 | 33 | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 34 | """ 35 | :class:`ReLU` 36 | """ 37 | inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size) 38 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 39 | 40 | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 41 | """ 42 | :class:`Tanh` 43 | """ 44 | inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size) 45 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 46 | 47 | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 48 | """ 49 | :class:`mLSTM` 50 | """ 51 | inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size) 52 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 53 | 54 | 55 | -------------------------------------------------------------------------------- /furnace/apex/apex/__init__.py: -------------------------------------------------------------------------------- 1 | from . import fp16_utils 2 | from . import parallel 3 | from . import amp 4 | 5 | # For optimizers and normalization there is no Python fallback. 6 | # Absence of cuda backend is a hard error. 7 | # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda 8 | # to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext 9 | # so they expect those backends to be available, but for some reason they actually aren't 10 | # available (for example because they built improperly in a way that isn't revealed until 11 | # load time) the error message is timely and visible. 12 | from . import optimizers 13 | from . import normalization 14 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/__init__.py: -------------------------------------------------------------------------------- 1 | from .amp import init, half_function, float_function, promote_function,\ 2 | register_half_function, register_float_function, register_promote_function 3 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 1, 0) 2 | __version__ = '.'.join(map(str, VERSION)) 3 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/compat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # True for post-0.4, when Variables/Tensors merged. 4 | def variable_is_tensor(): 5 | v = torch.autograd.Variable() 6 | return isinstance(v, torch.Tensor) 7 | 8 | def tensor_is_variable(): 9 | x = torch.Tensor() 10 | return type(x) == torch.autograd.Variable 11 | 12 | # False for post-0.4 13 | def tensor_is_float_tensor(): 14 | x = torch.Tensor() 15 | return type(x) == torch.FloatTensor 16 | 17 | # Akin to `torch.is_tensor`, but returns True for Variable 18 | # objects in pre-0.4. 19 | def is_tensor_like(x): 20 | return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable) 21 | 22 | # Wraps `torch.is_floating_point` if present, otherwise checks 23 | # the suffix of `x.type()`. 24 | def is_floating_point(x): 25 | if hasattr(torch, 'is_floating_point'): 26 | return torch.is_floating_point(x) 27 | try: 28 | torch_type = x.type() 29 | return torch_type.endswith('FloatTensor') or \ 30 | torch_type.endswith('HalfTensor') or \ 31 | torch_type.endswith('DoubleTensor') 32 | except AttributeError: 33 | return False 34 | 35 | def scalar_python_val(x): 36 | if hasattr(x, 'item'): 37 | return x.item() 38 | else: 39 | if isinstance(x, torch.autograd.Variable): 40 | return x.data[0] 41 | else: 42 | return x[0] 43 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/handle.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import logging 3 | import warnings 4 | 5 | from . import utils 6 | from .opt import OptimWrapper 7 | from .scaler import LossScaler 8 | 9 | class AmpHandle(object): 10 | def __init__(self, enable_caching=True, verbose=False): 11 | self._enable_caching = enable_caching 12 | self._verbose = verbose 13 | self._cache = dict() 14 | self._default_scaler = LossScaler() 15 | self._is_active = True 16 | self._all_wrappers = [] 17 | 18 | def is_active(self): 19 | return self._is_active 20 | 21 | @contextlib.contextmanager 22 | def _disable_casts(self): 23 | self._is_active = False 24 | yield 25 | self._is_active = True 26 | 27 | def wrap_optimizer(self, optimizer, num_loss=1): 28 | self._default_scaler = None 29 | return OptimWrapper(optimizer, self, num_loss) 30 | 31 | @contextlib.contextmanager 32 | def scale_loss(self, loss, optimizer): 33 | if not self.is_active(): 34 | yield loss 35 | return 36 | 37 | if self._default_scaler is None: 38 | raise RuntimeError( 39 | 'After calling `handle.wrap_optimizer()`, you must explicitly ' + 40 | 'use `optimizer.scale_loss(loss)`.') 41 | 42 | # TODO: this code block is duplicated here and `opt.py`. Unify. 43 | loss_scale = self._default_scaler.loss_scale() 44 | yield loss * loss_scale 45 | 46 | should_skip = self._default_scaler.unscale_and_update( 47 | optimizer.param_groups, loss_scale) 48 | if should_skip: 49 | optimizer_step = optimizer.step 50 | def skip_step(): 51 | logger = logging.getLogger('apex.amp') 52 | logger.warning('Gradient overflow, skipping update') 53 | optimizer.step = optimizer_step 54 | optimizer.step = skip_step 55 | 56 | self._clear_cache() 57 | 58 | def _clear_cache(self): 59 | self._cache.clear() 60 | 61 | # Experimental support for saving / restoring uncasted versions of functions 62 | def _save_func(self, mod, fn, func): 63 | self._all_wrappers.append((mod, fn, func)) 64 | 65 | def _deactivate(self): 66 | for mod, fn, func in self._all_wrappers: 67 | utils.set_func(mod, fn, func) 68 | self._all_wrappers = [] 69 | 70 | @property 71 | def has_cache(self): 72 | return self._enable_caching 73 | 74 | @property 75 | def cache(self): 76 | return self._cache 77 | 78 | def remove_cache(self, param): 79 | if self.has_cache and param in self.cache: 80 | del self.cache[param] 81 | 82 | @property 83 | def verbose(self): 84 | return self._verbose 85 | 86 | class NoOpHandle(object): 87 | def is_active(self): 88 | return False 89 | 90 | @contextlib.contextmanager 91 | def _disable_casts(self): 92 | yield 93 | 94 | def wrap_optimizer(self, optimizer, num_loss=1): 95 | return OptimWrapper(optimizer, self, num_loss) 96 | 97 | @contextlib.contextmanager 98 | def scale_loss(self, loss, optimizer): 99 | yield loss 100 | 101 | @property 102 | def has_cache(self): 103 | return False 104 | 105 | @property 106 | def verbose(self): 107 | return False 108 | 109 | def _deactivate(self): 110 | pass 111 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/lists/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/apex/amp/lists/__init__.py -------------------------------------------------------------------------------- /furnace/apex/apex/amp/lists/functional_overrides.py: -------------------------------------------------------------------------------- 1 | 2 | # TODO: think about the following two. They do weird things. 3 | # - torch.nn.utils.clip_grad (but it should always be fp32 anyway) 4 | # - torch.nn.utils.weight_norm 5 | 6 | # Notes: 7 | # F.instance_norm uses batch_norm internally. Which correctly handles 8 | # fp16 in/out with fp32 weights. So we shouldn't do anything for 9 | # either of these. 10 | # F.normalize calls `input.norm()` internally, so it's redundant, but 11 | # kept here in case impl. changes. 12 | # F.cosine_similarity is same: calls `x.norm()` internally. 13 | 14 | import torch.nn.functional 15 | 16 | MODULE = torch.nn.functional 17 | 18 | FP16_FUNCS = [ 19 | 'conv1d', 20 | 'conv2d', 21 | 'conv3d', 22 | 'conv_transpose1d', 23 | 'conv_transpose2d', 24 | 'conv_transpose3d', 25 | 'conv_tbc', # Undocumented / maybe new? 26 | 'linear', 27 | ] 28 | 29 | FP32_FUNCS = [ 30 | # Pointwise 31 | 'softplus', 32 | 'softmin', 33 | 'log_softmax', 34 | 'softmax', 35 | 36 | # Normalization 37 | 'layer_norm', 38 | 'group_norm', 39 | 'local_response_norm', 40 | 'normalize', 41 | 'cosine_similarity', 42 | 43 | # Loss functions 44 | # TODO: which of these can be fp16? 45 | 'poisson_nll_loss', 46 | 'cosine_embedding_loss', 47 | 'cross_entropy', 48 | 'hinge_embedding_loss', 49 | 'kl_div', 50 | 'l1_loss', 51 | 'mse_loss', 52 | 'margin_ranking_loss', 53 | 'multilabel_margin_loss', 54 | 'multilabel_soft_margin_loss', 55 | 'multi_margin_loss', 56 | 'nll_loss', 57 | 'binary_cross_entropy_with_logits', 58 | 'smooth_l1_loss', 59 | 'soft_margin_loss', 60 | 'triplet_margin_loss' 61 | ] 62 | 63 | BANNED_FUNCS = [ 64 | ('binary_cross_entropy', 65 | ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` " 66 | "It requires that the output of the previous function be already a FloatTensor. \n\n" 67 | "Most models have a Sigmoid right before BCELoss. In that case, you can use\n" 68 | " torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer " 69 | "that is compatible with amp.\nAnother option is to add\n" 70 | " amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n" 71 | "If you _really_ know what you are doing, you can disable this warning by passing " 72 | "allow_banned=True to `amp.init()`.")) 73 | ] 74 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/lists/tensor_overrides.py: -------------------------------------------------------------------------------- 1 | from .. import compat 2 | from . import torch_overrides 3 | 4 | import importlib 5 | 6 | import torch 7 | 8 | if compat.variable_is_tensor() and not compat.tensor_is_variable(): 9 | MODULE = torch.Tensor 10 | else: 11 | MODULE = torch.autograd.Variable 12 | 13 | 14 | FP16_FUNCS = [ 15 | '__matmul__', 16 | ] 17 | 18 | FP32_FUNCS = [ 19 | '__ipow__', 20 | '__pow__', 21 | '__rpow__', 22 | 23 | # Cast to fp32 before transfer to CPU 24 | 'cpu', 25 | ] 26 | 27 | CASTS = [ 28 | '__add__', 29 | '__div__', 30 | '__eq__', 31 | '__ge__', 32 | '__gt__', 33 | '__iadd__', 34 | '__idiv__', 35 | '__imul__', 36 | '__isub__', 37 | '__itruediv__', 38 | '__le__', 39 | '__lt__', 40 | '__mul__', 41 | '__ne__', 42 | '__radd__', 43 | '__rdiv__', 44 | '__rmul__', 45 | '__rsub__', 46 | '__rtruediv__', 47 | '__sub__', 48 | '__truediv__', 49 | ] 50 | 51 | # None of these, but here to make code cleaner. 52 | SEQUENCE_CASTS = [] 53 | 54 | # We need to grab all the methods from torch_overrides and add them to 55 | # the Tensor lists as well, as almost all methods are duplicated 56 | # between `torch` and `torch.Tensor` (and check with `hasattr`, 57 | # because a few random ones aren't defined on Tensor) 58 | _self_mod = importlib.import_module(__name__) 59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']: 60 | lst = getattr(_self_mod, attrname) 61 | for fn in getattr(torch_overrides, attrname): 62 | if hasattr(MODULE, fn): 63 | lst.append(fn) 64 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/lists/torch_overrides.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | MODULE = torch 4 | 5 | FP16_FUNCS = [ 6 | # Math 7 | # TODO: why are these in top-level torch namespace? 8 | 'conv1d', 9 | 'conv2d', 10 | 'conv3d', 11 | 'conv_transpose1d', 12 | 'conv_transpose2d', 13 | 'conv_transpose3d', 14 | 'conv_tbc', 15 | 16 | # BLAS 17 | 'addmm', 18 | 'addmv', 19 | 'addr', 20 | 'matmul', 21 | 'mm', 22 | 'mv', 23 | 24 | ] 25 | 26 | # TODO: ban in-place versions of these in fp16 27 | FP32_FUNCS = [ 28 | # Pointwise 29 | 'acos', 30 | 'asin', 31 | 'cosh', 32 | 'erfinv', 33 | 'exp', 34 | 'expm1', 35 | 'log', 36 | 'log10', 37 | 'log2', 38 | 'reciprocal', 39 | 'rsqrt', 40 | 'sinh', 41 | 'tan', 42 | 43 | # Other math 44 | 'pow', 45 | 46 | # Reduction 47 | 'cumprod', 48 | 'cumsum', 49 | 'dist', 50 | 'mean', 51 | 'norm', 52 | 'prod', 53 | 'std', 54 | 'sum', 55 | 'var', 56 | 57 | # Special reduction-like BLAS 58 | 'addbmm', 59 | 'baddbmm', 60 | 'bmm', 61 | 62 | # Misc 63 | 'renorm' 64 | ] 65 | 66 | # Multi-tensor fns that may need type promotion 67 | CASTS = [ 68 | # Multi-tensor math 69 | 'addcdiv', 70 | 'addcmul', 71 | 'atan2', 72 | 'cross', 73 | 74 | # Element-wise _or_ tensor-wise math 75 | 'add', 76 | 'div', 77 | 'mul', 78 | 79 | # Comparison 80 | 'eq', 81 | 'equal', 82 | 'ge', 83 | 'gt', 84 | 'le', 85 | 'lt', 86 | 'ne' 87 | ] 88 | 89 | # Will possibly need to promote *all* elements of `seq` 90 | SEQUENCE_CASTS = [ 91 | 'cat', # torch.cat(seq, dim=0, out=None) 92 | 'stack' # torch.stack(seq, dim=0, out=None) 93 | ] 94 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/opt.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import logging 3 | import warnings 4 | 5 | from .scaler import LossScaler, iter_params 6 | 7 | import numpy as np 8 | 9 | class OptimWrapper(object): 10 | def __init__(self, optimizer, amp_handle, num_loss): 11 | self._optimizer = optimizer 12 | self._amp_handle = amp_handle 13 | self._num_loss = num_loss 14 | self._loss_idx = 0 15 | self._skip_next = [False] * num_loss 16 | self._loss_scaler = [LossScaler() for _ in range(num_loss)] 17 | 18 | @contextlib.contextmanager 19 | def scale_loss(self, loss): 20 | if not self._amp_handle.is_active(): 21 | yield loss 22 | return 23 | 24 | # When there are multiple losses per-optimizer, we need 25 | # to save out current grad accumulation, since we won't be 26 | # able to unscale this particulare loss once the grads are 27 | # all mixed together. 28 | cached_grads = [] 29 | if self._loss_idx > 0: 30 | for p in iter_params(self._optimizer.param_groups): 31 | if p.grad is not None: 32 | cached_grads.append(p.grad.data.detach().clone()) 33 | else: 34 | cached_grads.append(None) 35 | self._optimizer.zero_grad() 36 | 37 | loss_scale = self._cur_loss_scaler().loss_scale() 38 | yield loss * loss_scale 39 | 40 | self._skip_next[self._loss_idx] = self._cur_loss_scaler().unscale_and_update( 41 | self._optimizer.param_groups, loss_scale) 42 | self._loss_idx += 1 43 | 44 | if len(cached_grads) > 0: 45 | for p, cached_grad in zip(iter_params(self._optimizer.param_groups), 46 | cached_grads): 47 | if cached_grad is not None: 48 | p.grad.data.add_(cached_grad) 49 | cached_grads = [] 50 | 51 | def _cur_loss_scaler(self): 52 | assert 0 <= self._loss_idx < self._num_loss 53 | return self._loss_scaler[self._loss_idx] 54 | 55 | def step(self, closure=None): 56 | if not self._amp_handle.is_active(): 57 | return self._optimizer.step(closure=closure) 58 | 59 | self._loss_idx = 0 60 | 61 | for group in self._optimizer.param_groups: 62 | for p in group['params']: 63 | self._amp_handle.remove_cache(p) 64 | 65 | if closure is not None: 66 | raise NotImplementedError( 67 | 'The `closure` argument is unsupported by the amp ' + 68 | 'optimizer wrapper.') 69 | if any(self._skip_next): 70 | logger = logging.getLogger('apex.amp') 71 | logger.info('Gradient overflow, skipping update') 72 | self._skip_next = [False] * self._num_loss 73 | else: 74 | return self._optimizer.step(closure=closure) 75 | 76 | # Forward any attribute lookups 77 | def __getattr__(self, attr): 78 | return getattr(self._optimizer, attr) 79 | 80 | # Forward all torch.optim.Optimizer methods 81 | def __getstate__(self): 82 | return self._optimizer.__getstate__() 83 | 84 | def __setstate__(self): 85 | return self._optimizer.__setstate__() 86 | 87 | def __repr__(self): 88 | return self._optimizer.__repr__() 89 | 90 | def state_dict(self): 91 | return self._optimizer.state_dict() 92 | 93 | def load_state_dict(self, state_dict): 94 | return self._optimizer.load_state_dict(state_dict) 95 | 96 | def zero_grad(self): 97 | return self._optimizer.zero_grad() 98 | 99 | def add_param_group(self, param_group): 100 | return self._optimizer.add_param_group(param_group) 101 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/rnn_compat.py: -------------------------------------------------------------------------------- 1 | from . import utils, wrap 2 | 3 | import torch 4 | _VF = torch._C._VariableFunctions 5 | RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm'] 6 | 7 | def _gen_VF_wrapper(name): 8 | def wrapper(*args, **kwargs): 9 | return getattr(_VF, name)(*args, **kwargs) 10 | return wrapper 11 | 12 | # Some python magic to generate an object that has the rnn cell functions 13 | # defined on it, all of which call into corresponding _VF version. 14 | class VariableFunctionsShim(object): 15 | def __init__(self): 16 | for name in RNN_NAMES: 17 | setattr(self, name + '_cell', _gen_VF_wrapper(name + '_cell')) 18 | 19 | def has_old_rnns(): 20 | try: 21 | torch.nn.backends.thnn.backend.LSTMCell 22 | return True 23 | except: 24 | return False 25 | 26 | def whitelist_rnn_cells(handle, verbose): 27 | # Different module + function names in old/new RNN cases 28 | if has_old_rnns(): 29 | fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell'] 30 | mod = torch.nn.backends.thnn.backend 31 | else: 32 | fn_names = [x + '_cell' for x in RNN_NAMES] 33 | mod = torch.nn.modules.rnn._VF 34 | assert isinstance(mod, VariableFunctionsShim) 35 | 36 | # Insert casts on cell functions 37 | for fn in fn_names: 38 | wrap.cached_cast(mod, fn, utils.maybe_half, handle, 39 | try_caching=True, verbose=verbose) 40 | 41 | if has_old_rnns(): 42 | # Special handling of `backward` for fused gru / lstm: 43 | # The `backward` method calls Tensor.sum() (blacklist) internally, 44 | # and then the resulting grad_input has the wrong type. 45 | # TODO: where else is this a problem? 46 | for rnn_type in ['GRUFused', 'LSTMFused']: 47 | mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type) 48 | wrap.disable_casts(mod, 'backward', handle) 49 | -------------------------------------------------------------------------------- /furnace/apex/apex/amp/scaler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | 4 | # from apex_C import scale_check_overflow 5 | 6 | def scale_check_overflow_python(d_grads, scale): 7 | # Exception handling for 18.04 compatibility 8 | try: 9 | cpu_sum = float(d_grads.float().sum()) 10 | except RuntimeError as instance: 11 | if "value cannot be converted" not in instance.args[0]: 12 | raise 13 | return True 14 | else: 15 | if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: 16 | return True 17 | d_grads.mul_(scale) 18 | return False 19 | 20 | class LossScaler(object): 21 | warned_no_fused_kernel = False 22 | warned_fp16_grad = False 23 | has_fused_kernel = False 24 | 25 | def __init__(self): 26 | self._loss_scale = 2.**16 27 | self._max_loss_scale = 2.**24 28 | self._scale_seq_len = 2000 29 | self._unskipped = 0 30 | self._has_overflow = False 31 | try: 32 | import amp_C 33 | LossScaler.has_fused_kernel = True 34 | LossScaler.scale_check_overflow_cuda = amp_C.scale_check_overflow 35 | self._overflow_buf = torch.cuda.IntTensor([0]) 36 | except ImportError as err: 37 | if not LossScaler.warned_no_fused_kernel: 38 | print("Warning: Amp fused downscale kernel is unavailable, possibly because apex " 39 | "was installed without --cuda_ext. Using Python fallback. ImportError was: ", 40 | err) 41 | LossScaler.has_fused_kernel = False 42 | LossScaler.warned_no_fused_kernel = True 43 | 44 | def loss_scale(self): 45 | return self._loss_scale 46 | 47 | def unscale_and_update(self, param_groups, scale): 48 | if LossScaler.has_fused_kernel: 49 | self._overflow_buf.zero_() 50 | self._has_overflow = False 51 | for p in iter_params(param_groups): 52 | if p.grad is not None: 53 | if LossScaler.has_fused_kernel and p.grad.data.type() == "torch.cuda.FloatTensor": 54 | LossScaler.scale_check_overflow_cuda(p.grad.data, 55 | 1./scale, 56 | self._overflow_buf, 57 | p.grad.data) 58 | else: 59 | if (p.grad.data.type() != "torch.cuda.FloatTensor" 60 | and not LossScaler.warned_fp16_grad): 61 | logger = logging.getLogger("apex.amp") 62 | logger.warning("Incoming grads are not fp32 (not master grads). " 63 | "Downscaling non-fp32 grads may indicate an error. " 64 | "When using Amp, you don't need to call .half() on your model.") 65 | LossScaler.warned_fp16_grad = True 66 | self._has_overflow = scale_check_overflow_python(p.grad.data, 67 | 1./scale) 68 | if self._has_overflow: 69 | break 70 | 71 | # If the fused kernel is available, we only need one D2H memcopy and sync. 72 | if LossScaler.has_fused_kernel and not self._has_overflow: 73 | self._has_overflow = self._overflow_buf.item() 74 | 75 | if self._has_overflow: 76 | should_skip = True 77 | self._loss_scale /= 2. 78 | self._unskipped = 0 79 | else: 80 | should_skip = False 81 | self._unskipped += 1 82 | 83 | if self._unskipped == self._scale_seq_len: 84 | self._loss_scale = min(self._max_loss_scale, self._loss_scale * 2.) 85 | self._unskipped = 0 86 | 87 | return should_skip 88 | 89 | def iter_params(param_groups): 90 | for group in param_groups: 91 | for p in group['params']: 92 | yield p 93 | -------------------------------------------------------------------------------- /furnace/apex/apex/fp16_utils/README.md: -------------------------------------------------------------------------------- 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user. To use `FP16_Optimizer`, only two lines of one's Python model need to change. 2 | 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling) 4 | 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple) 6 | 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 8 | 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) 10 | 11 | 12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses. 13 | 14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management) 15 | 16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling. These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically. 17 | -------------------------------------------------------------------------------- /furnace/apex/apex/fp16_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp16util import ( 2 | BN_convert_float, 3 | network_to_half, 4 | prep_param_lists, 5 | model_grads_to_master_grads, 6 | master_params_to_model_params, 7 | tofp16, 8 | to_python_float, 9 | clip_grad_norm, 10 | convert_module, 11 | convert_network, 12 | FP16Model, 13 | ) 14 | 15 | from .fp16_optimizer import FP16_Optimizer 16 | from .loss_scaler import LossScaler, DynamicLossScaler 17 | -------------------------------------------------------------------------------- /furnace/apex/apex/normalization/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_layer_norm import FusedLayerNorm 2 | -------------------------------------------------------------------------------- /furnace/apex/apex/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_adam import FusedAdam 2 | from .fp16_optimizer import FP16_Optimizer 3 | -------------------------------------------------------------------------------- /furnace/apex/apex/optimizers/csrc/fused_adam_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // CUDA forward declaration 4 | void fused_adam_cuda(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay); 5 | 6 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 7 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 8 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 9 | 10 | // C++ interface 11 | void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay) { 12 | CHECK_INPUT(p) 13 | if (p_copy.numel() > 0) CHECK_INPUT(p_copy); 14 | CHECK_INPUT(m); 15 | CHECK_INPUT(v); 16 | CHECK_INPUT(g); 17 | int64_t num_elem = p.numel(); 18 | AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal"); 19 | AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal"); 20 | AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal"); 21 | AT_ASSERTM(p_copy.numel() == num_elem || p_copy.numel() == 0, "number of elements in p_copy and p tensors should be equal, or p_copy should be empty"); 22 | 23 | fused_adam_cuda(p, p_copy, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction, decay); 24 | } 25 | 26 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 27 | m.def("adam", &adam, "Adam optimized CUDA implementation."); 28 | } 29 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/LARC.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Variable 4 | from torch.nn.parameter import Parameter 5 | 6 | class LARC(object): 7 | """ 8 | :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC, 9 | in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive 10 | local learning rate for each individual parameter. The algorithm is designed to improve 11 | convergence of large batch training. 12 | 13 | See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate. 14 | 15 | In practice it modifies the gradients of parameters as a proxy for modifying the learning rate 16 | of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer. 17 | 18 | ``` 19 | model = ... 20 | optim = torch.optim.Adam(model.parameters(), lr=...) 21 | optim = LARC(optim) 22 | ``` 23 | 24 | It can even be used in conjunction with apex.fp16_utils.FP16_optimizer. 25 | 26 | ``` 27 | model = ... 28 | optim = torch.optim.Adam(model.parameters(), lr=...) 29 | optim = LARC(optim) 30 | optim = apex.fp16_utils.FP16_Optimizer(optim) 31 | ``` 32 | 33 | Args: 34 | optimizer: Pytorch optimizer to wrap and modify learning rate for. 35 | trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888 36 | clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`. 37 | eps: epsilon kludge to help with numerical stability while calculating adaptive_lr 38 | """ 39 | 40 | def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8): 41 | self.param_groups = optimizer.param_groups 42 | self.optim = optimizer 43 | self.trust_coefficient = trust_coefficient 44 | self.eps = eps 45 | self.clip = clip 46 | 47 | def __getstate__(self): 48 | return self.optim.__getstate__() 49 | 50 | def __setstate__(self, state): 51 | self.optim.__setstate__(state) 52 | 53 | def __repr__(self): 54 | return self.optim.__repr__() 55 | 56 | def state_dict(self): 57 | return self.optim.state_dict() 58 | 59 | def load_state_dict(self, state_dict): 60 | self.optim.load_state_dict(state_dict) 61 | 62 | def zero_grad(self): 63 | self.optim.zero_grad() 64 | 65 | def add_param_group(self, param_group): 66 | self.optim.add_param_group( param_group) 67 | 68 | def step(self): 69 | with torch.no_grad(): 70 | weight_decays = [] 71 | for group in self.optim.param_groups: 72 | # absorb weight decay control from optimizer 73 | weight_decay = group['weight_decay'] if 'weight_decay' in group else 0 74 | weight_decays.append(weight_decay) 75 | group['weight_decay'] = 0 76 | for p in group['params']: 77 | if p.grad is None: 78 | continue 79 | param_norm = torch.norm(p.data) 80 | grad_norm = torch.norm(p.grad.data) 81 | 82 | if param_norm != 0 and grad_norm != 0: 83 | # calculate adaptive lr + weight decay 84 | adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps) 85 | 86 | # clip learning rate for LARC 87 | if self.clip: 88 | # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` 89 | adaptive_lr = min(adaptive_lr/group['lr'], 1) 90 | 91 | p.grad.data += weight_decay * p.data 92 | p.grad.data *= adaptive_lr 93 | 94 | self.optim.step() 95 | # return weight decay control to optimizer 96 | for i, group in enumerate(self.optim.param_groups): 97 | group['weight_decay'] = weight_decays[i] 98 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/README.md: -------------------------------------------------------------------------------- 1 | ## Distributed Data Parallel 2 | 3 | distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library. 4 | 5 | `apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with 6 | computation in the backward pass and bucketing smaller transfers to reduce the total number of 7 | transfers required. 8 | 9 | multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs. 10 | 11 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html) 12 | 13 | #### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed) 14 | 15 | #### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 16 | 17 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex) 18 | 19 | ### Synchronized Batch Normalization 20 | 21 | `apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`. 22 | It reduces stats on the first (channel) dimension of the Tensor and accepts 23 | arbitrary spatial dimensions. 24 | 25 | #### Installation 26 | 27 | Apex provides two sync BN implementation: 28 | 29 | 1. There is the Python-only implementation, which is the default implementation 30 | when install with `python setup.py install`. 31 | It uses PyTorch primitive operations and distributed communication package from 32 | `torch.distributed`. 33 | 34 | - _Python-only implementation requires input tensor to be of same data type as 35 | layer_ 36 | 37 | 2. We also provide implementation with kernels through CUDA/C++ extension with 38 | improved performance. We are experimenting with Welford and Kahan for reduction 39 | hoping to get better accuracy. 40 | To use the kernel implementation, user need to install Apex with CUDA extension 41 | enabled `python setup.py install --cuda_ext`. 42 | 43 | - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn. 44 | This is required to run imagenet example in fp16._ 45 | 46 | - _Currently kernel implementation only supports GPU._ 47 | 48 | #### HowTo 49 | 50 | 1. User could use `apex.parallel.SyncBatchNorm` by building their module with 51 | the layer explicitly. 52 | 53 | ``` 54 | import apex 55 | input_t = torch.randn(3, 5, 20).cuda() 56 | sbn = apex.parallel.SyncBatchNorm(5).cuda() 57 | output_t = sbn(input) 58 | ``` 59 | 60 | 2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`. 61 | 62 | ``` 63 | # model is an instance of torch.nn.Module 64 | import apex 65 | sync_bn_model = apex.parallel.convert_syncbn_model(model) 66 | ``` 67 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if hasattr(torch.distributed, 'ReduceOp'): 4 | ReduceOp = torch.distributed.ReduceOp 5 | elif hasattr(torch.distributed, 'reduce_op'): 6 | ReduceOp = torch.distributed.reduce_op 7 | else: 8 | ReduceOp = torch.distributed.deprecated.reduce_op 9 | 10 | from .distributed import DistributedDataParallel, Reducer 11 | # This is tricky because I'd like SyncBatchNorm to be exposed the same way 12 | # for both the cuda-enabled and python-fallback versions, and I don't want 13 | # to suppress the error information. 14 | try: 15 | import syncbn 16 | from .optimized_sync_batchnorm import SyncBatchNorm 17 | except ImportError as err: 18 | from .sync_batchnorm import SyncBatchNorm 19 | SyncBatchNorm.syncbn_import_error = err 20 | 21 | def convert_syncbn_model(module, process_group=None, channel_last=False): 22 | ''' 23 | Recursively traverse module and its children to replace all instances of 24 | ``torch.nn.modules.batchnorm._BatchNorm`` with :class:`apex.parallel.SyncBatchNorm`. 25 | 26 | All ``torch.nn.BatchNorm*N*d`` wrap around 27 | ``torch.nn.modules.batchnorm._BatchNorm``, so this function lets you easily switch 28 | to use sync BN. 29 | 30 | Args: 31 | module (torch.nn.Module): input module 32 | 33 | Example:: 34 | 35 | >>> # model is an instance of torch.nn.Module 36 | >>> import apex 37 | >>> sync_bn_model = apex.parallel.convert_syncbn_model(model) 38 | ''' 39 | mod = module 40 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): 41 | mod = SyncBatchNorm(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats, process_group, channel_last=channel_last) 42 | mod.running_mean = module.running_mean 43 | mod.running_var = module.running_var 44 | if module.affine: 45 | mod.weight.data = module.weight.data.clone().detach() 46 | mod.bias.data = module.bias.data.clone().detach() 47 | for name, child in module.named_children(): 48 | mod.add_module(name, convert_syncbn_model(child, 49 | process_group=process_group, 50 | channel_last=channel_last)) 51 | # TODO(jie) should I delete model explicitly? 52 | del module 53 | return mod 54 | 55 | def create_syncbn_process_group(group_size): 56 | ''' 57 | Creates process groups to be used for syncbn of a give ``group_size`` and returns 58 | process group that current GPU participates in. 59 | 60 | ``group_size`` must divide the total number of GPUs (world_size). 61 | 62 | ``group_size`` of 0 would be considered as =world_size. In this case ``None`` will be returned. 63 | 64 | ``group_size`` of 1 would be equivalent to using non-sync bn, but will still carry the overhead. 65 | 66 | Args: 67 | group_size (int): number of GPU's to collaborate for sync bn 68 | 69 | Example:: 70 | 71 | >>> # model is an instance of torch.nn.Module 72 | >>> import apex 73 | >>> group = apex.parallel.create_syncbn_process_group(group_size) 74 | ''' 75 | 76 | if group_size==0: 77 | return None 78 | 79 | world_size = torch.distributed.get_world_size() 80 | assert(world_size >= group_size) 81 | assert(world_size % group_size == 0) 82 | 83 | group=None 84 | for group_num in (range(world_size//group_size)): 85 | group_ids = range(group_num*group_size, (group_num+1)*group_size) 86 | cur_group = torch.distributed.new_group(ranks=group_ids) 87 | if (torch.distributed.get_rank()//group_size == group_num): 88 | group = cur_group 89 | #can not drop out and return here, every process must go through creation of all subgroups 90 | 91 | assert(group is not None) 92 | return group 93 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/multiproc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import subprocess 4 | 5 | def docstring_hack(): 6 | """ 7 | Multiproc file which will launch a set of processes locally for multi-gpu 8 | usage: python -m apex.parallel.multiproc main.py ... 9 | """ 10 | pass 11 | 12 | argslist = list(sys.argv)[1:] 13 | world_size = torch.cuda.device_count() 14 | 15 | if '--world-size' in argslist: 16 | world_size = int(argslist[argslist.index('--world-size')+1]) 17 | else: 18 | argslist.append('--world-size') 19 | argslist.append(str(world_size)) 20 | 21 | workers = [] 22 | 23 | for i in range(world_size): 24 | if '--rank' in argslist: 25 | argslist[argslist.index('--rank')+1] = str(i) 26 | else: 27 | argslist.append('--rank') 28 | argslist.append(str(i)) 29 | stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w") 30 | print(argslist) 31 | p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) 32 | workers.append(p) 33 | 34 | for p in workers: 35 | p.wait() 36 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/optimized_sync_batchnorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.modules.batchnorm import _BatchNorm 3 | from torch.nn import functional as F 4 | 5 | import syncbn 6 | from .optimized_sync_batchnorm_kernel import SyncBatchnormFunction 7 | 8 | 9 | class SyncBatchNorm(_BatchNorm): 10 | """ 11 | synchronized batch normalization module extented from `torch.nn.BatchNormNd` 12 | with the added stats reduction across multiple processes. 13 | :class:`apex.parallel.SyncBatchNorm` is designed to work with 14 | `DistributedDataParallel`. 15 | 16 | When running in training mode, the layer reduces stats across all processes 17 | to increase the effective batchsize for normalization layer. This is useful 18 | in applications where batch size is small on a given process that would 19 | diminish converged accuracy of the model. The model uses collective 20 | communication package from `torch.distributed`. 21 | 22 | When running in evaluation mode, the layer falls back to 23 | `torch.nn.functional.batch_norm` 24 | 25 | Args: 26 | num_features: :math:`C` from an expected input of size 27 | :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)` 28 | eps: a value added to the denominator for numerical stability. 29 | Default: 1e-5 30 | momentum: the value used for the running_mean and running_var 31 | computation. Can be set to ``None`` for cumulative moving average 32 | (i.e. simple average). Default: 0.1 33 | affine: a boolean value that when set to ``True``, this module has 34 | learnable affine parameters. Default: ``True`` 35 | track_running_stats: a boolean value that when set to ``True``, this 36 | module tracks the running mean and variance, and when set to ``False``, 37 | this module does not track such statistics and always uses batch 38 | statistics in both training and eval modes. Default: ``True`` 39 | process_group: pass in a process group within which the stats of the 40 | mini-batch is being synchronized. ``None`` for using default process 41 | group 42 | channel_last: a boolean value that when set to ``True``, this module 43 | take the last dimension of the input tensor to be the channel 44 | dimension. Default: False 45 | 46 | Examples:: 47 | >>> # channel first tensor 48 | >>> sbn = apex.parallel.SyncBatchNorm(100).cuda() 49 | >>> inp = torch.randn(10, 100, 14, 14).cuda() 50 | >>> out = sbn(inp) 51 | >>> inp = torch.randn(3, 100, 20).cuda() 52 | >>> out = sbn(inp) 53 | >>> # channel last tensor 54 | >>> sbn = apex.parallel.SyncBatchNorm(100, channel_last=True).cuda() 55 | >>> inp = torch.randn(10, 14, 14, 100).cuda() 56 | """ 57 | 58 | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False): 59 | super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats) 60 | self.process_group = process_group 61 | self.channel_last = channel_last 62 | 63 | def _specify_process_group(self, process_group): 64 | self.process_group = process_group 65 | 66 | def _specify_channel_last(self, channel_last): 67 | self.channel_last = channel_last 68 | 69 | def forward(self, input): 70 | if not self.training and self.track_running_stats and not self.channel_last: 71 | # fall back to pytorch implementation for inference 72 | return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps) 73 | else: 74 | exponential_average_factor = 0.0 75 | if self.training and self.track_running_stats: 76 | self.num_batches_tracked += 1 77 | if self.momentum is None: 78 | exponential_average_factor = 1.0 / float(self.num_batches_tracked) 79 | else: 80 | exponential_average_factor = self.momentum 81 | return SyncBatchnormFunction.apply(input, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, self.channel_last) 82 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/optimized_sync_batchnorm_kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd.function import Function 3 | 4 | import syncbn 5 | from apex.parallel import ReduceOp 6 | 7 | class SyncBatchnormFunction(Function): 8 | 9 | @staticmethod 10 | def forward(ctx, input, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False): 11 | torch.cuda.nvtx.range_push("sync_BN_fw") 12 | input = input.contiguous() 13 | world_size = 0 14 | 15 | mean = None 16 | var_biased = None 17 | inv_std = None 18 | var = None 19 | out = None 20 | count = None 21 | if track_running_stats: 22 | if channel_last: 23 | count = int(input.numel()/input.size(-1)) 24 | mean, var_biased = syncbn.welford_mean_var_c_last(input) 25 | else : 26 | count = int(input.numel()/input.size(1)) 27 | mean, var_biased = syncbn.welford_mean_var(input) 28 | 29 | if torch.distributed.is_initialized(): 30 | if not process_group: 31 | process_group = torch.distributed.group.WORLD 32 | world_size = torch.distributed.get_world_size(process_group) 33 | mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=mean.device) 34 | var_all = torch.empty(world_size, var_biased.size(0), dtype=var_biased.dtype, device=var_biased.device) 35 | mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)] 36 | var_l = [var_all.narrow(0, i, 1) for i in range(world_size)] 37 | torch.distributed.all_gather(mean_l, mean, process_group) 38 | torch.distributed.all_gather(var_l, var_biased, process_group) 39 | mean, var, inv_std = syncbn.welford_parallel(mean_all, var_all, count, eps) 40 | # TODO(Jie): should do fp32 math instead! 41 | else: 42 | inv_std = 1.0 / torch.sqrt(var_biased + eps) 43 | var = var_biased * (count) / (count-1) 44 | 45 | r_m_inc = mean if running_mean.dtype != torch.float16 else mean.half() 46 | r_v_inc = var if running_variance.dtype != torch.float16 else var.half() 47 | running_mean.data = running_mean.data * (1-momentum) + momentum*r_m_inc 48 | running_variance.data = running_variance.data * (1-momentum) + momentum*r_v_inc 49 | else: 50 | mean = running_mean.data 51 | inv_std = 1.0 / torch.sqrt(running_var.data + eps) 52 | 53 | ctx.save_for_backward(input, weight, mean, inv_std) 54 | ctx.process_group = process_group 55 | ctx.channel_last = channel_last 56 | ctx.world_size = world_size 57 | 58 | if channel_last: 59 | out = syncbn.batchnorm_forward_c_last(input, mean, inv_std, weight, bias) 60 | else: 61 | out = syncbn.batchnorm_forward(input, mean, inv_std, weight, bias) 62 | 63 | torch.cuda.nvtx.range_pop() 64 | return out 65 | 66 | @staticmethod 67 | def backward(ctx, grad_output): 68 | grad_output = grad_output.contiguous() 69 | torch.cuda.nvtx.range_push("sync_BN_bw") 70 | # mini batch mean & var are calculated by forward path. 71 | # mu = 1./N*np.sum(h, axis = 0) 72 | # var = 1./N*np.sum((h-mu)**2, axis = 0) 73 | saved_input, weight, mean, inv_std = ctx.saved_tensors 74 | process_group = ctx.process_group 75 | channel_last = ctx.channel_last 76 | world_size = ctx.world_size 77 | grad_input = grad_weight = grad_bias = None 78 | 79 | # TODO(jie): why do I have to clone here? life time of grad_output? 80 | if channel_last: 81 | mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn_c_last(grad_output, saved_input, mean, inv_std, weight) 82 | else: 83 | mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output, saved_input, mean, inv_std, weight) 84 | 85 | # calculate grad_input 86 | if ctx.needs_input_grad[0]: 87 | 88 | if torch.distributed.is_initialized(): 89 | torch.distributed.all_reduce( 90 | mean_dy, ReduceOp.SUM, process_group) 91 | mean_dy = mean_dy / world_size 92 | torch.distributed.all_reduce( 93 | mean_dy_xmu, ReduceOp.SUM, process_group) 94 | mean_dy_xmu = mean_dy_xmu / world_size 95 | if channel_last: 96 | grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu) 97 | else: 98 | grad_input = syncbn.batchnorm_backward(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu) 99 | 100 | if weight is None or not ctx.needs_input_grad[1]: 101 | grad_weight = None 102 | 103 | if weight is None or not ctx.needs_input_grad[2]: 104 | grad_bias = None 105 | 106 | torch.cuda.nvtx.range_pop() 107 | return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None 108 | -------------------------------------------------------------------------------- /furnace/apex/apex/parallel/sync_batchnorm_kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd.function import Function 3 | 4 | from apex.parallel import ReduceOp 5 | 6 | 7 | class SyncBatchnormFunction(Function): 8 | 9 | @staticmethod 10 | def forward(ctx, input, weight, bias, running_mean, running_variance, eps, process_group, world_size): 11 | torch.cuda.nvtx.range_push("sync_BN_fw") 12 | # transpose it to channel last to support broadcasting for input with different rank 13 | c_last_input = input.transpose(1, -1).contiguous().clone() 14 | 15 | ctx.save_for_backward(c_last_input, weight, bias, 16 | running_mean, running_variance) 17 | ctx.eps = eps 18 | ctx.process_group = process_group 19 | ctx.world_size = world_size 20 | 21 | c_last_input = (c_last_input - running_mean) / \ 22 | torch.sqrt(running_variance + eps) 23 | 24 | if weight is not None: 25 | c_last_input = c_last_input * weight 26 | if bias is not None: 27 | c_last_input = c_last_input + bias 28 | 29 | torch.cuda.nvtx.range_pop() 30 | return c_last_input.transpose(1, -1).contiguous().clone() 31 | 32 | @staticmethod 33 | def backward(ctx, grad_output): 34 | torch.cuda.nvtx.range_push("sync_BN_bw") 35 | # mini batch mean & var are calculated by forward path. 36 | # mu = 1./N*np.sum(h, axis = 0) 37 | # var = 1./N*np.sum((h-mu)**2, axis = 0) 38 | c_last_input, weight, bias, running_mean, running_variance = ctx.saved_tensors 39 | 40 | eps = ctx.eps 41 | process_group = ctx.process_group 42 | world_size = ctx.world_size 43 | grad_input = grad_weight = grad_bias = None 44 | num_features = running_mean.size()[0] 45 | 46 | # transpose it to channel last to support broadcasting for input with different rank 47 | torch.cuda.nvtx.range_push("carilli field") 48 | c_last_grad = grad_output.transpose(1, -1).contiguous() 49 | # squash non-channel dimension so we can easily calculate mean 50 | c_grad = c_last_grad.view(-1, num_features).contiguous() 51 | torch.cuda.nvtx.range_pop() 52 | 53 | # calculate grad_input 54 | if ctx.needs_input_grad[0]: 55 | # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0) 56 | # - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0)) 57 | mean_dy = c_grad.mean(0) 58 | mean_dy_xmu = (c_last_grad * (c_last_input - 59 | running_mean)).view(-1, num_features).mean(0) 60 | if torch.distributed.is_initialized(): 61 | torch.distributed.all_reduce( 62 | mean_dy, ReduceOp.SUM, process_group) 63 | mean_dy = mean_dy / world_size 64 | torch.distributed.all_reduce( 65 | mean_dy_xmu, ReduceOp.SUM, process_group) 66 | mean_dy_xmu = mean_dy_xmu / world_size 67 | c_last_grad_input = (c_last_grad - mean_dy - (c_last_input - running_mean) / ( 68 | running_variance + eps) * mean_dy_xmu) / torch.sqrt(running_variance + eps) 69 | if weight is not None: 70 | c_last_grad_input.mul_(weight) 71 | grad_input = c_last_grad_input.transpose(1, -1).contiguous() 72 | 73 | # calculate grad_weight 74 | grad_weight = None 75 | if weight is not None and ctx.needs_input_grad[1]: 76 | # dgamma = np.sum((h - mu) * (var + eps)**(-1. / 2.) * dy, axis=0) 77 | grad_weight = ((c_last_input - running_mean) / torch.sqrt( 78 | running_variance + eps) * c_last_grad).view(-1, num_features).sum(0) 79 | 80 | # calculate grad_bias 81 | grad_bias = None 82 | if bias is not None and ctx.needs_input_grad[2]: 83 | # dbeta = np.sum(dy, axis=0) 84 | grad_bias = c_grad.sum(0) 85 | 86 | torch.cuda.nvtx.range_pop() 87 | return grad_input, grad_weight, grad_bias, None, None, None, None, None 88 | -------------------------------------------------------------------------------- /furnace/apex/apex/reparameterization/README.md: -------------------------------------------------------------------------------- 1 | Under construction... 2 | -------------------------------------------------------------------------------- /furnace/apex/apex/reparameterization/weight_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.parameter import Parameter 3 | from ..fp16_utils import Fused_Weight_Norm 4 | import time 5 | 6 | from .reparameterization import Reparameterization 7 | 8 | def _norm(p, dim): 9 | """Computes the norm over all dimensions except dim""" 10 | if dim is None: 11 | return p.norm() 12 | elif dim == 0: 13 | output_size = (p.size(0),) + (1,) * (p.dim() - 1) 14 | return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size) 15 | elif dim == p.dim() - 1: 16 | output_size = (1,) * (p.dim() - 1) + (p.size(-1),) 17 | return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size) 18 | return _norm(p.transpose(0, dim), 0).transpose(0, dim) 19 | 20 | HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor) 21 | 22 | class WeightNorm(Reparameterization): 23 | """ 24 | Weight normalization is a reparameterization that decouples the magnitude 25 | of a weight tensor from its direction. This replaces the parameter specified 26 | by `name` (e.g. "weight") with two parameters: one specifying the magnitude 27 | (e.g. "weight_g") and one specifying the direction (e.g. "weight_v"). 28 | Weight normalization is implemented via a hook that recomputes the weight 29 | tensor from the magnitude and direction before every :meth:`~Module.forward` 30 | call. 31 | 32 | .. math:: 33 | \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|} 34 | 35 | By default, with `dim=0`, the norm is computed independently per output 36 | channel/plane. To compute a norm over the entire weight tensor, use 37 | `dim=None`. 38 | """ 39 | def compute_weight(self, module=None, name=None): 40 | """ 41 | Computes weight normalized weight value to assign value to module attribute 42 | with name `name`. 43 | Arguments: 44 | module (nn.Module): module with weight we'd like to reparameterize 45 | Returns: 46 | w (Tensor): Tensor object containing value of reparameterized weight 47 | """ 48 | if module is None: 49 | module = self.module 50 | if name is None: 51 | name = self.name 52 | module, name = Reparameterization.get_module_and_name(module, name) 53 | g = getattr(module, name + '_g') 54 | v = getattr(module, name + '_v') 55 | 56 | fused_weight_norm = Fused_Weight_Norm.apply 57 | v = v.contiguous() 58 | w = fused_weight_norm(v, g, self.dim) 59 | 60 | return w 61 | 62 | def reparameterize(self, name, weight, dim): 63 | """ 64 | Creates Parameters v and gto be used for weight normalization 65 | and creates names that for attributes for the module these Parameters 66 | will correspond to. The parameters will be registered according to the names 67 | provided. 68 | Arguments: 69 | module (nn.Module): module with weight we'd like to reparameterize 70 | name (str, optional): name of weight parameter 71 | dim (int, optional): dimension over which to compute parameterization 72 | Returns: 73 | names (list, str): names of Parameters to be used for reparameterization 74 | params (list, Parameter): Parameters to be used for reparameterization 75 | """ 76 | names = [name + '_g', name + '_v'] 77 | params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)] 78 | return names, params 79 | -------------------------------------------------------------------------------- /furnace/apex/csrc/flatten_unflatten.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h 4 | 5 | at::Tensor flatten(std::vector tensors) 6 | { 7 | return torch::utils::flatten_dense_tensors(tensors); 8 | } 9 | 10 | std::vector unflatten(at::Tensor flat, std::vector tensors) 11 | { 12 | return torch::utils::unflatten_dense_tensors(flat, tensors); 13 | } 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("flatten", &flatten, "Flatten dense tensors"); 17 | m.def("unflatten", &unflatten, "Unflatten dense tensors"); 18 | } 19 | -------------------------------------------------------------------------------- /furnace/apex/csrc/scale_check_overflow.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void scale_check_overflow_cuda(const at::Tensor& grads, 4 | float scale, 5 | const at::Tensor& d_buf, 6 | const at::Tensor& downscaled_grads); 7 | 8 | void scale_check_overflow(at::Tensor grads, 9 | float scale, 10 | at::Tensor overflow_buf, 11 | at::Tensor downscaled_grads) 12 | // const at::optional downscaled_grads) 13 | { 14 | AT_CHECK(grads.type().is_cuda(), "grads must be a CUDA tensor"); 15 | AT_CHECK(grads.is_contiguous(), "grads must be contiguous"); 16 | AT_CHECK(overflow_buf.type().is_cuda(), "overflow_buf must be a CUDA tensor"); 17 | AT_CHECK(overflow_buf.is_contiguous(), "overflow_buf must be contiguous"); 18 | AT_CHECK(downscaled_grads.type().is_cuda(), "downscaled_grads must be a CUDA tensor"); 19 | AT_CHECK(downscaled_grads.is_contiguous(), "downscaled_grads must be contiguous"); 20 | // Make sure we are downscaling the FP32 master grads 21 | AT_CHECK(downscaled_grads.type().scalarType() == at::ScalarType::Float, 22 | "The output grads supplied to scale_check_overflow should be fp32 (master grads).") 23 | AT_CHECK(grads.numel() == downscaled_grads.numel(), "Input and output grads must be the same size."); 24 | 25 | scale_check_overflow_cuda(grads, scale, overflow_buf, downscaled_grads); 26 | } 27 | 28 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 29 | m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors"); 30 | } 31 | -------------------------------------------------------------------------------- /furnace/apex/csrc/scale_check_overflow_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #define BLOCK_SIZE 1024 10 | #define NBLOCKS 160 11 | 12 | // It makes sense to lock the output type to fp32 because the downscaled 13 | // grads should be master grads (and in the case of Amp, the params and their 14 | // gradients should always be fp32. 15 | 16 | // This can be optimized with ILP but it's fine for now. 17 | template 18 | __global__ void scale_reduce_overflow(in_t* in, 19 | float* out, 20 | int n, 21 | float scale, 22 | volatile int* overflow_global) 23 | { 24 | __shared__ int overflow; 25 | 26 | int tid = blockIdx.x*blockDim.x + threadIdx.x; 27 | int stride = gridDim.x*blockDim.x; 28 | 29 | // Non-divergent exit condition for the __syncthreads 30 | for(int i = tid; i - threadIdx.x < n; i += stride) 31 | { 32 | if(threadIdx.x == 0) 33 | overflow = *overflow_global; 34 | 35 | __syncthreads(); 36 | 37 | if(overflow == 1) 38 | break; 39 | 40 | if(i < n) 41 | { 42 | float incoming_val = static_cast(in[i]); 43 | if(isfinite(incoming_val)) 44 | out[i] = incoming_val*scale; 45 | else 46 | *overflow_global = 1; // Blindly fire off a write. These will race but that's ok. 47 | // This is NOT guaranteed to be seen immediately by thread 0 on the next iteration. 48 | // I wonder if there's a way we can rig the short-circuiting with only one syncthreads. 49 | // It's possible we can just lean on the cache (no smem or syncs) and still be fast. 50 | } 51 | } 52 | } 53 | 54 | 55 | void scale_check_overflow_cuda 56 | (const at::Tensor& grads, 57 | float scale, 58 | const at::Tensor& overflow_buf, 59 | const at::Tensor& downscaled_grads) 60 | { 61 | using namespace at; 62 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 63 | 64 | int n = grads.numel(); 65 | 66 | // Lock the output (downscaled) type to float. 67 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(grads.type(), 68 | "scale_check_overflow_cuda", 69 | [&] 70 | { 71 | // using accscalar_t = acc_type; 72 | scale_reduce_overflow<<>> 73 | (grads.data(), 74 | downscaled_grads.data(), 75 | n, 76 | scale, 77 | overflow_buf.data()); 78 | }); 79 | 80 | AT_CUDA_CHECK(cudaGetLastError()); 81 | } 82 | -------------------------------------------------------------------------------- /furnace/apex/dist/apex-0.1-py3.6-linux-x86_64.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/dist/apex-0.1-py3.6-linux-x86_64.egg -------------------------------------------------------------------------------- /furnace/apex/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = NVIDIAAPEX 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | gh-pages: 16 | git checkout gh-pages 17 | rm -rf build 18 | rm -rf source 19 | git checkout master -- . 20 | make html 21 | rm -rf ../_modules ../_sources ../_static 22 | mv -fv build/html/* ../ 23 | rm -rf build 24 | git add -A 25 | git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master 26 | 27 | .PHONY: help Makefile 28 | 29 | # Catch-all target: route all unknown targets to Sphinx using the new 30 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 31 | %: Makefile 32 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 33 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/_static/css/pytorch_theme.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; 3 | } 4 | 5 | /* Default header fonts are ugly */ 6 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption { 7 | font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; 8 | } 9 | 10 | /* Use white for docs background */ 11 | .wy-side-nav-search { 12 | background-color: #fff; 13 | } 14 | 15 | .wy-nav-content-wrap, .wy-menu li.current > a { 16 | background-color: #fff; 17 | } 18 | 19 | @media screen and (min-width: 1400px) { 20 | .wy-nav-content-wrap { 21 | background-color: rgba(0, 0, 0, 0.0470588); 22 | } 23 | 24 | .wy-nav-content { 25 | background-color: #fff; 26 | } 27 | } 28 | 29 | /* Fixes for mobile */ 30 | .wy-nav-top { 31 | background-color: #fff; 32 | background-image: url('../img/apex.jpg'); 33 | background-repeat: no-repeat; 34 | background-position: center; 35 | padding: 0; 36 | margin: 0.4045em 0.809em; 37 | color: #333; 38 | } 39 | 40 | .wy-nav-top > a { 41 | display: none; 42 | } 43 | 44 | @media screen and (max-width: 768px) { 45 | .wy-side-nav-search>a img.logo { 46 | height: 60px; 47 | } 48 | } 49 | 50 | /* This is needed to ensure that logo above search scales properly */ 51 | .wy-side-nav-search a { 52 | display: block; 53 | } 54 | 55 | /* This ensures that multiple constructors will remain in separate lines. */ 56 | .rst-content dl:not(.docutils) dt { 57 | display: table; 58 | } 59 | 60 | /* Use our red for literals (it's very similar to the original color) */ 61 | .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { 62 | color: #F05732; 63 | } 64 | 65 | .rst-content tt.xref, a .rst-content tt, .rst-content tt.xref, 66 | .rst-content code.xref, a .rst-content tt, a .rst-content code { 67 | color: #404040; 68 | } 69 | 70 | /* Change link colors (except for the menu) */ 71 | 72 | a { 73 | color: #F05732; 74 | } 75 | 76 | a:hover { 77 | color: #F05732; 78 | } 79 | 80 | 81 | a:visited { 82 | color: #D44D2C; 83 | } 84 | 85 | .wy-menu a { 86 | color: #b3b3b3; 87 | } 88 | 89 | .wy-menu a:hover { 90 | color: #b3b3b3; 91 | } 92 | 93 | /* Default footer text is quite big */ 94 | footer { 95 | font-size: 80%; 96 | } 97 | 98 | footer .rst-footer-buttons { 99 | font-size: 125%; /* revert footer settings - 1/80% = 125% */ 100 | } 101 | 102 | footer p { 103 | font-size: 100%; 104 | } 105 | 106 | /* For hidden headers that appear in TOC tree */ 107 | /* see http://stackoverflow.com/a/32363545/3343043 */ 108 | .rst-content .hidden-section { 109 | display: none; 110 | } 111 | 112 | nav .hidden-section { 113 | display: inherit; 114 | } 115 | 116 | .wy-side-nav-search>div.version { 117 | color: #000; 118 | } 119 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% block sidebartitle %} {{ super() }} 3 | 4 | 32 | {% endblock %} 33 | 34 | {% block footer %} {{ super() }} 35 | 36 | 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/amp.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.amp 5 | =================================== 6 | 7 | Amp (Automatic Mixed Precision) is a tool designed for ease of use and maximum safety in FP16 training. All potentially unsafe ops are performed in FP32 under the hood, while safe ops are performed using faster, Tensor Core-friendly FP16 math. Amp also automatically implements dynamic loss scaling. 8 | 9 | The intention of Amp is to be the "on-ramp" to easy FP16 training: achieve all the numerical stability of full FP32 training, with most of the performance benefits of full FP16 training. 10 | 11 | Currently, complete API documentation resides on the Github page: https://github.com/NVIDIA/apex/tree/master/apex/amp. 12 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/fp16_utils.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.fp16_utils 5 | =================================== 6 | 7 | This submodule contains utilities designed to streamline the mixed precision training recipe 8 | presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions 9 | `Training Neural Networks with Mixed Precision: Theory and Practice`_ and 10 | `Training Neural Networks with Mixed Precision: Real Examples`_. 11 | For Pytorch users, Real Examples in particular is recommended. 12 | 13 | Full runnable Python scripts demonstrating ``apex.fp16_utils`` 14 | can be found on the Github page: 15 | 16 | | `Simple FP16_Optimizer demos`_ 17 | | 18 | | `Distributed Mixed Precision Training with imagenet`_ 19 | | 20 | | `Mixed Precision Training with word_language_model`_ 21 | | 22 | | 23 | 24 | .. _`on Parallel Forall`: 25 | https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/ 26 | .. _`Training Neural Networks with Mixed Precision: Theory and Practice`: 27 | http://on-demand.gputechconf.com/gtc/2018/video/S8923/ 28 | .. _`Training Neural Networks with Mixed Precision: Real Examples`: 29 | http://on-demand.gputechconf.com/gtc/2018/video/S81012/ 30 | .. _`Simple FP16_Optimizer demos`: 31 | https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple 32 | .. _`Distributed Mixed Precision Training with imagenet`: 33 | https://github.com/NVIDIA/apex/tree/master/examples/imagenet 34 | .. _`Mixed Precision Training with word_language_model`: 35 | https://github.com/NVIDIA/apex/tree/master/examples/word_language_model 36 | 37 | .. automodule:: apex.fp16_utils 38 | .. currentmodule:: apex.fp16_utils 39 | 40 | Automatic management of master params + loss scaling 41 | ---------------------------------------------------- 42 | 43 | .. autoclass:: FP16_Optimizer 44 | :members: 45 | 46 | .. autoclass:: LossScaler 47 | :members: 48 | 49 | .. autoclass:: DynamicLossScaler 50 | :members: 51 | 52 | Manual master parameter management 53 | ---------------------------------- 54 | 55 | .. autofunction:: prep_param_lists 56 | 57 | .. autofunction:: master_params_to_model_params 58 | 59 | .. autofunction:: model_grads_to_master_grads 60 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyTorch documentation master file, created by 2 | sphinx-quickstart on Fri Dec 23 13:31:47 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :github_url: https://github.com/nvidia/apex 7 | 8 | Apex (A PyTorch Extension) 9 | =================================== 10 | 11 | This site contains the API documentation for Apex (https://github.com/nvidia/apex), 12 | a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible. 13 | 14 | Installation requires CUDA 9 or later, PyTorch 0.4 or later, and Python 3. Install by running 15 | 16 | :: 17 | 18 | git clone https://www.github.com/nvidia/apex 19 | cd apex 20 | python setup.py install [--cuda_ext] [--cpp_ext] 21 | 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | :caption: AMP: Automatic Mixed Precision 26 | 27 | amp 28 | 29 | .. toctree:: 30 | :maxdepth: 1 31 | :caption: FP16/Mixed Precision Utilities 32 | 33 | fp16_utils 34 | 35 | .. toctree:: 36 | :maxdepth: 1 37 | :caption: Distributed Training 38 | 39 | parallel 40 | 41 | .. toctree:: 42 | :maxdepth: 1 43 | :caption: Fused Optimizers 44 | 45 | optimizers 46 | 47 | .. toctree:: 48 | :maxdepth: 1 49 | :caption: Fused Layer Norm 50 | 51 | layernorm 52 | 53 | .. reparameterization 54 | .. RNN 55 | 56 | Indices and tables 57 | ================== 58 | 59 | * :ref:`genindex` 60 | * :ref:`modindex` 61 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/layernorm.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.normalization.fused_layer_norm 5 | =================================== 6 | 7 | .. automodule:: apex.normalization 8 | .. currentmodule:: apex.normalization 9 | 10 | .. FusedAdam 11 | ---------- 12 | 13 | .. autoclass:: FusedLayerNorm 14 | :members: 15 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/optimizers.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.optimizers 5 | =================================== 6 | 7 | .. automodule:: apex.optimizers 8 | .. currentmodule:: apex.optimizers 9 | 10 | .. FusedAdam 11 | ---------- 12 | 13 | .. autoclass:: FusedAdam 14 | :members: 15 | -------------------------------------------------------------------------------- /furnace/apex/docs/source/parallel.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.parallel 5 | =================================== 6 | 7 | .. automodule:: apex.parallel 8 | .. currentmodule:: apex.parallel 9 | 10 | .. DistributedDataParallel 11 | ---------- 12 | 13 | .. autoclass:: DistributedDataParallel 14 | :members: 15 | 16 | .. autoclass:: Reducer 17 | :members: 18 | 19 | .. autoclass:: SyncBatchNorm 20 | :members: 21 | 22 | Utility functions 23 | ---------------------------------- 24 | 25 | .. autofunction:: convert_syncbn_model 26 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/README.md: -------------------------------------------------------------------------------- 1 | # Simple examples of FP16_Optimizer functionality 2 | 3 | To use `FP16_Optimizer` on a half-precision model, or a model with a mixture of 4 | half and float parameters, only two lines of your training script need to change: 5 | 1. Construct an `FP16_Optimizer` instance from an existing optimizer. 6 | 2. Replace `loss.backward()` with `optimizer.backward(loss)`. 7 | 8 | #### [Full API Documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling) 9 | 10 | See "Other Options" at the bottom of this page for some cases that require special treatment. 11 | 12 | #### Minimal Working Sample 13 | `minimal.py` shows the basic usage of `FP16_Optimizer` with either static or dynamic loss scaling. Test via `python minimal.py`. 14 | 15 | #### Closures 16 | `FP16_Optimizer` supports closures with the same control flow as ordinary Pytorch optimizers. 17 | `closure.py` shows an example. Test via `python closure.py`. 18 | 19 | See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.step) for more details. 20 | 21 | #### Serialization/Deserialization 22 | `FP16_Optimizer` supports saving and loading with the same control flow as ordinary Pytorch optimizers. 23 | `save_load.py` shows an example. Test via `python save_load.py`. 24 | 25 | See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.load_state_dict) for more details. 26 | 27 | #### Distributed 28 | **distributed_apex** shows an example using `FP16_Optimizer` with Apex DistributedDataParallel. 29 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary single-process 30 | usage. Test via 31 | ```bash 32 | cd distributed_apex 33 | bash run.sh 34 | ``` 35 | 36 | **distributed_pytorch** shows an example using `FP16_Optimizer` with Pytorch DistributedDataParallel. 37 | Again, the usage of `FP16_Optimizer` with distributed does not need to change from ordinary 38 | single-process usage. Test via 39 | ```bash 40 | cd distributed_pytorch 41 | bash run.sh 42 | ``` 43 | 44 | #### Other Options 45 | 46 | Gradient clipping requires that calls to `torch.nn.utils.clip_grad_norm` 47 | be replaced with [fp16_optimizer_instance.clip_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.clip_master_grads). The [word_language_model example](https://github.com/NVIDIA/apex/blob/master/examples/word_language_model/main_fp16_optimizer.py) uses this feature. 48 | 49 | Multiple losses will work if you simply replace 50 | ```bash 51 | loss1.backward() 52 | loss2.backward() 53 | ``` 54 | with 55 | ```bash 56 | optimizer.backward(loss1) 57 | optimizer.backward(loss2) 58 | ``` 59 | but `FP16_Optimizer` can be told to handle this more efficiently using the 60 | [update_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.update_master_grads) option. 61 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/closure.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from apex.fp16_utils import FP16_Optimizer 3 | 4 | torch.backends.cudnn.benchmark = True 5 | 6 | N, D_in, D_out = 64, 1024, 16 7 | 8 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half) 9 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half) 10 | 11 | model = torch.nn.Linear(D_in, D_out).cuda().half() 12 | 13 | optimizer = torch.optim.LBFGS(model.parameters()) 14 | ### Construct FP16_Optimizer 15 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0) 16 | ### 17 | 18 | loss_fn = torch.nn.MSELoss() 19 | 20 | for t in range(5): 21 | def closure(): 22 | optimizer.zero_grad() 23 | y_pred = model(x) 24 | loss = loss_fn(y_pred.float(), y.float()) 25 | ### Change loss.backward() within the closure to: ### 26 | optimizer.backward(loss) 27 | ### 28 | return loss 29 | loss = optimizer.step(closure) 30 | 31 | print("final loss = ", loss) 32 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_apex/README.md: -------------------------------------------------------------------------------- 1 | **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with 2 | `apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script, 3 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility). 4 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 5 | single-process usage. Test via 6 | ```bash 7 | bash run.sh 8 | ``` 9 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_apex/distributed_data_parallel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from apex.parallel import DistributedDataParallel as DDP 4 | from apex.fp16_utils import FP16_Optimizer 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--local_rank", default=0, type=int) 8 | args = parser.parse_args() 9 | 10 | torch.cuda.set_device(args.local_rank) 11 | torch.distributed.init_process_group(backend='nccl', 12 | init_method='env://') 13 | 14 | torch.backends.cudnn.benchmark = True 15 | 16 | N, D_in, D_out = 64, 1024, 16 17 | 18 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half) 19 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half) 20 | 21 | model = torch.nn.Linear(D_in, D_out).cuda().half() 22 | model = DDP(model) 23 | 24 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 25 | ### Construct FP16_Optimizer ### 26 | optimizer = FP16_Optimizer(optimizer) 27 | ### 28 | 29 | loss_fn = torch.nn.MSELoss() 30 | 31 | for t in range(500): 32 | optimizer.zero_grad() 33 | y_pred = model(x) 34 | loss = loss_fn(y_pred.float(), y.float()) 35 | ### Change loss.backward() to: ### 36 | optimizer.backward(loss) 37 | ### 38 | optimizer.step() 39 | 40 | print("final loss = ", loss) 41 | 42 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_apex/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py 3 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/README.md: -------------------------------------------------------------------------------- 1 | **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with 2 | `apex.parallel.DistributedDataParallel` in conjuction with the legacy Apex 3 | launcher script, `apex.parallel.multiproc`. See 4 | [FP16_Optimizer_simple/distributed_apex](https://github.com/NVIDIA/apex/tree/torch_launcher/examples/FP16_Optimizer_simple/distributed_apex) for a more up-to-date example that uses the Pytorch launcher 5 | script, `torch.distributed.launch`. 6 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 7 | single-process usage. Test via 8 | ```bash 9 | bash run.sh 10 | ``` 11 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/distributed_data_parallel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from apex.parallel import DistributedDataParallel as DDP 4 | from apex.fp16_utils import FP16_Optimizer 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 8 | help='url used to set up distributed training') 9 | parser.add_argument('--world-size', default=2, type=int, 10 | help='Number of distributed processes.') 11 | parser.add_argument("--rank", type=int, 12 | help='Rank of this process') 13 | 14 | args = parser.parse_args() 15 | 16 | torch.cuda.set_device(args.rank) 17 | torch.distributed.init_process_group(backend='nccl', 18 | init_method=args.dist_url, 19 | world_size=args.world_size, 20 | rank=args.rank) 21 | 22 | torch.backends.cudnn.benchmark = True 23 | 24 | N, D_in, D_out = 64, 1024, 16 25 | 26 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half) 27 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half) 28 | 29 | model = torch.nn.Linear(D_in, D_out).cuda().half() 30 | model = DDP(model) 31 | 32 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 33 | ### Construct FP16_Optimizer ### 34 | optimizer = FP16_Optimizer(optimizer) 35 | ### 36 | 37 | loss_fn = torch.nn.MSELoss() 38 | 39 | for t in range(500): 40 | optimizer.zero_grad() 41 | y_pred = model(x) 42 | loss = loss_fn(y_pred.float(), y.float()) 43 | ### Change loss.backward() to: ### 44 | optimizer.backward(loss) 45 | ### 46 | optimizer.step() 47 | 48 | print("final loss = ", loss) 49 | 50 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # By default, apex.parallel.multiproc will attempt to use all available GPUs on the system. 3 | # The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES: 4 | export CUDA_VISIBLE_DEVICES=0,1 5 | python -m apex.parallel.multiproc distributed_data_parallel.py 6 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_pytorch/README.md: -------------------------------------------------------------------------------- 1 | **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with 2 | `torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script, 3 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility). 4 | The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 5 | single-process usage. Test via 6 | ```bash 7 | bash run.sh 8 | ``` 9 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_pytorch/distributed_data_parallel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from apex.fp16_utils import FP16_Optimizer 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--local_rank", default=0, type=int) 7 | args = parser.parse_args() 8 | 9 | torch.cuda.set_device(args.local_rank) 10 | torch.distributed.init_process_group(backend='nccl', 11 | init_method='env://') 12 | 13 | torch.backends.cudnn.benchmark = True 14 | 15 | N, D_in, D_out = 64, 1024, 16 16 | 17 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half) 18 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half) 19 | 20 | model = torch.nn.Linear(D_in, D_out).cuda().half() 21 | model = torch.nn.parallel.DistributedDataParallel(model, 22 | device_ids=[args.local_rank], 23 | output_device=args.local_rank) 24 | 25 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 26 | ### Construct FP16_Optimizer ### 27 | optimizer = FP16_Optimizer(optimizer) 28 | ### 29 | 30 | loss_fn = torch.nn.MSELoss() 31 | 32 | for t in range(500): 33 | optimizer.zero_grad() 34 | y_pred = model(x) 35 | loss = loss_fn(y_pred.float(), y.float()) 36 | ### Change loss.backward() to: ### 37 | optimizer.backward(loss) 38 | ### 39 | optimizer.step() 40 | 41 | print("final loss = ", loss) 42 | 43 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/distributed_pytorch/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py 3 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/minimal.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from apex.fp16_utils import FP16_Optimizer 3 | 4 | torch.backends.cudnn.benchmark = True 5 | 6 | N, D_in, D_out = 64, 1024, 16 7 | 8 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half) 9 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half) 10 | 11 | model = torch.nn.Linear(D_in, D_out).cuda().half() 12 | 13 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) 14 | 15 | ### Construct FP16_Optimizer 16 | ### FP16_Optimizer will ingest and remember the original optimizer's param_groups. 17 | ### 18 | ### Construct with static loss scaling... 19 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0) 20 | ### ...or dynamic loss scaling 21 | # optimizer = FP16_Optimizer(optimizer, 22 | # dynamic_loss_scale=True, 23 | # dynamic_loss_args={'scale_factor' : 2}) 24 | ### dynamic_loss_args is optional, for "power users," and unnecessary in most cases. 25 | 26 | loss_fn = torch.nn.MSELoss() 27 | 28 | for t in range(200): 29 | optimizer.zero_grad() 30 | y_pred = model(x) 31 | loss = loss_fn(y_pred.float(), y.float()) 32 | ### Change loss.backward() to: 33 | optimizer.backward(loss) 34 | ### 35 | optimizer.step() 36 | 37 | print("final loss = ", loss) 38 | -------------------------------------------------------------------------------- /furnace/apex/examples/FP16_Optimizer_simple/save_load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from apex.fp16_utils import FP16_Optimizer 3 | 4 | torch.backends.cudnn.benchmark = True 5 | 6 | N, D_in, D_out = 64, 1024, 16 7 | 8 | x = torch.randn(N, D_in, device='cuda', dtype=torch.half) 9 | y = torch.randn(N, D_out, device='cuda', dtype=torch.half) 10 | 11 | model = torch.nn.Linear(D_in, D_out).cuda().half() 12 | 13 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) 14 | ### Construct FP16_Optimizer with static loss scaling... 15 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0) 16 | ### ...or dynamic loss scaling 17 | # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 18 | 19 | loss_fn = torch.nn.MSELoss() 20 | 21 | # The checkpointing shown here is identical to what you'd use without FP16_Optimizer. 22 | # 23 | # We save/load checkpoints within local scopes, so the "checkpoint" object 24 | # does not persist. This helps avoid dangling references to intermediate deserialized data, 25 | # and is good practice for Pytorch in general, not just with FP16_Optimizer. 26 | def save_checkpoint(): 27 | checkpoint = {} 28 | checkpoint['model'] = model.state_dict() 29 | checkpoint['optimizer'] = optimizer.state_dict() 30 | torch.save(checkpoint, 'saved.pth') 31 | 32 | def load_checkpoint(): 33 | checkpoint = torch.load('saved.pth', 34 | map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device())) 35 | model.load_state_dict(checkpoint['model']) 36 | optimizer.load_state_dict(checkpoint['optimizer']) 37 | 38 | for t in range(100): 39 | optimizer.zero_grad() 40 | y_pred = model(x) 41 | loss = loss_fn(y_pred.float(), y.float()) 42 | optimizer.backward(loss) ### formerly loss.backward() 43 | optimizer.step() 44 | 45 | save_checkpoint() 46 | 47 | load_checkpoint() 48 | 49 | for t in range(100): 50 | optimizer.zero_grad() 51 | y_pred = model(x) 52 | loss = loss_fn(y_pred.float(), y.float()) 53 | optimizer.backward(loss) ### formerly loss.backward() 54 | optimizer.step() 55 | 56 | print("final loss = ", loss) 57 | -------------------------------------------------------------------------------- /furnace/apex/examples/README.md: -------------------------------------------------------------------------------- 1 | ## Contents: 2 | 3 | **distributed**: Walkthrough of apex distributed data parallel utilities. 4 | 5 | **FP16_Optimizer_simple**: Simple examples demonstrating various use cases of `FP16_Optimizer` to automatically manage master parameters and static or dynamic loss scaling. 6 | 7 | **imagenet**: Example based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet) showing the use of `FP16_Optimizer`, as well as manual management of master parameters and loss scaling for illustration/comparison. 8 | 9 | **word_language_model**: Example based on [https://github.com/pytorch/examples/tree/master/word_language_model](https://github.com/pytorch/examples/tree/master/word_language_model) showing the use of `FP16_Optimizer`, as well as manual management of master parameters and loss scaling for illustration/comparison. 10 | 11 | **docker**: Example of a minimal Dockerfile that installs Apex on top of an existing container. 12 | -------------------------------------------------------------------------------- /furnace/apex/examples/distributed/README.md: -------------------------------------------------------------------------------- 1 | # Multiprocess Example based on pytorch/examples/mnist 2 | 3 | main.py demonstrates how to modify a simple model to enable multiprocess distributed data parallel 4 | training using the module wrapper `apex.parallel.DistributedDataParallel` 5 | (similar to `torch.nn.parallel.DistributedDataParallel`). 6 | 7 | Multiprocess distributed data parallel training frequently outperforms single-process 8 | data parallel training (such as that offered by `torch.nn.DataParallel`) because each process has its 9 | own python interpreter. Therefore, driving multiple GPUs with multiple processes reduces 10 | global interpreter lock contention versus having a single process (with a single GIL) drive all GPUs. 11 | 12 | `apex.parallel.DistributedDataParallel` is optimized for use with NCCL. It achieves high performance by 13 | overlapping communication with computation during ``backward()`` and bucketing smaller gradient 14 | transfers to reduce the total number of transfers required. 15 | 16 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html) 17 | 18 | #### [Source Code](https://github.com/NVIDIA/apex/tree/master/apex/parallel) 19 | 20 | #### [Another example: Imagenet with mixed precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 21 | 22 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex) 23 | 24 | ## Getting started 25 | Prior to running please run 26 | ```pip install -r requirements.txt``` 27 | 28 | To download the dataset, run 29 | ```python main.py``` 30 | without any arguments. Once you have downloaded the dataset, you should not need to do this again. 31 | 32 | `main.py` runs multiprocess distributed data parallel jobs using the Pytorch launcher script 33 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility). 34 | Jobs are launched via 35 | ```bash 36 | python -m torch.distributed.launch --nproc_per_node=N main.py args... 37 | ``` 38 | `torch.distributed.launch` spawns `N` processes, each of which runs as 39 | `python main.py args... --local_rank `. 40 | The `local_rank` argument for each process is determined and appended by `torch.distributed.launch`, 41 | and varies between 0 and `N-1`. `torch.distributed.launch` also provides environment variables 42 | for each process. 43 | Internally, each process calls `set_device` according to its local 44 | rank and `init_process_group` with `init_method=`env://' to ingest the provided environment 45 | variables. 46 | For best performance, set `N` equal to the number of visible CUDA devices on the node. 47 | 48 | ## Converting your own model 49 | 50 | To understand how to convert your own model, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags. 51 | 52 | ## Requirements 53 | Pytorch with NCCL available as a distributed backend. Pytorch 0.4+, installed as a pip or conda package, should have this by default. Otherwise, you can build Pytorch from source, in an environment where NCCL is installed and visible. 54 | -------------------------------------------------------------------------------- /furnace/apex/examples/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image must at least have pytorch and CUDA installed. 2 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:18.12-py3 3 | FROM $BASE_IMAGE 4 | ARG BASE_IMAGE 5 | RUN echo "Installing Apex on top of ${BASE_IMAGE}" 6 | WORKDIR /workspace 7 | # uninstall Apex if present 8 | RUN pip uninstall -y apex || : 9 | # SHA is something the user can touch to force recreation of this Docker layer, 10 | # and therefore force cloning of the latest version of Apex 11 | RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git 12 | WORKDIR /workspace/apex 13 | RUN python setup.py install 14 | WORKDIR /workspace 15 | -------------------------------------------------------------------------------- /furnace/apex/examples/docker/README.md: -------------------------------------------------------------------------------- 1 | ## Option 1: Create a new container with Apex 2 | 3 | **Dockerfile** installs the latest Apex on top of an existing image. Run 4 | ``` 5 | docker build -t image_with_apex . 6 | ``` 7 | By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image, 8 | which requires an NVIDIA GPU Cloud (NGC) account. If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key). 9 | 10 | Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg. 11 | Any `BASE_IMAGE` you supply must have Pytorch and Cuda installed, for example: 12 | ``` 13 | docker build --build-arg BASE_IMAGE=pytorch/pytorch:0.4-cuda9-cudnn7-devel -t image_with_apex . 14 | ``` 15 | 16 | If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**. 17 | 18 | **Warning:** 19 | Currently, Pytorch's default non-devel image on Dockerhub 20 | [pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries. It does not contain NVCC, which means it is not an eligible candidate for ``. 21 | 22 | ## Option 2: Install Apex in a running container 23 | 24 | Instead of building a new container, it is also a viable option to `git clone https://github.com/NVIDIA/apex.git` on bare metal, mount the Apex repo into your container at launch by running, for example, 25 | ``` 26 | docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container 27 | ``` 28 | then go to /apex/in/container within the running container and `python setup.py install [--cuda_ext] [--cpp_ext]`. 29 | -------------------------------------------------------------------------------- /furnace/apex/examples/imagenet/README.md: -------------------------------------------------------------------------------- 1 | # ImageNet training in PyTorch 2 | 3 | This example is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet). 4 | It implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. 5 | 6 | `main.py` with the `--fp16` argument demonstrates mixed precision training with manual management of master parameters and loss scaling. 7 | 8 | `main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling. 9 | 10 | `main_amp.py` with `--fp16` demonstrates use of Amp to automatically perform all FP16-friendly operations in half precision under the hood. Notice that with Amp: 11 | ..* you don't need to explicitly convert your model, or the input data, to half(). Conversions will occur on-the-fly internally within the Amp-patched torch functions. 12 | ..* dynamic loss scaling is always used under the hood. 13 | 14 | `main_reducer.py` is identical to `main.py`, except that it shows the use of [apex.parallel.Reduce](https://nvidia.github.io/apex/parallel.html#apex.parallel.Reducer) instead of `DistributedDataParallel`. 15 | 16 | ## Requirements 17 | 18 | - `pip install -r requirements.txt` 19 | - Download the ImageNet dataset and move validation images to labeled subfolders 20 | - To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh 21 | 22 | ## Training 23 | 24 | To train a model, run `main.py` with the desired model architecture and the path to the ImageNet dataset. 25 | 26 | The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG: 27 | 28 | ```bash 29 | python main.py -a alexnet --lr 0.01 /path/to/imagenet/folder 30 | ``` 31 | 32 | The directory at /path/to/imagenet/directory should contain two subdirectories called "train" 33 | and "val" that contain the training and validation data respectively. 34 | 35 | ## Distributed training 36 | 37 | `main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. `apex.parallel.DistributedDataParallel` 38 | is a drop-in replacement for `torch.nn.parallel.DistribtuedDataParallel` (see our [distributed example](https://github.com/NVIDIA/apex/tree/master/examples/distributed)). 39 | The scripts can interact with 40 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility) 41 | to spawn multiprocess jobs using the following syntax: 42 | ``` 43 | python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py args... 44 | ``` 45 | `NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node. 46 | 47 | Optionally one can run imagenet with sync batch normalization by adding 48 | `--sync_bn` into the `args...` 49 | 50 | ## Example commands 51 | 52 | (note: batch size `--b 224` assumes your GPUs have >=16GB of onboard memory) 53 | 54 | ```bash 55 | ### Softlink training dataset into current directory 56 | $ ln -sf /data/imagenet/train-jpeg/ train 57 | ### Softlink validation dataset into current directory 58 | $ ln -sf /data/imagenet/val-jpeg/ val 59 | ### Single-process training 60 | $ python main.py -a resnet50 --fp16 --b 224 --workers 4 --static-loss-scale 128.0 ./ 61 | ### Single-process training with Amp. Amp's casting causes it to use a bit more memory, 62 | ### hence the batch size 128. 63 | $ python main_amp.py -a resnet50 --fp16 --b 128 --workers 4 ./ 64 | ### Multi-process training (uses all visible GPUs on the node) 65 | $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py -a resnet50 --fp16 --b 224 --workers 4 --static-loss-scale 128.0 ./ 66 | ### Multi-process training on GPUs 0 and 1 only 67 | $ export CUDA_VISIBLE_DEVICES=0,1 68 | $ python -m torch.distributed.launch --nproc_per_node=2 main.py -a resnet50 --fp16 --b 224 --workers 4 ./ 69 | ### Multi-process training with FP16_Optimizer, static loss scale 128.0 (still uses FP32 master params) 70 | $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 224 --static-loss-scale 128.0 --workers 4 ./ 71 | ### Multi-process training with FP16_Optimizer, dynamic loss scaling 72 | $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 224 --dynamic-loss-scale --workers 4 ./ 73 | ``` 74 | 75 | ## Usage for `main.py` and `main_fp16_optimizer.py` 76 | 77 | `main_fp16_optimizer.py` also accepts the optional flag 78 | ```bash 79 | --dynamic-loss-scale Use dynamic loss scaling. If supplied, this argument 80 | supersedes --static-loss-scale. 81 | ``` 82 | 83 | -------------------------------------------------------------------------------- /furnace/apex/examples/word_language_model/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | class Dictionary(object): 6 | def __init__(self): 7 | self.word2idx = {} 8 | self.idx2word = [] 9 | 10 | def add_word(self, word): 11 | if word not in self.word2idx: 12 | self.idx2word.append(word) 13 | self.word2idx[word] = len(self.idx2word) - 1 14 | return self.word2idx[word] 15 | 16 | def __len__(self): 17 | return len(self.idx2word) 18 | 19 | 20 | class Corpus(object): 21 | def __init__(self, path, pad_to_multiple_of=1): 22 | # Synthetic elements used to pad the dictionary length. 23 | # It is assumed that these synthetic elements do not appear in the actual data files. 24 | self.synthetic = ["vvvvvvvv" + str(i) for i in range(pad_to_multiple_of-1)] 25 | 26 | self.dictionary = Dictionary() 27 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 28 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 29 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 30 | 31 | # Pad dictionary size to desired multiple. For example, padding to a multiple of 8 32 | # is necessary to ensure Tensor Core usage for the decoder. 33 | pad_elem = pad_to_multiple_of - len(self.dictionary)%pad_to_multiple_of 34 | if pad_elem != pad_to_multiple_of: 35 | for i in range(pad_elem): 36 | self.dictionary.add_word(self.synthetic[i]) 37 | 38 | def tokenize(self, path): 39 | """Tokenizes a text file.""" 40 | assert os.path.exists(path) 41 | # Add words to the dictionary 42 | with open(path, 'r') as f: 43 | tokens = 0 44 | for line in f: 45 | words = line.split() + [''] 46 | tokens += len(words) 47 | for word in words: 48 | self.dictionary.add_word(word) 49 | 50 | # Tokenize file content 51 | with open(path, 'r') as f: 52 | ids = torch.LongTensor(tokens) 53 | token = 0 54 | for line in f: 55 | words = line.split() + [''] 56 | for word in words: 57 | ids[token] = self.dictionary.word2idx[word] 58 | token += 1 59 | 60 | return ids 61 | -------------------------------------------------------------------------------- /furnace/apex/examples/word_language_model/data/wikitext-2/README: -------------------------------------------------------------------------------- 1 | This is raw data from the wikitext-2 dataset. 2 | 3 | See https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/ 4 | -------------------------------------------------------------------------------- /furnace/apex/examples/word_language_model/generate.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Language Modeling on Penn Tree Bank 3 | # 4 | # This file generates new sentences sampled from the language model 5 | # 6 | ############################################################################### 7 | 8 | import argparse 9 | 10 | import torch 11 | 12 | import data 13 | 14 | parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model') 15 | 16 | # Model parameters. 17 | parser.add_argument('--data', type=str, default='./data/wikitext-2', 18 | help='location of the data corpus') 19 | parser.add_argument('--checkpoint', type=str, default='./model.pt', 20 | help='model checkpoint to use') 21 | parser.add_argument('--outf', type=str, default='generated.txt', 22 | help='output file for generated text') 23 | parser.add_argument('--words', type=int, default='1000', 24 | help='number of words to generate') 25 | parser.add_argument('--seed', type=int, default=1111, 26 | help='random seed') 27 | parser.add_argument('--cuda', action='store_true', 28 | help='use CUDA') 29 | parser.add_argument('--temperature', type=float, default=1.0, 30 | help='temperature - higher will increase diversity') 31 | parser.add_argument('--log-interval', type=int, default=100, 32 | help='reporting interval') 33 | args = parser.parse_args() 34 | 35 | # Set the random seed manually for reproducibility. 36 | torch.manual_seed(args.seed) 37 | if torch.cuda.is_available(): 38 | if not args.cuda: 39 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 40 | 41 | if args.temperature < 1e-3: 42 | parser.error("--temperature has to be greater or equal 1e-3") 43 | 44 | with open(args.checkpoint, 'rb') as f: 45 | model = torch.load(f) 46 | model.eval() 47 | 48 | if args.cuda: 49 | model.cuda() 50 | else: 51 | model.cpu() 52 | 53 | corpus = data.Corpus(args.data) 54 | ntokens = len(corpus.dictionary) 55 | hidden = model.init_hidden(1) 56 | with torch.no_grad(): 57 | input = torch.rand(1, 1).mul(ntokens).long() 58 | if args.cuda: 59 | input = input.cuda() 60 | 61 | with open(args.outf, 'w') as outf: 62 | for i in range(args.words): 63 | output, hidden = model(input, hidden) 64 | word_weights = output.squeeze().float().data.div(args.temperature).exp().cpu() 65 | word_idx = torch.multinomial(word_weights, 1)[0] 66 | input.data.fill_(word_idx) 67 | word = corpus.dictionary.idx2word[word_idx] 68 | 69 | outf.write(word + ('\n' if i % 20 == 19 else ' ')) 70 | 71 | if i % args.log_interval == 0: 72 | print('| Generated {}/{} words'.format(i, args.words)) 73 | -------------------------------------------------------------------------------- /furnace/apex/examples/word_language_model/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class RNNModel(nn.Module): 5 | """Container module with an encoder, a recurrent module, and a decoder.""" 6 | 7 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 8 | super(RNNModel, self).__init__() 9 | self.drop = nn.Dropout(dropout) 10 | self.encoder = nn.Embedding(ntoken, ninp) 11 | if rnn_type in ['LSTM', 'GRU']: 12 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) 13 | else: 14 | try: 15 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] 16 | except KeyError: 17 | raise ValueError("""An invalid option for `--model` was supplied, 18 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 19 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) 20 | self.decoder = nn.Linear(nhid, ntoken) 21 | 22 | # Optionally tie weights as in: 23 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 24 | # https://arxiv.org/abs/1608.05859 25 | # and 26 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 27 | # https://arxiv.org/abs/1611.01462 28 | if tie_weights: 29 | if nhid != ninp: 30 | raise ValueError('When using the tied flag, nhid must be equal to emsize') 31 | self.decoder.weight = self.encoder.weight 32 | 33 | self.init_weights() 34 | 35 | self.rnn_type = rnn_type 36 | self.nhid = nhid 37 | self.nlayers = nlayers 38 | 39 | def init_weights(self): 40 | initrange = 0.1 41 | self.encoder.weight.data.uniform_(-initrange, initrange) 42 | self.decoder.bias.data.fill_(0) 43 | self.decoder.weight.data.uniform_(-initrange, initrange) 44 | 45 | def forward(self, input, hidden): 46 | emb = self.drop(self.encoder(input)) 47 | output, hidden = self.rnn(emb, hidden) 48 | output = self.drop(output) 49 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) 50 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 51 | 52 | def init_hidden(self, bsz): 53 | weight = next(self.parameters()).data 54 | if self.rnn_type == 'LSTM': 55 | return (weight.new(self.nlayers, bsz, self.nhid).zero_(), 56 | weight.new(self.nlayers, bsz, self.nhid).zero_()) 57 | else: 58 | return weight.new(self.nlayers, bsz, self.nhid).zero_() 59 | -------------------------------------------------------------------------------- /furnace/apex/setup.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from setuptools import setup, find_packages 3 | 4 | import sys 5 | 6 | if not torch.cuda.is_available(): 7 | print("Warning: Torch did not find available GPUs on this system.\n", 8 | "If your intention is to cross-compile, this is not an error.") 9 | 10 | print("torch.__version__ = ", torch.__version__) 11 | TORCH_MAJOR = int(torch.__version__.split('.')[0]) 12 | TORCH_MINOR = int(torch.__version__.split('.')[1]) 13 | 14 | if TORCH_MAJOR == 0 and TORCH_MINOR < 4: 15 | raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" + 16 | "The latest stable release can be obtained from https://pytorch.org/") 17 | 18 | cmdclass = {} 19 | ext_modules = [] 20 | 21 | if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv: 22 | from torch.utils.cpp_extension import BuildExtension 23 | cmdclass['build_ext'] = BuildExtension 24 | 25 | if "--cpp_ext" in sys.argv: 26 | from torch.utils.cpp_extension import CppExtension 27 | sys.argv.remove("--cpp_ext") 28 | ext_modules.append( 29 | CppExtension('apex_C', 30 | ['csrc/flatten_unflatten.cpp',])) 31 | 32 | if "--cuda_ext" in sys.argv: 33 | from torch.utils.cpp_extension import CUDAExtension 34 | sys.argv.remove("--cuda_ext") 35 | 36 | if torch.utils.cpp_extension.CUDA_HOME is None: 37 | print("Warning: nvcc is not available. Ignoring --cuda-ext") 38 | else: 39 | ext_modules.append( 40 | CUDAExtension(name='amp_C', 41 | sources=['csrc/scale_check_overflow.cpp', 42 | 'csrc/scale_check_overflow_kernel.cu'])) 43 | ext_modules.append( 44 | CUDAExtension(name='fused_adam_cuda', 45 | sources=['apex/optimizers/csrc/fused_adam_cuda.cpp', 46 | 'apex/optimizers/csrc/fused_adam_cuda_kernel.cu'], 47 | extra_compile_args={'cxx': ['-O3',], 48 | 'nvcc':['-O3', 49 | '--use_fast_math']})) 50 | ext_modules.append( 51 | CUDAExtension(name='syncbn', 52 | sources=['csrc/syncbn.cpp', 53 | 'csrc/welford.cu'])) 54 | ext_modules.append( 55 | CUDAExtension(name='fused_layer_norm_cuda', 56 | sources=['apex/normalization/csrc/layer_norm_cuda.cpp', 57 | 'apex/normalization/csrc/layer_norm_cuda_kernel.cu'], 58 | extra_compile_args={'cxx': ['-O3',], 59 | 'nvcc':['-maxrregcount=50', 60 | '-O3', 61 | '--use_fast_math']})) 62 | 63 | setup( 64 | name='apex', 65 | version='0.1', 66 | packages=find_packages(exclude=('build', 67 | 'csrc', 68 | 'include', 69 | 'tests', 70 | 'dist', 71 | 'docs', 72 | 'tests', 73 | 'examples', 74 | 'apex.egg-info',)), 75 | description='PyTorch Extensions written by NVIDIA', 76 | ext_modules=ext_modules, 77 | cmdclass=cmdclass, 78 | ) 79 | -------------------------------------------------------------------------------- /furnace/apex/tests/RNN/RNN_tests.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import apex 5 | from apex.RNN.models import bidirectionalRNN, stackedRNN, RNNCell 6 | from torch.nn._functions.rnn import LSTMCell 7 | import itertools 8 | 9 | 10 | torch.backends.cudnn.enabled=False 11 | 12 | batch_first = False #not implemented yet 13 | dropout = 0.0 #How to validate? 14 | bidirectional = False #True works, but differs in definition to PyTorch 15 | 16 | rnn_types = ['LSTM', 'GRU', 'ReLU', 'Tanh'] 17 | sizes = [8,4,2] 18 | 19 | seq_sizes = sizes 20 | hidden_sizes = sizes 21 | inp_sizes = sizes 22 | batch_sizes = sizes 23 | num_layerss = sizes 24 | 25 | biases = [True] 26 | 27 | def copy_param_set(pyt_rnn, my_rnn, layer=0, reverse=False): 28 | my_params = None 29 | 30 | rnn = None 31 | if isinstance(my_rnn, bidirectionalRNN): 32 | rnn = my_rnn.fwd.rnns[layer] if not reverse else my_rnn.bckwrd.rnns[layer] 33 | elif isinstance(my_rnn, stackedRNN): 34 | rnn = my_rnn.rnns[layer] 35 | else: 36 | raise RuntimeError() 37 | 38 | param_names = ['w_ih', 'w_hh', 'b_ih', 'b_hh'] 39 | 40 | if not hasattr(rnn, 'b_hh'): 41 | param_names = param_names[:2] 42 | my_params = [getattr(rnn, param_name) for param_name in param_names] 43 | 44 | pyt_params = None 45 | param_names = ['weight_ih_', 'weight_hh_', 'bias_ih_', 'bias_hh_'] 46 | reverse_str = '_reverse' if reverse else '' 47 | 48 | if not hasattr(pyt_rnn, 'bias_hh_l0'): 49 | param_names=param_names[:2] 50 | pyt_params =[getattr(pyt_rnn, param_name + 'l' + str(layer) + reverse_str ) 51 | for param_name in param_names ] 52 | for pyt_param, my_param in zip(pyt_params, my_params): 53 | pyt_param.data.copy_(my_param.data) 54 | 55 | def copy_all_params(pyt_rnn, my_rnn): 56 | for layer in range(num_layers): 57 | copy_param_set(pyt_rnn, my_rnn, layer) 58 | if bidirectional: 59 | copy_param_set(pyt_rnn, my_rnn, layer, bidirectional) 60 | 61 | 62 | def compare_variables(v1, v2, msg, params): 63 | diff = float((v1.data-v2.data).abs().max()) 64 | if diff > 1e-5: 65 | print("Error of ", diff, " found for ", msg, " for case: ", str(params)) 66 | 67 | def compare_tuple_variables(t1, t2, msg, params): 68 | for var1, var2 in zip(t1, t2): 69 | compare_variables(var1, var2, msg, params) 70 | 71 | def maybe_compare(v1, v2, msg, params): 72 | if isinstance(v1, Variable) and isinstance(v2, Variable): 73 | compare_variables(v1, v2, msg, params) 74 | else: 75 | compare_tuple_variables(v1, v2, msg, params) 76 | 77 | product = list(itertools.product(rnn_types, seq_sizes, hidden_sizes, inp_sizes, batch_sizes, num_layerss, biases)) 78 | 79 | for test_case in product: 80 | rnn_type, seq_size, hidden_size, inp_size, batch_size, num_layers, bias = test_case 81 | 82 | inp = torch.cuda.FloatTensor(seq_size, batch_size, inp_size).uniform_() 83 | 84 | if rnn_type == 'ReLU' or rnn_type == 'Tanh': 85 | pytorch_rnn = nn.RNN(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, nonlinearity=rnn_type.lower()).cuda() 86 | else: 87 | pytorch_rnn = getattr(nn, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda() 88 | my_rnn = getattr(apex.RNN.models, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda() 89 | 90 | copy_all_params(pytorch_rnn, my_rnn) 91 | 92 | pyt_inp = Variable(inp, requires_grad=True) 93 | my_inp = Variable(inp, requires_grad=True) 94 | 95 | my_out, my_hiddens = my_rnn(my_inp) 96 | pyt_out, pyt_hiddens = pytorch_rnn(pyt_inp) 97 | 98 | pyt_out.sum().backward() 99 | my_out.sum().backward() 100 | 101 | 102 | maybe_compare(pyt_out, my_out, "out", test_case) 103 | 104 | #If there's only one hidden state PyTorch doesn't return it in a tuple, 105 | #apex does, so we wrap PyTorch's returned hidden state in a tuple. 106 | if not isinstance(pyt_hiddens, tuple): 107 | pyt_hiddens = (pyt_hiddens,) 108 | 109 | try: 110 | for i, (pyt_hid, my_hid) in enumerate(zip(pyt_hiddens, my_hiddens)): 111 | maybe_compare(pyt_hid, my_hid , "hx_"+str(i), test_case) 112 | except ValueError: 113 | maybe_compare(pyt_hiddens, my_hiddens , "hx_0", test_case) 114 | 115 | 116 | maybe_compare(pyt_inp.grad, my_inp.grad, "inp.grad", test_case) 117 | 118 | print("Test passed.") 119 | -------------------------------------------------------------------------------- /furnace/apex/tests/distributed/ddp_race_condition_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.nn import Parameter 4 | from torch.nn import Module 5 | from apex.parallel import DistributedDataParallel as DDP 6 | import argparse 7 | import os 8 | 9 | 10 | parser = argparse.ArgumentParser(description='allreduce hook example') 11 | parser.add_argument("--local_rank", default=0, type=int) 12 | args = parser.parse_args() 13 | 14 | args.distributed = False 15 | if 'WORLD_SIZE' in os.environ: 16 | args.distributed = int(os.environ['WORLD_SIZE']) > 1 17 | 18 | if args.distributed: 19 | args.gpu = args.local_rank % torch.cuda.device_count() 20 | torch.cuda.set_device(args.gpu) 21 | torch.distributed.init_process_group(backend='nccl', 22 | init_method='env://') 23 | args.world_size = torch.distributed.get_world_size() 24 | 25 | torch.set_printoptions(precision=10) 26 | torch.manual_seed(args.local_rank) 27 | 28 | class Model(Module): 29 | def __init__(self): 30 | super(Model, self).__init__() 31 | self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0)) 32 | self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0)) 33 | def forward(self, input): 34 | return (input*self.a)*self.b 35 | 36 | model = Model() 37 | # model = DDP(model, message_size=1, gradient_predivide_factor=8.0) 38 | model = DDP(model, delay_allreduce=True) 39 | # model = DDP(model, message_size=1, allreduce_trigger_params=[model.b]) 40 | 41 | x = torch.cuda.FloatTensor(4096*4096) 42 | 43 | passed = True 44 | torch.cuda.cudart().cudaProfilerStart() 45 | for i in range(10): 46 | x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity 47 | model.zero_grad() 48 | out = model(x) 49 | loss = out.sum() 50 | # torch.cuda.nvtx.range_push("backward") 51 | loss.backward() 52 | # torch.cuda.nvtx.range_pop() 53 | 54 | # torch.cuda.nvtx.range_push("synchronize() + info") 55 | # torch.cuda.synchronize() 56 | print("i = {}".format(i)) 57 | def info(name, param, val): 58 | expected = val*4096*4096*(2.*i+1)/2. 59 | actual = param.grad.data.sum().item() 60 | print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format( 61 | param.grad.data_ptr(), expected, actual)) 62 | return (expected == actual) 63 | if not info("model.a", model.module.a, 2.): passed = False 64 | if not info("model.b", model.module.b, 1.): passed = False 65 | # torch.cuda.nvtx.range_pop() 66 | torch.cuda.cudart().cudaProfilerStop() 67 | 68 | print("passed = ", passed) 69 | -------------------------------------------------------------------------------- /furnace/apex/tests/distributed/run_race_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py 4 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_amp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_amp/__init__.py -------------------------------------------------------------------------------- /furnace/apex/tests/run_amp/test_cache.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import functools as ft 4 | import itertools as it 5 | 6 | from apex import amp 7 | import torch 8 | from torch import nn 9 | import torch.nn.functional as F 10 | 11 | from utils import common_init, HALF, FLOAT,\ 12 | ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT 13 | 14 | def get_reference_grad(i, w, ops): 15 | # Creating new tensors ensures, among other things, that the new tensors are not in the cache. 16 | # In fact, they are guaranteed not to use the cache because they are not torch.nn.Parameters. 17 | fp32_i = i.detach().clone().float() 18 | fp32_w = w.detach().clone().float().requires_grad_() 19 | loss = ops(fp32_i, fp32_w) 20 | loss.backward() 21 | return fp32_w.grad 22 | 23 | class WhitelistModule(torch.nn.Module): 24 | def __init__(self, dtype): 25 | super(WhitelistModule, self).__init__() 26 | self.weight = torch.nn.Parameter(torch.arange(8*8, device='cuda', dtype=dtype).view(8,8)) 27 | 28 | @staticmethod 29 | def ops(input, weight): 30 | return (input.mm(weight)).mm(weight).sum() 31 | 32 | def forward(self, input): 33 | return self.ops(input, self.weight) 34 | 35 | 36 | class BlacklistModule(torch.nn.Module): 37 | def __init__(self, dtype): 38 | super(BlacklistModule, self).__init__() 39 | self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8)) 40 | 41 | @staticmethod 42 | def ops(input, weight): 43 | return (input + torch.pow(weight, 2) + torch.pow(weight, 2)).sum() 44 | 45 | def forward(self, input): 46 | return self.ops(input, self.weight) 47 | 48 | 49 | class PromoteModule(torch.nn.Module): 50 | def __init__(self, dtype): 51 | super(PromoteModule, self).__init__() 52 | self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8)) 53 | 54 | @staticmethod 55 | def ops(input, weight): 56 | return ((input*weight)*weight).sum() 57 | 58 | def forward(self, input): 59 | return self.ops(input, self.weight) 60 | 61 | class TestCache(unittest.TestCase): 62 | def setUp(self): 63 | self.handle = amp.init(enabled=True) 64 | self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32) 65 | common_init(self) 66 | 67 | def tearDown(self): 68 | self.handle._deactivate() 69 | 70 | def train_eval_train_test(self, module, t): 71 | model = module(t).cuda() 72 | dummy_optimizer = torch.optim.SGD(model.parameters(), lr=1.0) 73 | 74 | def training_step(): 75 | for param in model.parameters(): 76 | param.grad = None 77 | 78 | loss = model(self.x).sum() 79 | self.handle._default_scaler._loss_scale = 1.0 80 | with self.handle.scale_loss(loss, dummy_optimizer) as scaled_loss: 81 | scaled_loss.backward() 82 | 83 | self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1) 84 | self.assertEqual(model.weight.grad.type(), model.weight.type()) 85 | 86 | reference_grad = get_reference_grad(self.x, model.weight, model.ops) 87 | 88 | # Currently there's no difference in the allclose calls, so no need for branching, 89 | # but I'm keeping this in case we want different tolerances for fp16 and fp32 checks. 90 | if model.weight.grad.type() == "torch.cuda.HalfTensor": 91 | self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad)) 92 | elif model.weight.grad.type() == "torch.cuda.FloatTensor": 93 | self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad)) 94 | else: 95 | raise RuntimeError("model.weight.grad.type = {}".format(model.weight.grad.type())) 96 | 97 | model.weight.data -= 1. 98 | 99 | # Simulates first epoch 100 | training_step() 101 | 102 | # Simulates eval 103 | with torch.no_grad(): 104 | loss = model(self.x).sum() 105 | 106 | # Simulates resuming training after eval 107 | training_step() 108 | 109 | # I could easily have these as a set of for loops in a single test, 110 | # instead of going for granularity. 111 | def test_whitelist_module_fp16_weight(self): 112 | self.train_eval_train_test(WhitelistModule, torch.float16) 113 | 114 | def test_whitelist_module_fp32_weight(self): 115 | self.train_eval_train_test(WhitelistModule, torch.float32) 116 | 117 | def test_blacklist_module_fp16_weight(self): 118 | self.train_eval_train_test(BlacklistModule, torch.float16) 119 | 120 | def test_blacklist_module_fp32_weight(self): 121 | self.train_eval_train_test(BlacklistModule, torch.float32) 122 | 123 | def test_promote_module_fp16_weight(self): 124 | self.train_eval_train_test(PromoteModule, torch.float16) 125 | 126 | def test_promote_module_fp32_weight(self): 127 | self.train_eval_train_test(PromoteModule, torch.float32) 128 | 129 | 130 | if __name__ == '__main__': 131 | unittest.main() 132 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_amp/test_promotion.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import itertools as it 4 | 5 | from apex import amp 6 | import torch 7 | from torch import nn 8 | import torch.nn.functional as F 9 | 10 | from utils import common_init, HALF, FLOAT, DTYPES 11 | 12 | class TestPromotion(unittest.TestCase): 13 | def setUp(self): 14 | self.handle = amp.init(enabled=True) 15 | common_init(self) 16 | 17 | def tearDown(self): 18 | self.handle._deactivate() 19 | 20 | def run_binary_promote_test(self, fns, input_shape, x_inplace=False): 21 | type_pairs = it.product(DTYPES, DTYPES) 22 | for fn, (xtype, ytype) in it.product(fns, type_pairs): 23 | x = torch.randn(input_shape, dtype=xtype).requires_grad_() 24 | x_leaf = x 25 | if x_inplace: 26 | # We need a non-leaf to call in place on 27 | x = x.clone() 28 | y = torch.randn(input_shape, dtype=ytype) 29 | out = fn(x, y) 30 | if x_inplace: 31 | # In place: always match xtype 32 | self.assertEqual(out.type(), x.type()) 33 | else: 34 | # Out of place: match widest type 35 | if xtype == torch.float or ytype == torch.float: 36 | self.assertEqual(out.type(), FLOAT) 37 | else: 38 | self.assertEqual(out.type(), HALF) 39 | out.float().sum().backward() 40 | self.assertEqual(x_leaf.grad.dtype, xtype) 41 | 42 | def test_atan2_matches_widest(self): 43 | fns = [lambda x, y : torch.atan2(x, y), 44 | lambda x, y : x.atan2(y)] 45 | self.run_binary_promote_test(fns, (self.b,)) 46 | 47 | def test_mul_matches_widest(self): 48 | fns = [lambda x, y : torch.mul(x, y), 49 | lambda x, y: x.mul(y)] 50 | self.run_binary_promote_test(fns, (self.b,)) 51 | 52 | def test_cat_matches_widest(self): 53 | shape = self.b 54 | ys = [torch.randn(shape, dtype=torch.half) for _ in range(5)] 55 | x_float = torch.randn(shape) 56 | out = torch.cat(ys + [x_float]) 57 | self.assertEqual(out.type(), FLOAT) 58 | x_half = torch.randn(shape, dtype=torch.half) 59 | out = torch.cat(ys + [x_half]) 60 | self.assertEqual(out.type(), HALF) 61 | 62 | def test_inplace_exp_is_error_for_half(self): 63 | xs = torch.randn(self.b) 64 | xs.exp_() 65 | self.assertEqual(xs.type(), FLOAT) 66 | xs = torch.randn(self.b, dtype=torch.half) 67 | with self.assertRaises(NotImplementedError): 68 | xs.exp_() 69 | 70 | def test_inplace_add_matches_self(self): 71 | fn = lambda x, y: x.add_(y) 72 | self.run_binary_promote_test([fn], (self.b,), x_inplace=True) 73 | 74 | if __name__ == '__main__': 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_amp/test_rnn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from apex import amp 4 | import random 5 | import torch 6 | from torch import nn 7 | 8 | from utils import common_init, HALF 9 | 10 | class TestRnnCells(unittest.TestCase): 11 | def setUp(self): 12 | self.handle = amp.init(enabled=True) 13 | common_init(self) 14 | 15 | def tearDown(self): 16 | self.handle._deactivate() 17 | 18 | def run_cell_test(self, cell, state_tuple=False): 19 | shape = (self.b, self.h) 20 | for typ in [torch.float, torch.half]: 21 | xs = [torch.randn(shape, dtype=typ).requires_grad_() 22 | for _ in range(self.t)] 23 | hidden_fn = lambda: torch.zeros(shape, dtype=typ) 24 | if state_tuple: 25 | hidden = (hidden_fn(), hidden_fn()) 26 | else: 27 | hidden = hidden_fn() 28 | outputs = [] 29 | for i in range(self.t): 30 | hidden = cell(xs[i], hidden) 31 | if state_tuple: 32 | output = hidden[0] 33 | else: 34 | output = hidden 35 | outputs.append(output) 36 | for y in outputs: 37 | self.assertEqual(y.type(), HALF) 38 | outputs[-1].float().sum().backward() 39 | for i, x in enumerate(xs): 40 | self.assertEqual(x.grad.dtype, x.dtype) 41 | 42 | def test_rnn_cell_is_half(self): 43 | cell = nn.RNNCell(self.h, self.h) 44 | self.run_cell_test(cell) 45 | 46 | def test_gru_cell_is_half(self): 47 | cell = nn.GRUCell(self.h, self.h) 48 | self.run_cell_test(cell) 49 | 50 | def test_lstm_cell_is_half(self): 51 | cell = nn.LSTMCell(self.h, self.h) 52 | self.run_cell_test(cell, state_tuple=True) 53 | 54 | class TestRnns(unittest.TestCase): 55 | def setUp(self): 56 | self.handle = amp.init(enabled=True) 57 | common_init(self) 58 | 59 | def tearDown(self): 60 | self.handle._deactivate() 61 | 62 | def run_rnn_test(self, rnn, layers, bidir, state_tuple=False): 63 | for typ in [torch.float, torch.half]: 64 | x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_() 65 | hidden_fn = lambda: torch.zeros((layers + (layers * bidir), 66 | self.b, self.h), dtype=typ) 67 | if state_tuple: 68 | hidden = (hidden_fn(), hidden_fn()) 69 | else: 70 | hidden = hidden_fn() 71 | output, _ = rnn(x, hidden) 72 | self.assertEqual(output.type(), HALF) 73 | output[-1, :, :].float().sum().backward() 74 | self.assertEqual(x.grad.dtype, x.dtype) 75 | 76 | def test_rnn_is_half(self): 77 | configs = [(1, False), (2, False), (2, True)] 78 | for layers, bidir in configs: 79 | rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=layers, 80 | nonlinearity='relu', bidirectional=bidir) 81 | self.run_rnn_test(rnn, layers, bidir) 82 | 83 | def test_gru_is_half(self): 84 | configs = [(1, False), (2, False), (2, True)] 85 | for layers, bidir in configs: 86 | rnn = nn.GRU(input_size=self.h, hidden_size=self.h, num_layers=layers, 87 | bidirectional=bidir) 88 | self.run_rnn_test(rnn, layers, bidir) 89 | 90 | def test_lstm_is_half(self): 91 | configs = [(1, False), (2, False), (2, True)] 92 | for layers, bidir in configs: 93 | rnn = nn.LSTM(input_size=self.h, hidden_size=self.h, num_layers=layers, 94 | bidirectional=bidir) 95 | self.run_rnn_test(rnn, layers, bidir, state_tuple=True) 96 | 97 | def test_rnn_packed_sequence(self): 98 | num_layers = 2 99 | rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=num_layers) 100 | for typ in [torch.float, torch.half]: 101 | x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_() 102 | lens = sorted([random.randint(self.t // 2, self.t) for _ in range(self.b)], 103 | reverse=True) 104 | # `pack_padded_sequence` breaks if default tensor type is non-CPU 105 | torch.set_default_tensor_type(torch.FloatTensor) 106 | lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu')) 107 | packed_seq = nn.utils.rnn.pack_padded_sequence(x, lens) 108 | torch.set_default_tensor_type(torch.cuda.FloatTensor) 109 | hidden = torch.zeros((num_layers, self.b, self.h), dtype=typ) 110 | output, _ = rnn(packed_seq, hidden) 111 | self.assertEqual(output.data.type(), HALF) 112 | output.data.float().sum().backward() 113 | self.assertEqual(x.grad.dtype, x.dtype) 114 | 115 | if __name__ == '__main__': 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_amp/test_scale.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import functools as ft 4 | import itertools as it 5 | 6 | from apex import amp 7 | import torch 8 | from torch import nn 9 | import torch.nn.functional as F 10 | 11 | from utils import common_init, HALF, FLOAT,\ 12 | ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT 13 | 14 | try: 15 | import amp_C 16 | scale_check_overflow = amp_C.scale_check_overflow 17 | disabled = False 18 | except ImportError as err: 19 | print("amp_C fused kernel unavailable, disabling TestScale. ImportError was ", err) 20 | disabled = True 21 | 22 | 23 | class TestScale(unittest.TestCase): 24 | 25 | def setUp(self): 26 | self.scale = 128.0 27 | self.nx = 999 28 | self.ny = 888 29 | 30 | self.overflow_buf = torch.cuda.IntTensor([0]) 31 | self.fp16 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float16) 32 | self.fp32 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float32) 33 | self.fp16_ref = torch.ones((1, 1), device='cuda', dtype=torch.float16) 34 | self.fp32_ref = torch.ones((1, 1), device='cuda', dtype=torch.float32) 35 | 36 | common_init(self) 37 | 38 | def tearDown(self): 39 | pass 40 | 41 | def downscale_test(self, input, output, ref): 42 | self.overflow_buf.zero_() 43 | input.fill_(1.0) 44 | if input is not output: 45 | output.fill_(3.0) 46 | input.mul_(self.scale) 47 | scale_check_overflow(input, 1./self.scale, self.overflow_buf, output) 48 | self.assertTrue(torch.allclose(output, ref)) 49 | self.assertTrue(self.overflow_buf.item() == 0) 50 | 51 | def find_inf_test(self, input, output, ref, x, y, val): 52 | self.overflow_buf.zero_() 53 | input.fill_(1.0) 54 | if input is not output: 55 | output.fill_(3.0) 56 | input[x,y] = val 57 | scale_check_overflow(input, 1./self.scale, self.overflow_buf, output) 58 | self.assertTrue(self.overflow_buf.item()) 59 | 60 | # Currently, the fused kernel gives a hard error if you attempt to downscale 61 | # into fp16 output, which imo is the desired behavior. Maybe someday we 62 | # will learn otherwise. 63 | # @unittest.skipIf(disabled, "amp_C is unavailable") 64 | # def test_fp16_to_fp16(self): 65 | # self.downscale_test(self.fp16, self.fp16, self.fp16_ref) 66 | 67 | @unittest.skipIf(disabled, "amp_C is unavailable") 68 | def test_fp16_to_fp32(self): 69 | self.downscale_test(self.fp16, self.fp32, self.fp32_ref) 70 | 71 | # @unittest.skipIf(disabled, "amp_C is unavailable") 72 | # def test_fp32_to_fp16(self): 73 | # self.downscale_test(self.fp32, self.fp16, self.fp16_ref) 74 | 75 | @unittest.skipIf(disabled, "amp_C is unavailable") 76 | def test_fp32_to_fp32(self): 77 | self.downscale_test(self.fp32, self.fp32, self.fp32_ref) 78 | 79 | @unittest.skipIf(disabled, "amp_C is unavailable") 80 | def test_fp16_to_fp32_find_inf_nan(self): 81 | self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, 0, 0, float('nan')) 82 | self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('inf')) 83 | self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('nan')) 84 | 85 | @unittest.skipIf(disabled, "amp_C is unavailable") 86 | def test_fp32_to_fp32_find_inf_nan(self): 87 | self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, 0, 0, float('inf')) 88 | self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('nan')) 89 | self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('inf')) 90 | 91 | 92 | if __name__ == '__main__': 93 | unittest.main() 94 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_amp/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | HALF = 'torch.cuda.HalfTensor' 4 | FLOAT = 'torch.cuda.FloatTensor' 5 | 6 | DTYPES = [torch.half, torch.float] 7 | 8 | ALWAYS_HALF = {torch.float: HALF, 9 | torch.half: HALF} 10 | ALWAYS_FLOAT = {torch.float: FLOAT, 11 | torch.half: FLOAT} 12 | MATCH_INPUT = {torch.float: FLOAT, 13 | torch.half: HALF} 14 | 15 | def common_init(test_case): 16 | test_case.h = 64 17 | test_case.b = 16 18 | test_case.c = 16 19 | test_case.k = 3 20 | test_case.t = 10 21 | torch.set_default_tensor_type(torch.cuda.FloatTensor) 22 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_fp16_optimizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_fp16_optimizer/__init__.py -------------------------------------------------------------------------------- /furnace/apex/tests/run_fp16_optimizer/test_fp16_optimizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import functools as ft 4 | import itertools as it 5 | 6 | import torch 7 | from apex.fp16_utils import FP16_Optimizer 8 | 9 | # Currently no-ops (tested via examples). 10 | # FP16_Optimizer to be deprecated and moved under unified Amp API. 11 | class TestFP16Optimizer(unittest.TestCase): 12 | def setUp(self): 13 | N, D_in, D_out = 64, 1024, 16 14 | self.N = N 15 | self.D_in = D_in 16 | self.D_out = D_out 17 | self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda') 18 | self.y = torch.randn((N, D_out), dtype=torch.float16, device='cuda') 19 | self.model = torch.nn.Linear(D_in, D_out).cuda().half() 20 | 21 | # def tearDown(self): 22 | # pass 23 | 24 | def test_minimal(self): 25 | pass 26 | 27 | def test_minimal_static(self): 28 | pass 29 | 30 | def test_minimal_dynamic(self): 31 | pass 32 | 33 | def test_closure(self): 34 | pass 35 | 36 | def test_closure_dynamic(self): 37 | pass 38 | 39 | def test_save_load(self): 40 | pass 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_fp16util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_fp16util/__init__.py -------------------------------------------------------------------------------- /furnace/apex/tests/run_fp16util/test_fp16util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from apex.fp16_utils import FP16Model 7 | 8 | 9 | class DummyBlock(nn.Module): 10 | def __init__(self): 11 | super(DummyBlock, self).__init__() 12 | 13 | self.conv = nn.Conv2d(10, 10, 2) 14 | self.bn = nn.BatchNorm2d(10, affine=True) 15 | 16 | def forward(self, x): 17 | return self.conv(self.bn(x)) 18 | 19 | 20 | class DummyNet(nn.Module): 21 | def __init__(self): 22 | super(DummyNet, self).__init__() 23 | 24 | self.conv1 = nn.Conv2d(3, 10, 2) 25 | self.bn1 = nn.BatchNorm2d(10, affine=False) 26 | self.db1 = DummyBlock() 27 | self.db2 = DummyBlock() 28 | 29 | def forward(self, x): 30 | out = x 31 | out = self.conv1(out) 32 | out = self.bn1(out) 33 | out = self.db1(out) 34 | out = self.db2(out) 35 | return out 36 | 37 | 38 | class DummyNetWrapper(nn.Module): 39 | def __init__(self): 40 | super(DummyNetWrapper, self).__init__() 41 | 42 | self.bn = nn.BatchNorm2d(3, affine=True) 43 | self.dn = DummyNet() 44 | 45 | def forward(self, x): 46 | return self.dn(self.bn(x)) 47 | 48 | 49 | class TestFP16Model(unittest.TestCase): 50 | def setUp(self): 51 | self.N = 64 52 | self.C_in = 3 53 | self.H_in = 16 54 | self.W_in = 32 55 | self.in_tensor = torch.randn((self.N, self.C_in, self.H_in, self.W_in)).cuda() 56 | self.orig_model = DummyNetWrapper().cuda() 57 | self.fp16_model = FP16Model(self.orig_model) 58 | 59 | def test_params_and_buffers(self): 60 | exempted_modules = [ 61 | self.fp16_model.network.bn, 62 | self.fp16_model.network.dn.db1.bn, 63 | self.fp16_model.network.dn.db2.bn, 64 | ] 65 | for m in self.fp16_model.modules(): 66 | expected_dtype = torch.float if (m in exempted_modules) else torch.half 67 | for p in m.parameters(recurse=False): 68 | assert p.dtype == expected_dtype 69 | for b in m.buffers(recurse=False): 70 | assert b.dtype in (expected_dtype, torch.int64) 71 | 72 | def test_output_is_half(self): 73 | out_tensor = self.fp16_model(self.in_tensor) 74 | assert out_tensor.dtype == torch.half 75 | 76 | -------------------------------------------------------------------------------- /furnace/apex/tests/run_mixed_adam/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/apex/tests/run_mixed_adam/__init__.py -------------------------------------------------------------------------------- /furnace/apex/tests/run_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | 4 | test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam"] 5 | 6 | runner = unittest.TextTestRunner(verbosity=2) 7 | 8 | errcode = 0 9 | 10 | for test_dir in test_dirs: 11 | suite = unittest.TestLoader().discover(test_dir) 12 | 13 | print("\nExecuting tests from " + test_dir) 14 | 15 | result = runner.run(suite) 16 | 17 | if not result.wasSuccessful(): 18 | errcode = 1 19 | 20 | sys.exit(errcode) 21 | -------------------------------------------------------------------------------- /furnace/apex/tests/synced_batchnorm/unit_test.sh: -------------------------------------------------------------------------------- 1 | python single_gpu_unit_test.py 2 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py 3 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp64 4 | #beware, you need a system with at least 4 gpus to test group_size 1 else False 87 | layers.append(block(self.in_channels, mid_out_channels, has_proj, 88 | stride=stride, norm_layer=norm_layer)) 89 | self.in_channels = mid_out_channels * block.expansion 90 | for i in range(1, blocks): 91 | layers.append(block(self.in_channels, mid_out_channels, 92 | has_proj=False, stride=1, 93 | norm_layer=norm_layer)) 94 | 95 | return nn.Sequential(*layers) 96 | 97 | def forward(self, x): 98 | x = self.conv1(x) 99 | x = self.maxpool(x) 100 | 101 | blocks = [] 102 | x = self.layer1(x); 103 | blocks.append(x) 104 | x = self.layer2(x); 105 | blocks.append(x) 106 | x = self.layer3(x); 107 | blocks.append(x) 108 | 109 | return blocks 110 | 111 | 112 | def xception39(pretrained_model=None, **kwargs): 113 | model = Xception(Block, [4, 8, 4], [16, 32, 64], **kwargs) 114 | 115 | if pretrained_model is not None: 116 | model = load_model(model, pretrained_model) 117 | return model 118 | -------------------------------------------------------------------------------- /furnace/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/datasets/__init__.py -------------------------------------------------------------------------------- /furnace/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/engine/__init__.py -------------------------------------------------------------------------------- /furnace/engine/logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | # @Time : 2018/8/2 上午11:48 4 | # @Author : yuchangqian 5 | # @Contact : changqian_yu@163.com 6 | # @File : logger.py 7 | import os 8 | import sys 9 | import logging 10 | 11 | from utils import pyt_utils 12 | # from utils.pyt_utils import ensure_dir 13 | 14 | _default_level_name = os.getenv('ENGINE_LOGGING_LEVEL', 'INFO') 15 | _default_level = logging.getLevelName(_default_level_name.upper()) 16 | 17 | 18 | class LogFormatter(logging.Formatter): 19 | log_fout = None 20 | date_full = '[%(asctime)s %(lineno)d@%(filename)s:%(name)s] ' 21 | date = '%(asctime)s ' 22 | msg = '%(message)s' 23 | 24 | def format(self, record): 25 | if record.levelno == logging.DEBUG: 26 | mcl, mtxt = self._color_dbg, 'DBG' 27 | elif record.levelno == logging.WARNING: 28 | mcl, mtxt = self._color_warn, 'WRN' 29 | elif record.levelno == logging.ERROR: 30 | mcl, mtxt = self._color_err, 'ERR' 31 | else: 32 | mcl, mtxt = self._color_normal, '' 33 | 34 | if mtxt: 35 | mtxt += ' ' 36 | 37 | if self.log_fout: 38 | self.__set_fmt(self.date_full + mtxt + self.msg) 39 | formatted = super(LogFormatter, self).format(record) 40 | # self.log_fout.write(formatted) 41 | # self.log_fout.write('\n') 42 | # self.log_fout.flush() 43 | return formatted 44 | 45 | self.__set_fmt(self._color_date(self.date) + mcl(mtxt + self.msg)) 46 | formatted = super(LogFormatter, self).format(record) 47 | 48 | return formatted 49 | 50 | if sys.version_info.major < 3: 51 | def __set_fmt(self, fmt): 52 | self._fmt = fmt 53 | else: 54 | def __set_fmt(self, fmt): 55 | self._style._fmt = fmt 56 | 57 | @staticmethod 58 | def _color_dbg(msg): 59 | return '\x1b[36m{}\x1b[0m'.format(msg) 60 | 61 | @staticmethod 62 | def _color_warn(msg): 63 | return '\x1b[1;31m{}\x1b[0m'.format(msg) 64 | 65 | @staticmethod 66 | def _color_err(msg): 67 | return '\x1b[1;4;31m{}\x1b[0m'.format(msg) 68 | 69 | @staticmethod 70 | def _color_omitted(msg): 71 | return '\x1b[35m{}\x1b[0m'.format(msg) 72 | 73 | @staticmethod 74 | def _color_normal(msg): 75 | return msg 76 | 77 | @staticmethod 78 | def _color_date(msg): 79 | return '\x1b[32m{}\x1b[0m'.format(msg) 80 | 81 | 82 | def get_logger(log_dir=None, log_file=None, formatter=LogFormatter): 83 | logger = logging.getLogger() 84 | logger.setLevel(_default_level) 85 | del logger.handlers[:] 86 | 87 | if log_dir and log_file: 88 | pyt_utils.ensure_dir(log_dir) 89 | LogFormatter.log_fout = True 90 | file_handler = logging.FileHandler(log_file, mode='a') 91 | file_handler.setLevel(logging.INFO) 92 | file_handler.setFormatter(formatter) 93 | logger.addHandler(file_handler) 94 | 95 | stream_handler = logging.StreamHandler() 96 | stream_handler.setFormatter(formatter(datefmt='%d %H:%M:%S')) 97 | stream_handler.setLevel(0) 98 | logger.addHandler(stream_handler) 99 | return logger 100 | -------------------------------------------------------------------------------- /furnace/engine/lr_policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | # @Time : 2018/8/1 上午1:50 4 | # @Author : yuchangqian 5 | # @Contact : changqian_yu@163.com 6 | # @File : lr_policy.py.py 7 | 8 | from abc import ABCMeta, abstractmethod 9 | 10 | 11 | class BaseLR(): 12 | __metaclass__ = ABCMeta 13 | 14 | @abstractmethod 15 | def get_lr(self, cur_iter): pass 16 | 17 | 18 | class PolyLR(BaseLR): 19 | def __init__(self, start_lr, lr_power, total_iters): 20 | self.start_lr = start_lr 21 | self.lr_power = lr_power 22 | self.total_iters = total_iters + 0.0 23 | 24 | def get_lr(self, cur_iter): 25 | return self.start_lr * ( 26 | (1 - float(cur_iter) / self.total_iters) ** self.lr_power) 27 | 28 | class WarmUpPolyLR(BaseLR): 29 | def __init__(self, start_lr, lr_power, total_iters, warmup_steps): 30 | self.start_lr = start_lr 31 | self.lr_power = lr_power 32 | self.total_iters = total_iters + 0.0 33 | self.warmup_steps = warmup_steps 34 | 35 | def get_lr(self, cur_iter): 36 | if cur_iter < self.warmup_steps: 37 | return self.start_lr * (cur_iter / self.warmup_steps) 38 | else: 39 | return self.start_lr * ( 40 | (1 - float(cur_iter) / self.total_iters) ** self.lr_power) 41 | 42 | class MultiStageLR(BaseLR): 43 | def __init__(self, lr_stages): 44 | assert type(lr_stages) in [list, tuple] and len(lr_stages[0]) == 2, \ 45 | 'lr_stages must be list or tuple, with [iters, lr] format' 46 | self._lr_stagess = lr_stages 47 | 48 | def get_lr(self, epoch): 49 | for it_lr in self._lr_stagess: 50 | if epoch < it_lr[0]: 51 | return it_lr[1] 52 | 53 | 54 | class LinearIncreaseLR(BaseLR): 55 | def __init__(self, start_lr, end_lr, warm_iters): 56 | self._start_lr = start_lr 57 | self._end_lr = end_lr 58 | self._warm_iters = warm_iters 59 | self._delta_lr = (end_lr - start_lr) / warm_iters 60 | 61 | def get_lr(self, cur_epoch): 62 | return self._start_lr + cur_epoch * self._delta_lr 63 | 64 | 65 | -------------------------------------------------------------------------------- /furnace/engine/version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | # @Time : 2018/8/3 下午2:59 4 | # @Author : yuchangqian 5 | # @Contact : changqian_yu@163.com 6 | # @File : version.py 7 | 8 | __version__ = '0.1.1' -------------------------------------------------------------------------------- /furnace/seg_opr/__init__.py: -------------------------------------------------------------------------------- 1 | from .seg_oprs import * 2 | -------------------------------------------------------------------------------- /furnace/seg_opr/metric.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import numpy as np 4 | 5 | np.seterr(divide='ignore', invalid='ignore') 6 | 7 | 8 | # voc cityscapes metric 9 | def hist_info(n_cl, pred, gt): 10 | assert (pred.shape == gt.shape) 11 | k = (gt >= 0) & (gt < n_cl) 12 | labeled = np.sum(k) 13 | correct = np.sum((pred[k] == gt[k])) 14 | 15 | return np.bincount(n_cl * gt[k].astype(int) + pred[k].astype(int), 16 | minlength=n_cl ** 2).reshape(n_cl, 17 | n_cl), labeled, correct 18 | 19 | 20 | def compute_score(hist, correct, labeled): 21 | iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) 22 | mean_IU = np.nanmean(iu) 23 | mean_IU_no_back = np.nanmean(iu[1:]) 24 | freq = hist.sum(1) / hist.sum() 25 | freq_IU = (iu[freq > 0] * freq[freq > 0]).sum() 26 | mean_pixel_acc = correct / labeled 27 | 28 | return iu, mean_IU, mean_IU_no_back, mean_pixel_acc 29 | 30 | 31 | # ade metric 32 | def meanIoU(area_intersection, area_union): 33 | iou = 1.0 * np.sum(area_intersection, axis=1) / np.sum(area_union, axis=1) 34 | meaniou = np.nanmean(iou) 35 | meaniou_no_back = np.nanmean(iou[1:]) 36 | 37 | return iou, meaniou, meaniou_no_back 38 | 39 | 40 | def intersectionAndUnion(imPred, imLab, numClass): 41 | # Remove classes from unlabeled pixels in gt image. 42 | # We should not penalize detections in unlabeled portions of the image. 43 | imPred = imPred * (imLab >= 0) 44 | 45 | # Compute area intersection: 46 | intersection = imPred * (imPred == imLab) 47 | (area_intersection, _) = np.histogram(intersection, bins=numClass, 48 | range=(1, numClass)) 49 | 50 | # Compute area union: 51 | (area_pred, _) = np.histogram(imPred, bins=numClass, range=(1, numClass)) 52 | (area_lab, _) = np.histogram(imLab, bins=numClass, range=(1, numClass)) 53 | area_union = area_pred + area_lab - area_intersection 54 | 55 | return area_intersection, area_union 56 | 57 | 58 | def mean_pixel_accuracy(pixel_correct, pixel_labeled): 59 | mean_pixel_accuracy = 1.0 * np.sum(pixel_correct) / ( 60 | np.spacing(1) + np.sum(pixel_labeled)) 61 | 62 | return mean_pixel_accuracy 63 | 64 | 65 | def pixelAccuracy(imPred, imLab): 66 | # Remove classes from unlabeled pixels in gt image. 67 | # We should not penalize detections in unlabeled portions of the image. 68 | pixel_labeled = np.sum(imLab >= 0) 69 | pixel_correct = np.sum((imPred == imLab) * (imLab >= 0)) 70 | pixel_accuracy = 1.0 * pixel_correct / pixel_labeled 71 | 72 | return pixel_accuracy, pixel_correct, pixel_labeled 73 | -------------------------------------------------------------------------------- /furnace/seg_opr/parallel/parallel_apply.py: -------------------------------------------------------------------------------- 1 | # import threading 2 | import torch 3 | import torch.multiprocessing as mp 4 | from torch.cuda._utils import _get_device_index 5 | 6 | 7 | def get_a_var(obj): 8 | if isinstance(obj, torch.Tensor): 9 | return obj 10 | 11 | if isinstance(obj, list) or isinstance(obj, tuple): 12 | for result in map(get_a_var, obj): 13 | if isinstance(result, torch.Tensor): 14 | return result 15 | if isinstance(obj, dict): 16 | for result in map(get_a_var, obj.items()): 17 | if isinstance(result, torch.Tensor): 18 | return result 19 | return None 20 | 21 | 22 | def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): 23 | r"""Applies each `module` in :attr:`modules` in parallel on arguments 24 | contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword) 25 | on each of :attr:`devices`. 26 | Args: 27 | modules (Module): modules to be parallelized 28 | inputs (tensor): inputs to the modules 29 | devices (list of int or torch.device): CUDA devices 30 | :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and 31 | :attr:`devices` (if given) should all have same length. Moreover, each 32 | element of :attr:`inputs` can either be a single object as the only argument 33 | to a module, or a collection of positional arguments. 34 | """ 35 | assert len(modules) == len(inputs) 36 | if kwargs_tup is not None: 37 | assert len(modules) == len(kwargs_tup) 38 | else: 39 | kwargs_tup = ({},) * len(modules) 40 | if devices is not None: 41 | assert len(modules) == len(devices) 42 | else: 43 | devices = [None] * len(modules) 44 | devices = list(map(lambda x: _get_device_index(x, True), devices)) 45 | context = mp.get_context('spawn') 46 | # lock = threading.Lock() 47 | # results = {} 48 | # results = [] 49 | results_queue = context.Queue(len(devices)) 50 | grad_enabled = torch.is_grad_enabled() 51 | 52 | def _worker(i, module, input, kwargs, device=None): 53 | torch.set_grad_enabled(grad_enabled) 54 | if device is None: 55 | device = get_a_var(input).get_device() 56 | try: 57 | with torch.cuda.device(device): 58 | # this also avoids accidental slicing of `input` if it is a Tensor 59 | if not isinstance(input, (list, tuple)): 60 | input = (input,) 61 | output = module(*input, **kwargs) 62 | results_queue.put(output) 63 | # with lock: 64 | # results[i] = output 65 | except Exception as e: 66 | results_queue.put(e) 67 | # with lock: 68 | # results[i] = e 69 | 70 | if len(modules) > 1: 71 | processes = [context.Process(target=_worker, 72 | args=(i, module, input, kwargs, device)) 73 | for i, (module, input, kwargs, device) in 74 | enumerate(zip(modules, inputs, kwargs_tup, devices))] 75 | 76 | for process in processes: 77 | process.start() 78 | for process in processes: 79 | process.join() 80 | else: 81 | _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0]) 82 | 83 | outputs = [] 84 | for i in range(len(inputs)): 85 | output = results_queue.get() 86 | if isinstance(output, Exception): 87 | raise output 88 | outputs.append(output) 89 | return outputs 90 | -------------------------------------------------------------------------------- /furnace/seg_opr/sgd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | # @Time : 2018/9/12 下午3:03 4 | # @Author : yuchangqian 5 | # @Contact : changqian_yu@163.com 6 | # @File : sgd.py 7 | 8 | import torch 9 | from torch.optim.sgd import SGD 10 | 11 | 12 | class StandardSGD(SGD): 13 | def step(self, closure=None): 14 | """Performs a single optimization step. 15 | Arguments: 16 | closure (callable, optional): A closure that reevaluates the model 17 | and returns the loss. 18 | """ 19 | loss = None 20 | if closure is not None: 21 | loss = closure() 22 | 23 | for group in self.param_groups: 24 | weight_decay = group['weight_decay'] 25 | momentum = group['momentum'] 26 | dampening = group['dampening'] 27 | nesterov = group['nesterov'] 28 | 29 | for p in group['params']: 30 | if p.grad is None: 31 | continue 32 | d_p = p.grad.data 33 | if weight_decay != 0: 34 | d_p.add_(weight_decay, p.data) 35 | d_p.mul_(group['lr']) 36 | if momentum != 0: 37 | param_state = self.state[p] 38 | if 'momentum_buffer' not in param_state: 39 | buf = param_state['momentum_buffer'] = torch.zeros_like( 40 | p.data) 41 | buf.mul_(momentum).add_(d_p) 42 | else: 43 | buf = param_state['momentum_buffer'] 44 | buf.mul_(momentum).add_(1 - dampening, d_p) 45 | if nesterov: 46 | d_p = d_p.add(momentum, buf) 47 | else: 48 | d_p = buf 49 | 50 | p.data.add_(-1, d_p) 51 | 52 | return loss -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | # @Time : 2018/10/3 下午2:10 4 | # @Author : yuchangqian 5 | # @Contact : changqian_yu@163.com 6 | # @File : __init__.py 7 | 8 | from .syncbn import * 9 | from .parallel import * -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/comm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : comm.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import queue 12 | import collections 13 | import threading 14 | 15 | 16 | __all__ = ['FutureResult', 'SlavePipe', 'SyncMaster'] 17 | 18 | 19 | class FutureResult(object): 20 | """A thread-safe future implementation. Used only as one-to-one pipe.""" 21 | 22 | def __init__(self): 23 | self._result = None 24 | self._lock = threading.Lock() 25 | self._cond = threading.Condition(self._lock) 26 | 27 | def put(self, result): 28 | with self._lock: 29 | assert self._result is None, 'Previous result has\'t been fetched.' 30 | self._result = result 31 | self._cond.notify() 32 | 33 | def get(self): 34 | with self._lock: 35 | if self._result is None: 36 | self._cond.wait() 37 | 38 | res = self._result 39 | self._result = None 40 | return res 41 | 42 | 43 | _MasterRegistry = collections.namedtuple('MasterRegistry', ['result']) 44 | _SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result']) 45 | 46 | 47 | class SlavePipe(_SlavePipeBase): 48 | """Pipe for master-slave communication.""" 49 | 50 | def run_slave(self, msg): 51 | self.queue.put((self.identifier, msg)) 52 | ret = self.result.get() 53 | self.queue.put(True) 54 | return ret 55 | 56 | 57 | class SyncMaster(object): 58 | """An abstract `SyncMaster` object. 59 | 60 | - During the replication, as the data parallel will trigger an callback of each module, all slave devices should 61 | call `register(id)` and obtain an `SlavePipe` to communicate with the master. 62 | - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected, 63 | and passed to a registered callback. 64 | - After receiving the messages, the master device should gather the information and determine to message passed 65 | back to each slave devices. 66 | """ 67 | 68 | def __init__(self, master_callback): 69 | """ 70 | 71 | Args: 72 | master_callback: a callback to be invoked after having collected messages from slave devices. 73 | """ 74 | self._master_callback = master_callback 75 | self._queue = queue.Queue() 76 | self._registry = collections.OrderedDict() 77 | self._activated = False 78 | 79 | def register_slave(self, identifier): 80 | """ 81 | Register an slave device. 82 | 83 | Args: 84 | identifier: an identifier, usually is the device id. 85 | 86 | Returns: a `SlavePipe` object which can be used to communicate with the master device. 87 | 88 | """ 89 | if self._activated: 90 | assert self._queue.empty(), 'Queue is not clean before next initialization.' 91 | self._activated = False 92 | self._registry.clear() 93 | future = FutureResult() 94 | self._registry[identifier] = _MasterRegistry(future) 95 | return SlavePipe(identifier, self._queue, future) 96 | 97 | def run_master(self, master_msg): 98 | """ 99 | Main entry for the master device in each forward pass. 100 | The messages were first collected from each devices (including the master device), and then 101 | an callback will be invoked to compute the message to be sent back to each devices 102 | (including the master device). 103 | 104 | Args: 105 | master_msg: the message that the master want to send to itself. This will be placed as the first 106 | message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example. 107 | 108 | Returns: the message to be sent back to the master device. 109 | 110 | """ 111 | self._activated = True 112 | 113 | intermediates = [(0, master_msg)] 114 | for i in range(self.nr_slaves): 115 | intermediates.append(self._queue.get()) 116 | 117 | results = self._master_callback(intermediates) 118 | assert results[0][0] == 0, 'The first result should belongs to the master.' 119 | 120 | for i, res in results: 121 | if i == 0: 122 | continue 123 | self._registry[i].result.put(res) 124 | 125 | for i in range(self.nr_slaves): 126 | assert self._queue.get() is True 127 | 128 | return results[0][1] 129 | 130 | @property 131 | def nr_slaves(self): 132 | return len(self._registry) -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/functions.py: -------------------------------------------------------------------------------- 1 | ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 | ## Created by: Hang Zhang 3 | ## Email: zhanghang0704@gmail.com 4 | ## Copyright (c) 2018 5 | ## 6 | ## This source code is licensed under the MIT-style license found in the 7 | ## LICENSE file in the root directory of this source tree 8 | ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 9 | 10 | """Synchronized Cross-GPU Batch Normalization functions""" 11 | import torch 12 | from torch.autograd import Variable, Function 13 | from .src import * 14 | 15 | __all__ = ['sum_square', 'batchnormtrain'] 16 | 17 | def sum_square(input): 18 | r"""Calculate sum of elements and sum of squares for Batch Normalization""" 19 | return _sum_square.apply(input) 20 | 21 | 22 | class _sum_square(Function): 23 | @staticmethod 24 | def forward(ctx, input): 25 | ctx.save_for_backward(input) 26 | if input.is_cuda: 27 | xsum, xsqusum = gpu.sumsquare_forward(input) 28 | else: 29 | xsum, xsqusum = cpu.sumsquare_forward(input) 30 | return xsum, xsqusum 31 | 32 | @staticmethod 33 | def backward(ctx, gradSum, gradSquare): 34 | input, = ctx.saved_variables 35 | if input.is_cuda: 36 | gradInput = gpu.sumsquare_backward(input, gradSum, gradSquare) 37 | else: 38 | raise NotImplemented 39 | return gradInput 40 | 41 | 42 | class _batchnormtrain(Function): 43 | @staticmethod 44 | def forward(ctx, input, mean, std, gamma, beta): 45 | ctx.save_for_backward(input, mean, std, gamma, beta) 46 | if input.is_cuda: 47 | output = gpu.batchnorm_forward(input, mean, std, gamma, beta) 48 | else: 49 | output = cpu.batchnorm_forward(input, mean, std, gamma, beta) 50 | return output 51 | 52 | @staticmethod 53 | def backward(ctx, gradOutput): 54 | input, mean, std, gamma, beta = ctx.saved_variables 55 | if gradOutput.is_cuda: 56 | gradInput, gradMean, gradStd, gradGamma, gradBeta = \ 57 | gpu.batchnorm_backward(gradOutput, input, mean, 58 | std, gamma, beta, True) 59 | else: 60 | raise NotImplemented 61 | return gradInput, gradMean, gradStd, gradGamma, gradBeta 62 | 63 | 64 | def batchnormtrain(input, mean, std, gamma, beta): 65 | r"""Applies Batch Normalization over a 3d input that is seen as a 66 | mini-batch. 67 | 68 | .. _encoding.batchnormtrain: 69 | 70 | .. math:: 71 | 72 | y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta 73 | 74 | Shape: 75 | - Input: :math:`(N, C)` or :math:`(N, C, L)` 76 | - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) 77 | 78 | """ 79 | return _batchnormtrain.apply(input, mean, std, gamma, beta) 80 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/parallel_apply.py: -------------------------------------------------------------------------------- 1 | # import threading 2 | import queue 3 | import torch 4 | import torch.multiprocessing as mp 5 | # from pathos.multiprocessing import ProcessPool as Pool 6 | from torch.cuda._utils import _get_device_index 7 | 8 | 9 | def get_a_var(obj): 10 | if isinstance(obj, torch.Tensor): 11 | return obj 12 | 13 | if isinstance(obj, list) or isinstance(obj, tuple): 14 | for result in map(get_a_var, obj): 15 | if isinstance(result, torch.Tensor): 16 | return result 17 | if isinstance(obj, dict): 18 | for result in map(get_a_var, obj.items()): 19 | if isinstance(result, torch.Tensor): 20 | return result 21 | return None 22 | 23 | 24 | def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): 25 | r"""Applies each `module` in :attr:`modules` in parallel on arguments 26 | contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword) 27 | on each of :attr:`devices`. 28 | Args: 29 | modules (Module): modules to be parallelized 30 | inputs (tensor): inputs to the modules 31 | devices (list of int or torch.device): CUDA devices 32 | :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and 33 | :attr:`devices` (if given) should all have same length. Moreover, each 34 | element of :attr:`inputs` can either be a single object as the only argument 35 | to a module, or a collection of positional arguments. 36 | """ 37 | assert len(modules) == len(inputs) 38 | if kwargs_tup is not None: 39 | assert len(modules) == len(kwargs_tup) 40 | else: 41 | kwargs_tup = ({},) * len(modules) 42 | if devices is not None: 43 | assert len(modules) == len(devices) 44 | else: 45 | devices = [None] * len(modules) 46 | devices = list(map(lambda x: _get_device_index(x, True), devices)) 47 | context = mp.get_context('spawn') 48 | # lock = threading.Lock() 49 | # results = {} 50 | # results = [] 51 | # pool = context.Pool(len(devices)) 52 | results_queue = queue.Queue(len(devices)) 53 | grad_enabled = torch.is_grad_enabled() 54 | 55 | def _worker(module, input, kwargs, device=None): 56 | torch.set_grad_enabled(grad_enabled) 57 | if device is None: 58 | device = get_a_var(input).get_device() 59 | try: 60 | with torch.cuda.device(device): 61 | # this also avoids accidental slicing of `input` if it is a Tensor 62 | if not isinstance(input, (list, tuple)): 63 | input = (input,) 64 | output = module(*input, **kwargs) 65 | results_queue.put(output) 66 | # with lock: 67 | # results[i] = output 68 | except Exception as e: 69 | results_queue.put(e) 70 | # with lock: 71 | # results[i] = e 72 | 73 | if len(modules) > 1: 74 | # pool.map(_worker, [modules, inputs, kwargs_tup, devices]) 75 | processes = [context.Process(target=_worker, 76 | args=(i, module, input, kwargs, device)) 77 | for i, (module, input, kwargs, device) in 78 | enumerate(zip(modules, inputs, kwargs_tup, devices))] 79 | 80 | for process in processes: 81 | process.start() 82 | for process in processes: 83 | process.join() 84 | else: 85 | _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0]) 86 | 87 | outputs = [] 88 | for i in range(len(inputs)): 89 | output = results_queue.get() 90 | if isinstance(output, Exception): 91 | raise output 92 | outputs.append(output) 93 | return outputs 94 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.cpp_extension import load 4 | 5 | cwd = os.path.dirname(os.path.realpath(__file__)) 6 | cpu_path = os.path.join(cwd, 'cpu') 7 | gpu_path = os.path.join(cwd, 'gpu') 8 | 9 | cpu = load('syncbn_cpu', [ 10 | os.path.join(cpu_path, 'operator.cpp'), 11 | os.path.join(cpu_path, 'syncbn_cpu.cpp'), 12 | ], build_directory=cpu_path, verbose=False) 13 | 14 | if torch.cuda.is_available(): 15 | gpu = load('syncbn_gpu', [ 16 | os.path.join(gpu_path, 'operator.cpp'), 17 | os.path.join(gpu_path, 'syncbn_kernel.cu'), 18 | ], build_directory=gpu_path, verbose=False) 19 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/.ninja_deps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/.ninja_deps -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/.ninja_log: -------------------------------------------------------------------------------- 1 | # ninja log v5 2 | 1 3006 1563332513 syncbn_cpu.o 486ee2c6335a262c 3 | 1 6096 1563332516 operator.o df1c06f439a829e3 4 | 6096 6262 1563332517 syncbn_cpu.so 7b7138baea8e4fe0 5 | 0 3376 1578576757073544196 syncbn_cpu.o 238aaa649062d1c 6 | 0 4373 1578576758073181846 operator.o eedcce4cadeab94a 7 | 4373 4493 1578576758193138364 syncbn_cpu.so 7b7138baea8e4fe0 8 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/__init__.py -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/build.ninja: -------------------------------------------------------------------------------- 1 | ninja_required_version = 1.3 2 | cxx = c++ 3 | 4 | cflags = -DTORCH_EXTENSION_NAME=syncbn_cpu -DTORCH_API_INCLUDE_EXTENSION_H -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -isystem /home/SENSETIME/chenxiaokang/anaconda3/include/python3.6m -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++11 5 | ldflags = -shared 6 | 7 | rule compile 8 | command = $cxx -MMD -MF $out.d $cflags -c $in -o $out 9 | depfile = $out.d 10 | deps = gcc 11 | 12 | rule link 13 | command = $cxx $in $ldflags -o $out 14 | 15 | build operator.o: compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/cpu/operator.cpp 16 | build syncbn_cpu.o: compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.cpp 17 | 18 | build syncbn_cpu.so: link operator.o syncbn_cpu.o 19 | 20 | default syncbn_cpu.so 21 | 22 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/dist/syncbn_cpu-0.0.0-py3.6-linux-x86_64.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/dist/syncbn_cpu-0.0.0-py3.6-linux-x86_64.egg -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/operator.cpp: -------------------------------------------------------------------------------- 1 | #include "operator.h" 2 | 3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 4 | m.def("batchnorm_forward", &BatchNorm_Forward_CPU, "BatchNorm forward (CPU)"); 5 | m.def("batchnorm_backward", &BatchNorm_Backward_CPU, "BatchNorm backward (CPU)"); 6 | m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)"); 7 | m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)"); 8 | } 9 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/operator.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | at::Tensor BatchNorm_Forward_CPU( 5 | const at::Tensor input_, 6 | const at::Tensor mean_, 7 | const at::Tensor std_, 8 | const at::Tensor gamma_, 9 | const at::Tensor beta_); 10 | 11 | std::vector BatchNorm_Backward_CPU( 12 | const at::Tensor gradoutput_, 13 | const at::Tensor input_, 14 | const at::Tensor mean_, 15 | const at::Tensor std_, 16 | const at::Tensor gamma_, 17 | const at::Tensor beta_, 18 | bool train); 19 | 20 | std::vector Sum_Square_Forward_CPU( 21 | const at::Tensor input_); 22 | 23 | at::Tensor Sum_Square_Backward_CPU( 24 | const at::Tensor input_, 25 | const at::Tensor gradSum_, 26 | const at::Tensor gradSquare_); -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/operator.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/operator.o -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CppExtension 3 | 4 | setup( 5 | name='syncbn_cpu', 6 | ext_modules=[ 7 | CppExtension('syncbn_cpu', [ 8 | 'operator.cpp', 9 | 'syncbn_cpu.cpp', 10 | ]), 11 | ], 12 | cmdclass={ 13 | 'build_ext': BuildExtension 14 | }) 15 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | at::Tensor broadcast_to(at::Tensor v, at::Tensor x) { 6 | if (x.ndimension() == 2) { 7 | return v; 8 | } else { 9 | std::vector broadcast_size = {1, -1}; 10 | for (int64_t i = 2; i < x.ndimension(); ++i) 11 | broadcast_size.push_back(1); 12 | 13 | return v.view(broadcast_size); 14 | } 15 | } 16 | 17 | at::Tensor BatchNorm_Forward_CPU( 18 | const at::Tensor input, 19 | const at::Tensor mean, 20 | const at::Tensor std, 21 | const at::Tensor gamma, 22 | const at::Tensor beta) { 23 | auto output = (input - broadcast_to(mean, input)) / broadcast_to(std, input); 24 | output = output * broadcast_to(gamma, input) + broadcast_to(beta, input); 25 | return output; 26 | } 27 | 28 | // Not implementing CPU backward for now 29 | std::vector BatchNorm_Backward_CPU( 30 | const at::Tensor gradoutput, 31 | const at::Tensor input, 32 | const at::Tensor mean, 33 | const at::Tensor std, 34 | const at::Tensor gamma, 35 | const at::Tensor beta, 36 | bool train) { 37 | /* outputs*/ 38 | at::Tensor gradinput = at::zeros_like(input); 39 | at::Tensor gradgamma = at::zeros_like(gamma); 40 | at::Tensor gradbeta = at::zeros_like(beta); 41 | at::Tensor gradMean = at::zeros_like(mean); 42 | at::Tensor gradStd = at::zeros_like(std); 43 | return {gradinput, gradMean, gradStd, gradgamma, gradbeta}; 44 | } 45 | 46 | std::vector Sum_Square_Forward_CPU( 47 | const at::Tensor input) { 48 | /* outputs */ 49 | at::Tensor sum = torch::zeros({input.size(1)}, input.options()); 50 | at::Tensor square = torch::zeros({input.size(1)}, input.options()); 51 | return {sum, square}; 52 | } 53 | 54 | at::Tensor Sum_Square_Backward_CPU( 55 | const at::Tensor input, 56 | const at::Tensor gradSum, 57 | const at::Tensor gradSquare) { 58 | /* outputs */ 59 | at::Tensor gradInput = at::zeros_like(input); 60 | return gradInput; 61 | } 62 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: syncbn-cpu 3 | Version: 0.0.0 4 | Summary: UNKNOWN 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.o -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/cpu/syncbn_cpu.so -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/.ninja_deps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/.ninja_deps -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/.ninja_log: -------------------------------------------------------------------------------- 1 | # ninja log v5 2 | 1 6029 1563332523 operator.o 93fbaee254d44db4 3 | 1 11088 1563332528 syncbn_kernel.cuda.o ec50d81437939f2c 4 | 11088 11258 1563332528 syncbn_gpu.so a2b728e60c853ec3 5 | 0 2904 1578576761208045520 operator.o cecba1516d789115 6 | 0 7039 1578576765338548294 syncbn_kernel.cuda.o a19378e9f1e5d587 7 | 7039 7134 1578576765434513509 syncbn_gpu.so a2b728e60c853ec3 8 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/__init__.py -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/build.ninja: -------------------------------------------------------------------------------- 1 | ninja_required_version = 1.3 2 | cxx = c++ 3 | nvcc = /usr/local/cuda/bin/nvcc 4 | 5 | cflags = -DTORCH_EXTENSION_NAME=syncbn_gpu -DTORCH_API_INCLUDE_EXTENSION_H -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -isystem /usr/local/cuda/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/include/python3.6m -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++11 6 | cuda_flags = -DTORCH_EXTENSION_NAME=syncbn_gpu -DTORCH_API_INCLUDE_EXTENSION_H -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -isystem /home/SENSETIME/chenxiaokang/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -isystem /usr/local/cuda/include -isystem /home/SENSETIME/chenxiaokang/anaconda3/include/python3.6m -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --compiler-options '-fPIC' -std=c++11 7 | ldflags = -shared -L/usr/local/cuda/lib64 -lcudart 8 | 9 | rule compile 10 | command = $cxx -MMD -MF $out.d $cflags -c $in -o $out 11 | depfile = $out.d 12 | deps = gcc 13 | 14 | rule cuda_compile 15 | command = $nvcc $cuda_flags -c $in -o $out 16 | 17 | rule link 18 | command = $cxx $in $ldflags -o $out 19 | 20 | build operator.o: compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/gpu/operator.cpp 21 | build syncbn_kernel.cuda.o: cuda_compile /home/SENSETIME/chenxiaokang/Desktop/seg/torchseg/furnace/seg_opr/sync_bn/src/gpu/syncbn_kernel.cu 22 | 23 | build syncbn_gpu.so: link operator.o syncbn_kernel.cuda.o 24 | 25 | default syncbn_gpu.so 26 | 27 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/device_tensor.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template 4 | struct DeviceTensor { 5 | public: 6 | inline __device__ __host__ DeviceTensor(DType *p, const int *size) 7 | : dptr_(p) { 8 | for (int i = 0; i < Dim; ++i) { 9 | size_[i] = size ? size[i] : 0; 10 | } 11 | } 12 | 13 | inline __device__ __host__ unsigned getSize(const int i) const { 14 | assert(i < Dim); 15 | return size_[i]; 16 | } 17 | 18 | inline __device__ __host__ int numElements() const { 19 | int n = 1; 20 | for (int i = 0; i < Dim; ++i) { 21 | n *= size_[i]; 22 | } 23 | return n; 24 | } 25 | 26 | inline __device__ __host__ DeviceTensor select(const size_t x) const { 27 | assert(Dim > 1); 28 | int offset = x; 29 | for (int i = 1; i < Dim; ++i) { 30 | offset *= size_[i]; 31 | } 32 | DeviceTensor tensor(dptr_ + offset, nullptr); 33 | for (int i = 0; i < Dim - 1; ++i) { 34 | tensor.size_[i] = this->size_[i+1]; 35 | } 36 | return tensor; 37 | } 38 | 39 | inline __device__ __host__ DeviceTensor operator[](const size_t x) const { 40 | assert(Dim > 1); 41 | int offset = x; 42 | for (int i = 1; i < Dim; ++i) { 43 | offset *= size_[i]; 44 | } 45 | DeviceTensor tensor(dptr_ + offset, nullptr); 46 | for (int i = 0; i < Dim - 1; ++i) { 47 | tensor.size_[i] = this->size_[i+1]; 48 | } 49 | return tensor; 50 | } 51 | 52 | inline __device__ __host__ size_t InnerSize() const { 53 | assert(Dim >= 3); 54 | size_t sz = 1; 55 | for (size_t i = 2; i < Dim; ++i) { 56 | sz *= size_[i]; 57 | } 58 | return sz; 59 | } 60 | 61 | inline __device__ __host__ size_t ChannelCount() const { 62 | assert(Dim >= 3); 63 | return size_[1]; 64 | } 65 | 66 | inline __device__ __host__ DType* data_ptr() const { 67 | return dptr_; 68 | } 69 | 70 | DType *dptr_; 71 | int size_[Dim]; 72 | }; 73 | 74 | template 75 | struct DeviceTensor { 76 | inline __device__ __host__ DeviceTensor(DType *p, const int *size) 77 | : dptr_(p) { 78 | size_[0] = size ? size[0] : 0; 79 | } 80 | 81 | inline __device__ __host__ unsigned getSize(const int i) const { 82 | assert(i == 0); 83 | return size_[0]; 84 | } 85 | 86 | inline __device__ __host__ int numElements() const { 87 | return size_[0]; 88 | } 89 | 90 | inline __device__ __host__ DType &operator[](const size_t x) const { 91 | return *(dptr_ + x); 92 | } 93 | 94 | inline __device__ __host__ DType* data_ptr() const { 95 | return dptr_; 96 | } 97 | 98 | DType *dptr_; 99 | int size_[1]; 100 | }; 101 | 102 | template 103 | static DeviceTensor devicetensor(const at::Tensor &blob) { 104 | DType *data = blob.data(); 105 | DeviceTensor tensor(data, nullptr); 106 | for (int i = 0; i < Dim; ++i) { 107 | tensor.size_[i] = blob.size(i); 108 | } 109 | return tensor; 110 | } 111 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/dist/syncbn_gpu-0.0.0-py3.6-linux-x86_64.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/dist/syncbn_gpu-0.0.0-py3.6-linux-x86_64.egg -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/operator.cpp: -------------------------------------------------------------------------------- 1 | #include "operator.h" 2 | 3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 4 | m.def("batchnorm_forward", &BatchNorm_Forward_CUDA, "BatchNorm forward (CUDA)"); 5 | m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)"); 6 | m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)"); 7 | m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)"); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/operator.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | at::Tensor BatchNorm_Forward_CUDA( 6 | const at::Tensor input_, 7 | const at::Tensor mean_, 8 | const at::Tensor std_, 9 | const at::Tensor gamma_, 10 | const at::Tensor beta_); 11 | 12 | std::vector BatchNorm_Backward_CUDA( 13 | const at::Tensor gradoutput_, 14 | const at::Tensor input_, 15 | const at::Tensor mean_, 16 | const at::Tensor std_, 17 | const at::Tensor gamma_, 18 | const at::Tensor beta_, 19 | bool train); 20 | 21 | std::vector Sum_Square_Forward_CUDA( 22 | const at::Tensor input_); 23 | 24 | at::Tensor Sum_Square_Backward_CUDA( 25 | const at::Tensor input_, 26 | const at::Tensor gradSum_, 27 | const at::Tensor gradSquare_); 28 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/operator.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/operator.o -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='syncbn_gpu', 6 | ext_modules=[ 7 | CUDAExtension('syncbn_gpu', [ 8 | 'operator.cpp', 9 | 'syncbn_kernel.cu', 10 | ]), 11 | ], 12 | cmdclass={ 13 | 'build_ext': BuildExtension 14 | }) 15 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/syncbn_gpu.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: syncbn-gpu 3 | Version: 0.0.0 4 | Summary: UNKNOWN 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/syncbn_gpu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/syncbn_gpu.so -------------------------------------------------------------------------------- /furnace/seg_opr/sync_bn/src/gpu/syncbn_kernel.cuda.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/seg_opr/sync_bn/src/gpu/syncbn_kernel.cuda.o -------------------------------------------------------------------------------- /furnace/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesCXK/TorchSSC/2fd21ad25af92cd9f9ad28de3c4bc897c0ae8b43/furnace/utils/__init__.py -------------------------------------------------------------------------------- /furnace/utils/init_func.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | # @Time : 2018/9/28 下午12:13 4 | # @Author : yuchangqian 5 | # @Contact : changqian_yu@163.com 6 | # @File : init_func.py.py 7 | import torch 8 | import torch.nn as nn 9 | 10 | 11 | def __init_weight(feature, conv_init, norm_layer, bn_eps, bn_momentum, 12 | **kwargs): 13 | for name, m in feature.named_modules(): 14 | if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): 15 | conv_init(m.weight, **kwargs) 16 | elif isinstance(m, norm_layer): 17 | m.eps = bn_eps 18 | m.momentum = bn_momentum 19 | nn.init.constant_(m.weight, 1) 20 | nn.init.constant_(m.bias, 0) 21 | 22 | 23 | def init_weight(module_list, conv_init, norm_layer, bn_eps, bn_momentum, 24 | **kwargs): 25 | if isinstance(module_list, list): 26 | for feature in module_list: 27 | __init_weight(feature, conv_init, norm_layer, bn_eps, bn_momentum, 28 | **kwargs) 29 | else: 30 | __init_weight(module_list, conv_init, norm_layer, bn_eps, bn_momentum, 31 | **kwargs) 32 | 33 | 34 | def group_weight(weight_group, module, norm_layer, lr): 35 | group_decay = [] 36 | group_no_decay = [] 37 | for m in module.modules(): 38 | if isinstance(m, nn.Linear): 39 | group_decay.append(m.weight) 40 | if m.bias is not None: 41 | group_no_decay.append(m.bias) 42 | elif isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.ConvTranspose2d, nn.ConvTranspose3d)): 43 | group_decay.append(m.weight) 44 | if m.bias is not None: 45 | group_no_decay.append(m.bias) 46 | elif isinstance(m, norm_layer) or isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) \ 47 | or isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.GroupNorm): 48 | if m.weight is not None: 49 | group_no_decay.append(m.weight) 50 | if m.bias is not None: 51 | group_no_decay.append(m.bias) 52 | elif isinstance(m, nn.Parameter): 53 | group_decay.append(m) 54 | # else: 55 | # print(m, norm_layer) 56 | # print(module.modules) 57 | # print( len(list(module.parameters())) , 'HHHHHHHHHHHHHHHHH', len(group_decay) + len( 58 | # group_no_decay)) 59 | assert len(list(module.parameters())) == len(group_decay) + len( 60 | group_no_decay) 61 | weight_group.append(dict(params=group_decay, lr=lr)) 62 | weight_group.append(dict(params=group_no_decay, weight_decay=.0, lr=lr)) 63 | return weight_group 64 | -------------------------------------------------------------------------------- /furnace/utils/visualize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import scipy.io as sio 4 | 5 | def set_img_color(colors, background, img, pred, gt, show255=False): 6 | for i in range(0, len(colors)): 7 | if i != background: 8 | img[np.where(pred == i)] = colors[i] 9 | if show255: 10 | img[np.where(gt==background)] = 255 11 | return img 12 | 13 | def show_prediction(colors, background, img, pred, gt): 14 | im = np.array(img, np.uint8) 15 | set_img_color(colors, background, im, pred, gt) 16 | final = np.array(im) 17 | return final 18 | 19 | def show_img(colors, background, img, clean, gt, *pds): 20 | im1 = np.array(img, np.uint8) 21 | #set_img_color(colors, background, im1, clean, gt) 22 | final = np.array(im1) 23 | # the pivot black bar 24 | pivot = np.zeros((im1.shape[0], 15, 3), dtype=np.uint8) 25 | for pd in pds: 26 | im = np.array(img, np.uint8) 27 | # pd[np.where(gt == 255)] = 255 28 | set_img_color(colors, background, im, pd, gt) 29 | final = np.column_stack((final, pivot)) 30 | final = np.column_stack((final, im)) 31 | 32 | im = np.array(img, np.uint8) 33 | set_img_color(colors, background, im, gt, True) 34 | final = np.column_stack((final, pivot)) 35 | final = np.column_stack((final, im)) 36 | return final 37 | 38 | def get_colors(class_num): 39 | colors = [] 40 | for i in range(class_num): 41 | colors.append((np.random.random((1,3)) * 255).tolist()[0]) 42 | 43 | return colors 44 | 45 | def get_ade_colors(): 46 | colors = sio.loadmat('./color150.mat')['colors'] 47 | colors = colors[:,::-1,] 48 | colors = np.array(colors).astype(int).tolist() 49 | colors.insert(0,[0,0,0]) 50 | 51 | return colors 52 | 53 | def print_iou(iu, mean_pixel_acc, class_names=None, show_no_back=False, no_print=False): 54 | n = iu.size 55 | lines = [] 56 | for i in range(n): 57 | if class_names is None: 58 | cls = 'Class %d:' % (i+1) 59 | else: 60 | cls = '%d %s' % (i+1, class_names[i]) 61 | lines.append('%-8s\t%.3f%%' % (cls, iu[i] * 100)) 62 | mean_IU = np.nanmean(iu) 63 | mean_IU_no_back = np.nanmean(iu[1:]) 64 | if show_no_back: 65 | lines.append('---------------------------- %-8s\t%.3f%%\t%-8s\t%.3f%%\t%-8s\t%.3f%%' % ('mean_IU', mean_IU * 100, 'mean_IU_no_back', mean_IU_no_back*100, 66 | 'mean_pixel_ACC',mean_pixel_acc*100)) 67 | else: 68 | print(mean_pixel_acc) 69 | lines.append('---------------------------- %-8s\t%.3f%%\t%-8s\t%.3f%%' % ('mean_IU', mean_IU * 100,'mean_pixel_ACC',mean_pixel_acc*100)) 70 | line = "\n".join(lines) 71 | if not no_print: 72 | print(line) 73 | return line 74 | 75 | 76 | -------------------------------------------------------------------------------- /install.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | The code is developed using Python 3.6 with PyTorch 1.0.0. The code is developed and tested using 2 GPU cards. 4 | 5 | 1. **Clone this repo.** 6 | 7 | ```shell 8 | $ git clone https://github.com/charlesCXK/TorchSSC.git 9 | $ cd TorchSSC 10 | ``` 11 | 12 | 2. **Install dependencies.** 13 | 14 | **(1) Create a conda environment:** 15 | 16 | ```shell 17 | $ conda env create -f ssc.yaml 18 | $ conda activate ssc 19 | ``` 20 | 21 | **(2) Install apex 0.1(needs CUDA)** 22 | 23 | ```shell 24 | $ cd ./furnace/apex 25 | $ python setup.py install --cpp_ext --cuda_ext 26 | ``` 27 | 28 | ​ -------------------------------------------------------------------------------- /model/sketch.nyu/config.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import os.path as osp 9 | import sys 10 | import time 11 | import numpy as np 12 | from easydict import EasyDict as edict 13 | import argparse 14 | 15 | C = edict() 16 | config = C 17 | cfg = C 18 | 19 | C.seed = 12345 20 | 21 | remoteip = os.popen('pwd').read() 22 | C.volna = '/home/chen/TorchSSC/' # this is the path to your repo 'TorchSSC' 23 | 24 | 25 | """please config ROOT_dir and user when u first using""" 26 | C.repo_name = 'TorchSSC' 27 | C.abs_dir = osp.realpath(".") 28 | C.this_dir = C.abs_dir.split(osp.sep)[-1] 29 | 30 | 31 | C.root_dir = C.abs_dir[:C.abs_dir.index(C.repo_name) + len(C.repo_name)] 32 | C.log_dir = osp.abspath('log') 33 | C.tb_dir = osp.abspath(osp.join(C.log_dir, "tb")) 34 | 35 | C.log_dir_link = osp.join(C.abs_dir, 'log') 36 | C.snapshot_dir = osp.abspath(osp.join(C.log_dir, "snapshot")) 37 | 38 | exp_time = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime()) 39 | C.log_file = C.log_dir + '/log_' + exp_time + '.log' 40 | C.link_log_file = C.log_file + '/log_last.log' 41 | C.val_log_file = C.log_dir + '/_' + exp_time + '.lovalg' 42 | C.link_val_log_file = C.log_dir + '/val_last.log' 43 | 44 | """Data Dir and Weight Dir""" 45 | C.dataset_path = osp.join(C.volna, 'DATA/NYU/') 46 | C.img_root_folder = C.dataset_path 47 | C.gt_root_folder = C.dataset_path 48 | C.hha_root_folder = osp.join(C.dataset_path, 'HHA') 49 | C.mapping_root_folder = osp.join(C.dataset_path, 'Mapping') 50 | C.train_source = osp.join(C.dataset_path, "train.txt") 51 | C.eval_source = osp.join(C.dataset_path, "test.txt") 52 | C.is_test = False 53 | 54 | """Path Config""" 55 | 56 | 57 | def add_path(path): 58 | if path not in sys.path: 59 | sys.path.insert(0, path) 60 | 61 | 62 | add_path(osp.join(C.root_dir, 'furnace')) 63 | 64 | from utils.pyt_utils import model_urls 65 | 66 | """Image Config""" 67 | C.num_classes = 12 68 | C.background = 255 69 | C.image_mean = np.array([0.485, 0.456, 0.406]) 70 | C.image_std = np.array([0.229, 0.224, 0.225]) 71 | C.image_height = 480 72 | C.image_width = 640 73 | C.num_train_imgs = 795 74 | C.num_eval_imgs = 654 75 | 76 | """ Settings for network, this would be different for each kind of model""" 77 | C.fix_bias = True 78 | C.bn_eps = 1e-5 79 | C.bn_momentum = 0.1 80 | C.pretrained_model = C.volna + 'DATA/pytorch-weight/resnet50-imagenet.pth' 81 | 82 | """Train Config""" 83 | C.lr = 0.1 84 | C.lr_power = 0.9 85 | C.momentum = 0.9 86 | C.weight_decay = 5e-4 87 | C.batch_size = 4 88 | C.nepochs = 250 89 | C.niters_per_epoch = 795 // C.batch_size 90 | C.num_workers = C.batch_size 91 | 92 | C.train_scale_array = [1] 93 | C.warm_up_epoch = 0 94 | 95 | """Eval Config""" 96 | C.eval_iter = 30 97 | C.eval_stride_rate = 2 / 3 98 | C.eval_scale_array = [1, ] 99 | C.eval_flip = False 100 | C.eval_base_size = 480 101 | C.eval_crop_size = 640 102 | 103 | """Display Config""" 104 | C.snapshot_iter = 10 105 | C.record_info_iter = 20 106 | C.display_iter = 50 107 | C.sketch_weight = 1 108 | C.sketch_weight_gsnn = 1.5 109 | 110 | C.kld_weight = 2 111 | C.samples = 4 112 | C.lantent_size = 16 113 | C.empty_loss_weight = 1 114 | 115 | def open_tensorboard(): 116 | pass 117 | 118 | if __name__ == '__main__': 119 | print(config.nepochs) 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument( 122 | '-tb', '--tensorboard', default=False, action='store_true') 123 | args = parser.parse_args() 124 | 125 | if args.tensorboard: 126 | open_tensorboard() -------------------------------------------------------------------------------- /model/sketch.nyu/dataloader.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | import numpy as np 4 | from torch.utils import data 5 | import random 6 | from config import config 7 | from utils.img_utils import normalize, \ 8 | generate_random_crop_pos, random_crop_pad_to_shape 9 | 10 | class TrainPre(object): 11 | def __init__(self, img_mean, img_std): 12 | self.img_mean = img_mean 13 | self.img_std = img_std 14 | 15 | def __call__(self, img, hha): 16 | img = normalize(img, self.img_mean, self.img_std) 17 | hha = normalize(hha, self.img_mean, self.img_std) 18 | 19 | p_img = img.transpose(2, 0, 1) 20 | p_hha = hha.transpose(2, 0, 1) 21 | 22 | extra_dict = {'hha_img': p_hha} 23 | 24 | return p_img, extra_dict 25 | class ValPre(object): 26 | def __call__(self, img, hha): 27 | extra_dict = {'hha_img': hha} 28 | return img, extra_dict 29 | 30 | 31 | def get_train_loader(engine, dataset, s3client=None): 32 | data_setting = {'img_root': config.img_root_folder, 33 | 'gt_root': config.gt_root_folder, 34 | 'hha_root':config.hha_root_folder, 35 | 'mapping_root': config.mapping_root_folder, 36 | 'train_source': config.train_source, 37 | 'eval_source': config.eval_source} 38 | train_preprocess = TrainPre(config.image_mean, config.image_std) 39 | 40 | train_dataset = dataset(data_setting, "train", train_preprocess, 41 | config.batch_size * config.niters_per_epoch, s3client=s3client) 42 | 43 | train_sampler = None 44 | is_shuffle = True 45 | batch_size = config.batch_size 46 | 47 | if engine.distributed: 48 | train_sampler = torch.utils.data.distributed.DistributedSampler( 49 | train_dataset) 50 | batch_size = config.batch_size // engine.world_size 51 | is_shuffle = False 52 | 53 | train_loader = data.DataLoader(train_dataset, 54 | batch_size=batch_size, 55 | num_workers=config.num_workers, 56 | drop_last=True, 57 | shuffle=is_shuffle, 58 | pin_memory=True, 59 | sampler=train_sampler) 60 | 61 | return train_loader, train_sampler 62 | -------------------------------------------------------------------------------- /model/sketch.nyu/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export NGPUS=2 3 | export CUDA_VISIBLE_DEVICES=0,1 4 | python -m torch.distributed.launch --nproc_per_node=$NGPUS train.py -p 10097 5 | python eval.py -e 200-250 -d 0-1 --save_path results --------------------------------------------------------------------------------