├── .gitignore ├── AE.py ├── DAE.py ├── LICENSE ├── README.md ├── VAE.py ├── fp_32_16.py ├── out.txt ├── out_mix.txt ├── test.py └── torch_float16.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /AE.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch import nn 5 | from torch.autograd import Variable 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from torchvision.datasets import MNIST 9 | from torchvision.utils import save_image 10 | 11 | if not os.path.exists('./mlp_img'): 12 | os.mkdir('./mlp_img') 13 | 14 | 15 | def to_img(x): 16 | x = x.view(x.size(0), 1, 28, 28) 17 | return x 18 | 19 | num_epochs = 200 20 | batch_size = 128 21 | learning_rate = 1e-3 22 | 23 | 24 | def plot_sample_img(img, name): 25 | img = img.view(1, 28, 28) 26 | save_image(img, './sample_{}.png'.format(name)) 27 | 28 | 29 | def min_max_normalization(tensor, min_value, max_value): 30 | min_tensor = tensor.min() 31 | tensor = (tensor - min_tensor) 32 | max_tensor = tensor.max() 33 | tensor = tensor / max_tensor 34 | tensor = tensor * (max_value - min_value) + min_value 35 | return tensor 36 | 37 | 38 | def tensor_round(tensor): 39 | return torch.round(tensor) 40 | 41 | img_transform = transforms.Compose([ 42 | transforms.ToTensor(), 43 | transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)), 44 | transforms.Lambda(lambda tensor:tensor_round(tensor)) 45 | ]) 46 | 47 | dataset = MNIST('./data', transform=img_transform, download=True) 48 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 49 | 50 | 51 | class autoencoder(nn.Module): 52 | def __init__(self): 53 | super(autoencoder, self).__init__() 54 | self.encoder = nn.Sequential( 55 | nn.Linear(28 * 28, 256), 56 | nn.ReLU(True), 57 | nn.Linear(256, 64), 58 | nn.ReLU(True)) 59 | self.decoder = nn.Sequential( 60 | nn.Linear(64, 256), 61 | nn.ReLU(True), 62 | nn.Linear(256, 28 * 28), 63 | nn.Sigmoid()) 64 | 65 | def forward(self, x): 66 | x = self.encoder(x) 67 | x = self.decoder(x) 68 | return x 69 | 70 | 71 | model = autoencoder().cuda() 72 | criterion = nn.BCELoss() 73 | optimizer = torch.optim.Adam( 74 | model.parameters(), lr=learning_rate, weight_decay=1e-5) 75 | 76 | for epoch in range(num_epochs): 77 | for data in dataloader: 78 | img, _ = data 79 | img = img.view(img.size(0), -1) 80 | img = Variable(img).cuda() 81 | # ===================forward===================== 82 | output = model(img) 83 | loss = criterion(output, img) 84 | MSE_loss = nn.MSELoss()(output, img) 85 | # ===================backward==================== 86 | optimizer.zero_grad() 87 | loss.backward() 88 | optimizer.step() 89 | # ===================log======================== 90 | print('epoch [{}/{}], loss:{:.4f}, MSE_loss:{:.4f}' 91 | .format(epoch + 1, num_epochs, loss.data[0], MSE_loss.data[0])) 92 | if epoch % 10 == 0: 93 | x = to_img(img.cpu().data) 94 | x_hat = to_img(output.cpu().data) 95 | save_image(x, './mlp_img/x_{}.png'.format(epoch)) 96 | save_image(x_hat, './mlp_img/x_hat_{}.png'.format(epoch)) 97 | 98 | torch.save(model.state_dict(), './sim_autoencoder.pth') 99 | -------------------------------------------------------------------------------- /DAE.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch import nn 5 | from torch.autograd import Variable 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from torchvision.datasets import MNIST 9 | from torchvision.utils import save_image 10 | 11 | if not os.path.exists('./DAEmlp_img'): 12 | os.mkdir('./DAEmlp_img') 13 | 14 | 15 | def to_img(x): 16 | x = x.view(x.size(0), 1, 28, 28) 17 | return x 18 | 19 | num_epochs = 200 20 | batch_size = 128 21 | learning_rate = 1e-3 22 | 23 | 24 | def add_noise(img): 25 | noise = torch.randn(img.size()) * 0.4 26 | noisy_img = img + noise 27 | return noisy_img 28 | 29 | 30 | def plot_sample_img(img, name): 31 | img = img.view(1, 28, 28) 32 | save_image(img, './sample_{}.png'.format(name)) 33 | 34 | 35 | def min_max_normalization(tensor, min_value, max_value): 36 | min_tensor = tensor.min() 37 | tensor = (tensor - min_tensor) 38 | max_tensor = tensor.max() 39 | tensor = tensor / max_tensor 40 | tensor = tensor * (max_value - min_value) + min_value 41 | return tensor 42 | 43 | 44 | def tensor_round(tensor): 45 | return torch.round(tensor) 46 | 47 | 48 | img_transform = transforms.Compose([ 49 | transforms.ToTensor(), 50 | transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)), 51 | transforms.Lambda(lambda tensor:tensor_round(tensor)) 52 | ]) 53 | 54 | dataset = MNIST('./data', transform=img_transform, download=True) 55 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 56 | 57 | 58 | class autoencoder(nn.Module): 59 | def __init__(self): 60 | super(autoencoder, self).__init__() 61 | self.encoder = nn.Sequential( 62 | nn.Linear(28 * 28, 256), 63 | nn.ReLU(True), 64 | nn.Linear(256, 64), 65 | nn.ReLU(True)) 66 | self.decoder = nn.Sequential( 67 | nn.Linear(64, 256), 68 | nn.ReLU(True), 69 | nn.Linear(256, 28 * 28), 70 | nn.Sigmoid()) 71 | 72 | def forward(self, x): 73 | x = self.encoder(x) 74 | x = self.decoder(x) 75 | return x 76 | 77 | 78 | model = autoencoder().cuda() 79 | criterion = nn.BCELoss() 80 | optimizer = torch.optim.Adam( 81 | model.parameters(), lr=learning_rate, weight_decay=1e-5) 82 | 83 | for epoch in range(num_epochs): 84 | for data in dataloader: 85 | img, _ = data 86 | img = img.view(img.size(0), -1) 87 | noisy_img = add_noise(img) 88 | noisy_img = Variable(noisy_img).cuda() 89 | img = Variable(img).cuda() 90 | # ===================forward===================== 91 | output = model(noisy_img) 92 | loss = criterion(output, img) 93 | MSE_loss = nn.MSELoss()(output, img) 94 | # ===================backward==================== 95 | optimizer.zero_grad() 96 | loss.backward() 97 | optimizer.step() 98 | # ===================log======================== 99 | print('epoch [{}/{}], loss:{:.4f}, MSE_loss:{:.4f}' 100 | .format(epoch + 1, num_epochs, loss.data[0], MSE_loss.data[0])) 101 | if epoch % 10 == 0: 102 | x = to_img(img.cpu().data) 103 | x_hat = to_img(output.cpu().data) 104 | x_noisy = to_img(noisy_img.cpu().data) 105 | weights = to_img(model.encoder[0].weight.cpu().data) 106 | save_image(x, './DAEmlp_img/x_{}.png'.format(epoch)) 107 | save_image(x_hat, './DAEmlp_img/x_hat_{}.png'.format(epoch)) 108 | save_image(x_noisy, './DAEmlp_img/x_noisy_{}.png'.format(epoch)) 109 | save_image(weights, './filters/epoch_{}.png'.format(epoch)) 110 | 111 | torch.save(model.state_dict(), './sim_dautoencoder.pth') 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Reyhane Askari 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | For full tutorial follow: https://reyhaneaskari.github.io/AE.htm 2 | -------------------------------------------------------------------------------- /VAE.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch import nn 5 | from torch.autograd import Variable 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from torchvision.datasets import MNIST 9 | from torchvision.utils import save_image 10 | 11 | if not os.path.exists('./mlp_img'): 12 | os.mkdir('./mlp_img') 13 | 14 | 15 | def to_img(x): 16 | x = x.view(x.size(0), 1, 28, 28) 17 | return x 18 | 19 | num_epochs = 200 20 | batch_size = 128 21 | learning_rate = 1e-3 22 | 23 | 24 | def plot_sample_img(img, name): 25 | img = img.view(1, 28, 28) 26 | save_image(img, './sample_{}.png'.format(name)) 27 | 28 | 29 | def min_max_normalization(tensor, min_value, max_value): 30 | min_tensor = tensor.min() 31 | tensor = (tensor - min_tensor) 32 | max_tensor = tensor.max() 33 | tensor = tensor / max_tensor 34 | tensor = tensor * (max_value - min_value) + min_value 35 | return tensor 36 | 37 | 38 | def tensor_round(tensor): 39 | return torch.round(tensor) 40 | 41 | 42 | img_transform = transforms.Compose([ 43 | transforms.ToTensor(), 44 | transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)), 45 | transforms.Lambda(lambda tensor:tensor_round(tensor)) 46 | ]) 47 | 48 | dataset = MNIST('./data', transform=img_transform, download=True) 49 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 50 | 51 | 52 | class VariationalAutoencoder(nn.Module): 53 | def __init__(self): 54 | super(VariationalAutoencoder, self).__init__() 55 | self.encoder = nn.Sequential( 56 | nn.Linear(28 * 28, 400), 57 | nn.ReLU(True), 58 | nn.Linear(400, 40)) 59 | self.decoder = nn.Sequential( 60 | nn.Linear(20, 400), 61 | nn.ReLU(True), 62 | nn.Linear(400, 28 * 28), 63 | nn.Sigmoid()) 64 | 65 | def reparametrize(self, mu, logvar): 66 | var = logvar.exp() 67 | std = var.sqrt() 68 | eps = Variable(torch.cuda.FloatTensor(std.size()).normal_()) 69 | return eps.mul(std).add(mu) 70 | 71 | def forward(self, x): 72 | h = self.encoder(x) 73 | mu = h[:, :20] 74 | logvar = h[:, 20:] 75 | z = self.reparametrize(mu, logvar) 76 | x_hat = self.decoder(z) 77 | return x_hat, mu, logvar 78 | 79 | def generation_with_interpolation(self, x_one, x_two, alpha): 80 | hidden_one = self.encoder(x_one) 81 | hidden_two = self.encoder(x_two) 82 | mu_one = hidden_one[:, :20] 83 | logvar_one = hidden_one[:, 20:] 84 | mu_two = hidden_two[:, :20] 85 | logvar_two = hidden_two[:, 20:] 86 | mu = (1 - alpha) * mu_one + alpha * mu_two 87 | logvar = (1 - alpha) * logvar_one + alpha * logvar_two 88 | z = self.reparametrize(mu, logvar) 89 | generated_image = self.decoder(z) 90 | return generated_image 91 | 92 | model = VariationalAutoencoder().cuda() 93 | BCE = nn.BCELoss() 94 | optimizer = torch.optim.Adam( 95 | model.parameters(), lr=learning_rate) 96 | 97 | for epoch in range(num_epochs): 98 | for data in dataloader: 99 | img, _ = data 100 | img = img.view(img.size(0), -1) 101 | img = Variable(img).cuda() 102 | # ===================forward===================== 103 | x_hat, mu, logvar = model(img) 104 | NKLD = mu.pow(2).add(logvar.exp()).mul(-1).add(logvar.add(1)) 105 | KLD = torch.sum(NKLD).mul(-0.5) 106 | KLD /= 128 * 784 107 | loss = BCE(x_hat, img) + KLD 108 | # ===================backward==================== 109 | optimizer.zero_grad() 110 | loss.backward() 111 | optimizer.step() 112 | # ===================log======================== 113 | print('epoch [{}/{}], loss:{:.4f}' 114 | .format(epoch + 1, num_epochs, loss.data[0])) 115 | if epoch % 10 == 0: 116 | x = to_img(img.cpu().data) 117 | x_hat = to_img(x_hat.cpu().data) 118 | save_image(x, './mlp_img/x_{}.png'.format(epoch)) 119 | save_image(x_hat, './mlp_img/x_hat_{}.png'.format(epoch)) 120 | batch = iter(dataloader).next()[0] 121 | batch = batch.view(batch.size(0), -1) 122 | batch = Variable(batch).cuda() 123 | x_one = batch[0:1] 124 | x_two = batch[1:2] 125 | generated_images = [] 126 | for alpha in torch.arange(0.0, 1.0, 0.1): 127 | generated_images.append(model.generation_with_interpolation( 128 | x_one, x_two, alpha)) 129 | generated_images = torch.cat(generated_images, 0).cpu().data 130 | save_image(generated_images.view(-1, 1, 28, 28), 131 | './generated/output_interpolate_{}.png'.format(epoch), 132 | nrow=1) 133 | torch.save(model.state_dict(), './sim_variational_autoencoder.pth') 134 | -------------------------------------------------------------------------------- /fp_32_16.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | import time 10 | 11 | # Training settings 12 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 13 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 14 | help='input batch size for training (default: 64)') 15 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 16 | help='input batch size for testing (default: 1000)') 17 | parser.add_argument('--epochs', type=int, default=1, metavar='N', 18 | help='number of epochs to train (default: 1)') 19 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 20 | help='learning rate (default: 0.01)') 21 | parser.add_argument('--seed', type=int, default=1, metavar='S', 22 | help='random seed (default: 1)') 23 | parser.add_argument('--mixf', action='store_true', default=False, 24 | help='enables using mixed float precision') 25 | args = parser.parse_args() 26 | 27 | torch.manual_seed(args.seed) 28 | torch.cuda.manual_seed(args.seed) 29 | 30 | 31 | class Net(nn.Module): 32 | def __init__(self): 33 | 34 | super(Net, self).__init__() 35 | self.conv1 = nn.Conv2d(2048, 2048, kernel_size=1) 36 | 37 | def forward(self, x): 38 | x = F.relu(self.conv1(x)) 39 | return x 40 | 41 | model = Net() 42 | if args.mixf: 43 | model.cuda().half() 44 | else: 45 | model.cuda() 46 | 47 | if args.mixf: 48 | params_copy = [param.clone(). 49 | type(torch.cuda.FloatTensor).detach() for param 50 | in model.parameters()] 51 | for param in params_copy: 52 | param.requires_grad = True 53 | 54 | optimizer = optim.SGD(params_copy, lr=args.lr, momentum=0.9) 55 | else: 56 | optimizer = optim.SGD(model.parameters(), 57 | lr=args.lr, 58 | momentum=0.9) 59 | 60 | 61 | def set_grad(params, params_with_grad): 62 | for param, param_w_grad in zip(params, params_with_grad): 63 | if param.grad is None: 64 | param.grad = torch.nn.Parameter(param. 65 | data.new(). 66 | resize_(*param.data.size())) 67 | param.grad.data.copy_(param_w_grad.grad.data) 68 | 69 | 70 | def train(epoch): 71 | model.train() 72 | # dummy dataset the same size as imagenet 73 | data_ = torch.FloatTensor(np.random.randn(4096, 2048, 1, 1)) 74 | target_ = torch.FloatTensor(np.random.randint(0, 128, (4096))) 75 | total_forward = 0 76 | for batch_idx in range(300): 77 | if args.mixf: 78 | data, target = data_.cuda().half(), target_.cuda().half() 79 | else: 80 | data, target = data_.cuda(), target_.cuda() 81 | data, target = Variable(data), Variable(target) 82 | optimizer.zero_grad() 83 | t_0 = time.time() 84 | output = model(data) 85 | total_forward += time.time() - t_0 86 | if batch_idx % 100 == 0: 87 | print('\tbatch_idx: ' + str(batch_idx)) 88 | print(total_forward) 89 | for epoch in range(1, args.epochs + 1): 90 | train(epoch) 91 | -------------------------------------------------------------------------------- /out.txt: -------------------------------------------------------------------------------- 1 | ==146== NVPROF is profiling process 146, command: python fp_32_16.py 2 | batch_idx: 0 3 | batch_idx: 100 4 | batch_idx: 200 5 | 1.0285618305206299 6 | ==146== Profiling application: python fp_32_16.py 7 | ==146== Profiling result: 8 | Type Time(%) Time Calls Avg Min Max Name 9 | GPU activities: 62.13% 1.57035s 300 5.2345ms 4.8927ms 5.5544ms volta_scudnn_128x64_relu_interior_nn_v1 10 | 35.01% 884.81ms 603 1.4673ms 1.4400us 3.3088ms [CUDA memcpy HtoD] 11 | 1.44% 36.313ms 300 121.04us 119.68us 123.14us void add_tensor_kernel_v3(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, float, float) 12 | 1.39% 35.246ms 300 117.49us 116.74us 118.62us void kernelPointwiseApply2, float, float, unsigned int, int=-2, int=-2>(TensorInfo, float>, TensorInfo, float, float) 13 | 0.03% 781.28us 300 2.6040us 2.5600us 2.7520us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) 14 | 0.00% 960ns 1 960ns 960ns 960ns [CUDA memset] 15 | API calls: 49.13% 3.36543s 12 280.45ms 11.568us 3.36314s cudaMalloc 16 | 36.13% 2.47512s 602 4.1115ms 15.294us 8.5905ms cudaMemcpyAsync 17 | 13.93% 954.19ms 8 119.27ms 24.437us 954.01ms cudaStreamCreateWithFlags 18 | 0.39% 26.837ms 602 44.579us 6.1980us 84.567us cudaStreamSynchronize 19 | 0.26% 17.723ms 1200 14.769us 8.5910us 37.295us cudaLaunch 20 | 0.05% 3.2636ms 9617 339ns 214ns 4.6070us cudaGetDevice 21 | 0.03% 1.9861ms 4 496.52us 468.49us 524.73us cudaGetDeviceProperties 22 | 0.02% 1.6143ms 900 1.7930us 442ns 9.9500us cudaEventRecord 23 | 0.01% 947.95us 3600 263ns 83ns 300.80us cudaSetupArgument 24 | 0.01% 858.53us 185 4.6400us 100ns 211.19us cuDeviceGetAttribute 25 | 0.01% 617.76us 1 617.76us 617.76us 617.76us cudaHostAlloc 26 | 0.01% 524.85us 604 868ns 528ns 2.2010us cudaSetDevice 27 | 0.01% 484.35us 1200 403ns 120ns 8.7620us cudaConfigureCall 28 | 0.01% 481.02us 1800 267ns 81ns 907ns cudaGetLastError 29 | 0.00% 308.11us 2 154.05us 136.54us 171.57us cuDeviceTotalMem 30 | 0.00% 103.24us 1 103.24us 103.24us 103.24us cudaStreamCreateWithPriority 31 | 0.00% 88.301us 2 44.150us 37.462us 50.839us cuDeviceGetName 32 | 0.00% 26.747us 1 26.747us 26.747us 26.747us cudaMemsetAsync 33 | 0.00% 25.943us 32 810ns 636ns 2.1950us cudaFuncSetAttribute 34 | 0.00% 15.508us 1 15.508us 15.508us 15.508us cudaMemcpy 35 | 0.00% 15.101us 25 604ns 349ns 2.2380us cudaEventCreateWithFlags 36 | 0.00% 7.0690us 26 271ns 206ns 842ns cudaDeviceGetAttribute 37 | 0.00% 2.3160us 4 579ns 193ns 1.2450us cuDeviceGetCount 38 | 0.00% 2.2090us 7 315ns 155ns 768ns cudaGetDeviceCount 39 | 0.00% 2.0890us 1 2.0890us 2.0890us 2.0890us cudaHostGetDevicePointer 40 | 0.00% 1.3720us 2 686ns 577ns 795ns cudaFree 41 | 0.00% 1.2490us 3 416ns 251ns 736ns cuDeviceGet 42 | 0.00% 1.1280us 1 1.1280us 1.1280us 1.1280us cudaDeviceGetStreamPriorityRange 43 | 0.00% 716ns 1 716ns 716ns 716ns cuInit 44 | 0.00% 350ns 1 350ns 350ns 350ns cuDriverGetVersion 45 | -------------------------------------------------------------------------------- /out_mix.txt: -------------------------------------------------------------------------------- 1 | ==158== NVPROF is profiling process 158, command: python fp_32_16.py --mixf 2 | batch_idx: 0 3 | batch_idx: 100 4 | batch_idx: 200 5 | 1.0206332206726074 6 | ==158== Profiling application: python fp_32_16.py --mixf 7 | ==158== Profiling result: 8 | Type Time(%) Time Calls Avg Min Max Name 9 | GPU activities: 58.88% 904.81ms 603 1.5005ms 1.6000us 3.2814ms [CUDA memcpy HtoD] 10 | 20.11% 309.05ms 300 1.0302ms 1.0115ms 1.0491ms volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1 11 | 16.29% 250.36ms 300 834.53us 834.01us 835.74us void nchwToNhwcKernel<__half, __half, float, bool=1>(int, int, int, int, __half const *, __half*, float, float) 12 | 1.73% 26.591ms 602 44.171us 2.0470us 86.944us void kernelPointwiseApply2, __half, float, unsigned int, int=-2, int=-2>(TensorInfo, TensorInfo, __half>, __half, __half) 13 | 1.60% 24.596ms 300 81.986us 79.200us 85.152us void add_tensor_kernel_v3(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, float, float) 14 | 1.33% 20.496ms 300 68.319us 67.839us 69.184us void kernelPointwiseApply2, __half, __half, unsigned int, int=-2, int=-2>(TensorInfo, __half>, TensorInfo<__half, __half>, __half, __half) 15 | 0.05% 789.76us 300 2.6320us 2.5920us 2.8480us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) 16 | 0.00% 47.775us 2 23.887us 1.9200us 45.855us void kernelPointwiseApply2, float, __half, unsigned int, int=-2, int=-2>(TensorInfo<__half, float>, TensorInfo, float>, float, float) 17 | 0.00% 32.992us 2 16.496us 2.2400us 30.752us [CUDA memcpy DtoD] 18 | 0.00% 992ns 1 992ns 992ns 992ns [CUDA memset] 19 | API calls: 55.78% 3.14244s 11 285.68ms 12.657us 3.14044s cudaMalloc 20 | 25.89% 1.45850s 604 2.4147ms 15.041us 5.1147ms cudaMemcpyAsync 21 | 16.89% 951.34ms 8 118.92ms 22.695us 951.17ms cudaStreamCreateWithFlags 22 | 0.70% 39.528ms 602 65.660us 6.4400us 86.703us cudaStreamSynchronize 23 | 0.48% 27.167ms 2104 12.911us 7.0580us 309.46us cudaLaunch 24 | 0.09% 4.8819ms 15675 311ns 204ns 4.1210us cudaGetDevice 25 | 0.04% 2.4174ms 4 604.35us 593.57us 620.02us cudaGetDeviceProperties 26 | 0.03% 1.5145ms 900 1.6820us 482ns 4.4120us cudaEventRecord 27 | 0.02% 1.2087ms 8416 143ns 83ns 1.0500us cudaSetupArgument 28 | 0.02% 990.99us 185 5.3560us 99ns 249.57us cuDeviceGetAttribute 29 | 0.01% 815.82us 1216 670ns 229ns 3.1180us cudaSetDevice 30 | 0.01% 689.08us 2104 327ns 120ns 1.2640us cudaConfigureCall 31 | 0.01% 613.83us 1 613.83us 613.83us 613.83us cudaHostAlloc 32 | 0.01% 545.94us 2706 201ns 80ns 670ns cudaGetLastError 33 | 0.01% 511.82us 2 255.91us 141.80us 370.02us cuDeviceTotalMem 34 | 0.00% 120.97us 2 60.483us 57.867us 63.100us cuDeviceGetName 35 | 0.00% 96.137us 1 96.137us 96.137us 96.137us cudaStreamCreateWithPriority 36 | 0.00% 25.387us 1 25.387us 25.387us 25.387us cudaMemsetAsync 37 | 0.00% 25.168us 32 786ns 581ns 2.1460us cudaFuncSetAttribute 38 | 0.00% 15.191us 1 15.191us 15.191us 15.191us cudaMemcpy 39 | 0.00% 12.384us 25 495ns 346ns 1.6920us cudaEventCreateWithFlags 40 | 0.00% 7.2860us 26 280ns 207ns 956ns cudaDeviceGetAttribute 41 | 0.00% 3.8750us 7 553ns 240ns 1.4820us cudaGetDeviceCount 42 | 0.00% 3.6940us 4 923ns 266ns 2.0270us cuDeviceGetCount 43 | 0.00% 2.2230us 1 2.2230us 2.2230us 2.2230us cudaHostGetDevicePointer 44 | 0.00% 1.9380us 3 646ns 172ns 1.3780us cuDeviceGet 45 | 0.00% 1.3260us 2 663ns 562ns 764ns cudaFree 46 | 0.00% 1.0780us 1 1.0780us 1.0780us 1.0780us cudaDeviceGetStreamPriorityRange 47 | 0.00% 865ns 1 865ns 865ns 865ns cuInit 48 | 0.00% 323ns 1 323ns 323ns 323ns cuDriverGetVersion 49 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | import time 10 | 11 | # Training settings 12 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 13 | parser.add_argument('--epochs', type=int, default=1, metavar='N', 14 | help='number of epochs to train (default: 1)') 15 | parser.add_argument('--seed', type=int, default=1, metavar='S', 16 | help='random seed (default: 1)') 17 | parser.add_argument('--mixf', action='store_true', default=False, 18 | help='enables using mixed float precision') 19 | args = parser.parse_args() 20 | 21 | torch.manual_seed(args.seed) 22 | torch.cuda.manual_seed(args.seed) 23 | 24 | 25 | class Net(nn.Module): 26 | def __init__(self): 27 | 28 | super(Net, self).__init__() 29 | self.conv1 = nn.Conv2d(2048, 2048, kernel_size=1) 30 | 31 | def forward(self, x): 32 | x = F.relu(self.conv1(x)) 33 | return x 34 | 35 | model = Net() 36 | 37 | if args.mixf: 38 | model.cuda().half() 39 | else: 40 | model.cuda() 41 | 42 | ITERS = 300 43 | 44 | def train(epoch): 45 | model.train() 46 | # dummy dataset the same size as imagenet 47 | data_ = torch.FloatTensor(np.random.randn(4096, 2048, 1, 1)) 48 | 49 | #lets get copy time out of conv time: 50 | if args.mixf: 51 | data = data_.cuda().half() 52 | else: 53 | data = data_.cuda() 54 | 55 | #time the entire thing, with proper cuda synchronization 56 | torch.cuda.synchronize() 57 | start = time.time() 58 | 59 | for batch_idx in range(ITERS): 60 | output = model(Variable(data)) 61 | 62 | torch.cuda.synchronize() 63 | print("Time / iteration: ", (time.time()-start)/ITERS) 64 | 65 | for epoch in range(1, args.epochs + 1): 66 | train(epoch) 67 | -------------------------------------------------------------------------------- /torch_float16.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from datetime import datetime 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | 8 | 9 | def build_img(args): 10 | if args.conv: 11 | img = np.random.rand(args.batch_size, args.input_channel, 12 | args.image_width, args.image_width) 13 | else: 14 | img = np.random.rand(args.batch_size, args.nin) 15 | if args.mixf: 16 | img = torch.Tensor(img).cuda().half() 17 | else: 18 | img = torch.Tensor(img).cuda() 19 | return Variable(img) 20 | 21 | 22 | class BuildModel(object): 23 | def __init__(self, args): 24 | if args.conv: 25 | self.sizes = ([args.input_channel] + 26 | [args.hidden_channel] * (args.layers + 1)) 27 | self.convs = [] 28 | else: 29 | self.sizes = ([args.nin] + ([args.layer_neurons] * args.layers) + 30 | [args.nout]) 31 | self.weights = [] 32 | for i in range(1, len(self.sizes)): 33 | if args.conv: 34 | if args.mixf: 35 | self.convs += [nn.Conv2d(self.sizes[i - 1], 36 | self.sizes[i], 37 | kernel_size=args.kernel_size).cuda().half()] 38 | else: 39 | self.convs += [nn.Conv2d(self.sizes[i - 1], 40 | self.sizes[i], 41 | kernel_size=args.kernel_size).cuda()] 42 | else: 43 | shape = (self.sizes[i - 1], self.sizes[i]) 44 | init_value = np.random.rand(*shape) 45 | if args.mixf: 46 | self.weights += [torch.Tensor(init_value).cuda().half()] 47 | else: 48 | self.weights += [torch.Tensor(init_value).cuda()] 49 | 50 | def run(self, img): 51 | self.outputs = [img] 52 | for i in range(1, len(self.sizes)): 53 | out = torch.mm(self.outputs[-1], self.weights[i - 1]) 54 | self.outputs.append(out) 55 | return self.outputs[-1] 56 | 57 | def run_conv(self, img): 58 | self.outputs = [img] 59 | for i in range(1, len(self.sizes)): 60 | out = self.convs[i - 1](self.outputs[-1]) 61 | self.outputs.append(out) 62 | return self.outputs[-1] 63 | 64 | 65 | def run_benchmark(args): 66 | img = build_img(args) 67 | model = BuildModel(args) 68 | # Start profiling 69 | time_start = datetime.now() 70 | if args.conv: 71 | run_fnc = model.run_conv 72 | else: 73 | run_fnc = model.run 74 | for i in range(args.nsteps): 75 | run_fnc(img) 76 | if (i + 1) % 100 == 0: 77 | print("Step %d/%d " % (i + 1, args.nsteps)) 78 | time_end = datetime.now() # end profiling 79 | time_spent = time_end - time_start 80 | seconds = time_spent.seconds + time_spent.days * 24 * 3600 81 | profile_message = 'execution time: %s sec + %s microsec' % (seconds, time_spent.microseconds) 82 | print (profile_message) 83 | 84 | 85 | if __name__ == '__main__': 86 | np.random.seed(12345678) 87 | default_batch_size = 4096 88 | default_nin = 2048 89 | default_nout = 2048 90 | default_nsteps = 2000 91 | default_layers = 1 92 | default_layer_neurons = 2048 93 | 94 | default_input_channel = 3 95 | default_hidden_channel = 128 96 | default_kernel_size = 5 97 | default_image_size = 64 98 | 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument('--mixf', action='store_true', default=False, 101 | help='Enables mixed float precision') 102 | parser.add_argument("--batch_size", type=int, default=default_batch_size, 103 | help='Batch size of the layer (default %d)' % default_batch_size) 104 | parser.add_argument("--nin", type=int, default=default_nin, 105 | help='Input size of the layer (default %d)' % default_nin) 106 | parser.add_argument("--nout", type=int, default=default_nout, 107 | help='Output size of the layer (default %d)' % default_nout) 108 | parser.add_argument("--nsteps", type=int, default=default_nsteps, 109 | help='Number of training steps (default %d)' % default_nsteps) 110 | parser.add_argument("--layers", type=int, default=default_layers, 111 | help='Number of layers (default %d)' % default_layers) 112 | parser.add_argument("--layer-neurons", type=int, default=default_layer_neurons, 113 | help='Number of neurons per layer (default %d)' % default_layer_neurons) 114 | parser.add_argument("--conv", action='store_true', default=False) 115 | parser.add_argument("--input_channel", type=int, default=default_input_channel) 116 | parser.add_argument("--hidden_channel", type=int, default=default_hidden_channel) 117 | parser.add_argument("--kernel_size", type=int, default=default_kernel_size) 118 | parser.add_argument("--image_width", type=int, default=default_image_size) 119 | 120 | args = parser.parse_args() 121 | run_benchmark(args) 122 | --------------------------------------------------------------------------------