├── .gitignore
├── AE.py
├── DAE.py
├── LICENSE
├── README.md
├── VAE.py
├── fp_32_16.py
├── out.txt
├── out_mix.txt
├── test.py
└── torch_float16.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/AE.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.autograd import Variable
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import transforms
 8 | from torchvision.datasets import MNIST
 9 | from torchvision.utils import save_image
10 | 
11 | if not os.path.exists('./mlp_img'):
12 |     os.mkdir('./mlp_img')
13 | 
14 | 
15 | def to_img(x):
16 |     x = x.view(x.size(0), 1, 28, 28)
17 |     return x
18 | 
19 | num_epochs = 200
20 | batch_size = 128
21 | learning_rate = 1e-3
22 | 
23 | 
24 | def plot_sample_img(img, name):
25 |     img = img.view(1, 28, 28)
26 |     save_image(img, './sample_{}.png'.format(name))
27 | 
28 | 
29 | def min_max_normalization(tensor, min_value, max_value):
30 |     min_tensor = tensor.min()
31 |     tensor = (tensor - min_tensor)
32 |     max_tensor = tensor.max()
33 |     tensor = tensor / max_tensor
34 |     tensor = tensor * (max_value - min_value) + min_value
35 |     return tensor
36 | 
37 | 
38 | def tensor_round(tensor):
39 |     return torch.round(tensor)
40 | 
41 | img_transform = transforms.Compose([
42 |     transforms.ToTensor(),
43 |     transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)),
44 |     transforms.Lambda(lambda tensor:tensor_round(tensor))
45 | ])
46 | 
47 | dataset = MNIST('./data', transform=img_transform, download=True)
48 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
49 | 
50 | 
51 | class autoencoder(nn.Module):
52 |     def __init__(self):
53 |         super(autoencoder, self).__init__()
54 |         self.encoder = nn.Sequential(
55 |             nn.Linear(28 * 28, 256),
56 |             nn.ReLU(True),
57 |             nn.Linear(256, 64),
58 |             nn.ReLU(True))
59 |         self.decoder = nn.Sequential(
60 |             nn.Linear(64, 256),
61 |             nn.ReLU(True),
62 |             nn.Linear(256, 28 * 28),
63 |             nn.Sigmoid())
64 | 
65 |     def forward(self, x):
66 |         x = self.encoder(x)
67 |         x = self.decoder(x)
68 |         return x
69 | 
70 | 
71 | model = autoencoder().cuda()
72 | criterion = nn.BCELoss()
73 | optimizer = torch.optim.Adam(
74 |     model.parameters(), lr=learning_rate, weight_decay=1e-5)
75 | 
76 | for epoch in range(num_epochs):
77 |     for data in dataloader:
78 |         img, _ = data
79 |         img = img.view(img.size(0), -1)
80 |         img = Variable(img).cuda()
81 |         # ===================forward=====================
82 |         output = model(img)
83 |         loss = criterion(output, img)
84 |         MSE_loss = nn.MSELoss()(output, img)
85 |         # ===================backward====================
86 |         optimizer.zero_grad()
87 |         loss.backward()
88 |         optimizer.step()
89 |     # ===================log========================
90 |     print('epoch [{}/{}], loss:{:.4f}, MSE_loss:{:.4f}'
91 |           .format(epoch + 1, num_epochs, loss.data[0], MSE_loss.data[0]))
92 |     if epoch % 10 == 0:
93 |         x = to_img(img.cpu().data)
94 |         x_hat = to_img(output.cpu().data)
95 |         save_image(x, './mlp_img/x_{}.png'.format(epoch))
96 |         save_image(x_hat, './mlp_img/x_hat_{}.png'.format(epoch))
97 | 
98 | torch.save(model.state_dict(), './sim_autoencoder.pth')
99 | 


--------------------------------------------------------------------------------
/DAE.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.autograd import Variable
  6 | from torch.utils.data import DataLoader
  7 | from torchvision import transforms
  8 | from torchvision.datasets import MNIST
  9 | from torchvision.utils import save_image
 10 | 
 11 | if not os.path.exists('./DAEmlp_img'):
 12 |     os.mkdir('./DAEmlp_img')
 13 | 
 14 | 
 15 | def to_img(x):
 16 |     x = x.view(x.size(0), 1, 28, 28)
 17 |     return x
 18 | 
 19 | num_epochs = 200
 20 | batch_size = 128
 21 | learning_rate = 1e-3
 22 | 
 23 | 
 24 | def add_noise(img):
 25 |     noise = torch.randn(img.size()) * 0.4
 26 |     noisy_img = img + noise
 27 |     return noisy_img
 28 | 
 29 | 
 30 | def plot_sample_img(img, name):
 31 |     img = img.view(1, 28, 28)
 32 |     save_image(img, './sample_{}.png'.format(name))
 33 | 
 34 | 
 35 | def min_max_normalization(tensor, min_value, max_value):
 36 |     min_tensor = tensor.min()
 37 |     tensor = (tensor - min_tensor)
 38 |     max_tensor = tensor.max()
 39 |     tensor = tensor / max_tensor
 40 |     tensor = tensor * (max_value - min_value) + min_value
 41 |     return tensor
 42 | 
 43 | 
 44 | def tensor_round(tensor):
 45 |     return torch.round(tensor)
 46 | 
 47 | 
 48 | img_transform = transforms.Compose([
 49 |     transforms.ToTensor(),
 50 |     transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)),
 51 |     transforms.Lambda(lambda tensor:tensor_round(tensor))
 52 | ])
 53 | 
 54 | dataset = MNIST('./data', transform=img_transform, download=True)
 55 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
 56 | 
 57 | 
 58 | class autoencoder(nn.Module):
 59 |     def __init__(self):
 60 |         super(autoencoder, self).__init__()
 61 |         self.encoder = nn.Sequential(
 62 |             nn.Linear(28 * 28, 256),
 63 |             nn.ReLU(True),
 64 |             nn.Linear(256, 64),
 65 |             nn.ReLU(True))
 66 |         self.decoder = nn.Sequential(
 67 |             nn.Linear(64, 256),
 68 |             nn.ReLU(True),
 69 |             nn.Linear(256, 28 * 28),
 70 |             nn.Sigmoid())
 71 | 
 72 |     def forward(self, x):
 73 |         x = self.encoder(x)
 74 |         x = self.decoder(x)
 75 |         return x
 76 | 
 77 | 
 78 | model = autoencoder().cuda()
 79 | criterion = nn.BCELoss()
 80 | optimizer = torch.optim.Adam(
 81 |     model.parameters(), lr=learning_rate, weight_decay=1e-5)
 82 | 
 83 | for epoch in range(num_epochs):
 84 |     for data in dataloader:
 85 |         img, _ = data
 86 |         img = img.view(img.size(0), -1)
 87 |         noisy_img = add_noise(img)
 88 |         noisy_img = Variable(noisy_img).cuda()
 89 |         img = Variable(img).cuda()
 90 |         # ===================forward=====================
 91 |         output = model(noisy_img)
 92 |         loss = criterion(output, img)
 93 |         MSE_loss = nn.MSELoss()(output, img)
 94 |         # ===================backward====================
 95 |         optimizer.zero_grad()
 96 |         loss.backward()
 97 |         optimizer.step()
 98 |     # ===================log========================
 99 |     print('epoch [{}/{}], loss:{:.4f}, MSE_loss:{:.4f}'
100 |           .format(epoch + 1, num_epochs, loss.data[0], MSE_loss.data[0]))
101 |     if epoch % 10 == 0:
102 |         x = to_img(img.cpu().data)
103 |         x_hat = to_img(output.cpu().data)
104 |         x_noisy = to_img(noisy_img.cpu().data)
105 |         weights = to_img(model.encoder[0].weight.cpu().data)
106 |         save_image(x, './DAEmlp_img/x_{}.png'.format(epoch))
107 |         save_image(x_hat, './DAEmlp_img/x_hat_{}.png'.format(epoch))
108 |         save_image(x_noisy, './DAEmlp_img/x_noisy_{}.png'.format(epoch))
109 |         save_image(weights, './filters/epoch_{}.png'.format(epoch))
110 | 
111 | torch.save(model.state_dict(), './sim_dautoencoder.pth')
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Reyhane Askari 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | For full tutorial follow: https://reyhaneaskari.github.io/AE.htm
2 | 


--------------------------------------------------------------------------------
/VAE.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.autograd import Variable
  6 | from torch.utils.data import DataLoader
  7 | from torchvision import transforms
  8 | from torchvision.datasets import MNIST
  9 | from torchvision.utils import save_image
 10 | 
 11 | if not os.path.exists('./mlp_img'):
 12 |     os.mkdir('./mlp_img')
 13 | 
 14 | 
 15 | def to_img(x):
 16 |     x = x.view(x.size(0), 1, 28, 28)
 17 |     return x
 18 | 
 19 | num_epochs = 200
 20 | batch_size = 128
 21 | learning_rate = 1e-3
 22 | 
 23 | 
 24 | def plot_sample_img(img, name):
 25 |     img = img.view(1, 28, 28)
 26 |     save_image(img, './sample_{}.png'.format(name))
 27 | 
 28 | 
 29 | def min_max_normalization(tensor, min_value, max_value):
 30 |     min_tensor = tensor.min()
 31 |     tensor = (tensor - min_tensor)
 32 |     max_tensor = tensor.max()
 33 |     tensor = tensor / max_tensor
 34 |     tensor = tensor * (max_value - min_value) + min_value
 35 |     return tensor
 36 | 
 37 | 
 38 | def tensor_round(tensor):
 39 |     return torch.round(tensor)
 40 | 
 41 | 
 42 | img_transform = transforms.Compose([
 43 |     transforms.ToTensor(),
 44 |     transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)),
 45 |     transforms.Lambda(lambda tensor:tensor_round(tensor))
 46 | ])
 47 | 
 48 | dataset = MNIST('./data', transform=img_transform, download=True)
 49 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
 50 | 
 51 | 
 52 | class VariationalAutoencoder(nn.Module):
 53 |     def __init__(self):
 54 |         super(VariationalAutoencoder, self).__init__()
 55 |         self.encoder = nn.Sequential(
 56 |             nn.Linear(28 * 28, 400),
 57 |             nn.ReLU(True),
 58 |             nn.Linear(400, 40))
 59 |         self.decoder = nn.Sequential(
 60 |             nn.Linear(20, 400),
 61 |             nn.ReLU(True),
 62 |             nn.Linear(400, 28 * 28),
 63 |             nn.Sigmoid())
 64 | 
 65 |     def reparametrize(self, mu, logvar):
 66 |         var = logvar.exp()
 67 |         std = var.sqrt()
 68 |         eps = Variable(torch.cuda.FloatTensor(std.size()).normal_())
 69 |         return eps.mul(std).add(mu)
 70 | 
 71 |     def forward(self, x):
 72 |         h = self.encoder(x)
 73 |         mu = h[:, :20]
 74 |         logvar = h[:, 20:]
 75 |         z = self.reparametrize(mu, logvar)
 76 |         x_hat = self.decoder(z)
 77 |         return x_hat, mu, logvar
 78 | 
 79 |     def generation_with_interpolation(self, x_one, x_two, alpha):
 80 |         hidden_one = self.encoder(x_one)
 81 |         hidden_two = self.encoder(x_two)
 82 |         mu_one = hidden_one[:, :20]
 83 |         logvar_one = hidden_one[:, 20:]
 84 |         mu_two = hidden_two[:, :20]
 85 |         logvar_two = hidden_two[:, 20:]
 86 |         mu = (1 - alpha) * mu_one + alpha * mu_two
 87 |         logvar = (1 - alpha) * logvar_one + alpha * logvar_two
 88 |         z = self.reparametrize(mu, logvar)
 89 |         generated_image = self.decoder(z)
 90 |         return generated_image
 91 | 
 92 | model = VariationalAutoencoder().cuda()
 93 | BCE = nn.BCELoss()
 94 | optimizer = torch.optim.Adam(
 95 |     model.parameters(), lr=learning_rate)
 96 | 
 97 | for epoch in range(num_epochs):
 98 |     for data in dataloader:
 99 |         img, _ = data
100 |         img = img.view(img.size(0), -1)
101 |         img = Variable(img).cuda()
102 |         # ===================forward=====================
103 |         x_hat, mu, logvar = model(img)
104 |         NKLD = mu.pow(2).add(logvar.exp()).mul(-1).add(logvar.add(1))
105 |         KLD = torch.sum(NKLD).mul(-0.5)
106 |         KLD /= 128 * 784
107 |         loss = BCE(x_hat, img) + KLD
108 |         # ===================backward====================
109 |         optimizer.zero_grad()
110 |         loss.backward()
111 |         optimizer.step()
112 |     # ===================log========================
113 |     print('epoch [{}/{}], loss:{:.4f}'
114 |           .format(epoch + 1, num_epochs, loss.data[0]))
115 |     if epoch % 10 == 0:
116 |         x = to_img(img.cpu().data)
117 |         x_hat = to_img(x_hat.cpu().data)
118 |         save_image(x, './mlp_img/x_{}.png'.format(epoch))
119 |         save_image(x_hat, './mlp_img/x_hat_{}.png'.format(epoch))
120 |         batch = iter(dataloader).next()[0]
121 |         batch = batch.view(batch.size(0), -1)
122 |         batch = Variable(batch).cuda()
123 |         x_one = batch[0:1]
124 |         x_two = batch[1:2]
125 |         generated_images = []
126 |         for alpha in torch.arange(0.0, 1.0, 0.1):
127 |             generated_images.append(model.generation_with_interpolation(
128 |                 x_one, x_two, alpha))
129 |         generated_images = torch.cat(generated_images, 0).cpu().data
130 |         save_image(generated_images.view(-1, 1, 28, 28),
131 |                    './generated/output_interpolate_{}.png'.format(epoch),
132 |                    nrow=1)
133 | torch.save(model.state_dict(), './sim_variational_autoencoder.pth')
134 | 


--------------------------------------------------------------------------------
/fp_32_16.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import torch.optim as optim
 8 | from torch.autograd import Variable
 9 | import time
10 | 
11 | # Training settings
12 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
13 | parser.add_argument('--batch-size', type=int, default=64, metavar='N',
14 |                     help='input batch size for training (default: 64)')
15 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
16 |                     help='input batch size for testing (default: 1000)')
17 | parser.add_argument('--epochs', type=int, default=1, metavar='N',
18 |                     help='number of epochs to train (default: 1)')
19 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
20 |                     help='learning rate (default: 0.01)')
21 | parser.add_argument('--seed', type=int, default=1, metavar='S',
22 |                     help='random seed (default: 1)')
23 | parser.add_argument('--mixf', action='store_true', default=False,
24 |                     help='enables using mixed float precision')
25 | args = parser.parse_args()
26 | 
27 | torch.manual_seed(args.seed)
28 | torch.cuda.manual_seed(args.seed)
29 | 
30 | 
31 | class Net(nn.Module):
32 |     def __init__(self):
33 | 
34 |         super(Net, self).__init__()
35 |         self.conv1 = nn.Conv2d(2048, 2048, kernel_size=1)
36 | 
37 |     def forward(self, x):
38 |         x = F.relu(self.conv1(x))
39 |         return x
40 | 
41 | model = Net()
42 | if args.mixf:
43 |     model.cuda().half()
44 | else:
45 |     model.cuda()
46 | 
47 | if args.mixf:
48 |     params_copy = [param.clone().
49 |                    type(torch.cuda.FloatTensor).detach() for param
50 |                    in model.parameters()]
51 |     for param in params_copy:
52 |         param.requires_grad = True
53 | 
54 |     optimizer = optim.SGD(params_copy, lr=args.lr, momentum=0.9)
55 | else:
56 |     optimizer = optim.SGD(model.parameters(),
57 |                           lr=args.lr,
58 |                           momentum=0.9)
59 | 
60 | 
61 | def set_grad(params, params_with_grad):
62 |     for param, param_w_grad in zip(params, params_with_grad):
63 |         if param.grad is None:
64 |             param.grad = torch.nn.Parameter(param.
65 |                                             data.new().
66 |                                             resize_(*param.data.size()))
67 |         param.grad.data.copy_(param_w_grad.grad.data)
68 | 
69 | 
70 | def train(epoch):
71 |     model.train()
72 |     # dummy dataset the same size as imagenet
73 |     data_ = torch.FloatTensor(np.random.randn(4096, 2048, 1, 1))
74 |     target_ = torch.FloatTensor(np.random.randint(0, 128, (4096)))
75 |     total_forward = 0
76 |     for batch_idx in range(300):
77 |         if args.mixf:
78 |             data, target = data_.cuda().half(), target_.cuda().half()
79 |         else:
80 |             data, target = data_.cuda(), target_.cuda()
81 |         data, target = Variable(data), Variable(target)
82 |         optimizer.zero_grad()
83 |         t_0 = time.time()
84 |         output = model(data)
85 |         total_forward += time.time() - t_0
86 |         if batch_idx % 100 == 0:
87 |             print('\tbatch_idx: ' + str(batch_idx))
88 |     print(total_forward)
89 | for epoch in range(1, args.epochs + 1):
90 |     train(epoch)
91 | 


--------------------------------------------------------------------------------
/out.txt:
--------------------------------------------------------------------------------
 1 | ==146== NVPROF is profiling process 146, command: python fp_32_16.py
 2 | 	batch_idx: 0
 3 | 	batch_idx: 100
 4 | 	batch_idx: 200
 5 | 1.0285618305206299
 6 | ==146== Profiling application: python fp_32_16.py
 7 | ==146== Profiling result:
 8 |             Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 9 |  GPU activities:   62.13%  1.57035s       300  5.2345ms  4.8927ms  5.5544ms  volta_scudnn_128x64_relu_interior_nn_v1
10 |                    35.01%  884.81ms       603  1.4673ms  1.4400us  3.3088ms  [CUDA memcpy HtoD]
11 |                     1.44%  36.313ms       300  121.04us  119.68us  123.14us  void add_tensor_kernel_v3<int=2, float, float, int=16, int=16, int=1, int=16, int=4>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, float, float)
12 |                     1.39%  35.246ms       300  117.49us  116.74us  118.62us  void kernelPointwiseApply2<ThresholdUpdateOutput<float>, float, float, unsigned int, int=-2, int=-2>(TensorInfo<ThresholdUpdateOutput<float>, float>, TensorInfo<float, float>, float, float)
13 |                     0.03%  781.28us       300  2.6040us  2.5600us  2.7520us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
14 |                     0.00%     960ns         1     960ns     960ns     960ns  [CUDA memset]
15 |       API calls:   49.13%  3.36543s        12  280.45ms  11.568us  3.36314s  cudaMalloc
16 |                    36.13%  2.47512s       602  4.1115ms  15.294us  8.5905ms  cudaMemcpyAsync
17 |                    13.93%  954.19ms         8  119.27ms  24.437us  954.01ms  cudaStreamCreateWithFlags
18 |                     0.39%  26.837ms       602  44.579us  6.1980us  84.567us  cudaStreamSynchronize
19 |                     0.26%  17.723ms      1200  14.769us  8.5910us  37.295us  cudaLaunch
20 |                     0.05%  3.2636ms      9617     339ns     214ns  4.6070us  cudaGetDevice
21 |                     0.03%  1.9861ms         4  496.52us  468.49us  524.73us  cudaGetDeviceProperties
22 |                     0.02%  1.6143ms       900  1.7930us     442ns  9.9500us  cudaEventRecord
23 |                     0.01%  947.95us      3600     263ns      83ns  300.80us  cudaSetupArgument
24 |                     0.01%  858.53us       185  4.6400us     100ns  211.19us  cuDeviceGetAttribute
25 |                     0.01%  617.76us         1  617.76us  617.76us  617.76us  cudaHostAlloc
26 |                     0.01%  524.85us       604     868ns     528ns  2.2010us  cudaSetDevice
27 |                     0.01%  484.35us      1200     403ns     120ns  8.7620us  cudaConfigureCall
28 |                     0.01%  481.02us      1800     267ns      81ns     907ns  cudaGetLastError
29 |                     0.00%  308.11us         2  154.05us  136.54us  171.57us  cuDeviceTotalMem
30 |                     0.00%  103.24us         1  103.24us  103.24us  103.24us  cudaStreamCreateWithPriority
31 |                     0.00%  88.301us         2  44.150us  37.462us  50.839us  cuDeviceGetName
32 |                     0.00%  26.747us         1  26.747us  26.747us  26.747us  cudaMemsetAsync
33 |                     0.00%  25.943us        32     810ns     636ns  2.1950us  cudaFuncSetAttribute
34 |                     0.00%  15.508us         1  15.508us  15.508us  15.508us  cudaMemcpy
35 |                     0.00%  15.101us        25     604ns     349ns  2.2380us  cudaEventCreateWithFlags
36 |                     0.00%  7.0690us        26     271ns     206ns     842ns  cudaDeviceGetAttribute
37 |                     0.00%  2.3160us         4     579ns     193ns  1.2450us  cuDeviceGetCount
38 |                     0.00%  2.2090us         7     315ns     155ns     768ns  cudaGetDeviceCount
39 |                     0.00%  2.0890us         1  2.0890us  2.0890us  2.0890us  cudaHostGetDevicePointer
40 |                     0.00%  1.3720us         2     686ns     577ns     795ns  cudaFree
41 |                     0.00%  1.2490us         3     416ns     251ns     736ns  cuDeviceGet
42 |                     0.00%  1.1280us         1  1.1280us  1.1280us  1.1280us  cudaDeviceGetStreamPriorityRange
43 |                     0.00%     716ns         1     716ns     716ns     716ns  cuInit
44 |                     0.00%     350ns         1     350ns     350ns     350ns  cuDriverGetVersion
45 | 


--------------------------------------------------------------------------------
/out_mix.txt:
--------------------------------------------------------------------------------
 1 | ==158== NVPROF is profiling process 158, command: python fp_32_16.py --mixf
 2 | 	batch_idx: 0
 3 | 	batch_idx: 100
 4 | 	batch_idx: 200
 5 | 1.0206332206726074
 6 | ==158== Profiling application: python fp_32_16.py --mixf
 7 | ==158== Profiling result:
 8 |             Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 9 |  GPU activities:   58.88%  904.81ms       603  1.5005ms  1.6000us  3.2814ms  [CUDA memcpy HtoD]
10 |                    20.11%  309.05ms       300  1.0302ms  1.0115ms  1.0491ms  volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
11 |                    16.29%  250.36ms       300  834.53us  834.01us  835.74us  void nchwToNhwcKernel<__half, __half, float, bool=1>(int, int, int, int, __half const *, __half*, float, float)
12 |                     1.73%  26.591ms       602  44.171us  2.0470us  86.944us  void kernelPointwiseApply2<CopyOp<__half, float>, __half, float, unsigned int, int=-2, int=-2>(TensorInfo<float, __half>, TensorInfo<CopyOp<__half, float>, __half>, __half, __half)
13 |                     1.60%  24.596ms       300  81.986us  79.200us  85.152us  void add_tensor_kernel_v3<int=2, __half, float, int=16, int=16, int=1, int=16, int=4>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, float, float)
14 |                     1.33%  20.496ms       300  68.319us  67.839us  69.184us  void kernelPointwiseApply2<ThresholdUpdateOutput<__half>, __half, __half, unsigned int, int=-2, int=-2>(TensorInfo<ThresholdUpdateOutput<__half>, __half>, TensorInfo<__half, __half>, __half, __half)
15 |                     0.05%  789.76us       300  2.6320us  2.5920us  2.8480us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
16 |                     0.00%  47.775us         2  23.887us  1.9200us  45.855us  void kernelPointwiseApply2<CopyOp<float, __half>, float, __half, unsigned int, int=-2, int=-2>(TensorInfo<__half, float>, TensorInfo<CopyOp<float, __half>, float>, float, float)
17 |                     0.00%  32.992us         2  16.496us  2.2400us  30.752us  [CUDA memcpy DtoD]
18 |                     0.00%     992ns         1     992ns     992ns     992ns  [CUDA memset]
19 |       API calls:   55.78%  3.14244s        11  285.68ms  12.657us  3.14044s  cudaMalloc
20 |                    25.89%  1.45850s       604  2.4147ms  15.041us  5.1147ms  cudaMemcpyAsync
21 |                    16.89%  951.34ms         8  118.92ms  22.695us  951.17ms  cudaStreamCreateWithFlags
22 |                     0.70%  39.528ms       602  65.660us  6.4400us  86.703us  cudaStreamSynchronize
23 |                     0.48%  27.167ms      2104  12.911us  7.0580us  309.46us  cudaLaunch
24 |                     0.09%  4.8819ms     15675     311ns     204ns  4.1210us  cudaGetDevice
25 |                     0.04%  2.4174ms         4  604.35us  593.57us  620.02us  cudaGetDeviceProperties
26 |                     0.03%  1.5145ms       900  1.6820us     482ns  4.4120us  cudaEventRecord
27 |                     0.02%  1.2087ms      8416     143ns      83ns  1.0500us  cudaSetupArgument
28 |                     0.02%  990.99us       185  5.3560us      99ns  249.57us  cuDeviceGetAttribute
29 |                     0.01%  815.82us      1216     670ns     229ns  3.1180us  cudaSetDevice
30 |                     0.01%  689.08us      2104     327ns     120ns  1.2640us  cudaConfigureCall
31 |                     0.01%  613.83us         1  613.83us  613.83us  613.83us  cudaHostAlloc
32 |                     0.01%  545.94us      2706     201ns      80ns     670ns  cudaGetLastError
33 |                     0.01%  511.82us         2  255.91us  141.80us  370.02us  cuDeviceTotalMem
34 |                     0.00%  120.97us         2  60.483us  57.867us  63.100us  cuDeviceGetName
35 |                     0.00%  96.137us         1  96.137us  96.137us  96.137us  cudaStreamCreateWithPriority
36 |                     0.00%  25.387us         1  25.387us  25.387us  25.387us  cudaMemsetAsync
37 |                     0.00%  25.168us        32     786ns     581ns  2.1460us  cudaFuncSetAttribute
38 |                     0.00%  15.191us         1  15.191us  15.191us  15.191us  cudaMemcpy
39 |                     0.00%  12.384us        25     495ns     346ns  1.6920us  cudaEventCreateWithFlags
40 |                     0.00%  7.2860us        26     280ns     207ns     956ns  cudaDeviceGetAttribute
41 |                     0.00%  3.8750us         7     553ns     240ns  1.4820us  cudaGetDeviceCount
42 |                     0.00%  3.6940us         4     923ns     266ns  2.0270us  cuDeviceGetCount
43 |                     0.00%  2.2230us         1  2.2230us  2.2230us  2.2230us  cudaHostGetDevicePointer
44 |                     0.00%  1.9380us         3     646ns     172ns  1.3780us  cuDeviceGet
45 |                     0.00%  1.3260us         2     663ns     562ns     764ns  cudaFree
46 |                     0.00%  1.0780us         1  1.0780us  1.0780us  1.0780us  cudaDeviceGetStreamPriorityRange
47 |                     0.00%     865ns         1     865ns     865ns     865ns  cuInit
48 |                     0.00%     323ns         1     323ns     323ns     323ns  cuDriverGetVersion
49 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import torch.optim as optim
 8 | from torch.autograd import Variable
 9 | import time
10 | 
11 | # Training settings
12 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
13 | parser.add_argument('--epochs', type=int, default=1, metavar='N',
14 |                     help='number of epochs to train (default: 1)')
15 | parser.add_argument('--seed', type=int, default=1, metavar='S',
16 |                     help='random seed (default: 1)')
17 | parser.add_argument('--mixf', action='store_true', default=False,
18 |                     help='enables using mixed float precision')
19 | args = parser.parse_args()
20 | 
21 | torch.manual_seed(args.seed)
22 | torch.cuda.manual_seed(args.seed)
23 | 
24 | 
25 | class Net(nn.Module):
26 |     def __init__(self):
27 | 
28 |         super(Net, self).__init__()
29 |         self.conv1 = nn.Conv2d(2048, 2048, kernel_size=1)
30 | 
31 |     def forward(self, x):
32 |         x = F.relu(self.conv1(x))
33 |         return x
34 | 
35 | model = Net()
36 | 
37 | if args.mixf:
38 |     model.cuda().half()
39 | else:
40 |     model.cuda()
41 | 
42 | ITERS = 300
43 | 
44 | def train(epoch):
45 |     model.train()
46 |     # dummy dataset the same size as imagenet
47 |     data_ = torch.FloatTensor(np.random.randn(4096, 2048, 1, 1))
48 | 
49 |     #lets get copy time out of conv time:
50 |     if args.mixf:
51 |         data = data_.cuda().half()
52 |     else:
53 |         data = data_.cuda()
54 | 
55 |     #time the entire thing, with proper cuda synchronization
56 |     torch.cuda.synchronize()
57 |     start = time.time()
58 | 
59 |     for batch_idx in range(ITERS):
60 |         output = model(Variable(data))
61 | 
62 |     torch.cuda.synchronize()
63 |     print("Time / iteration: ", (time.time()-start)/ITERS)
64 | 
65 | for epoch in range(1, args.epochs + 1):
66 |     train(epoch)
67 | 


--------------------------------------------------------------------------------
/torch_float16.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from datetime import datetime
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.autograd import Variable
  7 | 
  8 | 
  9 | def build_img(args):
 10 |     if args.conv:
 11 |         img = np.random.rand(args.batch_size, args.input_channel,
 12 |                              args.image_width, args.image_width)
 13 |     else:
 14 |         img = np.random.rand(args.batch_size, args.nin)
 15 |     if args.mixf:
 16 |         img = torch.Tensor(img).cuda().half()
 17 |     else:
 18 |         img = torch.Tensor(img).cuda()
 19 |     return Variable(img)
 20 | 
 21 | 
 22 | class BuildModel(object):
 23 |     def __init__(self, args):
 24 |         if args.conv:
 25 |             self.sizes = ([args.input_channel] +
 26 |                           [args.hidden_channel] * (args.layers + 1))
 27 |             self.convs = []
 28 |         else:
 29 |             self.sizes = ([args.nin] + ([args.layer_neurons] * args.layers) +
 30 |                           [args.nout])
 31 |             self.weights = []
 32 |         for i in range(1, len(self.sizes)):
 33 |             if args.conv:
 34 |                 if args.mixf:
 35 |                     self.convs += [nn.Conv2d(self.sizes[i - 1],
 36 |                                              self.sizes[i],
 37 |                                              kernel_size=args.kernel_size).cuda().half()]
 38 |                 else:
 39 |                     self.convs += [nn.Conv2d(self.sizes[i - 1],
 40 |                                              self.sizes[i],
 41 |                                              kernel_size=args.kernel_size).cuda()]
 42 |             else:
 43 |                 shape = (self.sizes[i - 1], self.sizes[i])
 44 |                 init_value = np.random.rand(*shape)
 45 |                 if args.mixf:
 46 |                     self.weights += [torch.Tensor(init_value).cuda().half()]
 47 |                 else:
 48 |                     self.weights += [torch.Tensor(init_value).cuda()]
 49 | 
 50 |     def run(self, img):
 51 |         self.outputs = [img]
 52 |         for i in range(1, len(self.sizes)):
 53 |             out = torch.mm(self.outputs[-1], self.weights[i - 1])
 54 |             self.outputs.append(out)
 55 |         return self.outputs[-1]
 56 | 
 57 |     def run_conv(self, img):
 58 |         self.outputs = [img]
 59 |         for i in range(1, len(self.sizes)):
 60 |             out = self.convs[i - 1](self.outputs[-1])
 61 |             self.outputs.append(out)
 62 |         return self.outputs[-1]
 63 | 
 64 | 
 65 | def run_benchmark(args):
 66 |     img = build_img(args)
 67 |     model = BuildModel(args)
 68 |     # Start profiling
 69 |     time_start = datetime.now()
 70 |     if args.conv:
 71 |         run_fnc = model.run_conv
 72 |     else:
 73 |         run_fnc = model.run
 74 |     for i in range(args.nsteps):
 75 |         run_fnc(img)
 76 |         if (i + 1) % 100 == 0:
 77 |             print("Step %d/%d " % (i + 1, args.nsteps))
 78 |     time_end = datetime.now()  # end profiling
 79 |     time_spent = time_end - time_start
 80 |     seconds = time_spent.seconds + time_spent.days * 24 * 3600
 81 |     profile_message = 'execution time: %s sec + %s microsec' % (seconds, time_spent.microseconds)
 82 |     print (profile_message)
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     np.random.seed(12345678)
 87 |     default_batch_size = 4096
 88 |     default_nin = 2048
 89 |     default_nout = 2048
 90 |     default_nsteps = 2000
 91 |     default_layers = 1
 92 |     default_layer_neurons = 2048
 93 | 
 94 |     default_input_channel = 3
 95 |     default_hidden_channel = 128
 96 |     default_kernel_size = 5
 97 |     default_image_size = 64
 98 | 
 99 |     parser = argparse.ArgumentParser()
100 |     parser.add_argument('--mixf', action='store_true', default=False,
101 |                         help='Enables mixed float precision')
102 |     parser.add_argument("--batch_size", type=int, default=default_batch_size,
103 |                         help='Batch size of the layer (default %d)' % default_batch_size)
104 |     parser.add_argument("--nin", type=int, default=default_nin,
105 |                         help='Input size of the layer (default %d)' % default_nin)
106 |     parser.add_argument("--nout", type=int, default=default_nout,
107 |                         help='Output size of the layer (default %d)' % default_nout)
108 |     parser.add_argument("--nsteps", type=int, default=default_nsteps,
109 |                         help='Number of training steps (default %d)' % default_nsteps)
110 |     parser.add_argument("--layers", type=int, default=default_layers,
111 |                         help='Number of layers (default %d)' % default_layers)
112 |     parser.add_argument("--layer-neurons", type=int, default=default_layer_neurons,
113 |                         help='Number of neurons per layer (default %d)' % default_layer_neurons)
114 |     parser.add_argument("--conv", action='store_true', default=False)
115 |     parser.add_argument("--input_channel", type=int, default=default_input_channel)
116 |     parser.add_argument("--hidden_channel", type=int, default=default_hidden_channel)
117 |     parser.add_argument("--kernel_size", type=int, default=default_kernel_size)
118 |     parser.add_argument("--image_width", type=int, default=default_image_size)
119 | 
120 |     args = parser.parse_args()
121 |     run_benchmark(args)
122 | 


--------------------------------------------------------------------------------