├── .gitignore ├── README.md ├── example_mnist.py └── gpu_profile.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | __pycache__ 3 | *prof.txt 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpu_memory_profiling 2 | Profile the GPU memory usage of every line in a pytorch code 3 | 4 | ## Example Usage 5 | ```bash 6 | python example_mnist.py 7 | ``` 8 | 9 | ## Dependency 10 | This code depends on [py3nvml](https://github.com/fbcotter/py3nvml). Pip install is available here: 11 | ``` 12 | pip install py3nvml 13 | ``` 14 | 15 | Tested with pytorch version 0.4.0 and py3nvml verion 0.2.0 16 | 17 | ### Acknowledgement 18 | The gpu_profile.py is a modified version of [this script](https://gist.github.com/MInner/8968b3b120c95d3f50b8a22a74bf66bc). 19 | 20 | The example_mnist.py is modified from [this script](https://github.com/pytorch/examples/blob/master/mnist/main.py). 21 | -------------------------------------------------------------------------------- /example_mnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | os.environ['CUDA_VISIBLE_DEVICES']='2' 4 | os.environ['GPU_DEBUG']='2' 5 | 6 | import argparse 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | from torchvision import datasets, transforms 12 | 13 | class Net(nn.Module): 14 | def __init__(self): 15 | super(Net, self).__init__() 16 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 17 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 18 | self.conv2_drop = nn.Dropout2d() 19 | self.fc1 = nn.Linear(320, 50) 20 | self.fc2 = nn.Linear(50, 10) 21 | 22 | def forward(self, x): 23 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 24 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 25 | x = x.view(-1, 320) 26 | x = F.relu(self.fc1(x)) 27 | x = F.dropout(x, training=self.training) 28 | x = self.fc2(x) 29 | return F.log_softmax(x, dim=1) 30 | 31 | import sys 32 | from gpu_profile import gpu_profile 33 | 34 | def train(args, model, device, train_loader, optimizer, epoch): 35 | model.train() 36 | for batch_idx, (data, target) in enumerate(train_loader): 37 | data, target = data.to(device), target.to(device) 38 | optimizer.zero_grad() 39 | output = model(data) 40 | loss = F.nll_loss(output, target) 41 | loss.backward() 42 | optimizer.step() 43 | if batch_idx % args.log_interval == 0: 44 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 45 | epoch, batch_idx * len(data), len(train_loader.dataset), 46 | 100. * batch_idx / len(train_loader), loss.item())) 47 | 48 | #gpu_profile(frame=sys._getframe(), event='line', arg=None) 49 | 50 | def test(args, model, device, test_loader): 51 | model.eval() 52 | test_loss = 0 53 | correct = 0 54 | with torch.no_grad(): 55 | for data, target in test_loader: 56 | data, target = data.to(device), target.to(device) 57 | output = model(data) 58 | test_loss += F.nll_loss(output, target, size_average=False).item() # sum up batch loss 59 | pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability 60 | correct += pred.eq(target.view_as(pred)).sum().item() 61 | 62 | test_loss /= len(test_loader.dataset) 63 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 64 | test_loss, correct, len(test_loader.dataset), 65 | 100. * correct / len(test_loader.dataset))) 66 | 67 | def main(): 68 | # Training settings 69 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 70 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 71 | help='input batch size for training (default: 64)') 72 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 73 | help='input batch size for testing (default: 1000)') 74 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 75 | help='number of epochs to train (default: 10)') 76 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 77 | help='learning rate (default: 0.01)') 78 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 79 | help='SGD momentum (default: 0.5)') 80 | parser.add_argument('--no-cuda', action='store_true', default=False, 81 | help='disables CUDA training') 82 | parser.add_argument('--seed', type=int, default=1, metavar='S', 83 | help='random seed (default: 1)') 84 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 85 | help='how many batches to wait before logging training status') 86 | args = parser.parse_args() 87 | use_cuda = not args.no_cuda and torch.cuda.is_available() 88 | 89 | torch.manual_seed(args.seed) 90 | 91 | device = torch.device("cuda" if use_cuda else "cpu") 92 | 93 | kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 94 | train_loader = torch.utils.data.DataLoader( 95 | datasets.MNIST('./data', train=True, download=True, 96 | transform=transforms.Compose([ 97 | transforms.ToTensor(), 98 | transforms.Normalize((0.1307,), (0.3081,)) 99 | ])), 100 | batch_size=args.batch_size, shuffle=True, **kwargs) 101 | test_loader = torch.utils.data.DataLoader( 102 | datasets.MNIST('./data', train=False, transform=transforms.Compose([ 103 | transforms.ToTensor(), 104 | transforms.Normalize((0.1307,), (0.3081,)) 105 | ])), 106 | batch_size=args.test_batch_size, shuffle=True, **kwargs) 107 | 108 | 109 | model = Net().to(device) 110 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 111 | 112 | for epoch in range(1, args.epochs+1): 113 | train(args, model, device, train_loader, optimizer, epoch) 114 | test(args, model, device, test_loader) 115 | 116 | 117 | if __name__ == '__main__': 118 | sys.settrace(gpu_profile) 119 | main() -------------------------------------------------------------------------------- /gpu_profile.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import linecache 3 | import os 4 | 5 | os.environ['CUDA_LAUNCH_BLOCKING']='1' 6 | 7 | #import pynvml3 8 | from py3nvml import py3nvml 9 | import torch 10 | import socket 11 | 12 | # different settings 13 | print_tensor_sizes = False 14 | use_incremental = False 15 | 16 | 17 | if 'GPU_DEBUG' in os.environ: 18 | gpu_profile_fn = f"Host_{socket.gethostname()}_gpu{os.environ['GPU_DEBUG']}_mem_prof-{datetime.datetime.now():%d-%b-%y-%H-%M-%S}.prof.txt" 19 | print('profiling gpu usage to ', gpu_profile_fn) 20 | 21 | 22 | ## Global variables 23 | last_tensor_sizes = set() 24 | last_meminfo_used = 0 25 | lineno = None 26 | func_name = None 27 | filename = None 28 | module_name = None 29 | 30 | 31 | def gpu_profile(frame, event, arg): 32 | # it is _about to_ execute (!) 33 | global last_tensor_sizes 34 | global last_meminfo_used 35 | global lineno, func_name, filename, module_name 36 | 37 | if event == 'line': 38 | try: 39 | # about _previous_ line (!) 40 | if lineno is not None: 41 | py3nvml.nvmlInit() 42 | handle = py3nvml.nvmlDeviceGetHandleByIndex(int(os.environ['GPU_DEBUG'])) 43 | meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) 44 | line = linecache.getline(filename, lineno) 45 | where_str = module_name+' '+func_name+':'+str(lineno) 46 | 47 | new_meminfo_used = meminfo.used 48 | mem_display = new_meminfo_used-last_meminfo_used if use_incremental else new_meminfo_used 49 | with open(gpu_profile_fn, 'a+') as f: 50 | f.write(f"{where_str:<50}" 51 | f":{(mem_display)/1024**2:<7.1f}Mb " 52 | f"{line.rstrip()}\n") 53 | 54 | last_meminfo_used = new_meminfo_used 55 | if print_tensor_sizes is True: 56 | for tensor in get_tensors(): 57 | if not hasattr(tensor, 'dbg_alloc_where'): 58 | tensor.dbg_alloc_where = where_str 59 | new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) 60 | for x in get_tensors()} 61 | for t, s, loc in new_tensor_sizes - last_tensor_sizes: 62 | f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') 63 | for t, s, loc in last_tensor_sizes - new_tensor_sizes: 64 | f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') 65 | last_tensor_sizes = new_tensor_sizes 66 | py3nvml.nvmlShutdown() 67 | 68 | # save details about line _to be_ executed 69 | lineno = None 70 | 71 | func_name = frame.f_code.co_name 72 | filename = frame.f_globals["__file__"] 73 | if (filename.endswith(".pyc") or 74 | filename.endswith(".pyo")): 75 | filename = filename[:-1] 76 | module_name = frame.f_globals["__name__"] 77 | lineno = frame.f_lineno 78 | 79 | #only profile codes within the parenet folder, otherwise there are too many function calls into other pytorch scripts 80 | #need to modify the key words below to suit your case. 81 | if 'gpu_memory_profiling' not in os.path.dirname(os.path.abspath(filename)): 82 | lineno = None # skip current line evaluation 83 | 84 | if ('car_datasets' in filename 85 | or '_exec_config' in func_name 86 | or 'gpu_profile' in module_name 87 | or 'tee_stdout' in module_name): 88 | lineno = None # skip othe unnecessary lines 89 | 90 | return gpu_profile 91 | 92 | except (KeyError, AttributeError): 93 | pass 94 | 95 | return gpu_profile 96 | 97 | 98 | def get_tensors(gpu_only=True): 99 | import gc 100 | for obj in gc.get_objects(): 101 | try: 102 | if torch.is_tensor(obj): 103 | tensor = obj 104 | elif hasattr(obj, 'data') and torch.is_tensor(obj.data): 105 | tensor = obj.data 106 | else: 107 | continue 108 | 109 | if tensor.is_cuda: 110 | yield tensor 111 | except Exception as e: 112 | pass 113 | --------------------------------------------------------------------------------