├── .gitignore ├── samples ├── cnn.png ├── biases.png └── weights.png ├── example ├── train.py └── models.py ├── README.md ├── plot_percentiles.py └── monitor.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.swp 3 | *.swn 4 | *.swo 5 | 6 | result 7 | -------------------------------------------------------------------------------- /samples/cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/cnn.png -------------------------------------------------------------------------------- /samples/biases.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/biases.png -------------------------------------------------------------------------------- /samples/weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/weights.png -------------------------------------------------------------------------------- /example/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from chainer import links as L 3 | from chainer import optimizers, cuda 4 | from chainer import datasets, iterators, training 5 | from chainer.training import extensions 6 | from models import CNN 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-G', '--gpu', type=int, default=2) 12 | parser.add_argument('-E', '--epochs', type=int, default=100) 13 | parser.add_argument('-B', '--batchsize', type=int, default=128) 14 | return parser.parse_args() 15 | 16 | 17 | def main(args): 18 | train, test = datasets.get_mnist(withlabel=True, ndim=3) 19 | train_iter = iterators.SerialIterator(train, args.batchsize) 20 | test_iter = iterators.SerialIterator(test, args.batchsize, repeat=False, 21 | shuffle=False) 22 | 23 | model = L.Classifier(CNN()) 24 | 25 | if args.gpu >= 0: 26 | cuda.check_cuda_available() 27 | cuda.get_device(args.gpu).use() 28 | model.to_gpu() 29 | 30 | optimizer = optimizers.Adam() 31 | optimizer.setup(model) 32 | 33 | updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) 34 | 35 | trainer = training.Trainer(updater, (args.epochs, 'epoch')) 36 | trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) 37 | trainer.extend(extensions.LogReport()) # Default log report 38 | trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 39 | 'main/accuracy', 40 | 'validation/main/loss', 41 | 'validation/main/accuracy'])) 42 | trainer.extend(extensions.ProgressBar()) 43 | trainer.run() 44 | 45 | 46 | if __name__ == '__main__': 47 | args = parse_args() 48 | main(args) 49 | -------------------------------------------------------------------------------- /example/models.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | from chainer import links as L 3 | from chainer import functions as F 4 | 5 | import os, sys 6 | project_root = os.path.abspath('..') 7 | if project_root not in sys.path: 8 | sys.path.insert(0, project_root) 9 | import monitor 10 | 11 | 12 | class CNN(chainer.Chain): 13 | def __init__(self): 14 | super().__init__( 15 | conv1=L.Convolution2D(1, 32, 4, stride=2, pad=1), 16 | conv2=L.Convolution2D(32, 64, 4, stride=2, pad=1), 17 | conv3=L.Convolution2D(64, 128, 4, stride=2, pad=1), 18 | fc1=L.Linear(None, 1024), 19 | fc2=L.Linear(1024, 10) 20 | ) 21 | self.monitored_layers = ['conv1', 'conv2', 'conv3', 'fc1', 'fc2'] 22 | 23 | def __call__(self, x): 24 | # Collect and report the statistics from the previous call before 25 | # proceeding with this forward propagation. 26 | self.report() 27 | 28 | h = self.conv1(x) 29 | h = self.conv2(h) 30 | h = self.conv3(h) 31 | h = self.fc1(h) 32 | h = self.fc2(h) 33 | return h 34 | 35 | def report(self): 36 | # To aggregate statistics over all layers, skip the layer argument 37 | # paramstats = monitor.weight_statistics(self) 38 | # chainer.report(paramstats) 39 | 40 | for layer in self.monitored_layers: 41 | stats = monitor.weight_statistics(self, layer) 42 | chainer.report(stats) 43 | 44 | stats = monitor.bias_statistics(self, layer) 45 | chainer.report(stats) 46 | 47 | stats = monitor.weight_gradient_statistics(self, layer) 48 | chainer.report(stats) 49 | 50 | stats = monitor.bias_gradient_statistics(self, layer) 51 | chainer.report(stats) 52 | 53 | stats = monitor.sparsity(self, layer) 54 | chainer.report(stats) 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural Network Monitoring for Chainer Models 2 | 3 | This is a Chainer plugin for computing statistics over weights, biases and gradients during training. 4 | 5 | You can collect the above mentioned data from any [chainer.Chain](http://docs.chainer.org/en/stable/reference/core/link.html) and repeat it for each iteration or epoch, saving them to a log using e.g. [chainer.report()](http://docs.chainer.org/en/stable/reference/util/reporter.html) to plot the statistical changes over the course of training later on. 6 | 7 | *Note: It is not yet optimized for speed. Computing percentiles is for instance slow.* 8 | 9 | ## Statistics 10 | 11 | 12 | 13 | *An example plot of weights, biases and gradients from different convolutional and fully connected layers.* 14 | 15 | ### Data 16 | 17 | - Mean 18 | - Standard deviation 19 | - Min 20 | - Max 21 | - Percentiles 22 | - Sparsity (actually just counting number of zeros) 23 | 24 | ### Targets 25 | 26 | - Weights 27 | - Biases 28 | - Gradients 29 | 30 | For a **specific layer** or the aggregated data over the **entire model**. 31 | 32 | ### Dependencies 33 | 34 | Chainer 1.18.0 (including NumPy 1.11.2) 35 | 36 | ## Example 37 | 38 | ### Usage 39 | 40 | ```python 41 | # This is simplified code, see the 'example' directory for a working example. 42 | import monitor 43 | 44 | # Prepare the model. 45 | model = MLP() 46 | optimizer.setup(model) 47 | 48 | # Forward computation, back propagation and a parameter update. 49 | # The gradients are still stored inside each parameter after those steps. 50 | loss = model(x, t) 51 | loss.backward() 52 | optimizer.update() 53 | 54 | # Use the plugin to collect data and nicely ask Chainer to include it in the log. 55 | weight_report = monitor.weight_statistics(model) 56 | chainer.report(weight_report) # Mean, std, min, max, percentiles 57 | 58 | bias_report = monitor.bias_statistics(model) 59 | chainer.report(bias_report) 60 | 61 | fst_layer_grads = monitor.weight_gradient_statistics(model, layer_name='fc1') 62 | chainer.report(fst_layer_grads) 63 | 64 | zeros = monitor.sparsity(model, include_bias=False) 65 | chainer.report(zeros) 66 | ``` 67 | 68 | ### Plotting the Statistics 69 | 70 | Weights and biases when training a small convolutional neural network for classification for 100 epochs aggregated over all layers (including final fully connected linear layers). The different alphas show different percentiles. 71 | 72 | #### Weights 73 | 74 | 75 | 76 | #### Biases 77 | 78 | 79 | -------------------------------------------------------------------------------- /plot_percentiles.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import matplotlib.pyplot as plt 4 | import json 5 | 6 | 7 | ylabels = ['Weights', 'Biases', 'Weight Gradients', 'Bias Gradients'] 8 | log_key_templates = ['predictor/{layer}/W/data/{statistic}', 9 | 'predictor/{layer}/b/data/{statistic}', 10 | 'predictor/{layer}/W/grad/{statistic}', 11 | 'predictor/{layer}/b/grad/{statistic}'] 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--log', type=str, default='example/result/log') 17 | parser.add_argument('--out', type=str, default='plot.png') 18 | parser.add_argument('--layers', nargs='+', type=str, 19 | default=['conv1', 'conv2', 'conv3', 'fc1', 'fc2']) 20 | return parser.parse_args() 21 | 22 | 23 | def load_log(filename, keys=None): 24 | 25 | """Parse a JSON file and return a dictionary with the given keys. Each 26 | key maps to a list of corresponding data measurements in the file.""" 27 | 28 | log = collections.defaultdict(list) 29 | 30 | with open(filename) as f: 31 | for data in json.load(f): # For each type of data 32 | if keys is not None: 33 | for key in keys: 34 | log[key].append(data[key]) 35 | else: 36 | for key, value in data.items(): 37 | log[key].append(value) 38 | return log 39 | 40 | 41 | def plot_percentile_log(filename, log, layer_names, color='green', dpi=100): 42 | 43 | n_rows = len(layer_names) 44 | n_cols = len(log_key_templates) 45 | 46 | figsize = (1024*n_cols/dpi, 1024*n_rows/dpi) 47 | fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, dpi=dpi) 48 | 49 | if n_rows == 1: 50 | axes = axes.reshape(1, -1) 51 | elif n_cols == 1: 52 | axes = axes.reshape(-1, 1) 53 | 54 | for row in range(n_rows): 55 | for col in range(n_cols): 56 | 57 | ax = axes[row, col] 58 | key_template = log_key_templates[col] 59 | 60 | # Min, Max 61 | pmin_key = key_template.format(layer=layer_names[row], 62 | statistic='min') 63 | pmax_key = key_template.format(layer=layer_names[row], 64 | statistic='max') 65 | pmin = log[pmin_key] 66 | pmax = log[pmax_key] 67 | ax.fill_between(range(len(pmin)), pmin, pmax, facecolor=color, 68 | alpha=0.2, linewidth=0) 69 | 70 | # Median 71 | z_key = key_template.format(layer=layer_names[row], 72 | statistic='percentile/3') 73 | z = log[z_key] 74 | ax.plot(range(len(z)), z, color=color, alpha=0.2) 75 | 76 | # Get all percentiles and fill between 77 | n_percentiles = 3 78 | for p in range(n_percentiles): 79 | s_key = key_template.format(layer=layer_names[row], 80 | statistic='percentile/{}'.format(p)) 81 | ns_key = key_template.format(layer=layer_names[row], 82 | statistic='percentile/{}'.format(6-p)) 83 | s = log[s_key] 84 | ns = log[ns_key] 85 | ax.fill_between(range(len(s)), s, ns, facecolor=color, 86 | alpha=0.2, linewidth=0) 87 | 88 | ax.set_xlabel('Epochs') 89 | ax.set_ylabel(ylabels[col]) 90 | ax.set_title(layer_names[row]) 91 | 92 | plt.savefig(filename, bbox_inches='tight', dpi=dpi) 93 | plt.clf() 94 | plt.close() 95 | 96 | 97 | if __name__ == '__main__': 98 | args = parse_args() 99 | log = load_log(args.log) 100 | plot_percentile_log(args.out, log, args.layers) 101 | -------------------------------------------------------------------------------- /monitor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from functools import reduce 3 | import cupy 4 | import chainer 5 | 6 | 7 | # The name template of the statistic to collect and include in the report, 8 | # e.g. 'predictor/conv1/W/grad/percentile/sigma_one' 9 | key_template = '{model}/{layer}/{param}/{attr}/{statistic}' 10 | 11 | 12 | def weight_statistics(model, layer_name=None): 13 | 14 | """Collect weight statistict from the given model and return it as a 15 | ``dict``. 16 | 17 | Args: 18 | model (~chainer.Chain): The model from which statistics are collected. 19 | layer_name (str): Name of the layer which may be specified or set to 20 | ``None`` to aggregate over all layers. 21 | 22 | Returns: 23 | dict: Parameter statistics. 24 | """ 25 | 26 | return parameter_statistics(model, 'W', 'data', layer_name) 27 | 28 | 29 | def bias_statistics(model, layer_name=None): 30 | 31 | """Collect bias statistict from the given model and return it as a 32 | ``dict``. 33 | 34 | Args: 35 | model (~chainer.Chain): The model from which statistics are collected. 36 | layer_name (str): Name of the layer which may be specified or set to 37 | ``None`` to aggregate over all layers. 38 | 39 | Returns: 40 | dict: Parameter statistics. 41 | """ 42 | 43 | return parameter_statistics(model, 'b', 'data', layer_name) 44 | 45 | 46 | def weight_gradient_statistics(model, layer_name=None): 47 | 48 | """Collect weight gradient statistict from the given model and return it 49 | as a ``dict``. 50 | 51 | Args: 52 | model (~chainer.Chain): The model from which statistics are collected. 53 | layer_name (str): Name of the layer which may be specified or set to 54 | ``None`` to aggregate over all layers. 55 | 56 | Returns: 57 | dict: Parameter statistics. 58 | """ 59 | 60 | return parameter_statistics(model, 'W', 'grad', layer_name) 61 | 62 | 63 | def bias_gradient_statistics(model, layer_name=None): 64 | 65 | """Collect bias gradient statistict from the given model and return it 66 | as a ``dict``. 67 | 68 | Args: 69 | model (~chainer.Chain): The model from which statistics are collected. 70 | layer_name (str): Name of the layer which may be specified or set to 71 | ``None`` to aggregate over all layers. 72 | 73 | Returns: 74 | dict: Parameter statistics. 75 | """ 76 | 77 | return parameter_statistics(model, 'b', 'grad', layer_name) 78 | 79 | 80 | def sparsity(model, include_bias=False, layer_name=None): 81 | 82 | """Count the number of parameters with the value zero for the given model 83 | and return it as a ``dict``. 84 | 85 | Args: 86 | model (~chainer.Chain): The model from which statistics are collected. 87 | include_bias (bool): ``True`` to include the number of biases that are 88 | zero, ``False`` to exclude them. 89 | layer_name (str): Name of the layer which may be specified or set to 90 | ``None`` to aggregate over all layers. 91 | 92 | Returns: 93 | dict: Parameter statistics. 94 | """ 95 | 96 | xp = model.xp 97 | 98 | def reduce_count_zeros(acc, param): 99 | if param.name == 'W' or (include_bias and param.name == 'b'): 100 | acc += param.data.size - xp.count_nonzero(param.data) 101 | return acc 102 | 103 | if layer_name is not None: 104 | sparsity = reduce(reduce_count_zeros, [getattr(model, layer_name)], 0) 105 | else: 106 | sparsity = reduce(reduce_count_zeros, model.params(), 0) 107 | 108 | key = key_template.format(model=model.name, 109 | layer='*' if layer_name is None else layer_name, 110 | param='Wb' if include_bias else 'W' , 111 | attr='sparsity', 112 | statistic='zeros') 113 | 114 | return { key: sparsity } 115 | 116 | 117 | def parameter_statistics(model, param_name, attr_name, layer_name=None): 118 | 119 | """Collect statistict from the given model and return it as a ``dict``. 120 | 121 | The returned ``dict`` contains a key for each metric, mapping to a NumPy 122 | or CuPy ``float32`` value depending on if the given model was on the CPU or 123 | the GPU. 124 | 125 | Args: 126 | model (~chainer.Chain): The model from which statistics are collected. 127 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``. 128 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``. 129 | layer_name (str): Name of the layer which may be specified or set to 130 | ``None`` to aggregate over all layers. 131 | 132 | Returns: 133 | dict: Parameter statistics. 134 | """ 135 | 136 | if layer_name is not None: # Collect statistics for a single layer only 137 | l = getattr(model, layer_name) 138 | lp = layer_params(l, param_name, attr_name) 139 | return as_statistics(lp, model.name, param_name, attr_name, 140 | layer_name=layer_name) 141 | 142 | lp = layers_params(model, param_name, attr_name) 143 | return as_statistics(lp, model.name, param_name, attr_name) 144 | 145 | 146 | def layer_params(layer, param_name, attr_name): 147 | 148 | """Return parameters in a flattened array from the given layer or an empty 149 | array if the parameters are not found. 150 | 151 | Args: 152 | layer (~chainer.Link): The layer from which parameters are collected. 153 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``. 154 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``. 155 | 156 | Returns: 157 | array: Flattened array of parameters. 158 | """ 159 | 160 | if isinstance(layer, chainer.Chain): 161 | # Nested chainer.Chain, aggregate all underlying statistics 162 | return layers_params(layer, param_name, attr_name) 163 | elif not hasattr(layer, param_name): 164 | return layer.xp.array([]) 165 | 166 | params = getattr(layer, param_name) 167 | params = getattr(params, attr_name) 168 | return params.flatten() 169 | 170 | 171 | def layers_params(model, param_name, attr_name): 172 | 173 | """Return all parameters in a flattened array from the given model. 174 | 175 | Args: 176 | model (~chainer.Chain): The model from which parameters are collected. 177 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``. 178 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``. 179 | 180 | Returns: 181 | array: Flattened array of parameters. 182 | """ 183 | 184 | xp = model.xp 185 | params = xp.array([], dtype=xp.float32) 186 | 187 | for param in model.params(): 188 | if param.name == param_name: 189 | values = getattr(param, attr_name) 190 | values = values.flatten() 191 | params = xp.concatenate((params, values)) # Slow? 192 | 193 | return params 194 | 195 | 196 | def as_statistics(data, model_name, param_name, attr_name, *, layer_name=None, 197 | statistics=('min', 'max', 'mean', 'std'), 198 | percentiles=(0.13, 2.28, 15.87, 50, 84.13, 97.72, 99.87)): 199 | 200 | """Compute statistics based on the given data and return it as a ``dict``. 201 | 202 | Args: 203 | data (array): NumPy or CuPy array of data. 204 | model_name (str): Name of the model, e.g. ``predictor``. 205 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``. 206 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``. 207 | layer_name (str): Name of the layer which may be specified or set to 208 | ``None``. In the case of ``None`` the layer name will be set to 209 | ``'*'``. 210 | 211 | Returns: 212 | dict: Parameter statistics. 213 | """ 214 | 215 | stats = {} 216 | 217 | if layer_name is None: 218 | layer_name = '*' 219 | 220 | if percentiles: 221 | ps = get_percentiles(data, sigma=percentiles) 222 | for i, percentile in enumerate(ps): 223 | key = key_template.format(model=model_name, 224 | layer=layer_name, 225 | param=param_name, 226 | attr=attr_name, 227 | statistic='percentile/{}'.format(i)) 228 | stats[key] = percentile 229 | 230 | for s in statistics: 231 | key = key_template.format(model=model_name, 232 | layer=layer_name, 233 | param=param_name, 234 | attr=attr_name, 235 | statistic=s) 236 | try: 237 | stats[key] = getattr(data, s)() 238 | except ValueError: 239 | # If data is missing from uninitialized model parameters, add 240 | # NaN placeholders instead of skipping the measurements completely 241 | # or registering zeros 242 | stats[key] = float('NaN') 243 | 244 | return stats 245 | 246 | 247 | def get_percentiles(data, sigma): 248 | 249 | """Compute percentiles for data and return an array with the same length 250 | as the number of elements in ``sigma``. 251 | 252 | Args: 253 | data (array): 1-dimensional NumPy or CuPy arryay. 254 | sigma (tuple): Sigmas for which percentiles are computed. 255 | 256 | Returns: 257 | array: Array of percentiles. 258 | """ 259 | 260 | def _get_percentiles(_data, _sigma): 261 | try: 262 | return np.percentile(_data, _sigma) 263 | except IndexError: # Handle uninitialized model parameters 264 | return np.array((float('NaN'),) * 7) 265 | 266 | if isinstance(data, cupy.ndarray): 267 | # TODO(hvy): Make percentile computation faster for GPUs 268 | data = cupy.asnumpy(data) 269 | return cupy.asarray(_get_percentiles(data, sigma)) 270 | 271 | return _get_percentiles(data, sigma) 272 | --------------------------------------------------------------------------------