├── .gitignore
├── samples
    ├── cnn.png
    ├── biases.png
    └── weights.png
├── example
    ├── train.py
    └── models.py
├── README.md
├── plot_percentiles.py
└── monitor.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.swp
3 | *.swn
4 | *.swo
5 | 
6 | result
7 | 


--------------------------------------------------------------------------------
/samples/cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/cnn.png


--------------------------------------------------------------------------------
/samples/biases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/biases.png


--------------------------------------------------------------------------------
/samples/weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/weights.png


--------------------------------------------------------------------------------
/example/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from chainer import links as L
 3 | from chainer import optimizers, cuda
 4 | from chainer import datasets, iterators, training
 5 | from chainer.training import extensions
 6 | from models import CNN
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('-G', '--gpu', type=int, default=2)
12 |     parser.add_argument('-E', '--epochs', type=int, default=100)
13 |     parser.add_argument('-B', '--batchsize', type=int, default=128)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def main(args):
18 |     train, test = datasets.get_mnist(withlabel=True, ndim=3)
19 |     train_iter = iterators.SerialIterator(train, args.batchsize)
20 |     test_iter = iterators.SerialIterator(test, args.batchsize, repeat=False,
21 |                                          shuffle=False)
22 | 
23 |     model = L.Classifier(CNN())
24 | 
25 |     if args.gpu >= 0:
26 |         cuda.check_cuda_available()
27 |         cuda.get_device(args.gpu).use()
28 |         model.to_gpu()
29 | 
30 |     optimizer = optimizers.Adam()
31 |     optimizer.setup(model)
32 | 
33 |     updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
34 | 
35 |     trainer = training.Trainer(updater, (args.epochs, 'epoch'))
36 |     trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu))
37 |     trainer.extend(extensions.LogReport())  # Default log report
38 |     trainer.extend(extensions.PrintReport(['epoch', 'main/loss',
39 |                                            'main/accuracy',
40 |                                            'validation/main/loss',
41 |                                            'validation/main/accuracy']))
42 |     trainer.extend(extensions.ProgressBar())
43 |     trainer.run()
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     args = parse_args()
48 |     main(args)
49 | 


--------------------------------------------------------------------------------
/example/models.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | from chainer import links as L
 3 | from chainer import functions as F
 4 | 
 5 | import os, sys
 6 | project_root = os.path.abspath('..')
 7 | if project_root not in sys.path:
 8 |     sys.path.insert(0, project_root)
 9 | import monitor
10 | 
11 | 
12 | class CNN(chainer.Chain):
13 |     def __init__(self):
14 |         super().__init__(
15 |             conv1=L.Convolution2D(1, 32, 4, stride=2, pad=1),
16 |             conv2=L.Convolution2D(32, 64, 4, stride=2, pad=1),
17 |             conv3=L.Convolution2D(64, 128, 4, stride=2, pad=1),
18 |             fc1=L.Linear(None, 1024),
19 |             fc2=L.Linear(1024, 10)
20 |         )
21 |         self.monitored_layers = ['conv1', 'conv2', 'conv3', 'fc1', 'fc2']
22 | 
23 |     def __call__(self, x):
24 |         # Collect and report the statistics from the previous call before
25 |         # proceeding with this forward propagation.
26 |         self.report()
27 | 
28 |         h = self.conv1(x)
29 |         h = self.conv2(h)
30 |         h = self.conv3(h)
31 |         h = self.fc1(h)
32 |         h = self.fc2(h)
33 |         return h
34 | 
35 |     def report(self):
36 |         # To aggregate statistics over all layers, skip the layer argument
37 |         # paramstats = monitor.weight_statistics(self)
38 |         # chainer.report(paramstats)
39 | 
40 |         for layer in self.monitored_layers:
41 |             stats = monitor.weight_statistics(self, layer)
42 |             chainer.report(stats)
43 | 
44 |             stats = monitor.bias_statistics(self, layer)
45 |             chainer.report(stats)
46 | 
47 |             stats = monitor.weight_gradient_statistics(self, layer)
48 |             chainer.report(stats)
49 | 
50 |             stats = monitor.bias_gradient_statistics(self, layer)
51 |             chainer.report(stats)
52 | 
53 |             stats = monitor.sparsity(self, layer)
54 |             chainer.report(stats)
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Network Monitoring for Chainer Models
 2 | 
 3 | This is a Chainer plugin for computing statistics over weights, biases and gradients during training.
 4 | 
 5 | You can collect the above mentioned data from any [chainer.Chain](http://docs.chainer.org/en/stable/reference/core/link.html) and repeat it for each iteration or epoch, saving them to a log using e.g. [chainer.report()](http://docs.chainer.org/en/stable/reference/util/reporter.html) to plot the statistical changes over the course of training later on.
 6 | 
 7 | *Note: It is not yet optimized for speed. Computing percentiles is for instance slow.*
 8 | 
 9 | ## Statistics
10 | 
11 | <img src="./samples/cnn.png" width="1024px;"/>
12 | 
13 | *An example plot of weights, biases and gradients from different convolutional and fully connected layers.*
14 | 
15 | ### Data
16 | 
17 | - Mean
18 | - Standard deviation
19 | - Min
20 | - Max
21 | - Percentiles
22 | - Sparsity (actually just counting number of zeros)
23 | 
24 | ### Targets
25 | 
26 | - Weights
27 | - Biases
28 | - Gradients
29 | 
30 | For a **specific layer** or the aggregated data over the **entire model**.
31 | 
32 | ### Dependencies
33 | 
34 | Chainer 1.18.0 (including NumPy 1.11.2)
35 | 
36 | ## Example
37 | 
38 | ### Usage
39 | 
40 | ```python
41 | # This is simplified code, see the 'example' directory for a working example.
42 | import monitor
43 | 
44 | # Prepare the model.
45 | model = MLP()
46 | optimizer.setup(model)
47 | 
48 | # Forward computation, back propagation and a parameter update.
49 | # The gradients are still stored inside each parameter after those steps.
50 | loss = model(x, t)
51 | loss.backward()
52 | optimizer.update()
53 | 
54 | # Use the plugin to collect data and nicely ask Chainer to include it in the log.
55 | weight_report = monitor.weight_statistics(model)
56 | chainer.report(weight_report) # Mean, std, min, max, percentiles
57 | 
58 | bias_report = monitor.bias_statistics(model)
59 | chainer.report(bias_report)
60 | 
61 | fst_layer_grads = monitor.weight_gradient_statistics(model, layer_name='fc1')
62 | chainer.report(fst_layer_grads)
63 | 
64 | zeros = monitor.sparsity(model, include_bias=False)
65 | chainer.report(zeros)
66 | ```
67 | 
68 | ### Plotting the Statistics
69 | 
70 | Weights and biases when training a small convolutional neural network for classification for 100 epochs aggregated over all layers (including final fully connected linear layers). The different alphas show different percentiles.
71 | 
72 | #### Weights
73 | 
74 | <img src="./samples/weights.png" width="512px;"/>
75 | 
76 | #### Biases
77 | 
78 | <img src="./samples/biases.png" width="512px;"/>
79 | 


--------------------------------------------------------------------------------
/plot_percentiles.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | 
  6 | 
  7 | ylabels = ['Weights', 'Biases', 'Weight Gradients', 'Bias Gradients']
  8 | log_key_templates = ['predictor/{layer}/W/data/{statistic}',
  9 |                      'predictor/{layer}/b/data/{statistic}',
 10 |                      'predictor/{layer}/W/grad/{statistic}',
 11 |                      'predictor/{layer}/b/grad/{statistic}']
 12 | 
 13 | 
 14 | def parse_args():
 15 |     parser = argparse.ArgumentParser()
 16 |     parser.add_argument('--log', type=str, default='example/result/log')
 17 |     parser.add_argument('--out', type=str, default='plot.png')
 18 |     parser.add_argument('--layers', nargs='+', type=str,
 19 |                         default=['conv1', 'conv2', 'conv3', 'fc1', 'fc2'])
 20 |     return parser.parse_args()
 21 | 
 22 | 
 23 | def load_log(filename, keys=None):
 24 | 
 25 |     """Parse a JSON file and return a dictionary with the given keys. Each
 26 |     key maps to a list of corresponding data measurements in the file."""
 27 | 
 28 |     log = collections.defaultdict(list)
 29 | 
 30 |     with open(filename) as f:
 31 |         for data in json.load(f):  # For each type of data
 32 |             if keys is not None:
 33 |                 for key in keys:
 34 |                     log[key].append(data[key])
 35 |             else:
 36 |                 for key, value in data.items():
 37 |                     log[key].append(value)
 38 |     return log
 39 | 
 40 | 
 41 | def plot_percentile_log(filename, log, layer_names, color='green', dpi=100):
 42 | 
 43 |     n_rows = len(layer_names)
 44 |     n_cols = len(log_key_templates)
 45 | 
 46 |     figsize = (1024*n_cols/dpi, 1024*n_rows/dpi)
 47 |     fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, dpi=dpi)
 48 | 
 49 |     if n_rows == 1:
 50 |         axes = axes.reshape(1, -1)
 51 |     elif n_cols == 1:
 52 |         axes = axes.reshape(-1, 1)
 53 | 
 54 |     for row in range(n_rows):
 55 |         for col in range(n_cols):
 56 | 
 57 |             ax = axes[row, col]
 58 |             key_template = log_key_templates[col]
 59 | 
 60 |             # Min, Max
 61 |             pmin_key = key_template.format(layer=layer_names[row],
 62 |                     statistic='min')
 63 |             pmax_key = key_template.format(layer=layer_names[row],
 64 |                     statistic='max')
 65 |             pmin = log[pmin_key]
 66 |             pmax = log[pmax_key]
 67 |             ax.fill_between(range(len(pmin)), pmin, pmax, facecolor=color,
 68 |                             alpha=0.2, linewidth=0)
 69 | 
 70 |             # Median
 71 |             z_key = key_template.format(layer=layer_names[row],
 72 |                     statistic='percentile/3')
 73 |             z = log[z_key]
 74 |             ax.plot(range(len(z)), z, color=color, alpha=0.2)
 75 | 
 76 |             # Get all percentiles and fill between
 77 |             n_percentiles = 3
 78 |             for p in range(n_percentiles):
 79 |                 s_key = key_template.format(layer=layer_names[row],
 80 |                         statistic='percentile/{}'.format(p))
 81 |                 ns_key = key_template.format(layer=layer_names[row],
 82 |                         statistic='percentile/{}'.format(6-p))
 83 |                 s = log[s_key]
 84 |                 ns = log[ns_key]
 85 |                 ax.fill_between(range(len(s)), s, ns, facecolor=color,
 86 |                                 alpha=0.2, linewidth=0)
 87 | 
 88 |             ax.set_xlabel('Epochs')
 89 |             ax.set_ylabel(ylabels[col])
 90 |             ax.set_title(layer_names[row])
 91 | 
 92 |     plt.savefig(filename, bbox_inches='tight', dpi=dpi)
 93 |     plt.clf()
 94 |     plt.close()
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     args = parse_args()
 99 |     log = load_log(args.log)
100 |     plot_percentile_log(args.out, log, args.layers)
101 | 


--------------------------------------------------------------------------------
/monitor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from functools import reduce
  3 | import cupy
  4 | import chainer
  5 | 
  6 | 
  7 | # The name template of the statistic to collect and include in the report,
  8 | # e.g. 'predictor/conv1/W/grad/percentile/sigma_one'
  9 | key_template = '{model}/{layer}/{param}/{attr}/{statistic}'
 10 | 
 11 | 
 12 | def weight_statistics(model, layer_name=None):
 13 | 
 14 |     """Collect weight statistict from the given model and return it as a
 15 |     ``dict``.
 16 | 
 17 |     Args:
 18 |         model (~chainer.Chain): The model from which statistics are collected.
 19 |         layer_name (str): Name of the layer which may be specified or set to
 20 |             ``None`` to aggregate over all layers.
 21 | 
 22 |     Returns:
 23 |         dict: Parameter statistics.
 24 |     """
 25 | 
 26 |     return parameter_statistics(model, 'W', 'data', layer_name)
 27 | 
 28 | 
 29 | def bias_statistics(model, layer_name=None):
 30 | 
 31 |     """Collect bias statistict from the given model and return it as a
 32 |     ``dict``.
 33 | 
 34 |     Args:
 35 |         model (~chainer.Chain): The model from which statistics are collected.
 36 |         layer_name (str): Name of the layer which may be specified or set to
 37 |             ``None`` to aggregate over all layers.
 38 | 
 39 |     Returns:
 40 |         dict: Parameter statistics.
 41 |     """
 42 | 
 43 |     return parameter_statistics(model, 'b', 'data', layer_name)
 44 | 
 45 | 
 46 | def weight_gradient_statistics(model, layer_name=None):
 47 | 
 48 |     """Collect weight gradient statistict from the given model and return it
 49 |     as a ``dict``.
 50 | 
 51 |     Args:
 52 |         model (~chainer.Chain): The model from which statistics are collected.
 53 |         layer_name (str): Name of the layer which may be specified or set to
 54 |             ``None`` to aggregate over all layers.
 55 | 
 56 |     Returns:
 57 |         dict: Parameter statistics.
 58 |     """
 59 | 
 60 |     return parameter_statistics(model, 'W', 'grad', layer_name)
 61 | 
 62 | 
 63 | def bias_gradient_statistics(model, layer_name=None):
 64 | 
 65 |     """Collect bias gradient statistict from the given model and return it
 66 |     as a ``dict``.
 67 | 
 68 |     Args:
 69 |         model (~chainer.Chain): The model from which statistics are collected.
 70 |         layer_name (str): Name of the layer which may be specified or set to
 71 |             ``None`` to aggregate over all layers.
 72 | 
 73 |     Returns:
 74 |         dict: Parameter statistics.
 75 |     """
 76 | 
 77 |     return parameter_statistics(model, 'b', 'grad', layer_name)
 78 | 
 79 | 
 80 | def sparsity(model, include_bias=False, layer_name=None):
 81 | 
 82 |     """Count the number of parameters with the value zero for the given model
 83 |     and return it as a ``dict``.
 84 | 
 85 |     Args:
 86 |         model (~chainer.Chain): The model from which statistics are collected.
 87 |         include_bias (bool): ``True`` to include the number of biases that are
 88 |             zero, ``False`` to exclude them.
 89 |         layer_name (str): Name of the layer which may be specified or set to
 90 |             ``None`` to aggregate over all layers.
 91 | 
 92 |     Returns:
 93 |         dict: Parameter statistics.
 94 |     """
 95 | 
 96 |     xp = model.xp
 97 | 
 98 |     def reduce_count_zeros(acc, param):
 99 |         if param.name == 'W' or (include_bias and param.name == 'b'):
100 |             acc += param.data.size - xp.count_nonzero(param.data)
101 |         return acc
102 | 
103 |     if layer_name is not None:
104 |         sparsity = reduce(reduce_count_zeros, [getattr(model, layer_name)], 0)
105 |     else:
106 |         sparsity = reduce(reduce_count_zeros, model.params(), 0)
107 | 
108 |     key = key_template.format(model=model.name,
109 |                               layer='*' if layer_name is None else layer_name,
110 |                               param='Wb' if include_bias else 'W' ,
111 |                               attr='sparsity',
112 |                               statistic='zeros')
113 | 
114 |     return { key: sparsity }
115 | 
116 | 
117 | def parameter_statistics(model, param_name, attr_name, layer_name=None):
118 | 
119 |     """Collect statistict from the given model and return it as a ``dict``.
120 | 
121 |     The returned ``dict`` contains a key for each metric, mapping to a NumPy
122 |     or CuPy ``float32`` value depending on if the given model was on the CPU or
123 |     the GPU.
124 | 
125 |     Args:
126 |         model (~chainer.Chain): The model from which statistics are collected.
127 |         param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
128 |         attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
129 |         layer_name (str): Name of the layer which may be specified or set to
130 |             ``None`` to aggregate over all layers.
131 | 
132 |     Returns:
133 |         dict: Parameter statistics.
134 |     """
135 | 
136 |     if layer_name is not None:  # Collect statistics for a single layer only
137 |         l = getattr(model, layer_name)
138 |         lp = layer_params(l, param_name, attr_name)
139 |         return as_statistics(lp, model.name, param_name, attr_name,
140 |                              layer_name=layer_name)
141 | 
142 |     lp = layers_params(model, param_name, attr_name)
143 |     return as_statistics(lp, model.name, param_name, attr_name)
144 | 
145 | 
146 | def layer_params(layer, param_name, attr_name):
147 | 
148 |     """Return parameters in a flattened array from the given layer or an empty
149 |     array if the parameters are not found.
150 | 
151 |     Args:
152 |         layer (~chainer.Link): The layer from which parameters are collected.
153 |         param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
154 |         attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
155 | 
156 |     Returns:
157 |         array: Flattened array of parameters.
158 |     """
159 | 
160 |     if isinstance(layer, chainer.Chain):
161 |         # Nested chainer.Chain, aggregate all underlying statistics
162 |         return layers_params(layer, param_name, attr_name)
163 |     elif not hasattr(layer, param_name):
164 |         return layer.xp.array([])
165 | 
166 |     params = getattr(layer, param_name)
167 |     params = getattr(params, attr_name)
168 |     return params.flatten()
169 | 
170 | 
171 | def layers_params(model, param_name, attr_name):
172 | 
173 |     """Return all parameters in a flattened array from the given model.
174 | 
175 |     Args:
176 |         model (~chainer.Chain): The model from which parameters are collected.
177 |         param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
178 |         attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
179 | 
180 |     Returns:
181 |         array: Flattened array of parameters.
182 |     """
183 | 
184 |     xp = model.xp
185 |     params = xp.array([], dtype=xp.float32)
186 | 
187 |     for param in model.params():
188 |         if param.name == param_name:
189 |             values = getattr(param, attr_name)
190 |             values = values.flatten()
191 |             params = xp.concatenate((params, values))  # Slow?
192 | 
193 |     return params
194 | 
195 | 
196 | def as_statistics(data, model_name, param_name, attr_name, *, layer_name=None,
197 |                   statistics=('min', 'max', 'mean', 'std'),
198 |                   percentiles=(0.13, 2.28, 15.87, 50, 84.13, 97.72, 99.87)):
199 | 
200 |     """Compute statistics based on the given data and return it as a ``dict``.
201 | 
202 |     Args:
203 |         data (array): NumPy or CuPy array of data.
204 |         model_name (str): Name of the model,  e.g. ``predictor``.
205 |         param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
206 |         attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
207 |         layer_name (str): Name of the layer which may be specified or set to
208 |             ``None``. In the case of ``None`` the layer name will be set to
209 |             ``'*'``.
210 | 
211 |     Returns:
212 |         dict: Parameter statistics.
213 |     """
214 | 
215 |     stats = {}
216 | 
217 |     if layer_name is None:
218 |         layer_name = '*'
219 | 
220 |     if percentiles:
221 |         ps = get_percentiles(data, sigma=percentiles)
222 |         for i, percentile in enumerate(ps):
223 |             key = key_template.format(model=model_name,
224 |                                       layer=layer_name,
225 |                                       param=param_name,
226 |                                       attr=attr_name,
227 |                                       statistic='percentile/{}'.format(i))
228 |             stats[key] = percentile
229 | 
230 |     for s in statistics:
231 |         key = key_template.format(model=model_name,
232 |                                   layer=layer_name,
233 |                                   param=param_name,
234 |                                   attr=attr_name,
235 |                                   statistic=s)
236 |         try:
237 |             stats[key] = getattr(data, s)()
238 |         except ValueError:
239 |             # If data is missing from uninitialized model parameters, add
240 |             # NaN placeholders instead of skipping the measurements completely
241 |             # or registering zeros
242 |             stats[key] = float('NaN')
243 | 
244 |     return stats
245 | 
246 | 
247 | def get_percentiles(data, sigma):
248 | 
249 |     """Compute percentiles for data and return an array with the same length
250 |     as the number of elements in ``sigma``.
251 | 
252 |     Args:
253 |         data (array): 1-dimensional NumPy or CuPy arryay.
254 |         sigma (tuple): Sigmas for which percentiles are computed.
255 | 
256 |     Returns:
257 |         array: Array of percentiles.
258 |     """
259 | 
260 |     def _get_percentiles(_data, _sigma):
261 |         try:
262 |             return np.percentile(_data, _sigma)
263 |         except IndexError:  # Handle uninitialized model parameters
264 |             return np.array((float('NaN'),) * 7)
265 | 
266 |     if isinstance(data, cupy.ndarray):
267 |         # TODO(hvy): Make percentile computation faster for GPUs
268 |         data = cupy.asnumpy(data)
269 |         return cupy.asarray(_get_percentiles(data, sigma))
270 | 
271 |     return _get_percentiles(data, sigma)
272 | 


--------------------------------------------------------------------------------