├── .gitignore
├── samples
├── cnn.png
├── biases.png
└── weights.png
├── example
├── train.py
└── models.py
├── README.md
├── plot_percentiles.py
└── monitor.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.swp
3 | *.swn
4 | *.swo
5 |
6 | result
7 |
--------------------------------------------------------------------------------
/samples/cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/cnn.png
--------------------------------------------------------------------------------
/samples/biases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/biases.png
--------------------------------------------------------------------------------
/samples/weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hvy/chainer-param-monitor/HEAD/samples/weights.png
--------------------------------------------------------------------------------
/example/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from chainer import links as L
3 | from chainer import optimizers, cuda
4 | from chainer import datasets, iterators, training
5 | from chainer.training import extensions
6 | from models import CNN
7 |
8 |
9 | def parse_args():
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-G', '--gpu', type=int, default=2)
12 | parser.add_argument('-E', '--epochs', type=int, default=100)
13 | parser.add_argument('-B', '--batchsize', type=int, default=128)
14 | return parser.parse_args()
15 |
16 |
17 | def main(args):
18 | train, test = datasets.get_mnist(withlabel=True, ndim=3)
19 | train_iter = iterators.SerialIterator(train, args.batchsize)
20 | test_iter = iterators.SerialIterator(test, args.batchsize, repeat=False,
21 | shuffle=False)
22 |
23 | model = L.Classifier(CNN())
24 |
25 | if args.gpu >= 0:
26 | cuda.check_cuda_available()
27 | cuda.get_device(args.gpu).use()
28 | model.to_gpu()
29 |
30 | optimizer = optimizers.Adam()
31 | optimizer.setup(model)
32 |
33 | updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
34 |
35 | trainer = training.Trainer(updater, (args.epochs, 'epoch'))
36 | trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu))
37 | trainer.extend(extensions.LogReport()) # Default log report
38 | trainer.extend(extensions.PrintReport(['epoch', 'main/loss',
39 | 'main/accuracy',
40 | 'validation/main/loss',
41 | 'validation/main/accuracy']))
42 | trainer.extend(extensions.ProgressBar())
43 | trainer.run()
44 |
45 |
46 | if __name__ == '__main__':
47 | args = parse_args()
48 | main(args)
49 |
--------------------------------------------------------------------------------
/example/models.py:
--------------------------------------------------------------------------------
1 | import chainer
2 | from chainer import links as L
3 | from chainer import functions as F
4 |
5 | import os, sys
6 | project_root = os.path.abspath('..')
7 | if project_root not in sys.path:
8 | sys.path.insert(0, project_root)
9 | import monitor
10 |
11 |
12 | class CNN(chainer.Chain):
13 | def __init__(self):
14 | super().__init__(
15 | conv1=L.Convolution2D(1, 32, 4, stride=2, pad=1),
16 | conv2=L.Convolution2D(32, 64, 4, stride=2, pad=1),
17 | conv3=L.Convolution2D(64, 128, 4, stride=2, pad=1),
18 | fc1=L.Linear(None, 1024),
19 | fc2=L.Linear(1024, 10)
20 | )
21 | self.monitored_layers = ['conv1', 'conv2', 'conv3', 'fc1', 'fc2']
22 |
23 | def __call__(self, x):
24 | # Collect and report the statistics from the previous call before
25 | # proceeding with this forward propagation.
26 | self.report()
27 |
28 | h = self.conv1(x)
29 | h = self.conv2(h)
30 | h = self.conv3(h)
31 | h = self.fc1(h)
32 | h = self.fc2(h)
33 | return h
34 |
35 | def report(self):
36 | # To aggregate statistics over all layers, skip the layer argument
37 | # paramstats = monitor.weight_statistics(self)
38 | # chainer.report(paramstats)
39 |
40 | for layer in self.monitored_layers:
41 | stats = monitor.weight_statistics(self, layer)
42 | chainer.report(stats)
43 |
44 | stats = monitor.bias_statistics(self, layer)
45 | chainer.report(stats)
46 |
47 | stats = monitor.weight_gradient_statistics(self, layer)
48 | chainer.report(stats)
49 |
50 | stats = monitor.bias_gradient_statistics(self, layer)
51 | chainer.report(stats)
52 |
53 | stats = monitor.sparsity(self, layer)
54 | chainer.report(stats)
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Neural Network Monitoring for Chainer Models
2 |
3 | This is a Chainer plugin for computing statistics over weights, biases and gradients during training.
4 |
5 | You can collect the above mentioned data from any [chainer.Chain](http://docs.chainer.org/en/stable/reference/core/link.html) and repeat it for each iteration or epoch, saving them to a log using e.g. [chainer.report()](http://docs.chainer.org/en/stable/reference/util/reporter.html) to plot the statistical changes over the course of training later on.
6 |
7 | *Note: It is not yet optimized for speed. Computing percentiles is for instance slow.*
8 |
9 | ## Statistics
10 |
11 |
12 |
13 | *An example plot of weights, biases and gradients from different convolutional and fully connected layers.*
14 |
15 | ### Data
16 |
17 | - Mean
18 | - Standard deviation
19 | - Min
20 | - Max
21 | - Percentiles
22 | - Sparsity (actually just counting number of zeros)
23 |
24 | ### Targets
25 |
26 | - Weights
27 | - Biases
28 | - Gradients
29 |
30 | For a **specific layer** or the aggregated data over the **entire model**.
31 |
32 | ### Dependencies
33 |
34 | Chainer 1.18.0 (including NumPy 1.11.2)
35 |
36 | ## Example
37 |
38 | ### Usage
39 |
40 | ```python
41 | # This is simplified code, see the 'example' directory for a working example.
42 | import monitor
43 |
44 | # Prepare the model.
45 | model = MLP()
46 | optimizer.setup(model)
47 |
48 | # Forward computation, back propagation and a parameter update.
49 | # The gradients are still stored inside each parameter after those steps.
50 | loss = model(x, t)
51 | loss.backward()
52 | optimizer.update()
53 |
54 | # Use the plugin to collect data and nicely ask Chainer to include it in the log.
55 | weight_report = monitor.weight_statistics(model)
56 | chainer.report(weight_report) # Mean, std, min, max, percentiles
57 |
58 | bias_report = monitor.bias_statistics(model)
59 | chainer.report(bias_report)
60 |
61 | fst_layer_grads = monitor.weight_gradient_statistics(model, layer_name='fc1')
62 | chainer.report(fst_layer_grads)
63 |
64 | zeros = monitor.sparsity(model, include_bias=False)
65 | chainer.report(zeros)
66 | ```
67 |
68 | ### Plotting the Statistics
69 |
70 | Weights and biases when training a small convolutional neural network for classification for 100 epochs aggregated over all layers (including final fully connected linear layers). The different alphas show different percentiles.
71 |
72 | #### Weights
73 |
74 |
75 |
76 | #### Biases
77 |
78 |
79 |
--------------------------------------------------------------------------------
/plot_percentiles.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import collections
3 | import matplotlib.pyplot as plt
4 | import json
5 |
6 |
7 | ylabels = ['Weights', 'Biases', 'Weight Gradients', 'Bias Gradients']
8 | log_key_templates = ['predictor/{layer}/W/data/{statistic}',
9 | 'predictor/{layer}/b/data/{statistic}',
10 | 'predictor/{layer}/W/grad/{statistic}',
11 | 'predictor/{layer}/b/grad/{statistic}']
12 |
13 |
14 | def parse_args():
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--log', type=str, default='example/result/log')
17 | parser.add_argument('--out', type=str, default='plot.png')
18 | parser.add_argument('--layers', nargs='+', type=str,
19 | default=['conv1', 'conv2', 'conv3', 'fc1', 'fc2'])
20 | return parser.parse_args()
21 |
22 |
23 | def load_log(filename, keys=None):
24 |
25 | """Parse a JSON file and return a dictionary with the given keys. Each
26 | key maps to a list of corresponding data measurements in the file."""
27 |
28 | log = collections.defaultdict(list)
29 |
30 | with open(filename) as f:
31 | for data in json.load(f): # For each type of data
32 | if keys is not None:
33 | for key in keys:
34 | log[key].append(data[key])
35 | else:
36 | for key, value in data.items():
37 | log[key].append(value)
38 | return log
39 |
40 |
41 | def plot_percentile_log(filename, log, layer_names, color='green', dpi=100):
42 |
43 | n_rows = len(layer_names)
44 | n_cols = len(log_key_templates)
45 |
46 | figsize = (1024*n_cols/dpi, 1024*n_rows/dpi)
47 | fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, dpi=dpi)
48 |
49 | if n_rows == 1:
50 | axes = axes.reshape(1, -1)
51 | elif n_cols == 1:
52 | axes = axes.reshape(-1, 1)
53 |
54 | for row in range(n_rows):
55 | for col in range(n_cols):
56 |
57 | ax = axes[row, col]
58 | key_template = log_key_templates[col]
59 |
60 | # Min, Max
61 | pmin_key = key_template.format(layer=layer_names[row],
62 | statistic='min')
63 | pmax_key = key_template.format(layer=layer_names[row],
64 | statistic='max')
65 | pmin = log[pmin_key]
66 | pmax = log[pmax_key]
67 | ax.fill_between(range(len(pmin)), pmin, pmax, facecolor=color,
68 | alpha=0.2, linewidth=0)
69 |
70 | # Median
71 | z_key = key_template.format(layer=layer_names[row],
72 | statistic='percentile/3')
73 | z = log[z_key]
74 | ax.plot(range(len(z)), z, color=color, alpha=0.2)
75 |
76 | # Get all percentiles and fill between
77 | n_percentiles = 3
78 | for p in range(n_percentiles):
79 | s_key = key_template.format(layer=layer_names[row],
80 | statistic='percentile/{}'.format(p))
81 | ns_key = key_template.format(layer=layer_names[row],
82 | statistic='percentile/{}'.format(6-p))
83 | s = log[s_key]
84 | ns = log[ns_key]
85 | ax.fill_between(range(len(s)), s, ns, facecolor=color,
86 | alpha=0.2, linewidth=0)
87 |
88 | ax.set_xlabel('Epochs')
89 | ax.set_ylabel(ylabels[col])
90 | ax.set_title(layer_names[row])
91 |
92 | plt.savefig(filename, bbox_inches='tight', dpi=dpi)
93 | plt.clf()
94 | plt.close()
95 |
96 |
97 | if __name__ == '__main__':
98 | args = parse_args()
99 | log = load_log(args.log)
100 | plot_percentile_log(args.out, log, args.layers)
101 |
--------------------------------------------------------------------------------
/monitor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from functools import reduce
3 | import cupy
4 | import chainer
5 |
6 |
7 | # The name template of the statistic to collect and include in the report,
8 | # e.g. 'predictor/conv1/W/grad/percentile/sigma_one'
9 | key_template = '{model}/{layer}/{param}/{attr}/{statistic}'
10 |
11 |
12 | def weight_statistics(model, layer_name=None):
13 |
14 | """Collect weight statistict from the given model and return it as a
15 | ``dict``.
16 |
17 | Args:
18 | model (~chainer.Chain): The model from which statistics are collected.
19 | layer_name (str): Name of the layer which may be specified or set to
20 | ``None`` to aggregate over all layers.
21 |
22 | Returns:
23 | dict: Parameter statistics.
24 | """
25 |
26 | return parameter_statistics(model, 'W', 'data', layer_name)
27 |
28 |
29 | def bias_statistics(model, layer_name=None):
30 |
31 | """Collect bias statistict from the given model and return it as a
32 | ``dict``.
33 |
34 | Args:
35 | model (~chainer.Chain): The model from which statistics are collected.
36 | layer_name (str): Name of the layer which may be specified or set to
37 | ``None`` to aggregate over all layers.
38 |
39 | Returns:
40 | dict: Parameter statistics.
41 | """
42 |
43 | return parameter_statistics(model, 'b', 'data', layer_name)
44 |
45 |
46 | def weight_gradient_statistics(model, layer_name=None):
47 |
48 | """Collect weight gradient statistict from the given model and return it
49 | as a ``dict``.
50 |
51 | Args:
52 | model (~chainer.Chain): The model from which statistics are collected.
53 | layer_name (str): Name of the layer which may be specified or set to
54 | ``None`` to aggregate over all layers.
55 |
56 | Returns:
57 | dict: Parameter statistics.
58 | """
59 |
60 | return parameter_statistics(model, 'W', 'grad', layer_name)
61 |
62 |
63 | def bias_gradient_statistics(model, layer_name=None):
64 |
65 | """Collect bias gradient statistict from the given model and return it
66 | as a ``dict``.
67 |
68 | Args:
69 | model (~chainer.Chain): The model from which statistics are collected.
70 | layer_name (str): Name of the layer which may be specified or set to
71 | ``None`` to aggregate over all layers.
72 |
73 | Returns:
74 | dict: Parameter statistics.
75 | """
76 |
77 | return parameter_statistics(model, 'b', 'grad', layer_name)
78 |
79 |
80 | def sparsity(model, include_bias=False, layer_name=None):
81 |
82 | """Count the number of parameters with the value zero for the given model
83 | and return it as a ``dict``.
84 |
85 | Args:
86 | model (~chainer.Chain): The model from which statistics are collected.
87 | include_bias (bool): ``True`` to include the number of biases that are
88 | zero, ``False`` to exclude them.
89 | layer_name (str): Name of the layer which may be specified or set to
90 | ``None`` to aggregate over all layers.
91 |
92 | Returns:
93 | dict: Parameter statistics.
94 | """
95 |
96 | xp = model.xp
97 |
98 | def reduce_count_zeros(acc, param):
99 | if param.name == 'W' or (include_bias and param.name == 'b'):
100 | acc += param.data.size - xp.count_nonzero(param.data)
101 | return acc
102 |
103 | if layer_name is not None:
104 | sparsity = reduce(reduce_count_zeros, [getattr(model, layer_name)], 0)
105 | else:
106 | sparsity = reduce(reduce_count_zeros, model.params(), 0)
107 |
108 | key = key_template.format(model=model.name,
109 | layer='*' if layer_name is None else layer_name,
110 | param='Wb' if include_bias else 'W' ,
111 | attr='sparsity',
112 | statistic='zeros')
113 |
114 | return { key: sparsity }
115 |
116 |
117 | def parameter_statistics(model, param_name, attr_name, layer_name=None):
118 |
119 | """Collect statistict from the given model and return it as a ``dict``.
120 |
121 | The returned ``dict`` contains a key for each metric, mapping to a NumPy
122 | or CuPy ``float32`` value depending on if the given model was on the CPU or
123 | the GPU.
124 |
125 | Args:
126 | model (~chainer.Chain): The model from which statistics are collected.
127 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
128 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
129 | layer_name (str): Name of the layer which may be specified or set to
130 | ``None`` to aggregate over all layers.
131 |
132 | Returns:
133 | dict: Parameter statistics.
134 | """
135 |
136 | if layer_name is not None: # Collect statistics for a single layer only
137 | l = getattr(model, layer_name)
138 | lp = layer_params(l, param_name, attr_name)
139 | return as_statistics(lp, model.name, param_name, attr_name,
140 | layer_name=layer_name)
141 |
142 | lp = layers_params(model, param_name, attr_name)
143 | return as_statistics(lp, model.name, param_name, attr_name)
144 |
145 |
146 | def layer_params(layer, param_name, attr_name):
147 |
148 | """Return parameters in a flattened array from the given layer or an empty
149 | array if the parameters are not found.
150 |
151 | Args:
152 | layer (~chainer.Link): The layer from which parameters are collected.
153 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
154 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
155 |
156 | Returns:
157 | array: Flattened array of parameters.
158 | """
159 |
160 | if isinstance(layer, chainer.Chain):
161 | # Nested chainer.Chain, aggregate all underlying statistics
162 | return layers_params(layer, param_name, attr_name)
163 | elif not hasattr(layer, param_name):
164 | return layer.xp.array([])
165 |
166 | params = getattr(layer, param_name)
167 | params = getattr(params, attr_name)
168 | return params.flatten()
169 |
170 |
171 | def layers_params(model, param_name, attr_name):
172 |
173 | """Return all parameters in a flattened array from the given model.
174 |
175 | Args:
176 | model (~chainer.Chain): The model from which parameters are collected.
177 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
178 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
179 |
180 | Returns:
181 | array: Flattened array of parameters.
182 | """
183 |
184 | xp = model.xp
185 | params = xp.array([], dtype=xp.float32)
186 |
187 | for param in model.params():
188 | if param.name == param_name:
189 | values = getattr(param, attr_name)
190 | values = values.flatten()
191 | params = xp.concatenate((params, values)) # Slow?
192 |
193 | return params
194 |
195 |
196 | def as_statistics(data, model_name, param_name, attr_name, *, layer_name=None,
197 | statistics=('min', 'max', 'mean', 'std'),
198 | percentiles=(0.13, 2.28, 15.87, 50, 84.13, 97.72, 99.87)):
199 |
200 | """Compute statistics based on the given data and return it as a ``dict``.
201 |
202 | Args:
203 | data (array): NumPy or CuPy array of data.
204 | model_name (str): Name of the model, e.g. ``predictor``.
205 | param_name (str): Name of the parameter, ``'W'`` or ``'b'``.
206 | attr_name (str): Name of the attribute, ``'data'`` or ``'grad'``.
207 | layer_name (str): Name of the layer which may be specified or set to
208 | ``None``. In the case of ``None`` the layer name will be set to
209 | ``'*'``.
210 |
211 | Returns:
212 | dict: Parameter statistics.
213 | """
214 |
215 | stats = {}
216 |
217 | if layer_name is None:
218 | layer_name = '*'
219 |
220 | if percentiles:
221 | ps = get_percentiles(data, sigma=percentiles)
222 | for i, percentile in enumerate(ps):
223 | key = key_template.format(model=model_name,
224 | layer=layer_name,
225 | param=param_name,
226 | attr=attr_name,
227 | statistic='percentile/{}'.format(i))
228 | stats[key] = percentile
229 |
230 | for s in statistics:
231 | key = key_template.format(model=model_name,
232 | layer=layer_name,
233 | param=param_name,
234 | attr=attr_name,
235 | statistic=s)
236 | try:
237 | stats[key] = getattr(data, s)()
238 | except ValueError:
239 | # If data is missing from uninitialized model parameters, add
240 | # NaN placeholders instead of skipping the measurements completely
241 | # or registering zeros
242 | stats[key] = float('NaN')
243 |
244 | return stats
245 |
246 |
247 | def get_percentiles(data, sigma):
248 |
249 | """Compute percentiles for data and return an array with the same length
250 | as the number of elements in ``sigma``.
251 |
252 | Args:
253 | data (array): 1-dimensional NumPy or CuPy arryay.
254 | sigma (tuple): Sigmas for which percentiles are computed.
255 |
256 | Returns:
257 | array: Array of percentiles.
258 | """
259 |
260 | def _get_percentiles(_data, _sigma):
261 | try:
262 | return np.percentile(_data, _sigma)
263 | except IndexError: # Handle uninitialized model parameters
264 | return np.array((float('NaN'),) * 7)
265 |
266 | if isinstance(data, cupy.ndarray):
267 | # TODO(hvy): Make percentile computation faster for GPUs
268 | data = cupy.asnumpy(data)
269 | return cupy.asarray(_get_percentiles(data, sigma))
270 |
271 | return _get_percentiles(data, sigma)
272 |
--------------------------------------------------------------------------------