├── README.md
├── knowledge_distillation_layer.hpp
└── knowledge_distillation_layer.cpp
/README.md:
--------------------------------------------------------------------------------
1 | # KnowledgeDistillation Layer (Caffe implementation)
2 | This is a CPU implementation of knowledge distillation in Caffe.
3 | This code is heavily based on [softmax_loss_layer.hpp](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_loss_layer.hpp) and [softmax_loss_layer.cpp](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cpp).
4 | Please refer to the [paper](https://arxiv.org/abs/1503.02531)
5 | ```
6 | Hinton, G. Vinyals, O. and Dean, J. Distilling knowledge in a neural network. 2015.
7 | ```
8 |
9 |
10 | ## Installation
11 | 1. Install [Caffe](https://github.com/BVLC/caffe/) in your directory `CAFFE`
12 | 2. Clone this repository in your directory `ROOT`
13 | ```bash
14 | cd $ROOT
15 | git clone https://github.com/wentianli/knowledge_distillation_caffe.git
16 | ```
17 | 3. Move files to your Caffe folder
18 | ```bash
19 | cp $ROOT/knowledge_distillation_layer.hpp $CAFFE/include/caffe/layers
20 | cp $ROOT/knowledge_distillation_layer.cpp $CAFFE/src/caffe/layers
21 | ```
22 | 4. Modify `$CAFFE/src/caffe/proto/caffe.proto`
add `optional KnowledgeDistillationParameter` in `LayerParameter`
23 | ```proto
24 | message LayerParameter {
25 | ...
26 |
27 | //next available layer-specific ID
28 | optional KnowledgeDistillationParameter knowledge_distillation_param = 147;
29 | }
30 | ```
31 |
add `message KnowledgeDistillationParameter`
32 | ```proto
33 | message KnowledgeDistillationParameter {
34 | optional float temperature = 1 [default = 1];
35 | }
36 | ```
37 | 5. Build Caffe
38 |
39 |
40 | ## Usage
41 | KnowledgeDistillation Layer has one specific parameter `temperature`.
The layer takes 2 or 3 input blobs:
42 | `bottom[0]`: the logits of the student
43 | `bottom[1]`: the logits of the teacher
44 | `bottom[2]`(*optional*): label inputs
45 | The logits are first divided by temperatrue T, then mapped to probability distributions over classes using the softmax function. The layer computes KL divergence instead of cross entropy. The gradients are multiplied by T^2, as suggested in the [paper](https://arxiv.org/abs/1503.02531).
46 | 1. Common setting in `prototxt` (2 input blobs are given)
47 | ```
48 | layer {
49 | name: "KD"
50 | type: "KnowledgeDistillation"
51 | bottom: "student_logits"
52 | bottom: "taecher_logits"
53 | top: "KL_div"
54 | include { phase: TRAIN }
55 | knowledge_distillation_param { temperature: 4 } #usually larger than 1
56 | loss_weight: 1
57 | }
58 | ```
59 | 2. If you have ignore_label, 3 input blobs should be given
60 | ```
61 | layer {
62 | name: "KD"
63 | type: "KnowledgeDistillation"
64 | bottom: "student_logits"
65 | bottom: "taecher_logits"
66 | bottom: "label"
67 | top: "KL_div"
68 | include { phase: TRAIN }
69 | knowledge_distillation_param { temperature: 4 }
70 | loss_param {ignore_label: 2}
71 | loss_weight: 1
72 | }
73 |
--------------------------------------------------------------------------------
/knowledge_distillation_layer.hpp:
--------------------------------------------------------------------------------
1 | #ifndef CAFFE_KNOWLEDGE_DISTILLATION_LAYER_HPP_
2 | #define CAFFE_KNOWLEDGE_DISTILLATION_LAYER_HPP_
3 |
4 | #include
5 |
6 | #include "caffe/blob.hpp"
7 | #include "caffe/layer.hpp"
8 | #include "caffe/proto/caffe.pb.h"
9 |
10 | #include "caffe/layers/loss_layer.hpp"
11 | #include "caffe/layers/softmax_layer.hpp"
12 |
13 | namespace caffe {
14 |
15 | /**
16 | * @brief Computes KL divergence of two probability distributions,
17 | * using the logits of the student and the teacher.
18 | *
19 | * @param bottom input Blob vector (length 2 or 3)
20 | * -# @f$ (N \times C \times H \times W) @f$
21 | * the student's predictions @f$ x @f$, a Blob with values in
22 | * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
23 | * the @f$ K = CHW @f$ classes. This layer maps these scores to a
24 | * probability distribution over classes using the softmax function
25 | * @f$ \hat{p}_{nk} = \exp(x_{nk}/T) /
26 | * \left[\sum_{k'} \exp(x_{nk'}/T)\right] @f$ (see SoftmaxLayer)
27 | * with T indicating the temperature in knowledge distillation.
28 | * -# @f$ (N \times C \times H \times W) @f$
29 | * the teacher's predictions @f$ x @f$, similar to that of the student.
30 | * -# @f$ (N \times 1 \times 1 \times 1) @f$ (optional)
31 | * the labels @f$ l @f$, an integer-valued Blob with values
32 | * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
33 | * indicating the correct class label among the @f$ K @f$ classes
34 | * @param top output Blob vector (length 1)
35 | * -# @f$ (1 \times 1 \times 1 \times 1) @f$
36 | * the computed KL divergence: @f$ E =
37 | * \frac{-1}{N} \sum\limits_{n=1}^N \sum\limits_{l=0}^{K-1} \hat{q}_{n,l} \log(\frac{\hat{p}_{n,l}}{\hat{q}_{n,l}})
38 | * @f$, for softmax output class probabilites @f$ \hat{p} @f$ by the student
39 | * and @f$ \hat{q} @f$ by the teacher
40 | *
41 | * [1] Hinton, G. Vinyals, O. and Dean, J. Distilling knowledge in a neural network. 2015.
42 | *
43 | * TODO: GPU implementation
44 | */
45 | template
46 | class KnowledgeDistillationLayer : public LossLayer {
47 | public:
48 | /**
49 | * @param param provides LossParameter loss_param, with options:
50 | * - temperature (optional, default 1)
51 | * Both logits are divided by the temperature T.
52 | * The gradients are multiplied by T^2.
53 | * - ignore_label (optional)
54 | * Specify a label value that should be ignored when computing the loss.
55 | * Only valid when label inputs are given as bottom[2].
56 | * - normalize (optional, default true)
57 | * If true, the loss is normalized by the number of (nonignored) labels
58 | * present; otherwise the loss is simply summed over spatial locations.
59 | */
60 | explicit KnowledgeDistillationLayer(const LayerParameter& param)
61 | : LossLayer(param) {}
62 | virtual void LayerSetUp(const vector*>& bottom,
63 | const vector*>& top);
64 | virtual void Reshape(const vector*>& bottom,
65 | const vector*>& top);
66 |
67 | virtual inline const char* type() const { return "KnowledgeDistillation"; }
68 | virtual inline int ExactNumBottomBlobs() const { return -1; }
69 | virtual inline int MinBottomBlobs() const { return 2; }
70 | virtual inline int MaxBottomBlobs() const { return 3; }
71 | virtual inline int ExactNumTopBlobs() const { return 1; }
72 |
73 | protected:
74 | virtual void Forward_cpu(const vector*>& bottom,
75 | const vector*>& top);
76 | //virtual void Forward_gpu(const vector*>& bottom,
77 | //const vector*>& top);
78 | /**
79 | * @brief Computes the softmax loss error gradient w.r.t. the predictions.
80 | *
81 | * Gradients are not computed with respect to the teacher's inputs (bottom[1])
82 | * nor label inputs (bottom[2], optional), crashing
83 | * if propagate_down[1] or propagate_down[2] is set.
84 | */
85 | virtual void Backward_cpu(const vector*>& top,
86 | const vector& propagate_down, const vector*>& bottom);
87 | //virtual void Backward_gpu(const vector*>& top,
88 | //const vector& propagate_down, const vector*>& bottom);
89 |
90 | /// Read the normalization mode parameter and compute the normalizer based
91 | /// on the blob size. If normalization_mode is VALID, the count of valid
92 | /// outputs will be read from valid_count, unless it is -1 in which case
93 | /// all outputs are assumed to be valid.
94 | virtual Dtype get_normalizer(
95 | LossParameter_NormalizationMode normalization_mode, int valid_count);
96 |
97 | /// The internal SoftmaxLayer(s) used to map predictions to a distribution. s for student, t for teacher.
98 | shared_ptr > s_softmax_layer_;
99 | shared_ptr > t_softmax_layer_;
100 | /// prob stores the input logit.
101 | Blob s_logit_;
102 | Blob t_logit_;
103 | /// prob stores the output probability predictions from the SoftmaxLayer.
104 | Blob s_prob_;
105 | Blob t_prob_;
106 | /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
107 | vector*> s_softmax_bottom_vec_;
108 | vector*> t_softmax_bottom_vec_;
109 | /// top vector holder used in call to the underlying SoftmaxLayer::Forward
110 | vector*> s_softmax_top_vec_;
111 | vector*> t_softmax_top_vec_;
112 | /// Whether to ignore instances with a certain label.
113 | bool has_ignore_label_;
114 | /// The label indicating that an instance should be ignored.
115 | int ignore_label_;
116 | /// How to normalize the output loss.
117 | LossParameter_NormalizationMode normalization_;
118 |
119 | int softmax_axis_, outer_num_, inner_num_;
120 | /// temperature
121 | Dtype T;
122 | };
123 |
124 | } // namespace caffe
125 |
126 | #endif // CAFFE_KNOWLEDGE_DISTILLATION_LAYER_HPP_
127 |
--------------------------------------------------------------------------------
/knowledge_distillation_layer.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #include "caffe/layers/knowledge_distillation_layer.hpp"
7 | #include "caffe/util/math_functions.hpp"
8 |
9 | namespace caffe {
10 |
11 | template
12 | void KnowledgeDistillationLayer::LayerSetUp(
13 | const vector*>& bottom, const vector*>& top) {
14 | LossLayer::LayerSetUp(bottom, top);
15 | LayerParameter softmax_param;
16 | softmax_param.set_type("Softmax");
17 | softmax_param.mutable_softmax_param()->set_axis(this->layer_param_.softmax_param().axis());
18 | s_logit_.Reshape(bottom[0]->shape());
19 | s_softmax_layer_ = LayerRegistry::CreateLayer(softmax_param);
20 | s_softmax_bottom_vec_.clear();
21 | s_softmax_bottom_vec_.push_back(&s_logit_);
22 | s_softmax_top_vec_.clear();
23 | s_softmax_top_vec_.push_back(&s_prob_);
24 | s_softmax_layer_->SetUp(s_softmax_bottom_vec_, s_softmax_top_vec_);
25 | t_logit_.Reshape(bottom[1]->shape());
26 | t_softmax_layer_ = LayerRegistry::CreateLayer(softmax_param);
27 | t_softmax_bottom_vec_.clear();
28 | t_softmax_bottom_vec_.push_back(&t_logit_);
29 | t_softmax_top_vec_.clear();
30 | t_softmax_top_vec_.push_back(&t_prob_);
31 | t_softmax_layer_->SetUp(t_softmax_bottom_vec_, t_softmax_top_vec_);
32 |
33 | has_ignore_label_ =
34 | this->layer_param_.loss_param().has_ignore_label();
35 | if (has_ignore_label_) {
36 | ignore_label_ = this->layer_param_.loss_param().ignore_label();
37 | }
38 | if (!this->layer_param_.loss_param().has_normalization() &&
39 | this->layer_param_.loss_param().has_normalize()) {
40 | normalization_ = this->layer_param_.loss_param().normalize() ?
41 | LossParameter_NormalizationMode_VALID :
42 | LossParameter_NormalizationMode_BATCH_SIZE;
43 | } else {
44 | normalization_ = this->layer_param_.loss_param().normalization();
45 | }
46 |
47 | T = this->layer_param_.knowledge_distillation_param().temperature();
48 | }
49 |
50 | template
51 | void KnowledgeDistillationLayer::Reshape(
52 | const vector*>& bottom, const vector*>& top) {
53 | LossLayer::Reshape(bottom, top);
54 | s_logit_.Reshape(bottom[0]->shape());
55 | t_logit_.Reshape(bottom[1]->shape());
56 | s_softmax_layer_->Reshape(s_softmax_bottom_vec_, s_softmax_top_vec_);
57 | t_softmax_layer_->Reshape(t_softmax_bottom_vec_, t_softmax_top_vec_);
58 | softmax_axis_ =
59 | bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
60 | outer_num_ = bottom[0]->count(0, softmax_axis_);
61 | inner_num_ = bottom[0]->count(softmax_axis_ + 1);
62 |
63 | CHECK_EQ(outer_num_, bottom[1]->count(0, softmax_axis_))
64 | << "Outer number of soft labels must match outer number of predictions.";
65 | CHECK_EQ(inner_num_, bottom[1]->count(softmax_axis_ + 1))
66 | << "Inner number of soft labels must match inner number of predictions.";
67 | CHECK_EQ(bottom.size() == 3, has_ignore_label_)
68 | << "ignore_label is only valid when label inputs are given as bottom[2].";
69 | if (bottom.size() == 3 && has_ignore_label_) {
70 | CHECK_EQ(outer_num_ * inner_num_, bottom[2]->count())
71 | << "Number of labels must match number of predictions; "
72 | << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
73 | << "label count (number of labels) must be N*H*W, "
74 | << "with integer values in {0, 1, ..., C-1}.";
75 | }
76 | }
77 |
78 | template
79 | Dtype KnowledgeDistillationLayer::get_normalizer(
80 | LossParameter_NormalizationMode normalization_mode, int valid_count) {
81 | Dtype normalizer;
82 | switch (normalization_mode) {
83 | case LossParameter_NormalizationMode_FULL:
84 | normalizer = Dtype(outer_num_ * inner_num_);
85 | break;
86 | case LossParameter_NormalizationMode_VALID:
87 | if (valid_count == -1) {
88 | normalizer = Dtype(outer_num_ * inner_num_);
89 | } else {
90 | normalizer = Dtype(valid_count);
91 | }
92 | break;
93 | case LossParameter_NormalizationMode_BATCH_SIZE:
94 | normalizer = Dtype(outer_num_);
95 | break;
96 | case LossParameter_NormalizationMode_NONE:
97 | normalizer = Dtype(1);
98 | break;
99 | default:
100 | LOG(FATAL) << "Unknown normalization mode: "
101 | << LossParameter_NormalizationMode_Name(normalization_mode);
102 | }
103 | // Some users will have no labels for some examples in order to 'turn off' a
104 | // particular loss in a multi-task setup. The max prevents NaNs in that case.
105 | return std::max(Dtype(1.0), normalizer);
106 | }
107 |
108 | template
109 | void KnowledgeDistillationLayer::Forward_cpu(
110 | const vector*>& bottom, const vector*>& top) {
111 | // Both logits are divided by the temperature T.
112 | caffe_copy(bottom[0]->count(), bottom[0]->cpu_data(), s_logit_.mutable_cpu_data());
113 | caffe_scal(bottom[0]->count(), Dtype(1)/T, s_logit_.mutable_cpu_data());
114 | caffe_copy(bottom[1]->count(), bottom[1]->cpu_data(), t_logit_.mutable_cpu_data());
115 | caffe_scal(bottom[0]->count(), Dtype(1)/T, t_logit_.mutable_cpu_data());
116 | // The forward pass computes the softmax prob values.
117 | s_softmax_layer_->Forward(s_softmax_bottom_vec_, s_softmax_top_vec_);
118 | t_softmax_layer_->Forward(t_softmax_bottom_vec_, t_softmax_top_vec_);
119 | const Dtype* prob_data = s_prob_.cpu_data();
120 | const Dtype* soft_label = t_prob_.cpu_data();
121 | int dim = s_prob_.count() / outer_num_;
122 | int count = 0;
123 | Dtype loss = 0;
124 | int pos;
125 | // Compute KL divergence.
126 | if (bottom.size() == 3 && has_ignore_label_) { // label inputs and ignore_label are given.
127 | const Dtype* label = bottom[2]->cpu_data();
128 | for (int i = 0; i < outer_num_; ++i) {
129 | for (int j = 0; j < inner_num_; ++j) {
130 | const int label_value = static_cast(label[i * inner_num_ + j]);
131 | if (label_value == ignore_label_) {
132 | continue;
133 | }
134 |
135 | for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
136 | pos = i * dim + c * inner_num_ + j;
137 | loss -= soft_label[pos] * (log(std::max(prob_data[pos], Dtype(FLT_MIN)))-log(std::max(soft_label[pos], Dtype(FLT_MIN))));
138 | }
139 | ++count;
140 | }
141 | }
142 | } else { // label inputs or ignore_label are not given.
143 | count = outer_num_ * inner_num_;
144 | for (int i = 0; i < outer_num_; ++i) {
145 | for (int j = 0; j < inner_num_; ++j) {
146 | for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
147 | pos = i * dim + c * inner_num_ + j;
148 | loss -= soft_label[pos] * (log(std::max(prob_data[pos], Dtype(FLT_MIN)))-log(std::max(soft_label[pos], Dtype(FLT_MIN))));
149 | }
150 | }
151 | }
152 | }
153 |
154 | top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
155 | }
156 |
157 | template
158 | void KnowledgeDistillationLayer::Backward_cpu(const vector*>& top,
159 | const vector& propagate_down, const vector*>& bottom) {
160 | if (propagate_down[1] | (bottom.size() == 3 && propagate_down[2])) {
161 | LOG(FATAL) << this->type()
162 | << " Layer cannot backpropagate to soft label nor label inputs.";
163 | }
164 | if (propagate_down[0]) {
165 | Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
166 | const Dtype* prob_data = s_prob_.cpu_data();
167 | caffe_copy(s_prob_.count(), prob_data, bottom_diff);
168 | const Dtype* soft_label = t_prob_.cpu_data();
169 | int dim = s_prob_.count() / outer_num_;
170 | int count = outer_num_ * inner_num_;
171 | // The gradients here are multiplied by T,
172 | // which is T^2 (as suggested in the paper) * 1/T (logits divided by T).
173 | caffe_cpu_axpby(outer_num_*dim, -T, soft_label, T, bottom_diff);
174 | // If label inputs are given, set the gradients to 0 w.r.t. ignore_label.
175 | if (bottom.size() == 3 && has_ignore_label_) {
176 | count = 0;
177 | const Dtype* label = bottom[2]->cpu_data();
178 | for (int i = 0; i < outer_num_; ++i) {
179 | for (int j = 0; j < inner_num_; ++j) {
180 | const int label_value = static_cast(label[i * inner_num_ + j]);
181 | if (label_value == ignore_label_) {
182 | for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
183 | bottom_diff[i * dim + c * inner_num_ + j] = 0;
184 | }
185 | } else {
186 | ++count;
187 | }
188 | }
189 | }
190 | }
191 |
192 | // Scale gradient
193 | Dtype loss_weight = top[0]->cpu_diff()[0] /
194 | get_normalizer(normalization_, count);
195 | caffe_scal(s_prob_.count(), loss_weight, bottom_diff);
196 | }
197 | }
198 |
199 |
200 | //#ifdef CPU_ONLY
201 | //STUB_GPU(KnowledgeDistillationLayer);
202 | //#endif
203 |
204 | INSTANTIATE_CLASS(KnowledgeDistillationLayer);
205 | REGISTER_LAYER_CLASS(KnowledgeDistillation);
206 |
207 | } // namespace caffe
208 |
--------------------------------------------------------------------------------