├── .gitattributes ├── center_loss.png ├── multibox_center_loss_layer.cpp ├── center_loss_layer.hpp ├── center_loss_layer.cu ├── multibox_focal_loss_layer.hpp ├── multibox_center_loss_layer.hpp ├── center_loss_layer.cpp ├── README.md ├── multibox_focal_loss_layer.cpp ├── center_loss_test.prototxt ├── center_loss_train.prototxt └── caffe.proto /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /center_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BOBrown/SSD-Centerloss/HEAD/center_loss.png -------------------------------------------------------------------------------- /multibox_center_loss_layer.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BOBrown/SSD-Centerloss/HEAD/multibox_center_loss_layer.cpp -------------------------------------------------------------------------------- /center_loss_layer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_CENTER_LOSS_LAYER_HPP_ 2 | #define CAFFE_CENTER_LOSS_LAYER_HPP_ 3 | 4 | #include 5 | 6 | #include "caffe/blob.hpp" 7 | #include "caffe/layer.hpp" 8 | #include "caffe/proto/caffe.pb.h" 9 | 10 | #include "caffe/layers/loss_layer.hpp" 11 | 12 | namespace caffe { 13 | 14 | template 15 | class CenterLossLayer : public LossLayer { 16 | public: 17 | explicit CenterLossLayer(const LayerParameter& param) 18 | : LossLayer(param) {} 19 | virtual void LayerSetUp(const vector*>& bottom, 20 | const vector*>& top); 21 | virtual void Reshape(const vector*>& bottom, 22 | const vector*>& top); 23 | 24 | virtual inline const char* type() const { return "CenterLoss"; } 25 | virtual inline int ExactNumBottomBlobs() const { return 2; } 26 | virtual inline int ExactNumTopBlobs() const { return -1; } 27 | 28 | protected: 29 | virtual void Forward_cpu(const vector*>& bottom, 30 | const vector*>& top); 31 | virtual void Forward_gpu(const vector*>& bottom, 32 | const vector*>& top); 33 | virtual void Backward_cpu(const vector*>& top, 34 | const vector& propagate_down, const vector*>& bottom); 35 | virtual void Backward_gpu(const vector*>& top, 36 | const vector& propagate_down, const vector*>& bottom); 37 | 38 | int M_; 39 | int K_; 40 | int N_; 41 | 42 | Blob distance_; 43 | Blob variation_sum_; 44 | }; 45 | 46 | } // namespace caffe 47 | 48 | #endif // CAFFE_CENTER_LOSS_LAYER_HPP_ -------------------------------------------------------------------------------- /center_loss_layer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "caffe/filler.hpp" 4 | #include "caffe/layers/center_loss_layer.hpp" 5 | #include "caffe/util/math_functions.hpp" 6 | 7 | namespace caffe { 8 | 9 | template 10 | __global__ void Compute_distance_data_gpu(int nthreads, const int K, const Dtype* bottom, 11 | const Dtype* label, const Dtype* center, Dtype* distance) { 12 | CUDA_KERNEL_LOOP(index, nthreads) { 13 | int m = index / K; 14 | int k = index % K; 15 | const int label_value = static_cast(label[m]); 16 | // distance(i) = x(i) - c_{y(i)} 17 | distance[index] = bottom[index] - center[label_value * K + k]; 18 | } 19 | } 20 | 21 | template 22 | __global__ void Compute_center_diff_gpu(int nthreads, const int M, const int K, 23 | const Dtype* label, const Dtype* distance, Dtype* variation_sum, 24 | Dtype* center_diff) { 25 | CUDA_KERNEL_LOOP(index, nthreads) { 26 | int count = 0; 27 | for (int m = 0; m < M; m++) { 28 | const int label_value = static_cast(label[m]); 29 | if (label_value == index) { 30 | count++; 31 | for (int k = 0; k < K; k++) { 32 | variation_sum[index * K + k] -= distance[m * K + k]; 33 | } 34 | } 35 | } 36 | for (int k = 0; k < K; k++) { 37 | center_diff[index * K + k] = variation_sum[index * K + k] /(count + (Dtype)1.); 38 | } 39 | } 40 | } 41 | 42 | 43 | template 44 | void CenterLossLayer::Forward_gpu(const vector*>& bottom, 45 | const vector*>& top) { 46 | int nthreads = M_ * K_; 47 | Compute_distance_data_gpu<<>>(nthreads, K_, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 49 | this->blobs_[0]->gpu_data(), distance_.mutable_gpu_data()); 50 | Dtype dot; 51 | caffe_gpu_dot(M_ * K_, distance_.gpu_data(), distance_.gpu_data(), &dot); 52 | Dtype loss = dot / M_ / Dtype(2); 53 | top[0]->mutable_cpu_data()[0] = loss; 54 | } 55 | 56 | template 57 | void CenterLossLayer::Backward_gpu(const vector*>& top, 58 | const vector& propagate_down, 59 | const vector*>& bottom) { 60 | int nthreads = N_; 61 | caffe_gpu_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data()); 62 | Compute_center_diff_gpu<<>>(nthreads, M_, K_, bottom[1]->gpu_data(), distance_.gpu_data(), 64 | variation_sum_.mutable_cpu_data(), this->blobs_[0]->mutable_gpu_diff()); 65 | 66 | if (propagate_down[0]) { 67 | caffe_gpu_scale(M_ * K_, top[0]->cpu_diff()[0] / M_, 68 | distance_.gpu_data(), bottom[0]->mutable_gpu_diff()); 69 | } 70 | if (propagate_down[1]) { 71 | LOG(FATAL) << this->type() 72 | << " Layer cannot backpropagate to label inputs."; 73 | } 74 | } 75 | 76 | INSTANTIATE_LAYER_GPU_FUNCS(CenterLossLayer); 77 | 78 | } // namespace caffe 79 | -------------------------------------------------------------------------------- /multibox_focal_loss_layer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_MULTIBOX_LOSS_LAYER_HPP_ 2 | #define CAFFE_MULTIBOX_LOSS_LAYER_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "caffe/blob.hpp" 9 | #include "caffe/layer.hpp" 10 | #include "caffe/proto/caffe.pb.h" 11 | #include "caffe/util/bbox_util.hpp" 12 | 13 | #include "caffe/layers/loss_layer.hpp" 14 | 15 | namespace caffe { 16 | 17 | /** 18 | * @brief Perform MultiBox operations. Including the following: 19 | * 20 | * - decode the predictions. 21 | * - perform matching between priors/predictions and ground truth. 22 | * - use matched boxes and confidences to compute loss. 23 | * 24 | */ 25 | template 26 | class MultiBoxFocalLossLayer : public LossLayer { 27 | public: 28 | explicit MultiBoxFocalLossLayer(const LayerParameter& param) 29 | : LossLayer(param) {} 30 | virtual void LayerSetUp(const vector*>& bottom, 31 | const vector*>& top); 32 | virtual void Reshape(const vector*>& bottom, 33 | const vector*>& top); 34 | 35 | virtual inline const char* type() const { return "MultiBoxFocalLoss"; } 36 | // bottom[0] stores the location predictions. 37 | // bottom[1] stores the confidence predictions. 38 | // bottom[2] stores the prior bounding boxes. 39 | // bottom[3] stores the ground truth bounding boxes. 40 | virtual inline int ExactNumBottomBlobs() const { return 4; } 41 | virtual inline int ExactNumTopBlobs() const { return 1; } 42 | 43 | protected: 44 | virtual void Forward_cpu(const vector*>& bottom, 45 | const vector*>& top); 46 | virtual void Backward_cpu(const vector*>& top, 47 | const vector& propagate_down, const vector*>& bottom); 48 | 49 | // The internal localization loss layer. 50 | shared_ptr > loc_loss_layer_; 51 | LocLossType loc_loss_type_; 52 | float loc_weight_; 53 | // bottom vector holder used in Forward function. 54 | vector*> loc_bottom_vec_; 55 | // top vector holder used in Forward function. 56 | vector*> loc_top_vec_; 57 | // blob which stores the matched location prediction. 58 | Blob loc_pred_; 59 | // blob which stores the corresponding matched ground truth. 60 | Blob loc_gt_; 61 | // localization loss. 62 | Blob loc_loss_; 63 | 64 | // The internal confidence loss layer. 65 | shared_ptr > conf_loss_layer_; 66 | ConfLossType conf_loss_type_; 67 | // bottom vector holder used in Forward function. 68 | vector*> conf_bottom_vec_; 69 | // top vector holder used in Forward function. 70 | vector*> conf_top_vec_; 71 | // blob which stores the confidence prediction. 72 | Blob conf_pred_; 73 | // blob which stores the corresponding ground truth label. 74 | Blob conf_gt_; 75 | // confidence loss. 76 | Blob conf_loss_; 77 | 78 | MultiBoxLossParameter multibox_loss_param_; 79 | int num_classes_; 80 | bool share_location_; 81 | MatchType match_type_; 82 | float overlap_threshold_; 83 | bool use_prior_for_matching_; 84 | int background_label_id_; 85 | bool use_difficult_gt_; 86 | bool do_neg_mining_; 87 | float neg_pos_ratio_; 88 | float neg_overlap_; 89 | CodeType code_type_; 90 | bool encode_variance_in_target_; 91 | bool map_object_to_agnostic_; 92 | bool ignore_cross_boundary_bbox_; 93 | bool bp_inside_; 94 | MiningType mining_type_; 95 | 96 | int loc_classes_; 97 | int num_gt_; 98 | int num_; 99 | int num_priors_; 100 | 101 | int num_matches_; 102 | int num_conf_; 103 | vector > > all_match_indices_; 104 | vector > all_neg_indices_; 105 | float alpha_; 106 | float gamma_; 107 | 108 | // How to normalize the loss. 109 | LossParameter_NormalizationMode normalization_; 110 | }; 111 | 112 | } // namespace caffe 113 | 114 | #endif // CAFFE_MULTIBOX_LOSS_LAYER_HPP_ 115 | -------------------------------------------------------------------------------- /multibox_center_loss_layer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_MULTIBOX_LOSS_LAYER_HPP_ 2 | #define CAFFE_MULTIBOX_LOSS_LAYER_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "caffe/blob.hpp" 9 | #include "caffe/layer.hpp" 10 | #include "caffe/proto/caffe.pb.h" 11 | #include "caffe/util/bbox_util.hpp" 12 | 13 | #include "caffe/layers/loss_layer.hpp" 14 | 15 | namespace caffe { 16 | 17 | /** 18 | * @brief Perform MultiBox operations. Including the following: 19 | * 20 | * - decode the predictions. 21 | * - perform matching between priors/predictions and ground truth. 22 | * - use matched boxes and confidences to compute loss. 23 | * 24 | */ 25 | template 26 | class MultiBoxCenterLossLayer : public LossLayer { 27 | public: 28 | explicit MultiBoxCenterLossLayer(const LayerParameter& param) 29 | : LossLayer(param) {} 30 | virtual void LayerSetUp(const vector*>& bottom, 31 | const vector*>& top); 32 | virtual void Reshape(const vector*>& bottom, 33 | const vector*>& top); 34 | 35 | virtual inline const char* type() const { return "MultiBoxCenterLoss"; } 36 | // bottom[0] stores the location predictions. 37 | // bottom[1] stores the confidence predictions. 38 | // bottom[2] stores the prior bounding boxes. 39 | // bottom[3] stores the ground truth bounding boxes. 40 | virtual inline int ExactNumBottomBlobs() const { return 5; } 41 | virtual inline int ExactNumTopBlobs() const { return 1; } 42 | 43 | protected: 44 | virtual void Forward_cpu(const vector*>& bottom, 45 | const vector*>& top); 46 | virtual void Backward_cpu(const vector*>& top, 47 | const vector& propagate_down, const vector*>& bottom); 48 | 49 | // The internal localization loss layer. 50 | shared_ptr > loc_loss_layer_; 51 | LocLossType loc_loss_type_; 52 | float loc_weight_; 53 | float center_loss_weight_; //weights for penalizing the center_loss 54 | // bottom vector holder used in Forward function. 55 | vector*> loc_bottom_vec_; 56 | // top vector holder used in Forward function. 57 | vector*> loc_top_vec_; 58 | // blob which stores the matched location prediction. 59 | Blob loc_pred_; 60 | // blob which stores the corresponding matched ground truth. 61 | Blob loc_gt_; 62 | // localization loss. 63 | Blob loc_loss_; 64 | 65 | // The internal confidence loss layer. 66 | shared_ptr > conf_loss_layer_; 67 | // The center confidence loss layer. 68 | shared_ptr > conf_center_loss_layer_; 69 | 70 | ConfLossType conf_loss_type_; 71 | // bottom vector holder used in Forward function. 72 | vector*> conf_bottom_vec_; 73 | // top vector holder used in Forward function. 74 | vector*> conf_top_vec_; 75 | // blob which stores the confidence prediction. 76 | // bottom vector holder used in Forward function. 77 | vector*> conf_center_bottom_vec_; 78 | // top vector holder used in Forward function. 79 | vector*> conf_center_top_vec_; 80 | // blob which stores the confidence prediction. 81 | Blob conf_pred_; 82 | // blob which stores the confidence prediction used by center loss. 83 | Blob conf_center_pred_; 84 | // blob which stores the corresponding ground truth label. 85 | Blob conf_gt_; 86 | // confidence loss. 87 | Blob conf_loss_; 88 | // confidence center loss. 89 | Blob conf_center_loss_; 90 | 91 | MultiBoxLossParameter multibox_loss_param_; 92 | int num_classes_; 93 | int center_features_; 94 | bool share_location_; 95 | MatchType match_type_; 96 | float overlap_threshold_; 97 | bool use_prior_for_matching_; 98 | int background_label_id_; 99 | bool use_difficult_gt_; 100 | bool do_neg_mining_; 101 | float neg_pos_ratio_; 102 | float neg_overlap_; 103 | CodeType code_type_; 104 | bool encode_variance_in_target_; 105 | bool map_object_to_agnostic_; 106 | bool ignore_cross_boundary_bbox_; 107 | bool bp_inside_; 108 | MiningType mining_type_; 109 | 110 | int loc_classes_; 111 | int num_gt_; 112 | int num_; 113 | int num_priors_; 114 | 115 | int num_matches_; 116 | int num_conf_; 117 | vector > > all_match_indices_; 118 | vector > all_neg_indices_; 119 | 120 | // How to normalize the loss. 121 | LossParameter_NormalizationMode normalization_; 122 | }; 123 | 124 | } // namespace caffe 125 | 126 | #endif // CAFFE_MULTIBOX_LOSS_LAYER_HPP_ 127 | -------------------------------------------------------------------------------- /center_loss_layer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "caffe/filler.hpp" 4 | #include "caffe/layers/center_loss_layer.hpp" 5 | #include "caffe/util/math_functions.hpp" 6 | 7 | namespace caffe { 8 | 9 | template 10 | void CenterLossLayer::LayerSetUp(const vector*>& bottom, 11 | const vector*>& top) { 12 | const int num_output = this->layer_param_.center_loss_param().num_output(); 13 | N_ = num_output; 14 | const int axis = bottom[0]->CanonicalAxisIndex( 15 | this->layer_param_.center_loss_param().axis()); 16 | // Dimensions starting from "axis" are "flattened" into a single 17 | // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W), 18 | // and axis == 1, N inner products with dimension CHW are performed. 19 | K_ = bottom[0]->count(axis); 20 | // Check if we need to set up the weights 21 | if (this->blobs_.size() > 0) { 22 | LOG(INFO) << "Skipping parameter initialization"; 23 | } else { 24 | this->blobs_.resize(1); 25 | // Intialize the weight 26 | vector center_shape(2); 27 | center_shape[0] = N_; 28 | center_shape[1] = K_; 29 | this->blobs_[0].reset(new Blob(center_shape)); 30 | // fill the weights 31 | shared_ptr > center_filler(GetFiller( 32 | this->layer_param_.center_loss_param().center_filler())); 33 | center_filler->Fill(this->blobs_[0].get()); 34 | 35 | } // parameter initialization 36 | this->param_propagate_down_.resize(this->blobs_.size(), true); 37 | } 38 | 39 | template 40 | void CenterLossLayer::Reshape(const vector*>& bottom, 41 | const vector*>& top) { 42 | CHECK_EQ(bottom[1]->channels(), 1); 43 | CHECK_EQ(bottom[1]->height(), 1); 44 | CHECK_EQ(bottom[1]->width(), 1); 45 | M_ = bottom[0]->num(); 46 | // The top shape will be the bottom shape with the flattened axes dropped, 47 | // and replaced by a single axis with dimension num_output (N_). 48 | LossLayer::Reshape(bottom, top); 49 | distance_.ReshapeLike(*bottom[0]); 50 | variation_sum_.ReshapeLike(*this->blobs_[0]); 51 | } 52 | 53 | template 54 | void CenterLossLayer::Forward_cpu(const vector*>& bottom, 55 | const vector*>& top) { 56 | const Dtype* bottom_data = bottom[0]->cpu_data(); 57 | const Dtype* label = bottom[1]->cpu_data(); 58 | const Dtype* center = this->blobs_[0]->cpu_data(); 59 | Dtype* distance_data = distance_.mutable_cpu_data(); 60 | 61 | // the i-th distance_data 62 | for (int i = 0; i < M_; i++) { 63 | const int label_value = static_cast(label[i]); 64 | // D(i,:) = X(i,:) - C(y(i),:) 65 | caffe_sub(K_, bottom_data + i * K_, center + label_value * K_, distance_data + i * K_); // sub bottom_data + i * K_ with center + label_value * K_ 66 | } 67 | Dtype dot = caffe_cpu_dot(M_ * K_, distance_.cpu_data(), distance_.cpu_data()); 68 | Dtype loss = dot / M_ / Dtype(2); 69 | top[0]->mutable_cpu_data()[0] = loss; 70 | } 71 | 72 | template 73 | void CenterLossLayer::Backward_cpu(const vector*>& top, 74 | const vector& propagate_down, 75 | const vector*>& bottom) { 76 | // Gradient with respect to centers 77 | if (this->param_propagate_down_[0]) { 78 | const Dtype* label = bottom[1]->cpu_data(); 79 | Dtype* center_diff = this->blobs_[0]->mutable_cpu_diff(); 80 | Dtype* variation_sum_data = variation_sum_.mutable_cpu_data(); 81 | const Dtype* distance_data = distance_.cpu_data(); 82 | 83 | // \sum_{y_i==j} 84 | caffe_set(N_ * K_, (Dtype)0., variation_sum_.mutable_cpu_data()); //initialize each value in variation_sum_ using (Dtype)0 85 | for (int n = 0; n < N_; n++) { 86 | int count = 0; 87 | for (int m = 0; m < M_; m++) { 88 | const int label_value = static_cast(label[m]); 89 | if (label_value == n) { 90 | count++; 91 | caffe_sub(K_, variation_sum_data + n * K_, distance_data + m * K_, variation_sum_data + n * K_); 92 | } 93 | } 94 | caffe_axpy(K_, (Dtype)1./(count + (Dtype)1.), variation_sum_data + n * K_, center_diff + n * K_); 95 | } 96 | } 97 | // Gradient with respect to bottom data 98 | if (propagate_down[0]) { 99 | caffe_copy(M_ * K_, distance_.cpu_data(), bottom[0]->mutable_cpu_diff()); 100 | caffe_scal(M_ * K_, top[0]->cpu_diff()[0] / M_, bottom[0]->mutable_cpu_diff()); 101 | } 102 | if (propagate_down[1]) { 103 | LOG(FATAL) << this->type() 104 | << " Layer cannot backpropagate to label inputs."; 105 | } 106 | } 107 | 108 | #ifdef CPU_ONLY 109 | STUB_GPU(CenterLossLayer); 110 | #endif 111 | 112 | INSTANTIATE_CLASS(CenterLossLayer); 113 | REGISTER_LAYER_CLASS(CenterLoss); 114 | 115 | } // namespace caffe 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SSD-Centerloss 2 | This is an unofficial trial applying Centerloss to SSD multibox_loss function 3 | 4 | Technical details are in the paper: **A Discriminative Feature Learning Approach for Deep Face Recognition** 5 | https://pan.baidu.com/s/1up_PWpR85HqVe10yhFzHoQ 6 | 7 | SSD(Single Shot MultiBox Detector) implements the multibox_loss function in the https://github.com/weiliu89/caffe/tree/ssd. We can read the loss function through the coding multibox_loss_layer.h/multibox_loss_layer.cpp 8 | 9 | # Motivation: 10 | When detecting objects on the image, we often employ, including SSD, softmax function to classify the object and L1 regression to localize the object. 11 | 12 | $$ L(x,c,l,g) = \frac{1}{N}(L_{conf}(x,c)+ \alpha L_{loc}(x,l,g))$$ 13 | 14 | In the equation above, $L_{conf}$ represents the function that classifies each object, $L_{loc}$ stands for the localization function. $N$ is the number of default boxes. This equation means that averaging the sum of all of default boxes loss. Each default box will contribute to the final loss. 15 | 16 | However, for some objects that are similar to each other, learning the location information may be easy. The softmax function is hard to work due to the similarity of feature of foreground samples. Center loss can effectively decrease the feature difference between the same object. 17 |
18 |
19 | ![image](https://github.com/BOBrown/SSD-Centerloss/blob/master/center_loss.png) 20 | 21 | 22 | # How to use SSD-Centerloss 23 | ### (1) add center loss layer: 24 | Notation ~~ is the root path of your caffe-ssd dir 25 | ``` 26 | cp center_loss_layer.cpp ~~/caffe-ssd/src/caffe/layers/ 27 | cp center_loss_layer.h ~~/caffe-ssd/include/caffe/layers/ 28 | cp multibox_center_loss_layer.cpp ~~/caffe-ssd/src/caffe/layers/ 29 | cp multibox_center_loss_layer.hpp ~~/caffe-ssd/include/caffe/layers/ 30 | ``` 31 | ### (2)Then adding the following code in the caffe.proto 32 | ``` 33 | message CenterLossParameter { 34 | optional uint32 num_output = 1; // The number of outputs for the layer 35 | optional FillerParameter center_filler = 2; // The filler for the centers 36 | // The first axis to be lumped into a single inner product computation; 37 | // all preceding axes are retained in the output. 38 | // May be negative to index from the end (e.g., -1 for the last axis). 39 | optional int32 axis = 3 [default = 1]; 40 | } 41 | 42 | message MultiBoxCenterLossParameter{ 43 | //center_features represents the length of features that is equal to the length of object centers in each default box. 44 | optional uint32 center_features = 1; 45 | } 46 | ``` 47 | Adding in the message LayerParameter 48 | ``` 49 | optional MultiBoxCenterLossParameter multibox_center_loss_param = 211;//this value should be the only in this message 50 | optional CenterLossParameter center_loss_param = 149; 51 | ``` 52 | ### (3) Getting the center_features of each default box 53 | For an instance, fc7_norm layer has 4 anchors, including aspect ratio = sqrt(2),1,1/2,2. Each anchor has 16 center_features. Therefore the num_output is 64. 54 | ``` 55 | layer { 56 | name: "fc7_norm_center_mbox_conf_new" 57 | type: "Convolution" 58 | bottom: "fc7_norm" 59 | top: "fc7_norm_center_mbox_conf" 60 | param { 61 | lr_mult: 1 62 | decay_mult: 1 63 | } 64 | param { 65 | lr_mult: 2 66 | decay_mult: 0 67 | } 68 | convolution_param { 69 | num_output: 64 70 | pad: 1 71 | kernel_size: 3 72 | stride: 1 73 | weight_filler { 74 | type: "xavier" 75 | } 76 | bias_filler { 77 | type: "constant" 78 | value: 0 79 | } 80 | } 81 | } 82 | layer { 83 | name: "fc7_norm_center_mbox_conf_perm" 84 | type: "Permute" 85 | bottom: "fc7_norm_center_mbox_conf" 86 | top: "fc7_norm_center_mbox_conf_perm" 87 | permute_param { 88 | order: 0 89 | order: 2 90 | order: 3 91 | order: 1 92 | } 93 | } 94 | layer { 95 | name: "fc7_norm_mbox_center_conf_flat" 96 | type: "Flatten" 97 | bottom: "fc7_norm_center_mbox_conf_perm" 98 | top: "fc7_norm_mbox_center_conf_flat" 99 | flatten_param { 100 | axis: 1 101 | } 102 | } 103 | ``` 104 | ### (4) Changing the function type in train.prototxt 105 | ``` 106 | layer { 107 | name: "mbox_loss" 108 | type: "MultiBoxCenterLoss" # the type was changed 109 | bottom: "mbox_loc" 110 | bottom: "mbox_conf" 111 | bottom: "mbox_priorbox" 112 | bottom: "label" 113 | bottom: "mbox_center_conf" #mbox_center_conf is the concatenation of all the center_features in all default box. 114 | top: "mbox_loss" 115 | include { 116 | phase: TRAIN 117 | } 118 | propagate_down: true 119 | propagate_down: true 120 | propagate_down: false 121 | propagate_down: false 122 | propagate_down: true #center_features layers need backward. 123 | loss_param { 124 | normalization: VALID 125 | } 126 | multibox_loss_param { 127 | loc_loss_type: SMOOTH_L1 128 | conf_loss_type: SOFTMAX 129 | loc_weight: 1 130 | num_classes: 21 131 | share_location: true 132 | match_type: PER_PREDICTION 133 | overlap_threshold: 0.2 134 | use_prior_for_matching: true 135 | background_label_id: 0 136 | use_difficult_gt: true 137 | neg_pos_ratio: 3 138 | neg_overlap: 0.1 139 | code_type: CENTER_SIZE 140 | ignore_cross_boundary_bbox: false 141 | mining_type: MAX_NEGATIVE 142 | } 143 | multibox_center_loss_param { 144 | center_features: 16 # center_features represents the length of features that is equal to the length of object centers in each default box. 145 | } 146 | } 147 | ``` -------------------------------------------------------------------------------- /multibox_focal_loss_layer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "caffe/layers/multibox_focal_loss_layer.hpp" 7 | #include "caffe/util/math_functions.hpp" 8 | 9 | namespace caffe { 10 | 11 | template 12 | void MultiBoxFocalLossLayer::LayerSetUp(const vector*>& bottom, 13 | const vector*>& top) { 14 | LossLayer::LayerSetUp(bottom, top); 15 | if (this->layer_param_.propagate_down_size() == 0) { 16 | this->layer_param_.add_propagate_down(true); 17 | this->layer_param_.add_propagate_down(true); 18 | this->layer_param_.add_propagate_down(false); 19 | this->layer_param_.add_propagate_down(false); 20 | } 21 | 22 | alpha_ = this->layer_param_.focal_loss_param().alpha(); 23 | gamma_ = this->layer_param_.focal_loss_param().gamma(); 24 | 25 | const MultiBoxLossParameter& multibox_loss_param = 26 | this->layer_param_.multibox_loss_param(); 27 | multibox_loss_param_ = this->layer_param_.multibox_loss_param(); 28 | 29 | num_ = bottom[0]->num(); 30 | num_priors_ = bottom[2]->height() / 4; 31 | // Get other parameters. 32 | CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes."; 33 | num_classes_ = multibox_loss_param.num_classes(); 34 | CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1."; 35 | share_location_ = multibox_loss_param.share_location(); 36 | loc_classes_ = share_location_ ? 1 : num_classes_; 37 | background_label_id_ = multibox_loss_param.background_label_id(); 38 | use_difficult_gt_ = multibox_loss_param.use_difficult_gt(); 39 | mining_type_ = multibox_loss_param.mining_type(); 40 | if (multibox_loss_param.has_do_neg_mining()) { 41 | LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead."; 42 | do_neg_mining_ = multibox_loss_param.do_neg_mining(); 43 | CHECK_EQ(do_neg_mining_, 44 | mining_type_ != MultiBoxLossParameter_MiningType_NONE); 45 | } 46 | do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE; 47 | 48 | if (!this->layer_param_.loss_param().has_normalization() && 49 | this->layer_param_.loss_param().has_normalize()) { 50 | normalization_ = this->layer_param_.loss_param().normalize() ? 51 | LossParameter_NormalizationMode_VALID : 52 | LossParameter_NormalizationMode_BATCH_SIZE; 53 | } else { 54 | normalization_ = this->layer_param_.loss_param().normalization(); 55 | } 56 | 57 | if (do_neg_mining_) { 58 | CHECK(share_location_) 59 | << "Currently only support negative mining if share_location is true."; 60 | } 61 | 62 | vector loss_shape(1, 1); 63 | // Set up localization loss layer. 64 | loc_weight_ = multibox_loss_param.loc_weight(); 65 | loc_loss_type_ = multibox_loss_param.loc_loss_type(); 66 | // fake shape. 67 | vector loc_shape(1, 1); 68 | loc_shape.push_back(4); 69 | loc_pred_.Reshape(loc_shape); 70 | loc_gt_.Reshape(loc_shape); 71 | loc_bottom_vec_.push_back(&loc_pred_); 72 | loc_bottom_vec_.push_back(&loc_gt_); 73 | loc_loss_.Reshape(loss_shape); 74 | loc_top_vec_.push_back(&loc_loss_); 75 | if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2) { 76 | LayerParameter layer_param; 77 | layer_param.set_name(this->layer_param_.name() + "_l2_loc"); 78 | layer_param.set_type("EuclideanLoss"); 79 | layer_param.add_loss_weight(loc_weight_); 80 | loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param); 81 | loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_); 82 | } else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) { 83 | LayerParameter layer_param; 84 | layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc"); 85 | layer_param.set_type("SmoothL1Loss"); 86 | layer_param.add_loss_weight(loc_weight_); 87 | loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param); 88 | loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_); 89 | } else { 90 | LOG(FATAL) << "Unknown localization loss type."; 91 | } 92 | // Set up confidence loss layer. 93 | conf_loss_type_ = multibox_loss_param.conf_loss_type(); 94 | conf_bottom_vec_.push_back(&conf_pred_); 95 | conf_bottom_vec_.push_back(&conf_gt_); 96 | conf_loss_.Reshape(loss_shape); 97 | conf_top_vec_.push_back(&conf_loss_); 98 | if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) { 99 | CHECK_GE(background_label_id_, 0) 100 | << "background_label_id should be within [0, num_classes) for Softmax."; 101 | CHECK_LT(background_label_id_, num_classes_) 102 | << "background_label_id should be within [0, num_classes) for Softmax."; 103 | LayerParameter layer_param; 104 | layer_param.set_name(this->layer_param_.name() + "_focal_conf"); 105 | layer_param.set_type("FocalLoss"); 106 | layer_param.add_loss_weight(Dtype(1.)); 107 | layer_param.mutable_loss_param()->set_normalization( 108 | LossParameter_NormalizationMode_NONE); 109 | FocalLossParameter* faocal_param = layer_param.mutable_focal_loss_param(); 110 | faocal_param->set_axis(1); 111 | faocal_param->set_alpha(alpha_); 112 | faocal_param->set_gamma(gamma_); 113 | // Fake reshape. 114 | vector conf_shape(1, 1); 115 | conf_gt_.Reshape(conf_shape); 116 | conf_shape.push_back(num_classes_); 117 | conf_pred_.Reshape(conf_shape); 118 | conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param); 119 | conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_); 120 | } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) { 121 | LayerParameter layer_param; 122 | layer_param.set_name(this->layer_param_.name() + "_logistic_conf"); 123 | layer_param.set_type("SigmoidCrossEntropyLoss"); 124 | layer_param.add_loss_weight(Dtype(1.)); 125 | // Fake reshape. 126 | vector conf_shape(1, 1); 127 | conf_shape.push_back(num_classes_); 128 | conf_gt_.Reshape(conf_shape); 129 | conf_pred_.Reshape(conf_shape); 130 | conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param); 131 | conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_); 132 | } else { 133 | LOG(FATAL) << "Unknown confidence loss type."; 134 | } 135 | } 136 | 137 | template 138 | void MultiBoxFocalLossLayer::Reshape(const vector*>& bottom, 139 | const vector*>& top) { 140 | LossLayer::Reshape(bottom, top); 141 | num_ = bottom[0]->num(); 142 | num_priors_ = bottom[2]->height() / 4; 143 | num_gt_ = bottom[3]->height(); 144 | CHECK_EQ(bottom[0]->num(), bottom[1]->num()); 145 | CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels()) 146 | << "Number of priors must match number of location predictions."; 147 | CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels()) 148 | << "Number of priors must match number of confidence predictions."; 149 | } 150 | 151 | template 152 | void MultiBoxFocalLossLayer::Forward_cpu(const vector*>& bottom, 153 | const vector*>& top) { 154 | const Dtype* loc_data = bottom[0]->cpu_data(); 155 | const Dtype* conf_data = bottom[1]->cpu_data(); 156 | const Dtype* prior_data = bottom[2]->cpu_data(); 157 | const Dtype* gt_data = bottom[3]->cpu_data(); 158 | 159 | // Retrieve all ground truth. 160 | map > all_gt_bboxes; 161 | GetGroundTruth(gt_data, num_gt_, background_label_id_, use_difficult_gt_, 162 | &all_gt_bboxes); 163 | 164 | // Retrieve all prior bboxes. It is same within a batch since we assume all 165 | // images in a batch are of same dimension. 166 | vector prior_bboxes; 167 | vector > prior_variances; 168 | GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances); 169 | 170 | // Retrieve all predictions. 171 | vector all_loc_preds; 172 | GetLocPredictions(loc_data, num_, num_priors_, loc_classes_, share_location_, 173 | &all_loc_preds); 174 | 175 | // Find matches between source bboxes and ground truth bboxes. 176 | vector > > all_match_overlaps; 177 | FindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances, 178 | multibox_loss_param_, &all_match_overlaps, &all_match_indices_); 179 | 180 | num_matches_ = 0; 181 | int num_negs = 0; 182 | // Sample hard negative (and positive) examples based on mining type. 183 | MineHardExamples(*bottom[1], all_loc_preds, all_gt_bboxes, prior_bboxes, 184 | prior_variances, all_match_overlaps, multibox_loss_param_, 185 | &num_matches_, &num_negs, &all_match_indices_, 186 | &all_neg_indices_); 187 | 188 | if (num_matches_ >= 1) { 189 | // Form data to pass on to loc_loss_layer_. 190 | vector loc_shape(2); 191 | loc_shape[0] = 1; 192 | loc_shape[1] = num_matches_ * 4; 193 | loc_pred_.Reshape(loc_shape); 194 | loc_gt_.Reshape(loc_shape); 195 | Dtype* loc_pred_data = loc_pred_.mutable_cpu_data(); 196 | Dtype* loc_gt_data = loc_gt_.mutable_cpu_data(); 197 | EncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_, 198 | prior_bboxes, prior_variances, multibox_loss_param_, 199 | loc_pred_data, loc_gt_data); 200 | loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_); 201 | loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_); 202 | } else { 203 | loc_loss_.mutable_cpu_data()[0] = 0; 204 | } 205 | 206 | // Form data to pass on to conf_loss_layer_. 207 | if (do_neg_mining_) { 208 | num_conf_ = num_matches_ + num_negs; 209 | } else { 210 | num_conf_ = num_ * num_priors_; 211 | } 212 | if (num_conf_ >= 1) { 213 | // Reshape the confidence data. 214 | vector conf_shape; 215 | if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) { 216 | conf_shape.push_back(num_conf_); 217 | conf_gt_.Reshape(conf_shape); 218 | conf_shape.push_back(num_classes_); 219 | conf_pred_.Reshape(conf_shape); 220 | } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) { 221 | conf_shape.push_back(1); 222 | conf_shape.push_back(num_conf_); 223 | conf_shape.push_back(num_classes_); 224 | conf_gt_.Reshape(conf_shape); 225 | conf_pred_.Reshape(conf_shape); 226 | } else { 227 | LOG(FATAL) << "Unknown confidence loss type."; 228 | } 229 | if (!do_neg_mining_) { 230 | // Consider all scores. 231 | // Share data and diff with bottom[1]. 232 | CHECK_EQ(conf_pred_.count(), bottom[1]->count()); 233 | conf_pred_.ShareData(*(bottom[1])); 234 | } 235 | Dtype* conf_pred_data = conf_pred_.mutable_cpu_data(); 236 | Dtype* conf_gt_data = conf_gt_.mutable_cpu_data(); 237 | caffe_set(conf_gt_.count(), Dtype(background_label_id_), conf_gt_data); 238 | EncodeConfPrediction(conf_data, num_, num_priors_, multibox_loss_param_, 239 | all_match_indices_, all_neg_indices_, all_gt_bboxes, 240 | conf_pred_data, conf_gt_data); 241 | conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_); 242 | conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_); 243 | } else { 244 | conf_loss_.mutable_cpu_data()[0] = 0; 245 | } 246 | 247 | top[0]->mutable_cpu_data()[0] = 0; 248 | if (this->layer_param_.propagate_down(0)) { 249 | Dtype normalizer = LossLayer::GetNormalizer( 250 | normalization_, num_, num_priors_, num_matches_); 251 | top[0]->mutable_cpu_data()[0] += 252 | loc_weight_ * loc_loss_.cpu_data()[0] / normalizer; 253 | } 254 | if (this->layer_param_.propagate_down(1)) { 255 | Dtype normalizer = LossLayer::GetNormalizer( 256 | normalization_, num_, num_priors_, num_matches_); 257 | top[0]->mutable_cpu_data()[0] += conf_loss_.cpu_data()[0] / normalizer; 258 | } 259 | } 260 | 261 | template 262 | void MultiBoxFocalLossLayer::Backward_cpu(const vector*>& top, 263 | const vector& propagate_down, 264 | const vector*>& bottom) { 265 | 266 | if (propagate_down[2]) { 267 | LOG(FATAL) << this->type() 268 | << " Layer cannot backpropagate to prior inputs."; 269 | } 270 | if (propagate_down[3]) { 271 | LOG(FATAL) << this->type() 272 | << " Layer cannot backpropagate to label inputs."; 273 | } 274 | 275 | // Back propagate on location prediction. 276 | if (propagate_down[0]) { 277 | Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff(); 278 | caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff); 279 | if (num_matches_ >= 1) { 280 | vector loc_propagate_down; 281 | // Only back propagate on prediction, not ground truth. 282 | loc_propagate_down.push_back(true); 283 | loc_propagate_down.push_back(false); 284 | loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down, 285 | loc_bottom_vec_); 286 | // Scale gradient. 287 | Dtype normalizer = LossLayer::GetNormalizer( 288 | normalization_, num_, num_priors_, num_matches_); 289 | Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer; 290 | caffe_scal(loc_pred_.count(), loss_weight, loc_pred_.mutable_cpu_diff()); 291 | // Copy gradient back to bottom[0]. 292 | const Dtype* loc_pred_diff = loc_pred_.cpu_diff(); 293 | int count = 0; 294 | for (int i = 0; i < num_; ++i) { 295 | for (map >::iterator it = 296 | all_match_indices_[i].begin(); 297 | it != all_match_indices_[i].end(); ++it) { 298 | const int label = share_location_ ? 0 : it->first; 299 | const vector& match_index = it->second; 300 | for (int j = 0; j < match_index.size(); ++j) { 301 | if (match_index[j] <= -1) { 302 | continue; 303 | } 304 | // Copy the diff to the right place. 305 | int start_idx = loc_classes_ * 4 * j + label * 4; 306 | caffe_copy(4, loc_pred_diff + count * 4, 307 | loc_bottom_diff + start_idx); 308 | ++count; 309 | } 310 | } 311 | loc_bottom_diff += bottom[0]->offset(1); 312 | } 313 | } 314 | } 315 | 316 | // Back propagate on confidence prediction. 317 | if (propagate_down[1]) { 318 | Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff(); 319 | caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff); 320 | if (num_conf_ >= 1) { 321 | vector conf_propagate_down; 322 | // Only back propagate on prediction, not ground truth. 323 | conf_propagate_down.push_back(true); 324 | conf_propagate_down.push_back(false); 325 | conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down, 326 | conf_bottom_vec_); 327 | // Scale gradient. 328 | Dtype normalizer = LossLayer::GetNormalizer( 329 | normalization_, num_, num_priors_, num_matches_); 330 | Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer; 331 | caffe_scal(conf_pred_.count(), loss_weight, 332 | conf_pred_.mutable_cpu_diff()); 333 | // Copy gradient back to bottom[1]. 334 | const Dtype* conf_pred_diff = conf_pred_.cpu_diff(); 335 | if (do_neg_mining_) { 336 | int count = 0; 337 | for (int i = 0; i < num_; ++i) { 338 | // Copy matched (positive) bboxes scores' diff. 339 | const map >& match_indices = all_match_indices_[i]; 340 | for (map >::const_iterator it = 341 | match_indices.begin(); it != match_indices.end(); ++it) { 342 | const vector& match_index = it->second; 343 | CHECK_EQ(match_index.size(), num_priors_); 344 | for (int j = 0; j < num_priors_; ++j) { 345 | if (match_index[j] <= -1) { 346 | continue; 347 | } 348 | // Copy the diff to the right place. 349 | caffe_copy(num_classes_, 350 | conf_pred_diff + count * num_classes_, 351 | conf_bottom_diff + j * num_classes_); 352 | ++count; 353 | } 354 | } 355 | // Copy negative bboxes scores' diff. 356 | for (int n = 0; n < all_neg_indices_[i].size(); ++n) { 357 | int j = all_neg_indices_[i][n]; 358 | CHECK_LT(j, num_priors_); 359 | caffe_copy(num_classes_, 360 | conf_pred_diff + count * num_classes_, 361 | conf_bottom_diff + j * num_classes_); 362 | ++count; 363 | } 364 | conf_bottom_diff += bottom[1]->offset(1); 365 | } 366 | } else { 367 | // The diff is already computed and stored. 368 | bottom[1]->ShareDiff(conf_pred_); 369 | } 370 | } 371 | } 372 | 373 | // After backward, remove match statistics. 374 | all_match_indices_.clear(); 375 | all_neg_indices_.clear(); 376 | } 377 | 378 | INSTANTIATE_CLASS(MultiBoxFocalLossLayer); 379 | REGISTER_LAYER_CLASS(MultiBoxFocalLoss); 380 | 381 | } // namespace caffe 382 | -------------------------------------------------------------------------------- /center_loss_test.prototxt: -------------------------------------------------------------------------------- 1 | layer { 2 | name: "data" 3 | type: "AnnotatedData" 4 | top: "data" 5 | top: "label" 6 | include { 7 | phase: TEST 8 | } 9 | transform_param { 10 | mean_value: 28 11 | mean_value: 28 12 | mean_value: 28 13 | resize_param { 14 | prob: 1 15 | resize_mode: WARP 16 | height: 300 17 | width: 300 18 | interp_mode: LINEAR 19 | } 20 | } 21 | data_param { 22 | source: "/VOC0712/lmdb/VOC0712_test_lmdb/" 23 | batch_size: 1 24 | backend: LMDB 25 | } 26 | annotated_data_param { 27 | batch_sampler { 28 | } 29 | label_map_file: "/VOC0712/labelmap_voc.prototxt" 30 | } 31 | } 32 | layer { 33 | name: "conv1_1" 34 | type: "Convolution" 35 | bottom: "data" 36 | top: "conv1_1" 37 | param { 38 | lr_mult: 1 39 | decay_mult: 1 40 | } 41 | param { 42 | lr_mult: 2 43 | decay_mult: 0 44 | } 45 | convolution_param { 46 | num_output: 64 47 | pad: 1 48 | kernel_size: 3 49 | weight_filler { 50 | type: "xavier" 51 | } 52 | bias_filler { 53 | type: "constant" 54 | value: 0 55 | } 56 | } 57 | } 58 | layer { 59 | name: "relu1_1" 60 | type: "ReLU" 61 | bottom: "conv1_1" 62 | top: "conv1_1" 63 | } 64 | layer { 65 | name: "conv1_2" 66 | type: "Convolution" 67 | bottom: "conv1_1" 68 | top: "conv1_2" 69 | param { 70 | lr_mult: 1 71 | decay_mult: 1 72 | } 73 | param { 74 | lr_mult: 2 75 | decay_mult: 0 76 | } 77 | convolution_param { 78 | num_output: 64 79 | pad: 1 80 | kernel_size: 3 81 | weight_filler { 82 | type: "xavier" 83 | } 84 | bias_filler { 85 | type: "constant" 86 | value: 0 87 | } 88 | } 89 | } 90 | layer { 91 | name: "relu1_2" 92 | type: "ReLU" 93 | bottom: "conv1_2" 94 | top: "conv1_2" 95 | } 96 | layer { 97 | name: "pool1" 98 | type: "Pooling" 99 | bottom: "conv1_2" 100 | top: "pool1" 101 | pooling_param { 102 | pool: MAX 103 | kernel_size: 2 104 | stride: 2 105 | } 106 | } 107 | layer { 108 | name: "conv2_1" 109 | type: "Convolution" 110 | bottom: "pool1" 111 | top: "conv2_1" 112 | param { 113 | lr_mult: 1 114 | decay_mult: 1 115 | } 116 | param { 117 | lr_mult: 2 118 | decay_mult: 0 119 | } 120 | convolution_param { 121 | num_output: 128 122 | pad: 1 123 | kernel_size: 3 124 | weight_filler { 125 | type: "xavier" 126 | } 127 | bias_filler { 128 | type: "constant" 129 | value: 0 130 | } 131 | } 132 | } 133 | layer { 134 | name: "relu2_1" 135 | type: "ReLU" 136 | bottom: "conv2_1" 137 | top: "conv2_1" 138 | } 139 | layer { 140 | name: "conv2_2" 141 | type: "Convolution" 142 | bottom: "conv2_1" 143 | top: "conv2_2" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 128 154 | pad: 1 155 | kernel_size: 3 156 | weight_filler { 157 | type: "xavier" 158 | } 159 | bias_filler { 160 | type: "constant" 161 | value: 0 162 | } 163 | } 164 | } 165 | layer { 166 | name: "relu2_2" 167 | type: "ReLU" 168 | bottom: "conv2_2" 169 | top: "conv2_2" 170 | } 171 | layer { 172 | name: "pool2" 173 | type: "Pooling" 174 | bottom: "conv2_2" 175 | top: "pool2" 176 | pooling_param { 177 | pool: MAX 178 | kernel_size: 2 179 | stride: 2 180 | } 181 | } 182 | layer { 183 | name: "conv3_1" 184 | type: "Convolution" 185 | bottom: "pool2" 186 | top: "conv3_1" 187 | param { 188 | lr_mult: 1 189 | decay_mult: 1 190 | } 191 | param { 192 | lr_mult: 2 193 | decay_mult: 0 194 | } 195 | convolution_param { 196 | num_output: 256 197 | pad: 1 198 | kernel_size: 3 199 | weight_filler { 200 | type: "xavier" 201 | } 202 | bias_filler { 203 | type: "constant" 204 | value: 0 205 | } 206 | } 207 | } 208 | layer { 209 | name: "relu3_1" 210 | type: "ReLU" 211 | bottom: "conv3_1" 212 | top: "conv3_1" 213 | } 214 | layer { 215 | name: "conv3_2" 216 | type: "Convolution" 217 | bottom: "conv3_1" 218 | top: "conv3_2" 219 | param { 220 | lr_mult: 1 221 | decay_mult: 1 222 | } 223 | param { 224 | lr_mult: 2 225 | decay_mult: 0 226 | } 227 | convolution_param { 228 | num_output: 256 229 | pad: 1 230 | kernel_size: 3 231 | weight_filler { 232 | type: "xavier" 233 | } 234 | bias_filler { 235 | type: "constant" 236 | value: 0 237 | } 238 | } 239 | } 240 | layer { 241 | name: "relu3_2" 242 | type: "ReLU" 243 | bottom: "conv3_2" 244 | top: "conv3_2" 245 | } 246 | layer { 247 | name: "conv3_3" 248 | type: "Convolution" 249 | bottom: "conv3_2" 250 | top: "conv3_3" 251 | param { 252 | lr_mult: 1 253 | decay_mult: 1 254 | } 255 | param { 256 | lr_mult: 2 257 | decay_mult: 0 258 | } 259 | convolution_param { 260 | num_output: 256 261 | pad: 1 262 | kernel_size: 3 263 | weight_filler { 264 | type: "xavier" 265 | } 266 | bias_filler { 267 | type: "constant" 268 | value: 0 269 | } 270 | } 271 | } 272 | layer { 273 | name: "relu3_3" 274 | type: "ReLU" 275 | bottom: "conv3_3" 276 | top: "conv3_3" 277 | } 278 | layer { 279 | name: "pool3" 280 | type: "Pooling" 281 | bottom: "conv3_3" 282 | top: "pool3" 283 | pooling_param { 284 | pool: MAX 285 | kernel_size: 2 286 | stride: 2 287 | } 288 | } 289 | layer { 290 | name: "conv4_1" 291 | type: "Convolution" 292 | bottom: "pool3" 293 | top: "conv4_1" 294 | param { 295 | lr_mult: 1 296 | decay_mult: 1 297 | } 298 | param { 299 | lr_mult: 2 300 | decay_mult: 0 301 | } 302 | convolution_param { 303 | num_output: 512 304 | pad: 1 305 | kernel_size: 3 306 | weight_filler { 307 | type: "xavier" 308 | } 309 | bias_filler { 310 | type: "constant" 311 | value: 0 312 | } 313 | } 314 | } 315 | layer { 316 | name: "relu4_1" 317 | type: "ReLU" 318 | bottom: "conv4_1" 319 | top: "conv4_1" 320 | } 321 | layer { 322 | name: "conv4_2" 323 | type: "Convolution" 324 | bottom: "conv4_1" 325 | top: "conv4_2" 326 | param { 327 | lr_mult: 1 328 | decay_mult: 1 329 | } 330 | param { 331 | lr_mult: 2 332 | decay_mult: 0 333 | } 334 | convolution_param { 335 | num_output: 512 336 | pad: 1 337 | kernel_size: 3 338 | weight_filler { 339 | type: "xavier" 340 | } 341 | bias_filler { 342 | type: "constant" 343 | value: 0 344 | } 345 | } 346 | } 347 | layer { 348 | name: "relu4_2" 349 | type: "ReLU" 350 | bottom: "conv4_2" 351 | top: "conv4_2" 352 | } 353 | layer { 354 | name: "conv4_3" 355 | type: "Convolution" 356 | bottom: "conv4_2" 357 | top: "conv4_3" 358 | param { 359 | lr_mult: 1 360 | decay_mult: 1 361 | } 362 | param { 363 | lr_mult: 2 364 | decay_mult: 0 365 | } 366 | convolution_param { 367 | num_output: 512 368 | pad: 1 369 | kernel_size: 3 370 | weight_filler { 371 | type: "xavier" 372 | } 373 | bias_filler { 374 | type: "constant" 375 | value: 0 376 | } 377 | } 378 | } 379 | layer { 380 | name: "relu4_3" 381 | type: "ReLU" 382 | bottom: "conv4_3" 383 | top: "conv4_3" 384 | } 385 | layer { 386 | name: "pool4" 387 | type: "Pooling" 388 | bottom: "conv4_3" 389 | top: "pool4" 390 | pooling_param { 391 | pool: MAX 392 | kernel_size: 3 393 | stride: 1 394 | pad: 1 395 | } 396 | } 397 | layer { 398 | name: "conv5_1" 399 | type: "Convolution" 400 | bottom: "pool4" 401 | top: "conv5_1" 402 | param { 403 | lr_mult: 1 404 | decay_mult: 1 405 | } 406 | param { 407 | lr_mult: 2 408 | decay_mult: 0 409 | } 410 | convolution_param { 411 | num_output: 512 412 | pad: 2 413 | kernel_size: 3 414 | weight_filler { 415 | type: "xavier" 416 | } 417 | bias_filler { 418 | type: "constant" 419 | value: 0 420 | } 421 | dilation: 2 422 | } 423 | } 424 | layer { 425 | name: "relu5_1" 426 | type: "ReLU" 427 | bottom: "conv5_1" 428 | top: "conv5_1" 429 | } 430 | layer { 431 | name: "conv5_2" 432 | type: "Convolution" 433 | bottom: "conv5_1" 434 | top: "conv5_2" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | convolution_param { 444 | num_output: 512 445 | pad: 2 446 | kernel_size: 3 447 | weight_filler { 448 | type: "xavier" 449 | } 450 | bias_filler { 451 | type: "constant" 452 | value: 0 453 | } 454 | dilation: 2 455 | } 456 | } 457 | layer { 458 | name: "relu5_2" 459 | type: "ReLU" 460 | bottom: "conv5_2" 461 | top: "conv5_2" 462 | } 463 | layer { 464 | name: "conv5_3" 465 | type: "Convolution" 466 | bottom: "conv5_2" 467 | top: "conv5_3" 468 | param { 469 | lr_mult: 1 470 | decay_mult: 1 471 | } 472 | param { 473 | lr_mult: 2 474 | decay_mult: 0 475 | } 476 | convolution_param { 477 | num_output: 512 478 | pad: 2 479 | kernel_size: 3 480 | weight_filler { 481 | type: "xavier" 482 | } 483 | bias_filler { 484 | type: "constant" 485 | value: 0 486 | } 487 | dilation: 2 488 | } 489 | } 490 | layer { 491 | name: "relu5_3" 492 | type: "ReLU" 493 | bottom: "conv5_3" 494 | top: "conv5_3" 495 | } 496 | layer { 497 | name: "pool5" 498 | type: "Pooling" 499 | bottom: "conv5_3" 500 | top: "pool5" 501 | pooling_param { 502 | pool: MAX 503 | kernel_size: 3 504 | stride: 1 505 | pad: 1 506 | } 507 | } 508 | layer { 509 | name: "fc6" 510 | type: "Convolution" 511 | bottom: "pool5" 512 | top: "fc6" 513 | param { 514 | lr_mult: 1 515 | decay_mult: 1 516 | } 517 | param { 518 | lr_mult: 2 519 | decay_mult: 0 520 | } 521 | convolution_param { 522 | num_output: 1024 523 | pad: 6 524 | kernel_size: 3 525 | weight_filler { 526 | type: "xavier" 527 | } 528 | bias_filler { 529 | type: "constant" 530 | value: 0 531 | } 532 | dilation: 6 533 | } 534 | } 535 | layer { 536 | name: "relu6" 537 | type: "ReLU" 538 | bottom: "fc6" 539 | top: "fc6" 540 | } 541 | layer { 542 | name: "fc7" 543 | type: "Convolution" 544 | bottom: "fc6" 545 | top: "fc7" 546 | param { 547 | lr_mult: 1 548 | decay_mult: 1 549 | } 550 | param { 551 | lr_mult: 2 552 | decay_mult: 0 553 | } 554 | convolution_param { 555 | num_output: 1024 556 | kernel_size: 1 557 | weight_filler { 558 | type: "xavier" 559 | } 560 | bias_filler { 561 | type: "constant" 562 | value: 0 563 | } 564 | } 565 | } 566 | layer { 567 | name: "relu7" 568 | type: "ReLU" 569 | bottom: "fc7" 570 | top: "fc7" 571 | } 572 | layer { 573 | name: "conv6_1" 574 | type: "Convolution" 575 | bottom: "fc7" 576 | top: "conv6_1" 577 | param { 578 | lr_mult: 1 579 | decay_mult: 1 580 | } 581 | param { 582 | lr_mult: 2 583 | decay_mult: 0 584 | } 585 | convolution_param { 586 | num_output: 256 587 | pad: 0 588 | kernel_size: 1 589 | stride: 1 590 | weight_filler { 591 | type: "xavier" 592 | } 593 | bias_filler { 594 | type: "constant" 595 | value: 0 596 | } 597 | } 598 | } 599 | layer { 600 | name: "conv6_1_relu" 601 | type: "ReLU" 602 | bottom: "conv6_1" 603 | top: "conv6_1" 604 | } 605 | layer { 606 | name: "conv6_2" 607 | type: "Convolution" 608 | bottom: "conv6_1" 609 | top: "conv6_2" 610 | param { 611 | lr_mult: 1 612 | decay_mult: 1 613 | } 614 | param { 615 | lr_mult: 2 616 | decay_mult: 0 617 | } 618 | convolution_param { 619 | num_output: 512 620 | pad: 6 621 | kernel_size: 3 622 | stride: 1 623 | weight_filler { 624 | type: "xavier" 625 | } 626 | bias_filler { 627 | type: "constant" 628 | value: 0 629 | } 630 | dilation: 6 631 | } 632 | } 633 | layer { 634 | name: "conv6_2_relu" 635 | type: "ReLU" 636 | bottom: "conv6_2" 637 | top: "conv6_2" 638 | } 639 | layer { 640 | name: "conv7_1" 641 | type: "Convolution" 642 | bottom: "conv6_2" 643 | top: "conv7_1" 644 | param { 645 | lr_mult: 1 646 | decay_mult: 1 647 | } 648 | param { 649 | lr_mult: 2 650 | decay_mult: 0 651 | } 652 | convolution_param { 653 | num_output: 128 654 | pad: 0 655 | kernel_size: 1 656 | stride: 1 657 | weight_filler { 658 | type: "xavier" 659 | } 660 | bias_filler { 661 | type: "constant" 662 | value: 0 663 | } 664 | } 665 | } 666 | layer { 667 | name: "conv7_1_relu" 668 | type: "ReLU" 669 | bottom: "conv7_1" 670 | top: "conv7_1" 671 | } 672 | layer { 673 | name: "conv7_2" 674 | type: "Convolution" 675 | bottom: "conv7_1" 676 | top: "conv7_2" 677 | param { 678 | lr_mult: 1 679 | decay_mult: 1 680 | } 681 | param { 682 | lr_mult: 2 683 | decay_mult: 0 684 | } 685 | convolution_param { 686 | num_output: 256 687 | pad: 1 688 | kernel_size: 3 689 | stride: 2 690 | weight_filler { 691 | type: "xavier" 692 | } 693 | bias_filler { 694 | type: "constant" 695 | value: 0 696 | } 697 | } 698 | } 699 | layer { 700 | name: "conv7_2_relu" 701 | type: "ReLU" 702 | bottom: "conv7_2" 703 | top: "conv7_2" 704 | } 705 | layer { 706 | name: "conv8_1" 707 | type: "Convolution" 708 | bottom: "conv7_2" 709 | top: "conv8_1" 710 | param { 711 | lr_mult: 1 712 | decay_mult: 1 713 | } 714 | param { 715 | lr_mult: 2 716 | decay_mult: 0 717 | } 718 | convolution_param { 719 | num_output: 128 720 | pad: 0 721 | kernel_size: 1 722 | stride: 1 723 | weight_filler { 724 | type: "xavier" 725 | } 726 | bias_filler { 727 | type: "constant" 728 | value: 0 729 | } 730 | } 731 | } 732 | layer { 733 | name: "conv8_1_relu" 734 | type: "ReLU" 735 | bottom: "conv8_1" 736 | top: "conv8_1" 737 | } 738 | layer { 739 | name: "conv8_2" 740 | type: "Convolution" 741 | bottom: "conv8_1" 742 | top: "conv8_2" 743 | param { 744 | lr_mult: 1 745 | decay_mult: 1 746 | } 747 | param { 748 | lr_mult: 2 749 | decay_mult: 0 750 | } 751 | convolution_param { 752 | num_output: 256 753 | pad: 1 754 | kernel_size: 3 755 | stride: 2 756 | weight_filler { 757 | type: "xavier" 758 | } 759 | bias_filler { 760 | type: "constant" 761 | value: 0 762 | } 763 | } 764 | } 765 | layer { 766 | name: "conv8_2_relu" 767 | type: "ReLU" 768 | bottom: "conv8_2" 769 | top: "conv8_2" 770 | } 771 | layer { 772 | name: "fc7_norm" 773 | type: "Normalize" 774 | bottom: "fc7" 775 | top: "fc7_norm" 776 | norm_param { 777 | across_spatial: false 778 | scale_filler { 779 | type: "constant" 780 | value: 20 781 | } 782 | channel_shared: false 783 | } 784 | } 785 | layer { 786 | name: "fc7_norm_mbox_loc" 787 | type: "Convolution" 788 | bottom: "fc7_norm" 789 | top: "fc7_norm_mbox_loc" 790 | param { 791 | lr_mult: 1 792 | decay_mult: 1 793 | } 794 | param { 795 | lr_mult: 2 796 | decay_mult: 0 797 | } 798 | convolution_param { 799 | num_output: 40 800 | pad: 1 801 | kernel_size: 3 802 | stride: 1 803 | weight_filler { 804 | type: "xavier" 805 | } 806 | bias_filler { 807 | type: "constant" 808 | value: 0 809 | } 810 | } 811 | } 812 | layer { 813 | name: "fc7_norm_mbox_loc_perm" 814 | type: "Permute" 815 | bottom: "fc7_norm_mbox_loc" 816 | top: "fc7_norm_mbox_loc_perm" 817 | permute_param { 818 | order: 0 819 | order: 2 820 | order: 3 821 | order: 1 822 | } 823 | } 824 | layer { 825 | name: "fc7_norm_mbox_loc_flat" 826 | type: "Flatten" 827 | bottom: "fc7_norm_mbox_loc_perm" 828 | top: "fc7_norm_mbox_loc_flat" 829 | flatten_param { 830 | axis: 1 831 | } 832 | } 833 | layer { 834 | name: "fc7_norm_mbox_conf_new" 835 | type: "Convolution" 836 | bottom: "fc7_norm" 837 | top: "fc7_norm_mbox_conf" 838 | param { 839 | lr_mult: 1 840 | decay_mult: 1 841 | } 842 | param { 843 | lr_mult: 2 844 | decay_mult: 0 845 | } 846 | convolution_param { 847 | num_output: 60 848 | pad: 1 849 | kernel_size: 3 850 | stride: 1 851 | weight_filler { 852 | type: "xavier" 853 | } 854 | bias_filler { 855 | type: "constant" 856 | value: 0 857 | } 858 | } 859 | } 860 | layer { 861 | name: "fc7_norm_mbox_conf_perm" 862 | type: "Permute" 863 | bottom: "fc7_norm_mbox_conf" 864 | top: "fc7_norm_mbox_conf_perm" 865 | permute_param { 866 | order: 0 867 | order: 2 868 | order: 3 869 | order: 1 870 | } 871 | } 872 | layer { 873 | name: "fc7_norm_mbox_conf_flat" 874 | type: "Flatten" 875 | bottom: "fc7_norm_mbox_conf_perm" 876 | top: "fc7_norm_mbox_conf_flat" 877 | flatten_param { 878 | axis: 1 879 | } 880 | } 881 | layer { 882 | name: "fc7_norm_mbox_priorbox" 883 | type: "PriorBox" 884 | bottom: "fc7_norm" 885 | bottom: "data" 886 | top: "fc7_norm_mbox_priorbox" 887 | prior_box_param { 888 | min_size: 12 889 | max_size: 24 890 | aspect_ratio: 2 891 | aspect_ratio: 3 892 | aspect_ratio: 4 893 | aspect_ratio: 5 894 | flip: true 895 | clip: false 896 | variance: 0.1 897 | variance: 0.1 898 | variance: 0.2 899 | variance: 0.2 900 | step: 8 901 | offset: 0.5 902 | } 903 | } 904 | layer { 905 | name: "conv6_2_mbox_loc" 906 | type: "Convolution" 907 | bottom: "conv6_2" 908 | top: "conv6_2_mbox_loc" 909 | param { 910 | lr_mult: 1 911 | decay_mult: 1 912 | } 913 | param { 914 | lr_mult: 2 915 | decay_mult: 0 916 | } 917 | convolution_param { 918 | num_output: 40 919 | pad: 1 920 | kernel_size: 3 921 | stride: 1 922 | weight_filler { 923 | type: "xavier" 924 | } 925 | bias_filler { 926 | type: "constant" 927 | value: 0 928 | } 929 | } 930 | } 931 | layer { 932 | name: "conv6_2_mbox_loc_perm" 933 | type: "Permute" 934 | bottom: "conv6_2_mbox_loc" 935 | top: "conv6_2_mbox_loc_perm" 936 | permute_param { 937 | order: 0 938 | order: 2 939 | order: 3 940 | order: 1 941 | } 942 | } 943 | layer { 944 | name: "conv6_2_mbox_loc_flat" 945 | type: "Flatten" 946 | bottom: "conv6_2_mbox_loc_perm" 947 | top: "conv6_2_mbox_loc_flat" 948 | flatten_param { 949 | axis: 1 950 | } 951 | } 952 | layer { 953 | name: "conv6_2_mbox_conf" 954 | type: "Convolution" 955 | bottom: "conv6_2" 956 | top: "conv6_2_mbox_conf" 957 | param { 958 | lr_mult: 1 959 | decay_mult: 1 960 | } 961 | param { 962 | lr_mult: 2 963 | decay_mult: 0 964 | } 965 | convolution_param { 966 | num_output: 60 967 | pad: 1 968 | kernel_size: 3 969 | stride: 1 970 | weight_filler { 971 | type: "xavier" 972 | } 973 | bias_filler { 974 | type: "constant" 975 | value: 0 976 | } 977 | } 978 | } 979 | layer { 980 | name: "conv6_2_mbox_conf_perm" 981 | type: "Permute" 982 | bottom: "conv6_2_mbox_conf" 983 | top: "conv6_2_mbox_conf_perm" 984 | permute_param { 985 | order: 0 986 | order: 2 987 | order: 3 988 | order: 1 989 | } 990 | } 991 | layer { 992 | name: "conv6_2_mbox_conf_flat" 993 | type: "Flatten" 994 | bottom: "conv6_2_mbox_conf_perm" 995 | top: "conv6_2_mbox_conf_flat" 996 | flatten_param { 997 | axis: 1 998 | } 999 | } 1000 | layer { 1001 | name: "conv6_2_mbox_priorbox" 1002 | type: "PriorBox" 1003 | bottom: "conv6_2" 1004 | bottom: "data" 1005 | top: "conv6_2_mbox_priorbox" 1006 | prior_box_param { 1007 | min_size: 24 1008 | max_size: 36 1009 | aspect_ratio: 2 1010 | aspect_ratio: 3 1011 | aspect_ratio: 4 1012 | aspect_ratio: 5 1013 | flip: true 1014 | clip: false 1015 | variance: 0.1 1016 | variance: 0.1 1017 | variance: 0.2 1018 | variance: 0.2 1019 | step: 8 1020 | offset: 0.5 1021 | } 1022 | } 1023 | layer { 1024 | name: "conv7_2_mbox_loc" 1025 | type: "Convolution" 1026 | bottom: "conv7_2" 1027 | top: "conv7_2_mbox_loc" 1028 | param { 1029 | lr_mult: 1 1030 | decay_mult: 1 1031 | } 1032 | param { 1033 | lr_mult: 2 1034 | decay_mult: 0 1035 | } 1036 | convolution_param { 1037 | num_output: 40 1038 | pad: 1 1039 | kernel_size: 3 1040 | stride: 1 1041 | weight_filler { 1042 | type: "xavier" 1043 | } 1044 | bias_filler { 1045 | type: "constant" 1046 | value: 0 1047 | } 1048 | } 1049 | } 1050 | layer { 1051 | name: "conv7_2_mbox_loc_perm" 1052 | type: "Permute" 1053 | bottom: "conv7_2_mbox_loc" 1054 | top: "conv7_2_mbox_loc_perm" 1055 | permute_param { 1056 | order: 0 1057 | order: 2 1058 | order: 3 1059 | order: 1 1060 | } 1061 | } 1062 | layer { 1063 | name: "conv7_2_mbox_loc_flat" 1064 | type: "Flatten" 1065 | bottom: "conv7_2_mbox_loc_perm" 1066 | top: "conv7_2_mbox_loc_flat" 1067 | flatten_param { 1068 | axis: 1 1069 | } 1070 | } 1071 | layer { 1072 | name: "conv7_2_mbox_conf" 1073 | type: "Convolution" 1074 | bottom: "conv7_2" 1075 | top: "conv7_2_mbox_conf" 1076 | param { 1077 | lr_mult: 1 1078 | decay_mult: 1 1079 | } 1080 | param { 1081 | lr_mult: 2 1082 | decay_mult: 0 1083 | } 1084 | convolution_param { 1085 | num_output: 60 1086 | pad: 1 1087 | kernel_size: 3 1088 | stride: 1 1089 | weight_filler { 1090 | type: "xavier" 1091 | } 1092 | bias_filler { 1093 | type: "constant" 1094 | value: 0 1095 | } 1096 | } 1097 | } 1098 | layer { 1099 | name: "conv7_2_mbox_conf_perm" 1100 | type: "Permute" 1101 | bottom: "conv7_2_mbox_conf" 1102 | top: "conv7_2_mbox_conf_perm" 1103 | permute_param { 1104 | order: 0 1105 | order: 2 1106 | order: 3 1107 | order: 1 1108 | } 1109 | } 1110 | layer { 1111 | name: "conv7_2_mbox_conf_flat" 1112 | type: "Flatten" 1113 | bottom: "conv7_2_mbox_conf_perm" 1114 | top: "conv7_2_mbox_conf_flat" 1115 | flatten_param { 1116 | axis: 1 1117 | } 1118 | } 1119 | layer { 1120 | name: "conv7_2_mbox_priorbox" 1121 | type: "PriorBox" 1122 | bottom: "conv7_2" 1123 | bottom: "data" 1124 | top: "conv7_2_mbox_priorbox" 1125 | prior_box_param { 1126 | min_size: 36 1127 | max_size: 48 1128 | aspect_ratio: 2 1129 | aspect_ratio: 3 1130 | aspect_ratio: 4 1131 | aspect_ratio: 5 1132 | flip: true 1133 | clip: false 1134 | variance: 0.1 1135 | variance: 0.1 1136 | variance: 0.2 1137 | variance: 0.2 1138 | step: 16 1139 | offset: 0.5 1140 | } 1141 | } 1142 | layer { 1143 | name: "conv8_2_mbox_loc" 1144 | type: "Convolution" 1145 | bottom: "conv8_2" 1146 | top: "conv8_2_mbox_loc" 1147 | param { 1148 | lr_mult: 1 1149 | decay_mult: 1 1150 | } 1151 | param { 1152 | lr_mult: 2 1153 | decay_mult: 0 1154 | } 1155 | convolution_param { 1156 | num_output: 24 1157 | pad: 1 1158 | kernel_size: 3 1159 | stride: 1 1160 | weight_filler { 1161 | type: "xavier" 1162 | } 1163 | bias_filler { 1164 | type: "constant" 1165 | value: 0 1166 | } 1167 | } 1168 | } 1169 | layer { 1170 | name: "conv8_2_mbox_loc_perm" 1171 | type: "Permute" 1172 | bottom: "conv8_2_mbox_loc" 1173 | top: "conv8_2_mbox_loc_perm" 1174 | permute_param { 1175 | order: 0 1176 | order: 2 1177 | order: 3 1178 | order: 1 1179 | } 1180 | } 1181 | layer { 1182 | name: "conv8_2_mbox_loc_flat" 1183 | type: "Flatten" 1184 | bottom: "conv8_2_mbox_loc_perm" 1185 | top: "conv8_2_mbox_loc_flat" 1186 | flatten_param { 1187 | axis: 1 1188 | } 1189 | } 1190 | layer { 1191 | name: "conv8_2_mbox_conf" 1192 | type: "Convolution" 1193 | bottom: "conv8_2" 1194 | top: "conv8_2_mbox_conf" 1195 | param { 1196 | lr_mult: 1 1197 | decay_mult: 1 1198 | } 1199 | param { 1200 | lr_mult: 2 1201 | decay_mult: 0 1202 | } 1203 | convolution_param { 1204 | num_output: 36 1205 | pad: 1 1206 | kernel_size: 3 1207 | stride: 1 1208 | weight_filler { 1209 | type: "xavier" 1210 | } 1211 | bias_filler { 1212 | type: "constant" 1213 | value: 0 1214 | } 1215 | } 1216 | } 1217 | layer { 1218 | name: "conv8_2_mbox_conf_perm" 1219 | type: "Permute" 1220 | bottom: "conv8_2_mbox_conf" 1221 | top: "conv8_2_mbox_conf_perm" 1222 | permute_param { 1223 | order: 0 1224 | order: 2 1225 | order: 3 1226 | order: 1 1227 | } 1228 | } 1229 | layer { 1230 | name: "conv8_2_mbox_conf_flat" 1231 | type: "Flatten" 1232 | bottom: "conv8_2_mbox_conf_perm" 1233 | top: "conv8_2_mbox_conf_flat" 1234 | flatten_param { 1235 | axis: 1 1236 | } 1237 | } 1238 | layer { 1239 | name: "conv8_2_mbox_priorbox" 1240 | type: "PriorBox" 1241 | bottom: "conv8_2" 1242 | bottom: "data" 1243 | top: "conv8_2_mbox_priorbox" 1244 | prior_box_param { 1245 | min_size: 48 1246 | max_size: 60 1247 | aspect_ratio: 2 1248 | aspect_ratio: 3 1249 | flip: true 1250 | clip: false 1251 | variance: 0.1 1252 | variance: 0.1 1253 | variance: 0.2 1254 | variance: 0.2 1255 | step: 30 1256 | offset: 0.5 1257 | } 1258 | } 1259 | layer { 1260 | name: "mbox_loc" 1261 | type: "Concat" 1262 | bottom: "fc7_norm_mbox_loc_flat" 1263 | bottom: "conv6_2_mbox_loc_flat" 1264 | bottom: "conv7_2_mbox_loc_flat" 1265 | bottom: "conv8_2_mbox_loc_flat" 1266 | top: "mbox_loc" 1267 | concat_param { 1268 | axis: 1 1269 | } 1270 | } 1271 | layer { 1272 | name: "mbox_conf" 1273 | type: "Concat" 1274 | bottom: "fc7_norm_mbox_conf_flat" 1275 | bottom: "conv6_2_mbox_conf_flat" 1276 | bottom: "conv7_2_mbox_conf_flat" 1277 | bottom: "conv8_2_mbox_conf_flat" 1278 | top: "mbox_conf" 1279 | concat_param { 1280 | axis: 1 1281 | } 1282 | } 1283 | layer { 1284 | name: "mbox_priorbox" 1285 | type: "Concat" 1286 | bottom: "fc7_norm_mbox_priorbox" 1287 | bottom: "conv6_2_mbox_priorbox" 1288 | bottom: "conv7_2_mbox_priorbox" 1289 | bottom: "conv8_2_mbox_priorbox" 1290 | top: "mbox_priorbox" 1291 | concat_param { 1292 | axis: 2 1293 | } 1294 | } 1295 | layer { 1296 | name: "mbox_conf_reshape" 1297 | type: "Reshape" 1298 | bottom: "mbox_conf" 1299 | top: "mbox_conf_reshape" 1300 | reshape_param { 1301 | shape { 1302 | dim: 0 1303 | dim: -1 1304 | dim: 6 1305 | } 1306 | } 1307 | } 1308 | layer { 1309 | name: "mbox_conf_softmax" 1310 | type: "Softmax" 1311 | bottom: "mbox_conf_reshape" 1312 | top: "mbox_conf_softmax" 1313 | softmax_param { 1314 | axis: 2 1315 | } 1316 | } 1317 | layer { 1318 | name: "mbox_conf_flatten" 1319 | type: "Flatten" 1320 | bottom: "mbox_conf_softmax" 1321 | top: "mbox_conf_flatten" 1322 | flatten_param { 1323 | axis: 1 1324 | } 1325 | } 1326 | layer { 1327 | name: "detection_out" 1328 | type: "DetectionOutput" 1329 | bottom: "mbox_loc" 1330 | bottom: "mbox_conf_flatten" 1331 | bottom: "mbox_priorbox" 1332 | top: "detection_out" 1333 | include { 1334 | phase: TEST 1335 | } 1336 | detection_output_param { 1337 | num_classes: 6 1338 | share_location: true 1339 | background_label_id: 0 1340 | nms_param { 1341 | nms_threshold: 0.2 1342 | top_k: 800 1343 | } 1344 | code_type: CENTER_SIZE 1345 | keep_top_k: 200 1346 | confidence_threshold: 0.01 1347 | } 1348 | } 1349 | layer { 1350 | name: "detection_eval" 1351 | type: "DetectionEvaluate" 1352 | bottom: "detection_out" 1353 | bottom: "label" 1354 | top: "detection_eval" 1355 | include { 1356 | phase: TEST 1357 | } 1358 | detection_evaluate_param { 1359 | num_classes: 6 1360 | background_label_id: 0 1361 | overlap_threshold: 0.5 1362 | evaluate_difficult_gt: false 1363 | name_size_file: "/VOC0712/test_name_size.txt" 1364 | } 1365 | } 1366 | -------------------------------------------------------------------------------- /center_loss_train.prototxt: -------------------------------------------------------------------------------- 1 | #center_loss_ssd 2 | layer { 3 | name: "data" 4 | type: "AnnotatedData" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | transform_param { 11 | mirror: true 12 | mean_value: 28 13 | mean_value: 28 14 | mean_value: 28 15 | resize_param { 16 | prob: 1 17 | resize_mode: WARP 18 | height: 300 19 | width: 300 20 | interp_mode: LINEAR 21 | interp_mode: AREA 22 | interp_mode: NEAREST 23 | interp_mode: CUBIC 24 | interp_mode: LANCZOS4 25 | } 26 | emit_constraint { 27 | emit_type: CENTER 28 | } 29 | distort_param { 30 | brightness_prob: 0.8 31 | brightness_delta: 32 32 | contrast_prob: 0.5 33 | contrast_lower: 0.5 34 | contrast_upper: 1.5 35 | hue_prob: 0.5 36 | hue_delta: 18 37 | saturation_prob: 0.5 38 | saturation_lower: 0.5 39 | saturation_upper: 1.5 40 | random_order_prob: 0 41 | } 42 | expand_param { 43 | prob: 0.5 44 | max_expand_ratio: 2 45 | } 46 | } 47 | data_param { 48 | source: "/VOC0712/lmdb/VOC0712_trainval_lmdb/" 49 | batch_size: 16 50 | backend: LMDB 51 | } 52 | annotated_data_param { 53 | batch_sampler { 54 | max_sample: 1 55 | max_trials: 1 56 | } 57 | batch_sampler { 58 | sampler { 59 | min_scale: 0.3 60 | max_scale: 1 61 | min_aspect_ratio: 0.5 62 | max_aspect_ratio: 2 63 | } 64 | sample_constraint { 65 | min_jaccard_overlap: 0.1 66 | } 67 | max_sample: 1 68 | max_trials: 50 69 | } 70 | batch_sampler { 71 | sampler { 72 | min_scale: 0.3 73 | max_scale: 1 74 | min_aspect_ratio: 0.5 75 | max_aspect_ratio: 2 76 | } 77 | sample_constraint { 78 | min_jaccard_overlap: 0.3 79 | } 80 | max_sample: 1 81 | max_trials: 50 82 | } 83 | batch_sampler { 84 | sampler { 85 | min_scale: 0.3 86 | max_scale: 1 87 | min_aspect_ratio: 0.5 88 | max_aspect_ratio: 2 89 | } 90 | sample_constraint { 91 | min_jaccard_overlap: 0.5 92 | } 93 | max_sample: 1 94 | max_trials: 50 95 | } 96 | batch_sampler { 97 | sampler { 98 | min_scale: 0.3 99 | max_scale: 1 100 | min_aspect_ratio: 0.5 101 | max_aspect_ratio: 2 102 | } 103 | sample_constraint { 104 | min_jaccard_overlap: 0.7 105 | } 106 | max_sample: 1 107 | max_trials: 50 108 | } 109 | batch_sampler { 110 | sampler { 111 | min_scale: 0.3 112 | max_scale: 1 113 | min_aspect_ratio: 0.5 114 | max_aspect_ratio: 2 115 | } 116 | sample_constraint { 117 | min_jaccard_overlap: 0.9 118 | } 119 | max_sample: 1 120 | max_trials: 50 121 | } 122 | batch_sampler { 123 | sampler { 124 | min_scale: 0.3 125 | max_scale: 1 126 | min_aspect_ratio: 0.5 127 | max_aspect_ratio: 2 128 | } 129 | sample_constraint { 130 | max_jaccard_overlap: 1 131 | } 132 | max_sample: 1 133 | max_trials: 50 134 | } 135 | label_map_file: "/VOC0712/labelmap_voc.prototxt" 136 | } 137 | } 138 | layer { 139 | name: "conv1_1" 140 | type: "Convolution" 141 | bottom: "data" 142 | top: "conv1_1" 143 | param { 144 | lr_mult: 1 145 | decay_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | decay_mult: 0 150 | } 151 | convolution_param { 152 | num_output: 64 153 | pad: 1 154 | kernel_size: 3 155 | weight_filler { 156 | type: "xavier" 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0 161 | } 162 | } 163 | } 164 | layer { 165 | name: "relu1_1" 166 | type: "ReLU" 167 | bottom: "conv1_1" 168 | top: "conv1_1" 169 | } 170 | layer { 171 | name: "conv1_2" 172 | type: "Convolution" 173 | bottom: "conv1_1" 174 | top: "conv1_2" 175 | param { 176 | lr_mult: 1 177 | decay_mult: 1 178 | } 179 | param { 180 | lr_mult: 2 181 | decay_mult: 0 182 | } 183 | convolution_param { 184 | num_output: 64 185 | pad: 1 186 | kernel_size: 3 187 | weight_filler { 188 | type: "xavier" 189 | } 190 | bias_filler { 191 | type: "constant" 192 | value: 0 193 | } 194 | } 195 | } 196 | layer { 197 | name: "relu1_2" 198 | type: "ReLU" 199 | bottom: "conv1_2" 200 | top: "conv1_2" 201 | } 202 | layer { 203 | name: "pool1" 204 | type: "Pooling" 205 | bottom: "conv1_2" 206 | top: "pool1" 207 | pooling_param { 208 | pool: MAX 209 | kernel_size: 2 210 | stride: 2 211 | } 212 | } 213 | layer { 214 | name: "conv2_1" 215 | type: "Convolution" 216 | bottom: "pool1" 217 | top: "conv2_1" 218 | param { 219 | lr_mult: 1 220 | decay_mult: 1 221 | } 222 | param { 223 | lr_mult: 2 224 | decay_mult: 0 225 | } 226 | convolution_param { 227 | num_output: 128 228 | pad: 1 229 | kernel_size: 3 230 | weight_filler { 231 | type: "xavier" 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0 236 | } 237 | } 238 | } 239 | layer { 240 | name: "relu2_1" 241 | type: "ReLU" 242 | bottom: "conv2_1" 243 | top: "conv2_1" 244 | } 245 | layer { 246 | name: "conv2_2" 247 | type: "Convolution" 248 | bottom: "conv2_1" 249 | top: "conv2_2" 250 | param { 251 | lr_mult: 1 252 | decay_mult: 1 253 | } 254 | param { 255 | lr_mult: 2 256 | decay_mult: 0 257 | } 258 | convolution_param { 259 | num_output: 128 260 | pad: 1 261 | kernel_size: 3 262 | weight_filler { 263 | type: "xavier" 264 | } 265 | bias_filler { 266 | type: "constant" 267 | value: 0 268 | } 269 | } 270 | } 271 | layer { 272 | name: "relu2_2" 273 | type: "ReLU" 274 | bottom: "conv2_2" 275 | top: "conv2_2" 276 | } 277 | layer { 278 | name: "pool2" 279 | type: "Pooling" 280 | bottom: "conv2_2" 281 | top: "pool2" 282 | pooling_param { 283 | pool: MAX 284 | kernel_size: 2 285 | stride: 2 286 | } 287 | } 288 | layer { 289 | name: "conv3_1" 290 | type: "Convolution" 291 | bottom: "pool2" 292 | top: "conv3_1" 293 | param { 294 | lr_mult: 1 295 | decay_mult: 1 296 | } 297 | param { 298 | lr_mult: 2 299 | decay_mult: 0 300 | } 301 | convolution_param { 302 | num_output: 256 303 | pad: 1 304 | kernel_size: 3 305 | weight_filler { 306 | type: "xavier" 307 | } 308 | bias_filler { 309 | type: "constant" 310 | value: 0 311 | } 312 | } 313 | } 314 | layer { 315 | name: "relu3_1" 316 | type: "ReLU" 317 | bottom: "conv3_1" 318 | top: "conv3_1" 319 | } 320 | layer { 321 | name: "conv3_2" 322 | type: "Convolution" 323 | bottom: "conv3_1" 324 | top: "conv3_2" 325 | param { 326 | lr_mult: 1 327 | decay_mult: 1 328 | } 329 | param { 330 | lr_mult: 2 331 | decay_mult: 0 332 | } 333 | convolution_param { 334 | num_output: 256 335 | pad: 1 336 | kernel_size: 3 337 | weight_filler { 338 | type: "xavier" 339 | } 340 | bias_filler { 341 | type: "constant" 342 | value: 0 343 | } 344 | } 345 | } 346 | layer { 347 | name: "relu3_2" 348 | type: "ReLU" 349 | bottom: "conv3_2" 350 | top: "conv3_2" 351 | } 352 | layer { 353 | name: "conv3_3" 354 | type: "Convolution" 355 | bottom: "conv3_2" 356 | top: "conv3_3" 357 | param { 358 | lr_mult: 1 359 | decay_mult: 1 360 | } 361 | param { 362 | lr_mult: 2 363 | decay_mult: 0 364 | } 365 | convolution_param { 366 | num_output: 256 367 | pad: 1 368 | kernel_size: 3 369 | weight_filler { 370 | type: "xavier" 371 | } 372 | bias_filler { 373 | type: "constant" 374 | value: 0 375 | } 376 | } 377 | } 378 | layer { 379 | name: "relu3_3" 380 | type: "ReLU" 381 | bottom: "conv3_3" 382 | top: "conv3_3" 383 | } 384 | layer { 385 | name: "pool3" 386 | type: "Pooling" 387 | bottom: "conv3_3" 388 | top: "pool3" 389 | pooling_param { 390 | pool: MAX 391 | kernel_size: 2 392 | stride: 2 393 | } 394 | } 395 | layer { 396 | name: "conv4_1" 397 | type: "Convolution" 398 | bottom: "pool3" 399 | top: "conv4_1" 400 | param { 401 | lr_mult: 1 402 | decay_mult: 1 403 | } 404 | param { 405 | lr_mult: 2 406 | decay_mult: 0 407 | } 408 | convolution_param { 409 | num_output: 512 410 | pad: 1 411 | kernel_size: 3 412 | weight_filler { 413 | type: "xavier" 414 | } 415 | bias_filler { 416 | type: "constant" 417 | value: 0 418 | } 419 | } 420 | } 421 | layer { 422 | name: "relu4_1" 423 | type: "ReLU" 424 | bottom: "conv4_1" 425 | top: "conv4_1" 426 | } 427 | layer { 428 | name: "conv4_2" 429 | type: "Convolution" 430 | bottom: "conv4_1" 431 | top: "conv4_2" 432 | param { 433 | lr_mult: 1 434 | decay_mult: 1 435 | } 436 | param { 437 | lr_mult: 2 438 | decay_mult: 0 439 | } 440 | convolution_param { 441 | num_output: 512 442 | pad: 1 443 | kernel_size: 3 444 | weight_filler { 445 | type: "xavier" 446 | } 447 | bias_filler { 448 | type: "constant" 449 | value: 0 450 | } 451 | } 452 | } 453 | layer { 454 | name: "relu4_2" 455 | type: "ReLU" 456 | bottom: "conv4_2" 457 | top: "conv4_2" 458 | } 459 | layer { 460 | name: "conv4_3" 461 | type: "Convolution" 462 | bottom: "conv4_2" 463 | top: "conv4_3" 464 | param { 465 | lr_mult: 1 466 | decay_mult: 1 467 | } 468 | param { 469 | lr_mult: 2 470 | decay_mult: 0 471 | } 472 | convolution_param { 473 | num_output: 512 474 | pad: 1 475 | kernel_size: 3 476 | weight_filler { 477 | type: "xavier" 478 | } 479 | bias_filler { 480 | type: "constant" 481 | value: 0 482 | } 483 | } 484 | } 485 | layer { 486 | name: "relu4_3" 487 | type: "ReLU" 488 | bottom: "conv4_3" 489 | top: "conv4_3" 490 | } 491 | layer { 492 | name: "pool4" 493 | type: "Pooling" 494 | bottom: "conv4_3" 495 | top: "pool4" 496 | pooling_param { 497 | pool: MAX 498 | kernel_size: 3 499 | stride: 1 500 | pad: 1 501 | } 502 | } 503 | layer { 504 | name: "conv5_1" 505 | type: "Convolution" 506 | bottom: "pool4" 507 | top: "conv5_1" 508 | param { 509 | lr_mult: 1 510 | decay_mult: 1 511 | } 512 | param { 513 | lr_mult: 2 514 | decay_mult: 0 515 | } 516 | convolution_param { 517 | num_output: 512 518 | pad: 2 519 | kernel_size: 3 520 | weight_filler { 521 | type: "xavier" 522 | } 523 | bias_filler { 524 | type: "constant" 525 | value: 0 526 | } 527 | dilation: 2 528 | } 529 | } 530 | layer { 531 | name: "relu5_1" 532 | type: "ReLU" 533 | bottom: "conv5_1" 534 | top: "conv5_1" 535 | } 536 | layer { 537 | name: "conv5_2" 538 | type: "Convolution" 539 | bottom: "conv5_1" 540 | top: "conv5_2" 541 | param { 542 | lr_mult: 1 543 | decay_mult: 1 544 | } 545 | param { 546 | lr_mult: 2 547 | decay_mult: 0 548 | } 549 | convolution_param { 550 | num_output: 512 551 | pad: 2 552 | kernel_size: 3 553 | weight_filler { 554 | type: "xavier" 555 | } 556 | bias_filler { 557 | type: "constant" 558 | value: 0 559 | } 560 | dilation: 2 561 | } 562 | } 563 | layer { 564 | name: "relu5_2" 565 | type: "ReLU" 566 | bottom: "conv5_2" 567 | top: "conv5_2" 568 | } 569 | layer { 570 | name: "conv5_3" 571 | type: "Convolution" 572 | bottom: "conv5_2" 573 | top: "conv5_3" 574 | param { 575 | lr_mult: 1 576 | decay_mult: 1 577 | } 578 | param { 579 | lr_mult: 2 580 | decay_mult: 0 581 | } 582 | convolution_param { 583 | num_output: 512 584 | pad: 2 585 | kernel_size: 3 586 | weight_filler { 587 | type: "xavier" 588 | } 589 | bias_filler { 590 | type: "constant" 591 | value: 0 592 | } 593 | dilation: 2 594 | } 595 | } 596 | layer { 597 | name: "relu5_3" 598 | type: "ReLU" 599 | bottom: "conv5_3" 600 | top: "conv5_3" 601 | } 602 | layer { 603 | name: "pool5" 604 | type: "Pooling" 605 | bottom: "conv5_3" 606 | top: "pool5" 607 | pooling_param { 608 | pool: MAX 609 | kernel_size: 3 610 | stride: 1 611 | pad: 1 612 | } 613 | } 614 | layer { 615 | name: "fc6" 616 | type: "Convolution" 617 | bottom: "pool5" 618 | top: "fc6" 619 | param { 620 | lr_mult: 1 621 | decay_mult: 1 622 | } 623 | param { 624 | lr_mult: 2 625 | decay_mult: 0 626 | } 627 | convolution_param { 628 | num_output: 1024 629 | pad: 6 630 | kernel_size: 3 631 | weight_filler { 632 | type: "xavier" 633 | } 634 | bias_filler { 635 | type: "constant" 636 | value: 0 637 | } 638 | dilation: 6 639 | } 640 | } 641 | layer { 642 | name: "relu6" 643 | type: "ReLU" 644 | bottom: "fc6" 645 | top: "fc6" 646 | } 647 | layer { 648 | name: "fc7" 649 | type: "Convolution" 650 | bottom: "fc6" 651 | top: "fc7" 652 | param { 653 | lr_mult: 1 654 | decay_mult: 1 655 | } 656 | param { 657 | lr_mult: 2 658 | decay_mult: 0 659 | } 660 | convolution_param { 661 | num_output: 1024 662 | kernel_size: 1 663 | weight_filler { 664 | type: "xavier" 665 | } 666 | bias_filler { 667 | type: "constant" 668 | value: 0 669 | } 670 | } 671 | } 672 | layer { 673 | name: "relu7" 674 | type: "ReLU" 675 | bottom: "fc7" 676 | top: "fc7" 677 | } 678 | layer { 679 | name: "conv6_1" 680 | type: "Convolution" 681 | bottom: "fc7" 682 | top: "conv6_1" 683 | param { 684 | lr_mult: 1 685 | decay_mult: 1 686 | } 687 | param { 688 | lr_mult: 2 689 | decay_mult: 0 690 | } 691 | convolution_param { 692 | num_output: 256 693 | pad: 0 694 | kernel_size: 1 695 | stride: 1 696 | weight_filler { 697 | type: "xavier" 698 | } 699 | bias_filler { 700 | type: "constant" 701 | value: 0 702 | } 703 | } 704 | } 705 | layer { 706 | name: "conv6_1_relu" 707 | type: "ReLU" 708 | bottom: "conv6_1" 709 | top: "conv6_1" 710 | } 711 | layer { 712 | name: "conv6_2" 713 | type: "Convolution" 714 | bottom: "conv6_1" 715 | top: "conv6_2" 716 | param { 717 | lr_mult: 1 718 | decay_mult: 1 719 | } 720 | param { 721 | lr_mult: 2 722 | decay_mult: 0 723 | } 724 | convolution_param { 725 | num_output: 512 726 | pad: 6 727 | kernel_size: 3 728 | stride: 1 729 | weight_filler { 730 | type: "xavier" 731 | } 732 | bias_filler { 733 | type: "constant" 734 | value: 0 735 | } 736 | dilation: 6 737 | } 738 | } 739 | layer { 740 | name: "conv6_2_relu" 741 | type: "ReLU" 742 | bottom: "conv6_2" 743 | top: "conv6_2" 744 | } 745 | layer { 746 | name: "conv7_1" 747 | type: "Convolution" 748 | bottom: "conv6_2" 749 | top: "conv7_1" 750 | param { 751 | lr_mult: 1 752 | decay_mult: 1 753 | } 754 | param { 755 | lr_mult: 2 756 | decay_mult: 0 757 | } 758 | convolution_param { 759 | num_output: 128 760 | pad: 0 761 | kernel_size: 1 762 | stride: 1 763 | weight_filler { 764 | type: "xavier" 765 | } 766 | bias_filler { 767 | type: "constant" 768 | value: 0 769 | } 770 | } 771 | } 772 | layer { 773 | name: "conv7_1_relu" 774 | type: "ReLU" 775 | bottom: "conv7_1" 776 | top: "conv7_1" 777 | } 778 | layer { 779 | name: "conv7_2" 780 | type: "Convolution" 781 | bottom: "conv7_1" 782 | top: "conv7_2" 783 | param { 784 | lr_mult: 1 785 | decay_mult: 1 786 | } 787 | param { 788 | lr_mult: 2 789 | decay_mult: 0 790 | } 791 | convolution_param { 792 | num_output: 256 793 | pad: 1 794 | kernel_size: 3 795 | stride: 2 796 | weight_filler { 797 | type: "xavier" 798 | } 799 | bias_filler { 800 | type: "constant" 801 | value: 0 802 | } 803 | } 804 | } 805 | layer { 806 | name: "conv7_2_relu" 807 | type: "ReLU" 808 | bottom: "conv7_2" 809 | top: "conv7_2" 810 | } 811 | layer { 812 | name: "conv8_1" 813 | type: "Convolution" 814 | bottom: "conv7_2" 815 | top: "conv8_1" 816 | param { 817 | lr_mult: 1 818 | decay_mult: 1 819 | } 820 | param { 821 | lr_mult: 2 822 | decay_mult: 0 823 | } 824 | convolution_param { 825 | num_output: 128 826 | pad: 0 827 | kernel_size: 1 828 | stride: 1 829 | weight_filler { 830 | type: "xavier" 831 | } 832 | bias_filler { 833 | type: "constant" 834 | value: 0 835 | } 836 | } 837 | } 838 | layer { 839 | name: "conv8_1_relu" 840 | type: "ReLU" 841 | bottom: "conv8_1" 842 | top: "conv8_1" 843 | } 844 | layer { 845 | name: "conv8_2" 846 | type: "Convolution" 847 | bottom: "conv8_1" 848 | top: "conv8_2" 849 | param { 850 | lr_mult: 1 851 | decay_mult: 1 852 | } 853 | param { 854 | lr_mult: 2 855 | decay_mult: 0 856 | } 857 | convolution_param { 858 | num_output: 256 859 | pad: 1 860 | kernel_size: 3 861 | stride: 2 862 | weight_filler { 863 | type: "xavier" 864 | } 865 | bias_filler { 866 | type: "constant" 867 | value: 0 868 | } 869 | } 870 | } 871 | layer { 872 | name: "conv8_2_relu" 873 | type: "ReLU" 874 | bottom: "conv8_2" 875 | top: "conv8_2" 876 | } 877 | layer { 878 | name: "fc7_norm" 879 | type: "Normalize" 880 | bottom: "fc7" 881 | top: "fc7_norm" 882 | norm_param { 883 | across_spatial: false 884 | scale_filler { 885 | type: "constant" 886 | value: 20 887 | } 888 | channel_shared: false 889 | } 890 | } 891 | layer { 892 | name: "fc7_norm_mbox_loc" 893 | type: "Convolution" 894 | bottom: "fc7_norm" 895 | top: "fc7_norm_mbox_loc" 896 | param { 897 | lr_mult: 1 898 | decay_mult: 1 899 | } 900 | param { 901 | lr_mult: 2 902 | decay_mult: 0 903 | } 904 | convolution_param { 905 | num_output: 40 906 | pad: 1 907 | kernel_size: 3 908 | stride: 1 909 | weight_filler { 910 | type: "xavier" 911 | } 912 | bias_filler { 913 | type: "constant" 914 | value: 0 915 | } 916 | } 917 | } 918 | layer { 919 | name: "fc7_norm_mbox_loc_perm" 920 | type: "Permute" 921 | bottom: "fc7_norm_mbox_loc" 922 | top: "fc7_norm_mbox_loc_perm" 923 | permute_param { 924 | order: 0 925 | order: 2 926 | order: 3 927 | order: 1 928 | } 929 | } 930 | layer { 931 | name: "fc7_norm_mbox_loc_flat" 932 | type: "Flatten" 933 | bottom: "fc7_norm_mbox_loc_perm" 934 | top: "fc7_norm_mbox_loc_flat" 935 | flatten_param { 936 | axis: 1 937 | } 938 | } 939 | layer { 940 | name: "fc7_norm_mbox_conf_new" 941 | type: "Convolution" 942 | bottom: "fc7_norm" 943 | top: "fc7_norm_mbox_conf" 944 | param { 945 | lr_mult: 1 946 | decay_mult: 1 947 | } 948 | param { 949 | lr_mult: 2 950 | decay_mult: 0 951 | } 952 | convolution_param { 953 | num_output: 60 954 | pad: 1 955 | kernel_size: 3 956 | stride: 1 957 | weight_filler { 958 | type: "xavier" 959 | } 960 | bias_filler { 961 | type: "constant" 962 | value: 0 963 | } 964 | } 965 | } 966 | layer { 967 | name: "fc7_norm_mbox_conf_perm" 968 | type: "Permute" 969 | bottom: "fc7_norm_mbox_conf" 970 | top: "fc7_norm_mbox_conf_perm" 971 | permute_param { 972 | order: 0 973 | order: 2 974 | order: 3 975 | order: 1 976 | } 977 | } 978 | layer { 979 | name: "fc7_norm_mbox_conf_flat" 980 | type: "Flatten" 981 | bottom: "fc7_norm_mbox_conf_perm" 982 | top: "fc7_norm_mbox_conf_flat" 983 | flatten_param { 984 | axis: 1 985 | } 986 | } 987 | layer { 988 | name: "fc7_norm_center_mbox_conf_new" 989 | type: "Convolution" 990 | bottom: "fc7_norm" 991 | top: "fc7_norm_center_mbox_conf" 992 | param { 993 | lr_mult: 1 994 | decay_mult: 1 995 | } 996 | param { 997 | lr_mult: 2 998 | decay_mult: 0 999 | } 1000 | convolution_param { 1001 | num_output: 160 1002 | pad: 1 1003 | kernel_size: 3 1004 | stride: 1 1005 | weight_filler { 1006 | type: "xavier" 1007 | } 1008 | bias_filler { 1009 | type: "constant" 1010 | value: 0 1011 | } 1012 | } 1013 | } 1014 | layer { 1015 | name: "fc7_norm_center_mbox_conf_perm" 1016 | type: "Permute" 1017 | bottom: "fc7_norm_center_mbox_conf" 1018 | top: "fc7_norm_center_mbox_conf_perm" 1019 | permute_param { 1020 | order: 0 1021 | order: 2 1022 | order: 3 1023 | order: 1 1024 | } 1025 | } 1026 | layer { 1027 | name: "fc7_norm_mbox_center_conf_flat" 1028 | type: "Flatten" 1029 | bottom: "fc7_norm_center_mbox_conf_perm" 1030 | top: "fc7_norm_mbox_center_conf_flat" 1031 | flatten_param { 1032 | axis: 1 1033 | } 1034 | } 1035 | layer { 1036 | name: "fc7_norm_mbox_priorbox" 1037 | type: "PriorBox" 1038 | bottom: "fc7_norm" 1039 | bottom: "data" 1040 | top: "fc7_norm_mbox_priorbox" 1041 | prior_box_param { 1042 | min_size: 12 1043 | max_size: 24 1044 | aspect_ratio: 2 1045 | aspect_ratio: 3 1046 | aspect_ratio: 4 1047 | aspect_ratio: 5 1048 | flip: true 1049 | clip: false 1050 | variance: 0.1 1051 | variance: 0.1 1052 | variance: 0.2 1053 | variance: 0.2 1054 | step: 8 1055 | offset: 0.5 1056 | } 1057 | } 1058 | layer { 1059 | name: "conv6_2_mbox_loc" 1060 | type: "Convolution" 1061 | bottom: "conv6_2" 1062 | top: "conv6_2_mbox_loc" 1063 | param { 1064 | lr_mult: 1 1065 | decay_mult: 1 1066 | } 1067 | param { 1068 | lr_mult: 2 1069 | decay_mult: 0 1070 | } 1071 | convolution_param { 1072 | num_output: 40 1073 | pad: 1 1074 | kernel_size: 3 1075 | stride: 1 1076 | weight_filler { 1077 | type: "xavier" 1078 | } 1079 | bias_filler { 1080 | type: "constant" 1081 | value: 0 1082 | } 1083 | } 1084 | } 1085 | layer { 1086 | name: "conv6_2_mbox_loc_perm" 1087 | type: "Permute" 1088 | bottom: "conv6_2_mbox_loc" 1089 | top: "conv6_2_mbox_loc_perm" 1090 | permute_param { 1091 | order: 0 1092 | order: 2 1093 | order: 3 1094 | order: 1 1095 | } 1096 | } 1097 | layer { 1098 | name: "conv6_2_mbox_loc_flat" 1099 | type: "Flatten" 1100 | bottom: "conv6_2_mbox_loc_perm" 1101 | top: "conv6_2_mbox_loc_flat" 1102 | flatten_param { 1103 | axis: 1 1104 | } 1105 | } 1106 | layer { 1107 | name: "conv6_2_mbox_conf" 1108 | type: "Convolution" 1109 | bottom: "conv6_2" 1110 | top: "conv6_2_mbox_conf" 1111 | param { 1112 | lr_mult: 1 1113 | decay_mult: 1 1114 | } 1115 | param { 1116 | lr_mult: 2 1117 | decay_mult: 0 1118 | } 1119 | convolution_param { 1120 | num_output: 60 1121 | pad: 1 1122 | kernel_size: 3 1123 | stride: 1 1124 | weight_filler { 1125 | type: "xavier" 1126 | } 1127 | bias_filler { 1128 | type: "constant" 1129 | value: 0 1130 | } 1131 | } 1132 | } 1133 | layer { 1134 | name: "conv6_2_mbox_conf_perm" 1135 | type: "Permute" 1136 | bottom: "conv6_2_mbox_conf" 1137 | top: "conv6_2_mbox_conf_perm" 1138 | permute_param { 1139 | order: 0 1140 | order: 2 1141 | order: 3 1142 | order: 1 1143 | } 1144 | } 1145 | layer { 1146 | name: "conv6_2_mbox_conf_flat" 1147 | type: "Flatten" 1148 | bottom: "conv6_2_mbox_conf_perm" 1149 | top: "conv6_2_mbox_conf_flat" 1150 | flatten_param { 1151 | axis: 1 1152 | } 1153 | } 1154 | layer { 1155 | name: "conv6_2_center_mbox_conf_new" 1156 | type: "Convolution" 1157 | bottom: "conv6_2" 1158 | top: "conv6_2_center_mbox_conf" 1159 | param { 1160 | lr_mult: 1 1161 | decay_mult: 1 1162 | } 1163 | param { 1164 | lr_mult: 2 1165 | decay_mult: 0 1166 | } 1167 | convolution_param { 1168 | num_output: 160 1169 | pad: 1 1170 | kernel_size: 3 1171 | stride: 1 1172 | weight_filler { 1173 | type: "xavier" 1174 | } 1175 | bias_filler { 1176 | type: "constant" 1177 | value: 0 1178 | } 1179 | } 1180 | } 1181 | layer { 1182 | name: "conv6_2_center_mbox_conf_perm" 1183 | type: "Permute" 1184 | bottom: "conv6_2_center_mbox_conf" 1185 | top: "conv6_2_center_mbox_conf_perm" 1186 | permute_param { 1187 | order: 0 1188 | order: 2 1189 | order: 3 1190 | order: 1 1191 | } 1192 | } 1193 | layer { 1194 | name: "conv6_2_center_mbox_conf_flat" 1195 | type: "Flatten" 1196 | bottom: "conv6_2_center_mbox_conf_perm" 1197 | top: "conv6_2_center_mbox_conf_flat" 1198 | flatten_param { 1199 | axis: 1 1200 | } 1201 | } 1202 | layer { 1203 | name: "conv6_2_mbox_priorbox" 1204 | type: "PriorBox" 1205 | bottom: "conv6_2" 1206 | bottom: "data" 1207 | top: "conv6_2_mbox_priorbox" 1208 | prior_box_param { 1209 | min_size: 24 1210 | max_size: 36 1211 | aspect_ratio: 2 1212 | aspect_ratio: 3 1213 | aspect_ratio: 4 1214 | aspect_ratio: 5 1215 | flip: true 1216 | clip: false 1217 | variance: 0.1 1218 | variance: 0.1 1219 | variance: 0.2 1220 | variance: 0.2 1221 | step: 8 1222 | offset: 0.5 1223 | } 1224 | } 1225 | layer { 1226 | name: "conv7_2_mbox_loc" 1227 | type: "Convolution" 1228 | bottom: "conv7_2" 1229 | top: "conv7_2_mbox_loc" 1230 | param { 1231 | lr_mult: 1 1232 | decay_mult: 1 1233 | } 1234 | param { 1235 | lr_mult: 2 1236 | decay_mult: 0 1237 | } 1238 | convolution_param { 1239 | num_output: 40 1240 | pad: 1 1241 | kernel_size: 3 1242 | stride: 1 1243 | weight_filler { 1244 | type: "xavier" 1245 | } 1246 | bias_filler { 1247 | type: "constant" 1248 | value: 0 1249 | } 1250 | } 1251 | } 1252 | layer { 1253 | name: "conv7_2_mbox_loc_perm" 1254 | type: "Permute" 1255 | bottom: "conv7_2_mbox_loc" 1256 | top: "conv7_2_mbox_loc_perm" 1257 | permute_param { 1258 | order: 0 1259 | order: 2 1260 | order: 3 1261 | order: 1 1262 | } 1263 | } 1264 | layer { 1265 | name: "conv7_2_mbox_loc_flat" 1266 | type: "Flatten" 1267 | bottom: "conv7_2_mbox_loc_perm" 1268 | top: "conv7_2_mbox_loc_flat" 1269 | flatten_param { 1270 | axis: 1 1271 | } 1272 | } 1273 | layer { 1274 | name: "conv7_2_mbox_conf" 1275 | type: "Convolution" 1276 | bottom: "conv7_2" 1277 | top: "conv7_2_mbox_conf" 1278 | param { 1279 | lr_mult: 1 1280 | decay_mult: 1 1281 | } 1282 | param { 1283 | lr_mult: 2 1284 | decay_mult: 0 1285 | } 1286 | convolution_param { 1287 | num_output: 60 1288 | pad: 1 1289 | kernel_size: 3 1290 | stride: 1 1291 | weight_filler { 1292 | type: "xavier" 1293 | } 1294 | bias_filler { 1295 | type: "constant" 1296 | value: 0 1297 | } 1298 | } 1299 | } 1300 | layer { 1301 | name: "conv7_2_mbox_conf_perm" 1302 | type: "Permute" 1303 | bottom: "conv7_2_mbox_conf" 1304 | top: "conv7_2_mbox_conf_perm" 1305 | permute_param { 1306 | order: 0 1307 | order: 2 1308 | order: 3 1309 | order: 1 1310 | } 1311 | } 1312 | layer { 1313 | name: "conv7_2_mbox_conf_flat" 1314 | type: "Flatten" 1315 | bottom: "conv7_2_mbox_conf_perm" 1316 | top: "conv7_2_mbox_conf_flat" 1317 | flatten_param { 1318 | axis: 1 1319 | } 1320 | } 1321 | layer { 1322 | name: "conv7_2_center_mbox_conf_new" 1323 | type: "Convolution" 1324 | bottom: "conv7_2" 1325 | top: "conv7_2_center_mbox_conf" 1326 | param { 1327 | lr_mult: 1 1328 | decay_mult: 1 1329 | } 1330 | param { 1331 | lr_mult: 2 1332 | decay_mult: 0 1333 | } 1334 | convolution_param { 1335 | num_output: 160 1336 | pad: 1 1337 | kernel_size: 3 1338 | stride: 1 1339 | weight_filler { 1340 | type: "xavier" 1341 | } 1342 | bias_filler { 1343 | type: "constant" 1344 | value: 0 1345 | } 1346 | } 1347 | } 1348 | layer { 1349 | name: "conv7_2_center_mbox_conf_perm" 1350 | type: "Permute" 1351 | bottom: "conv7_2_center_mbox_conf" 1352 | top: "conv7_2_center_mbox_conf_perm" 1353 | permute_param { 1354 | order: 0 1355 | order: 2 1356 | order: 3 1357 | order: 1 1358 | } 1359 | } 1360 | layer { 1361 | name: "conv7_2_center_mbox_conf_flat" 1362 | type: "Flatten" 1363 | bottom: "conv7_2_center_mbox_conf_perm" 1364 | top: "conv7_2_center_mbox_conf_flat" 1365 | flatten_param { 1366 | axis: 1 1367 | } 1368 | } 1369 | layer { 1370 | name: "conv7_2_mbox_priorbox" 1371 | type: "PriorBox" 1372 | bottom: "conv7_2" 1373 | bottom: "data" 1374 | top: "conv7_2_mbox_priorbox" 1375 | prior_box_param { 1376 | min_size: 36 1377 | max_size: 48 1378 | aspect_ratio: 2 1379 | aspect_ratio: 3 1380 | aspect_ratio: 4 1381 | aspect_ratio: 5 1382 | flip: true 1383 | clip: false 1384 | variance: 0.1 1385 | variance: 0.1 1386 | variance: 0.2 1387 | variance: 0.2 1388 | step: 16 1389 | offset: 0.5 1390 | } 1391 | } 1392 | layer { 1393 | name: "conv8_2_mbox_loc" 1394 | type: "Convolution" 1395 | bottom: "conv8_2" 1396 | top: "conv8_2_mbox_loc" 1397 | param { 1398 | lr_mult: 1 1399 | decay_mult: 1 1400 | } 1401 | param { 1402 | lr_mult: 2 1403 | decay_mult: 0 1404 | } 1405 | convolution_param { 1406 | num_output: 24 1407 | pad: 1 1408 | kernel_size: 3 1409 | stride: 1 1410 | weight_filler { 1411 | type: "xavier" 1412 | } 1413 | bias_filler { 1414 | type: "constant" 1415 | value: 0 1416 | } 1417 | } 1418 | } 1419 | layer { 1420 | name: "conv8_2_mbox_loc_perm" 1421 | type: "Permute" 1422 | bottom: "conv8_2_mbox_loc" 1423 | top: "conv8_2_mbox_loc_perm" 1424 | permute_param { 1425 | order: 0 1426 | order: 2 1427 | order: 3 1428 | order: 1 1429 | } 1430 | } 1431 | layer { 1432 | name: "conv8_2_mbox_loc_flat" 1433 | type: "Flatten" 1434 | bottom: "conv8_2_mbox_loc_perm" 1435 | top: "conv8_2_mbox_loc_flat" 1436 | flatten_param { 1437 | axis: 1 1438 | } 1439 | } 1440 | layer { 1441 | name: "conv8_2_mbox_conf" 1442 | type: "Convolution" 1443 | bottom: "conv8_2" 1444 | top: "conv8_2_mbox_conf" 1445 | param { 1446 | lr_mult: 1 1447 | decay_mult: 1 1448 | } 1449 | param { 1450 | lr_mult: 2 1451 | decay_mult: 0 1452 | } 1453 | convolution_param { 1454 | num_output: 36 1455 | pad: 1 1456 | kernel_size: 3 1457 | stride: 1 1458 | weight_filler { 1459 | type: "xavier" 1460 | } 1461 | bias_filler { 1462 | type: "constant" 1463 | value: 0 1464 | } 1465 | } 1466 | } 1467 | layer { 1468 | name: "conv8_2_mbox_conf_perm" 1469 | type: "Permute" 1470 | bottom: "conv8_2_mbox_conf" 1471 | top: "conv8_2_mbox_conf_perm" 1472 | permute_param { 1473 | order: 0 1474 | order: 2 1475 | order: 3 1476 | order: 1 1477 | } 1478 | } 1479 | layer { 1480 | name: "conv8_2_mbox_conf_flat" 1481 | type: "Flatten" 1482 | bottom: "conv8_2_mbox_conf_perm" 1483 | top: "conv8_2_mbox_conf_flat" 1484 | flatten_param { 1485 | axis: 1 1486 | } 1487 | } 1488 | layer { 1489 | name: "conv8_2_center_mbox_conf_new" 1490 | type: "Convolution" 1491 | bottom: "conv8_2" 1492 | top: "conv8_2_center_mbox_conf" 1493 | param { 1494 | lr_mult: 1 1495 | decay_mult: 1 1496 | } 1497 | param { 1498 | lr_mult: 2 1499 | decay_mult: 0 1500 | } 1501 | convolution_param { 1502 | num_output: 96 1503 | pad: 1 1504 | kernel_size: 3 1505 | stride: 1 1506 | weight_filler { 1507 | type: "xavier" 1508 | } 1509 | bias_filler { 1510 | type: "constant" 1511 | value: 0 1512 | } 1513 | } 1514 | } 1515 | layer { 1516 | name: "conv8_2_center_mbox_conf_perm" 1517 | type: "Permute" 1518 | bottom: "conv8_2_center_mbox_conf" 1519 | top: "conv8_2_center_mbox_conf_perm" 1520 | permute_param { 1521 | order: 0 1522 | order: 2 1523 | order: 3 1524 | order: 1 1525 | } 1526 | } 1527 | layer { 1528 | name: "conv8_2_center_mbox_conf_flat" 1529 | type: "Flatten" 1530 | bottom: "conv8_2_center_mbox_conf_perm" 1531 | top: "conv8_2_center_mbox_conf_flat" 1532 | flatten_param { 1533 | axis: 1 1534 | } 1535 | } 1536 | layer { 1537 | name: "conv8_2_mbox_priorbox" 1538 | type: "PriorBox" 1539 | bottom: "conv8_2" 1540 | bottom: "data" 1541 | top: "conv8_2_mbox_priorbox" 1542 | prior_box_param { 1543 | min_size: 48 1544 | max_size: 60 1545 | aspect_ratio: 2 1546 | aspect_ratio: 3 1547 | flip: true 1548 | clip: false 1549 | variance: 0.1 1550 | variance: 0.1 1551 | variance: 0.2 1552 | variance: 0.2 1553 | step: 30 1554 | offset: 0.5 1555 | } 1556 | } 1557 | layer { 1558 | name: "mbox_loc" 1559 | type: "Concat" 1560 | bottom: "fc7_norm_mbox_loc_flat" 1561 | bottom: "conv6_2_mbox_loc_flat" 1562 | bottom: "conv7_2_mbox_loc_flat" 1563 | bottom: "conv8_2_mbox_loc_flat" 1564 | top: "mbox_loc" 1565 | concat_param { 1566 | axis: 1 1567 | } 1568 | } 1569 | layer { 1570 | name: "mbox_conf" 1571 | type: "Concat" 1572 | bottom: "fc7_norm_mbox_conf_flat" 1573 | bottom: "conv6_2_mbox_conf_flat" 1574 | bottom: "conv7_2_mbox_conf_flat" 1575 | bottom: "conv8_2_mbox_conf_flat" 1576 | top: "mbox_conf" 1577 | concat_param { 1578 | axis: 1 1579 | } 1580 | } 1581 | layer { 1582 | name: "mbox_priorbox" 1583 | type: "Concat" 1584 | bottom: "fc7_norm_mbox_priorbox" 1585 | bottom: "conv6_2_mbox_priorbox" 1586 | bottom: "conv7_2_mbox_priorbox" 1587 | bottom: "conv8_2_mbox_priorbox" 1588 | top: "mbox_priorbox" 1589 | concat_param { 1590 | axis: 2 1591 | } 1592 | } 1593 | layer { 1594 | name: "mbox_center_conf" 1595 | type: "Concat" 1596 | bottom: "fc7_norm_mbox_center_conf_flat" 1597 | bottom: "conv6_2_center_mbox_conf_flat" 1598 | bottom: "conv7_2_center_mbox_conf_flat" 1599 | bottom: "conv8_2_center_mbox_conf_flat" 1600 | top: "mbox_center_conf" 1601 | concat_param { 1602 | axis: 1 1603 | } 1604 | } 1605 | layer { 1606 | name: "mbox_loss" 1607 | type: "MultiBoxCenterLoss" 1608 | bottom: "mbox_loc" 1609 | bottom: "mbox_conf" 1610 | bottom: "mbox_priorbox" 1611 | bottom: "label" 1612 | bottom: "mbox_center_conf" 1613 | top: "mbox_loss" 1614 | include { 1615 | phase: TRAIN 1616 | } 1617 | propagate_down: true 1618 | propagate_down: true 1619 | propagate_down: false 1620 | propagate_down: false 1621 | propagate_down: true 1622 | loss_param { 1623 | normalization: VALID 1624 | } 1625 | multibox_loss_param { 1626 | loc_loss_type: SMOOTH_L1 1627 | conf_loss_type: SOFTMAX 1628 | loc_weight: 1 1629 | num_classes: 6 1630 | share_location: true 1631 | match_type: PER_PREDICTION 1632 | overlap_threshold: 0.45 1633 | use_prior_for_matching: true 1634 | background_label_id: 0 1635 | use_difficult_gt: true 1636 | neg_pos_ratio: 3 1637 | neg_overlap: 0.1 1638 | code_type: CENTER_SIZE 1639 | ignore_cross_boundary_bbox: false 1640 | mining_type: MAX_NEGATIVE 1641 | center_loss_weight: 0.1 1642 | } 1643 | multibox_center_loss_param { 1644 | center_features: 16 1645 | } 1646 | } 1647 | -------------------------------------------------------------------------------- /caffe.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package caffe; 4 | 5 | // Specifies the shape (dimensions) of a Blob. 6 | message BlobShape { 7 | repeated int64 dim = 1 [packed = true]; 8 | } 9 | 10 | message BlobProto { 11 | optional BlobShape shape = 7; 12 | repeated float data = 5 [packed = true]; 13 | repeated float diff = 6 [packed = true]; 14 | repeated double double_data = 8 [packed = true]; 15 | repeated double double_diff = 9 [packed = true]; 16 | 17 | // 4D dimensions -- deprecated. Use "shape" instead. 18 | optional int32 num = 1 [default = 0]; 19 | optional int32 channels = 2 [default = 0]; 20 | optional int32 height = 3 [default = 0]; 21 | optional int32 width = 4 [default = 0]; 22 | } 23 | 24 | // The BlobProtoVector is simply a way to pass multiple blobproto instances 25 | // around. 26 | message BlobProtoVector { 27 | repeated BlobProto blobs = 1; 28 | } 29 | 30 | message Datum { 31 | optional int32 channels = 1; 32 | optional int32 height = 2; 33 | optional int32 width = 3; 34 | // the actual image data, in bytes 35 | optional bytes data = 4; 36 | optional int32 label = 5; 37 | // Optionally, the datum could also hold float data. 38 | repeated float float_data = 6; 39 | // If true data contains an encoded image that need to be decoded 40 | optional bool encoded = 7 [default = false]; 41 | } 42 | 43 | // The label (display) name and label id. 44 | message LabelMapItem { 45 | // Both name and label are required. 46 | optional string name = 1; 47 | optional int32 label = 2; 48 | // display_name is optional. 49 | optional string display_name = 3; 50 | } 51 | 52 | message LabelMap { 53 | repeated LabelMapItem item = 1; 54 | } 55 | 56 | // Sample a bbox in the normalized space [0, 1] with provided constraints. 57 | message Sampler { 58 | // Minimum scale of the sampled bbox. 59 | optional float min_scale = 1 [default = 1.]; 60 | // Maximum scale of the sampled bbox. 61 | optional float max_scale = 2 [default = 1.]; 62 | 63 | // Minimum aspect ratio of the sampled bbox. 64 | optional float min_aspect_ratio = 3 [default = 1.]; 65 | // Maximum aspect ratio of the sampled bbox. 66 | optional float max_aspect_ratio = 4 [default = 1.]; 67 | } 68 | 69 | // Constraints for selecting sampled bbox. 70 | message SampleConstraint { 71 | // Minimum Jaccard overlap between sampled bbox and all bboxes in 72 | // AnnotationGroup. 73 | optional float min_jaccard_overlap = 1; 74 | // Maximum Jaccard overlap between sampled bbox and all bboxes in 75 | // AnnotationGroup. 76 | optional float max_jaccard_overlap = 2; 77 | 78 | // Minimum coverage of sampled bbox by all bboxes in AnnotationGroup. 79 | optional float min_sample_coverage = 3; 80 | // Maximum coverage of sampled bbox by all bboxes in AnnotationGroup. 81 | optional float max_sample_coverage = 4; 82 | 83 | // Minimum coverage of all bboxes in AnnotationGroup by sampled bbox. 84 | optional float min_object_coverage = 5; 85 | // Maximum coverage of all bboxes in AnnotationGroup by sampled bbox. 86 | optional float max_object_coverage = 6; 87 | } 88 | 89 | // Sample a batch of bboxes with provided constraints. 90 | message BatchSampler { 91 | // Use original image as the source for sampling. 92 | optional bool use_original_image = 1 [default = true]; 93 | 94 | // Constraints for sampling bbox. 95 | optional Sampler sampler = 2; 96 | 97 | // Constraints for determining if a sampled bbox is positive or negative. 98 | optional SampleConstraint sample_constraint = 3; 99 | 100 | // If provided, break when found certain number of samples satisfing the 101 | // sample_constraint. 102 | optional uint32 max_sample = 4; 103 | 104 | // Maximum number of trials for sampling to avoid infinite loop. 105 | optional uint32 max_trials = 5 [default = 100]; 106 | } 107 | 108 | // Condition for emitting annotations. 109 | message EmitConstraint { 110 | enum EmitType { 111 | CENTER = 0; 112 | MIN_OVERLAP = 1; 113 | } 114 | optional EmitType emit_type = 1 [default = CENTER]; 115 | // If emit_type is MIN_OVERLAP, provide the emit_overlap. 116 | optional float emit_overlap = 2; 117 | } 118 | 119 | // The normalized bounding box [0, 1] w.r.t. the input image size. 120 | message NormalizedBBox { 121 | optional float xmin = 1; 122 | optional float ymin = 2; 123 | optional float xmax = 3; 124 | optional float ymax = 4; 125 | optional int32 label = 5; 126 | optional bool difficult = 6; 127 | optional float score = 7; 128 | optional float size = 8; 129 | } 130 | 131 | // Annotation for each object instance. 132 | message Annotation { 133 | optional int32 instance_id = 1 [default = 0]; 134 | optional NormalizedBBox bbox = 2; 135 | } 136 | 137 | // Group of annotations for a particular label. 138 | message AnnotationGroup { 139 | optional int32 group_label = 1; 140 | repeated Annotation annotation = 2; 141 | } 142 | 143 | // An extension of Datum which contains "rich" annotations. 144 | message AnnotatedDatum { 145 | enum AnnotationType { 146 | BBOX = 0; 147 | } 148 | optional Datum datum = 1; 149 | // If there are "rich" annotations, specify the type of annotation. 150 | // Currently it only supports bounding box. 151 | // If there are no "rich" annotations, use label in datum instead. 152 | optional AnnotationType type = 2; 153 | // Each group contains annotation for a particular class. 154 | repeated AnnotationGroup annotation_group = 3; 155 | } 156 | 157 | message FillerParameter { 158 | // The filler type. 159 | optional string type = 1 [default = 'constant']; 160 | optional float value = 2 [default = 0]; // the value in constant filler 161 | optional float min = 3 [default = 0]; // the min value in uniform filler 162 | optional float max = 4 [default = 1]; // the max value in uniform filler 163 | optional float mean = 5 [default = 0]; // the mean value in Gaussian filler 164 | optional float std = 6 [default = 1]; // the std value in Gaussian filler 165 | // The expected number of non-zero output weights for a given input in 166 | // Gaussian filler -- the default -1 means don't perform sparsification. 167 | optional int32 sparse = 7 [default = -1]; 168 | // Normalize the filler variance by fan_in, fan_out, or their average. 169 | // Applies to 'xavier' and 'msra' fillers. 170 | enum VarianceNorm { 171 | FAN_IN = 0; 172 | FAN_OUT = 1; 173 | AVERAGE = 2; 174 | } 175 | optional VarianceNorm variance_norm = 8 [default = FAN_IN]; 176 | } 177 | 178 | message NetParameter { 179 | optional string name = 1; // consider giving the network a name 180 | // DEPRECATED. See InputParameter. The input blobs to the network. 181 | repeated string input = 3; 182 | // DEPRECATED. See InputParameter. The shape of the input blobs. 183 | repeated BlobShape input_shape = 8; 184 | 185 | // 4D input dimensions -- deprecated. Use "input_shape" instead. 186 | // If specified, for each input blob there should be four 187 | // values specifying the num, channels, height and width of the input blob. 188 | // Thus, there should be a total of (4 * #input) numbers. 189 | repeated int32 input_dim = 4; 190 | 191 | // Whether the network will force every layer to carry out backward operation. 192 | // If set False, then whether to carry out backward is determined 193 | // automatically according to the net structure and learning rates. 194 | optional bool force_backward = 5 [default = false]; 195 | // The current "state" of the network, including the phase, level, and stage. 196 | // Some layers may be included/excluded depending on this state and the states 197 | // specified in the layers' include and exclude fields. 198 | optional NetState state = 6; 199 | 200 | // Print debugging information about results while running Net::Forward, 201 | // Net::Backward, and Net::Update. 202 | optional bool debug_info = 7 [default = false]; 203 | 204 | // The layers that make up the net. Each of their configurations, including 205 | // connectivity and behavior, is specified as a LayerParameter. 206 | repeated LayerParameter layer = 100; // ID 100 so layers are printed last. 207 | 208 | // DEPRECATED: use 'layer' instead. 209 | repeated V1LayerParameter layers = 2; 210 | } 211 | 212 | // NOTE 213 | // Update the next available ID when you add a new SolverParameter field. 214 | // 215 | // SolverParameter next available ID: 44 (last added: plateau_winsize) 216 | message SolverParameter { 217 | ////////////////////////////////////////////////////////////////////////////// 218 | // Specifying the train and test networks 219 | // 220 | // Exactly one train net must be specified using one of the following fields: 221 | // train_net_param, train_net, net_param, net 222 | // One or more test nets may be specified using any of the following fields: 223 | // test_net_param, test_net, net_param, net 224 | // If more than one test net field is specified (e.g., both net and 225 | // test_net are specified), they will be evaluated in the field order given 226 | // above: (1) test_net_param, (2) test_net, (3) net_param/net. 227 | // A test_iter must be specified for each test_net. 228 | // A test_level and/or a test_stage may also be specified for each test_net. 229 | ////////////////////////////////////////////////////////////////////////////// 230 | 231 | // Proto filename for the train net, possibly combined with one or more 232 | // test nets. 233 | optional string net = 24; 234 | // Inline train net param, possibly combined with one or more test nets. 235 | optional NetParameter net_param = 25; 236 | 237 | optional string train_net = 1; // Proto filename for the train net. 238 | repeated string test_net = 2; // Proto filenames for the test nets. 239 | optional NetParameter train_net_param = 21; // Inline train net params. 240 | repeated NetParameter test_net_param = 22; // Inline test net params. 241 | 242 | // The states for the train/test nets. Must be unspecified or 243 | // specified once per net. 244 | // 245 | // By default, all states will have solver = true; 246 | // train_state will have phase = TRAIN, 247 | // and all test_state's will have phase = TEST. 248 | // Other defaults are set according to the NetState defaults. 249 | optional NetState train_state = 26; 250 | repeated NetState test_state = 27; 251 | 252 | // Evaluation type. 253 | optional string eval_type = 41 [default = "classification"]; 254 | // ap_version: different ways of computing Average Precision. 255 | // Check https://sanchom.wordpress.com/tag/average-precision/ for details. 256 | // 11point: the 11-point interpolated average precision. Used in VOC2007. 257 | // MaxIntegral: maximally interpolated AP. Used in VOC2012/ILSVRC. 258 | // Integral: the natural integral of the precision-recall curve. 259 | optional string ap_version = 42 [default = "Integral"]; 260 | // If true, display per class result. 261 | optional bool show_per_class_result = 44 [default = false]; 262 | 263 | // The number of iterations for each test net. 264 | repeated int32 test_iter = 3; 265 | 266 | // The number of iterations between two testing phases. 267 | optional int32 test_interval = 4 [default = 0]; 268 | optional bool test_compute_loss = 19 [default = false]; 269 | // If true, run an initial test pass before the first iteration, 270 | // ensuring memory availability and printing the starting value of the loss. 271 | optional bool test_initialization = 32 [default = true]; 272 | optional float base_lr = 5; // The base learning rate 273 | // the number of iterations between displaying info. If display = 0, no info 274 | // will be displayed. 275 | optional int32 display = 6; 276 | // Display the loss averaged over the last average_loss iterations 277 | optional int32 average_loss = 33 [default = 1]; 278 | optional int32 max_iter = 7; // the maximum number of iterations 279 | // accumulate gradients over `iter_size` x `batch_size` instances 280 | optional int32 iter_size = 36 [default = 1]; 281 | 282 | // The learning rate decay policy. The currently implemented learning rate 283 | // policies are as follows: 284 | // - fixed: always return base_lr. 285 | // - step: return base_lr * gamma ^ (floor(iter / step)) 286 | // - exp: return base_lr * gamma ^ iter 287 | // - inv: return base_lr * (1 + gamma * iter) ^ (- power) 288 | // - multistep: similar to step but it allows non uniform steps defined by 289 | // stepvalue 290 | // - poly: the effective learning rate follows a polynomial decay, to be 291 | // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) 292 | // - sigmoid: the effective learning rate follows a sigmod decay 293 | // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) 294 | // - plateau: decreases lr 295 | // if the minimum loss isn't updated for 'plateau_winsize' iters 296 | // 297 | // where base_lr, max_iter, gamma, step, stepvalue and power are defined 298 | // in the solver parameter protocol buffer, and iter is the current iteration. 299 | optional string lr_policy = 8; 300 | optional float gamma = 9; // The parameter to compute the learning rate. 301 | optional float power = 10; // The parameter to compute the learning rate. 302 | optional float momentum = 11; // The momentum value. 303 | optional float weight_decay = 12; // The weight decay. 304 | // regularization types supported: L1 and L2 305 | // controlled by weight_decay 306 | optional string regularization_type = 29 [default = "L2"]; 307 | // the stepsize for learning rate policy "step" 308 | optional int32 stepsize = 13; 309 | // the stepsize for learning rate policy "multistep" 310 | repeated int32 stepvalue = 34; 311 | // the stepsize for learning rate policy "plateau" 312 | repeated int32 plateau_winsize = 43; 313 | 314 | // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm, 315 | // whenever their actual L2 norm is larger. 316 | optional float clip_gradients = 35 [default = -1]; 317 | 318 | optional int32 snapshot = 14 [default = 0]; // The snapshot interval 319 | optional string snapshot_prefix = 15; // The prefix for the snapshot. 320 | // whether to snapshot diff in the results or not. Snapshotting diff will help 321 | // debugging but the final protocol buffer size will be much larger. 322 | optional bool snapshot_diff = 16 [default = false]; 323 | enum SnapshotFormat { 324 | HDF5 = 0; 325 | BINARYPROTO = 1; 326 | } 327 | optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO]; 328 | // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default. 329 | enum SolverMode { 330 | CPU = 0; 331 | GPU = 1; 332 | } 333 | optional SolverMode solver_mode = 17 [default = GPU]; 334 | // the device_id will that be used in GPU mode. Use device_id = 0 in default. 335 | optional int32 device_id = 18 [default = 0]; 336 | // If non-negative, the seed with which the Solver will initialize the Caffe 337 | // random number generator -- useful for reproducible results. Otherwise, 338 | // (and by default) initialize using a seed derived from the system clock. 339 | optional int64 random_seed = 20 [default = -1]; 340 | 341 | // type of the solver 342 | optional string type = 40 [default = "SGD"]; 343 | 344 | // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam 345 | optional float delta = 31 [default = 1e-8]; 346 | // parameters for the Adam solver 347 | optional float momentum2 = 39 [default = 0.999]; 348 | 349 | // RMSProp decay value 350 | // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) 351 | optional float rms_decay = 38 [default = 0.99]; 352 | 353 | // If true, print information about the state of the net that may help with 354 | // debugging learning problems. 355 | optional bool debug_info = 23 [default = false]; 356 | 357 | // If false, don't save a snapshot after training finishes. 358 | optional bool snapshot_after_train = 28 [default = true]; 359 | 360 | // DEPRECATED: old solver enum types, use string instead 361 | enum SolverType { 362 | SGD = 0; 363 | NESTEROV = 1; 364 | ADAGRAD = 2; 365 | RMSPROP = 3; 366 | ADADELTA = 4; 367 | ADAM = 5; 368 | } 369 | // DEPRECATED: use type instead of solver_type 370 | optional SolverType solver_type = 30 [default = SGD]; 371 | } 372 | 373 | // A message that stores the solver snapshots 374 | message SolverState { 375 | optional int32 iter = 1; // The current iteration 376 | optional string learned_net = 2; // The file that stores the learned net. 377 | repeated BlobProto history = 3; // The history for sgd solvers 378 | optional int32 current_step = 4 [default = 0]; // The current step for learning rate 379 | optional float minimum_loss = 5 [default = 1E38]; // Historical minimum loss 380 | optional int32 iter_last_event = 6 [default = 0]; // The iteration when last lr-update or min_loss-update happend 381 | } 382 | 383 | enum Phase { 384 | TRAIN = 0; 385 | TEST = 1; 386 | } 387 | 388 | message NetState { 389 | optional Phase phase = 1 [default = TEST]; 390 | optional int32 level = 2 [default = 0]; 391 | repeated string stage = 3; 392 | } 393 | 394 | message NetStateRule { 395 | // Set phase to require the NetState have a particular phase (TRAIN or TEST) 396 | // to meet this rule. 397 | optional Phase phase = 1; 398 | 399 | // Set the minimum and/or maximum levels in which the layer should be used. 400 | // Leave undefined to meet the rule regardless of level. 401 | optional int32 min_level = 2; 402 | optional int32 max_level = 3; 403 | 404 | // Customizable sets of stages to include or exclude. 405 | // The net must have ALL of the specified stages and NONE of the specified 406 | // "not_stage"s to meet the rule. 407 | // (Use multiple NetStateRules to specify conjunctions of stages.) 408 | repeated string stage = 4; 409 | repeated string not_stage = 5; 410 | } 411 | 412 | // Specifies training parameters (multipliers on global learning constants, 413 | // and the name and other settings used for weight sharing). 414 | message ParamSpec { 415 | // The names of the parameter blobs -- useful for sharing parameters among 416 | // layers, but never required otherwise. To share a parameter between two 417 | // layers, give it a (non-empty) name. 418 | optional string name = 1; 419 | 420 | // Whether to require shared weights to have the same shape, or just the same 421 | // count -- defaults to STRICT if unspecified. 422 | optional DimCheckMode share_mode = 2; 423 | enum DimCheckMode { 424 | // STRICT (default) requires that num, channels, height, width each match. 425 | STRICT = 0; 426 | // PERMISSIVE requires only the count (num*channels*height*width) to match. 427 | PERMISSIVE = 1; 428 | } 429 | 430 | // The multiplier on the global learning rate for this parameter. 431 | optional float lr_mult = 3 [default = 1.0]; 432 | 433 | // The multiplier on the global weight decay for this parameter. 434 | optional float decay_mult = 4 [default = 1.0]; 435 | } 436 | 437 | // NOTE 438 | // Update the next available ID when you add a new LayerParameter field. 439 | // 440 | // LayerParameter next available layer-specific ID: 147 (last added: recurrent_param) 441 | message LayerParameter { 442 | optional string name = 1; // the layer name 443 | optional string type = 2; // the layer type 444 | repeated string bottom = 3; // the name of each bottom blob 445 | repeated string top = 4; // the name of each top blob 446 | 447 | // The train / test phase for computation. 448 | optional Phase phase = 10; 449 | 450 | // The amount of weight to assign each top blob in the objective. 451 | // Each layer assigns a default value, usually of either 0 or 1, 452 | // to each top blob. 453 | repeated float loss_weight = 5; 454 | 455 | // Specifies training parameters (multipliers on global learning constants, 456 | // and the name and other settings used for weight sharing). 457 | repeated ParamSpec param = 6; 458 | 459 | // The blobs containing the numeric parameters of the layer. 460 | repeated BlobProto blobs = 7; 461 | 462 | // Specifies whether to backpropagate to each bottom. If unspecified, 463 | // Caffe will automatically infer whether each input needs backpropagation 464 | // to compute parameter gradients. If set to true for some inputs, 465 | // backpropagation to those inputs is forced; if set false for some inputs, 466 | // backpropagation to those inputs is skipped. 467 | // 468 | // The size must be either 0 or equal to the number of bottoms. 469 | repeated bool propagate_down = 11; 470 | 471 | // Rules controlling whether and when a layer is included in the network, 472 | // based on the current NetState. You may specify a non-zero number of rules 473 | // to include OR exclude, but not both. If no include or exclude rules are 474 | // specified, the layer is always included. If the current NetState meets 475 | // ANY (i.e., one or more) of the specified rules, the layer is 476 | // included/excluded. 477 | repeated NetStateRule include = 8; 478 | repeated NetStateRule exclude = 9; 479 | 480 | // Parameters for data pre-processing. 481 | optional TransformationParameter transform_param = 100; 482 | 483 | // Parameters shared by loss layers. 484 | optional LossParameter loss_param = 101; 485 | 486 | // Layer type-specific parameters. 487 | // 488 | // Note: certain layers may have more than one computational engine 489 | // for their implementation. These layers include an Engine type and 490 | // engine parameter for selecting the implementation. 491 | // The default for the engine is set by the ENGINE switch at compile-time. 492 | optional AccuracyParameter accuracy_param = 102; 493 | optional AnnotatedDataParameter annotated_data_param = 200; 494 | optional ArgMaxParameter argmax_param = 103; 495 | optional BatchNormParameter batch_norm_param = 139; 496 | optional BiasParameter bias_param = 141; 497 | optional CenterLossParameter center_loss_param = 149; 498 | optional ConcatParameter concat_param = 104; 499 | optional ContrastiveLossParameter contrastive_loss_param = 105; 500 | optional ConvolutionParameter convolution_param = 106; 501 | optional CropParameter crop_param = 144; 502 | optional DataParameter data_param = 107; 503 | optional DetectionEvaluateParameter detection_evaluate_param = 205; 504 | optional DetectionOutputParameter detection_output_param = 204; 505 | optional DropoutParameter dropout_param = 108; 506 | optional DummyDataParameter dummy_data_param = 109; 507 | optional EltwiseParameter eltwise_param = 110; 508 | optional ELUParameter elu_param = 140; 509 | optional EmbedParameter embed_param = 137; 510 | optional ExpParameter exp_param = 111; 511 | optional FlattenParameter flatten_param = 135; 512 | optional HDF5DataParameter hdf5_data_param = 112; 513 | optional HDF5OutputParameter hdf5_output_param = 113; 514 | optional HingeLossParameter hinge_loss_param = 114; 515 | optional ImageDataParameter image_data_param = 115; 516 | optional InfogainLossParameter infogain_loss_param = 116; 517 | optional InnerProductParameter inner_product_param = 117; 518 | optional InputParameter input_param = 143; 519 | optional LogParameter log_param = 134; 520 | optional LRNParameter lrn_param = 118; 521 | optional MemoryDataParameter memory_data_param = 119; 522 | optional MultiBoxLossParameter multibox_loss_param = 201; 523 | optional MultiBoxCenterLossParameter multibox_center_loss_param = 211; 524 | optional MVNParameter mvn_param = 120; 525 | optional NormalizeParameter norm_param = 206; 526 | optional ParameterParameter parameter_param = 145; 527 | optional PermuteParameter permute_param = 202; 528 | optional PoolingParameter pooling_param = 121; 529 | optional PowerParameter power_param = 122; 530 | optional PReLUParameter prelu_param = 131; 531 | optional PriorBoxParameter prior_box_param = 203; 532 | optional PythonParameter python_param = 130; 533 | optional RecurrentParameter recurrent_param = 146; 534 | optional ReductionParameter reduction_param = 136; 535 | optional ReLUParameter relu_param = 123; 536 | optional ReshapeParameter reshape_param = 133; 537 | optional ScaleParameter scale_param = 142; 538 | optional SigmoidParameter sigmoid_param = 124; 539 | optional SoftmaxParameter softmax_param = 125; 540 | optional SPPParameter spp_param = 132; 541 | optional SliceParameter slice_param = 126; 542 | optional TanHParameter tanh_param = 127; 543 | optional ThresholdParameter threshold_param = 128; 544 | optional TileParameter tile_param = 138; 545 | optional VideoDataParameter video_data_param = 207; 546 | optional WindowDataParameter window_data_param = 129; 547 | optional FocalLossParameter focal_loss_param = 147; 548 | } 549 | 550 | // Message that stores parameters used to apply transformation 551 | // to the data layer's data 552 | message TransformationParameter { 553 | // For data pre-processing, we can do simple scaling and subtracting the 554 | // data mean, if provided. Note that the mean subtraction is always carried 555 | // out before scaling. 556 | optional float scale = 1 [default = 1]; 557 | // Specify if we want to randomly mirror data. 558 | optional bool mirror = 2 [default = false]; 559 | // Specify if we would like to randomly crop an image. 560 | optional uint32 crop_size = 3 [default = 0]; 561 | optional uint32 crop_h = 11 [default = 0]; 562 | optional uint32 crop_w = 12 [default = 0]; 563 | 564 | // mean_file and mean_value cannot be specified at the same time 565 | optional string mean_file = 4; 566 | // if specified can be repeated once (would substract it from all the channels) 567 | // or can be repeated the same number of times as channels 568 | // (would subtract them from the corresponding channel) 569 | repeated float mean_value = 5; 570 | // Force the decoded image to have 3 color channels. 571 | optional bool force_color = 6 [default = false]; 572 | // Force the decoded image to have 1 color channels. 573 | optional bool force_gray = 7 [default = false]; 574 | // Resize policy 575 | optional ResizeParameter resize_param = 8; 576 | // Noise policy 577 | optional NoiseParameter noise_param = 9; 578 | // Distortion policy 579 | optional DistortionParameter distort_param = 13; 580 | // Expand policy 581 | optional ExpansionParameter expand_param = 14; 582 | // Constraint for emitting the annotation after transformation. 583 | optional EmitConstraint emit_constraint = 10; 584 | } 585 | 586 | // Message that stores parameters used by data transformer for resize policy 587 | message ResizeParameter { 588 | //Probability of using this resize policy 589 | optional float prob = 1 [default = 1]; 590 | 591 | enum Resize_mode { 592 | WARP = 1; 593 | FIT_SMALL_SIZE = 2; 594 | FIT_LARGE_SIZE_AND_PAD = 3; 595 | } 596 | optional Resize_mode resize_mode = 2 [default = WARP]; 597 | optional uint32 height = 3 [default = 0]; 598 | optional uint32 width = 4 [default = 0]; 599 | // A parameter used to update bbox in FIT_SMALL_SIZE mode. 600 | optional uint32 height_scale = 8 [default = 0]; 601 | optional uint32 width_scale = 9 [default = 0]; 602 | 603 | enum Pad_mode { 604 | CONSTANT = 1; 605 | MIRRORED = 2; 606 | REPEAT_NEAREST = 3; 607 | } 608 | // Padding mode for BE_SMALL_SIZE_AND_PAD mode and object centering 609 | optional Pad_mode pad_mode = 5 [default = CONSTANT]; 610 | // if specified can be repeated once (would fill all the channels) 611 | // or can be repeated the same number of times as channels 612 | // (would use it them to the corresponding channel) 613 | repeated float pad_value = 6; 614 | 615 | enum Interp_mode { //Same as in OpenCV 616 | LINEAR = 1; 617 | AREA = 2; 618 | NEAREST = 3; 619 | CUBIC = 4; 620 | LANCZOS4 = 5; 621 | } 622 | //interpolation for for resizing 623 | repeated Interp_mode interp_mode = 7; 624 | } 625 | 626 | message SaltPepperParameter { 627 | //Percentage of pixels 628 | optional float fraction = 1 [default = 0]; 629 | repeated float value = 2; 630 | } 631 | 632 | // Message that stores parameters used by data transformer for transformation 633 | // policy 634 | message NoiseParameter { 635 | //Probability of using this resize policy 636 | optional float prob = 1 [default = 0]; 637 | // Histogram equalized 638 | optional bool hist_eq = 2 [default = false]; 639 | // Color inversion 640 | optional bool inverse = 3 [default = false]; 641 | // Grayscale 642 | optional bool decolorize = 4 [default = false]; 643 | // Gaussian blur 644 | optional bool gauss_blur = 5 [default = false]; 645 | 646 | // JPEG compression quality (-1 = no compression) 647 | optional float jpeg = 6 [default = -1]; 648 | 649 | // Posterization 650 | optional bool posterize = 7 [default = false]; 651 | 652 | // Erosion 653 | optional bool erode = 8 [default = false]; 654 | 655 | // Salt-and-pepper noise 656 | optional bool saltpepper = 9 [default = false]; 657 | 658 | optional SaltPepperParameter saltpepper_param = 10; 659 | 660 | // Local histogram equalization 661 | optional bool clahe = 11 [default = false]; 662 | 663 | // Color space conversion 664 | optional bool convert_to_hsv = 12 [default = false]; 665 | 666 | // Color space conversion 667 | optional bool convert_to_lab = 13 [default = false]; 668 | } 669 | 670 | // Message that stores parameters used by data transformer for distortion policy 671 | message DistortionParameter { 672 | // The probability of adjusting brightness. 673 | optional float brightness_prob = 1 [default = 0.0]; 674 | // Amount to add to the pixel values within [-delta, delta]. 675 | // The possible value is within [0, 255]. Recommend 32. 676 | optional float brightness_delta = 2 [default = 0.0]; 677 | 678 | // The probability of adjusting contrast. 679 | optional float contrast_prob = 3 [default = 0.0]; 680 | // Lower bound for random contrast factor. Recommend 0.5. 681 | optional float contrast_lower = 4 [default = 0.0]; 682 | // Upper bound for random contrast factor. Recommend 1.5. 683 | optional float contrast_upper = 5 [default = 0.0]; 684 | 685 | // The probability of adjusting hue. 686 | optional float hue_prob = 6 [default = 0.0]; 687 | // Amount to add to the hue channel within [-delta, delta]. 688 | // The possible value is within [0, 180]. Recommend 36. 689 | optional float hue_delta = 7 [default = 0.0]; 690 | 691 | // The probability of adjusting saturation. 692 | optional float saturation_prob = 8 [default = 0.0]; 693 | // Lower bound for the random saturation factor. Recommend 0.5. 694 | optional float saturation_lower = 9 [default = 0.0]; 695 | // Upper bound for the random saturation factor. Recommend 1.5. 696 | optional float saturation_upper = 10 [default = 0.0]; 697 | 698 | // The probability of randomly order the image channels. 699 | optional float random_order_prob = 11 [default = 0.0]; 700 | } 701 | 702 | // Message that stores parameters used by data transformer for expansion policy 703 | message ExpansionParameter { 704 | //Probability of using this expansion policy 705 | optional float prob = 1 [default = 1]; 706 | 707 | // The ratio to expand the image. 708 | optional float max_expand_ratio = 2 [default = 1.]; 709 | } 710 | 711 | // Message that stores parameters shared by loss layers 712 | message LossParameter { 713 | // If specified, ignore instances with the given label. 714 | optional int32 ignore_label = 1; 715 | // How to normalize the loss for loss layers that aggregate across batches, 716 | // spatial dimensions, or other dimensions. Currently only implemented in 717 | // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers. 718 | enum NormalizationMode { 719 | // Divide by the number of examples in the batch times spatial dimensions. 720 | // Outputs that receive the ignore label will NOT be ignored in computing 721 | // the normalization factor. 722 | FULL = 0; 723 | // Divide by the total number of output locations that do not take the 724 | // ignore_label. If ignore_label is not set, this behaves like FULL. 725 | VALID = 1; 726 | // Divide by the batch size. 727 | BATCH_SIZE = 2; 728 | // Do not normalize the loss. 729 | NONE = 3; 730 | } 731 | // For historical reasons, the default normalization for 732 | // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID. 733 | optional NormalizationMode normalization = 3 [default = VALID]; 734 | // Deprecated. Ignored if normalization is specified. If normalization 735 | // is not specified, then setting this to false will be equivalent to 736 | // normalization = BATCH_SIZE to be consistent with previous behavior. 737 | optional bool normalize = 2; 738 | } 739 | 740 | // Messages that store parameters used by individual layer types follow, in 741 | // alphabetical order. 742 | 743 | message AccuracyParameter { 744 | // When computing accuracy, count as correct by comparing the true label to 745 | // the top k scoring classes. By default, only compare to the top scoring 746 | // class (i.e. argmax). 747 | optional uint32 top_k = 1 [default = 1]; 748 | 749 | // The "label" axis of the prediction blob, whose argmax corresponds to the 750 | // predicted label -- may be negative to index from the end (e.g., -1 for the 751 | // last axis). For example, if axis == 1 and the predictions are 752 | // (N x C x H x W), the label blob is expected to contain N*H*W ground truth 753 | // labels with integer values in {0, 1, ..., C-1}. 754 | optional int32 axis = 2 [default = 1]; 755 | 756 | // If specified, ignore instances with the given label. 757 | optional int32 ignore_label = 3; 758 | } 759 | 760 | message AnnotatedDataParameter { 761 | // Define the sampler. 762 | repeated BatchSampler batch_sampler = 1; 763 | // Store label name and label id in LabelMap format. 764 | optional string label_map_file = 2; 765 | // If provided, it will replace the AnnotationType stored in each 766 | // AnnotatedDatum. 767 | optional AnnotatedDatum.AnnotationType anno_type = 3; 768 | } 769 | 770 | message ArgMaxParameter { 771 | // If true produce pairs (argmax, maxval) 772 | optional bool out_max_val = 1 [default = false]; 773 | optional uint32 top_k = 2 [default = 1]; 774 | // The axis along which to maximise -- may be negative to index from the 775 | // end (e.g., -1 for the last axis). 776 | // By default ArgMaxLayer maximizes over the flattened trailing dimensions 777 | // for each index of the first / num dimension. 778 | optional int32 axis = 3; 779 | } 780 | 781 | message ConcatParameter { 782 | // The axis along which to concatenate -- may be negative to index from the 783 | // end (e.g., -1 for the last axis). Other axes must have the 784 | // same dimension for all the bottom blobs. 785 | // By default, ConcatLayer concatenates blobs along the "channels" axis (1). 786 | optional int32 axis = 2 [default = 1]; 787 | 788 | // DEPRECATED: alias for "axis" -- does not support negative indexing. 789 | optional uint32 concat_dim = 1 [default = 1]; 790 | } 791 | 792 | message BatchNormParameter { 793 | // If false, accumulate global mean/variance values via a moving average. If 794 | // true, use those accumulated values instead of computing mean/variance 795 | // across the batch. 796 | optional bool use_global_stats = 1; 797 | // How much does the moving average decay each iteration? 798 | optional float moving_average_fraction = 2 [default = .999]; 799 | // Small value to add to the variance estimate so that we don't divide by 800 | // zero. 801 | optional float eps = 3 [default = 1e-5]; 802 | } 803 | 804 | message BiasParameter { 805 | // The first axis of bottom[0] (the first input Blob) along which to apply 806 | // bottom[1] (the second input Blob). May be negative to index from the end 807 | // (e.g., -1 for the last axis). 808 | // 809 | // For example, if bottom[0] is 4D with shape 100x3x40x60, the output 810 | // top[0] will have the same shape, and bottom[1] may have any of the 811 | // following shapes (for the given value of axis): 812 | // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 813 | // (axis == 1 == -3) 3; 3x40; 3x40x60 814 | // (axis == 2 == -2) 40; 40x60 815 | // (axis == 3 == -1) 60 816 | // Furthermore, bottom[1] may have the empty shape (regardless of the value of 817 | // "axis") -- a scalar bias. 818 | optional int32 axis = 1 [default = 1]; 819 | 820 | // (num_axes is ignored unless just one bottom is given and the bias is 821 | // a learned parameter of the layer. Otherwise, num_axes is determined by the 822 | // number of axes by the second bottom.) 823 | // The number of axes of the input (bottom[0]) covered by the bias 824 | // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. 825 | // Set num_axes := 0, to add a zero-axis Blob: a scalar. 826 | optional int32 num_axes = 2 [default = 1]; 827 | 828 | // (filler is ignored unless just one bottom is given and the bias is 829 | // a learned parameter of the layer.) 830 | // The initialization for the learned bias parameter. 831 | // Default is the zero (0) initialization, resulting in the BiasLayer 832 | // initially performing the identity operation. 833 | optional FillerParameter filler = 3; 834 | } 835 | 836 | message ContrastiveLossParameter { 837 | // margin for dissimilar pair 838 | optional float margin = 1 [default = 1.0]; 839 | // The first implementation of this cost did not exactly match the cost of 840 | // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2. 841 | // legacy_version = false (the default) uses (margin - d)^2 as proposed in the 842 | // Hadsell paper. New models should probably use this version. 843 | // legacy_version = true uses (margin - d^2). This is kept to support / 844 | // reproduce existing models and results 845 | optional bool legacy_version = 2 [default = false]; 846 | } 847 | 848 | message ConvolutionParameter { 849 | optional uint32 num_output = 1; // The number of outputs for the layer 850 | optional bool bias_term = 2 [default = true]; // whether to have bias terms 851 | 852 | // Pad, kernel size, and stride are all given as a single value for equal 853 | // dimensions in all spatial dimensions, or once per spatial dimension. 854 | repeated uint32 pad = 3; // The padding size; defaults to 0 855 | repeated uint32 kernel_size = 4; // The kernel size 856 | repeated uint32 stride = 6; // The stride; defaults to 1 857 | // Factor used to dilate the kernel, (implicitly) zero-filling the resulting 858 | // holes. (Kernel dilation is sometimes referred to by its use in the 859 | // algorithme à trous from Holschneider et al. 1987.) 860 | repeated uint32 dilation = 18; // The dilation; defaults to 1 861 | 862 | // For 2D convolution only, the *_h and *_w versions may also be used to 863 | // specify both spatial dimensions. 864 | optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only) 865 | optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only) 866 | optional uint32 kernel_h = 11; // The kernel height (2D only) 867 | optional uint32 kernel_w = 12; // The kernel width (2D only) 868 | optional uint32 stride_h = 13; // The stride height (2D only) 869 | optional uint32 stride_w = 14; // The stride width (2D only) 870 | 871 | optional uint32 group = 5 [default = 1]; // The group size for group conv 872 | 873 | optional FillerParameter weight_filler = 7; // The filler for the weight 874 | optional FillerParameter bias_filler = 8; // The filler for the bias 875 | enum Engine { 876 | DEFAULT = 0; 877 | CAFFE = 1; 878 | CUDNN = 2; 879 | } 880 | optional Engine engine = 15 [default = DEFAULT]; 881 | 882 | // The axis to interpret as "channels" when performing convolution. 883 | // Preceding dimensions are treated as independent inputs; 884 | // succeeding dimensions are treated as "spatial". 885 | // With (N, C, H, W) inputs, and axis == 1 (the default), we perform 886 | // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for 887 | // groups g>1) filters across the spatial axes (H, W) of the input. 888 | // With (N, C, D, H, W) inputs, and axis == 1, we perform 889 | // N independent 3D convolutions, sliding (C/g)-channels 890 | // filters across the spatial axes (D, H, W) of the input. 891 | optional int32 axis = 16 [default = 1]; 892 | 893 | // Whether to force use of the general ND convolution, even if a specific 894 | // implementation for blobs of the appropriate number of spatial dimensions 895 | // is available. (Currently, there is only a 2D-specific convolution 896 | // implementation; for input blobs with num_axes != 2, this option is 897 | // ignored and the ND implementation will be used.) 898 | optional bool force_nd_im2col = 17 [default = false]; 899 | } 900 | 901 | message CropParameter { 902 | // To crop, elements of the first bottom are selected to fit the dimensions 903 | // of the second, reference bottom. The crop is configured by 904 | // - the crop `axis` to pick the dimensions for cropping 905 | // - the crop `offset` to set the shift for all/each dimension 906 | // to align the cropped bottom with the reference bottom. 907 | // All dimensions up to but excluding `axis` are preserved, while 908 | // the dimensions including and trailing `axis` are cropped. 909 | // If only one `offset` is set, then all dimensions are offset by this amount. 910 | // Otherwise, the number of offsets must equal the number of cropped axes to 911 | // shift the crop in each dimension accordingly. 912 | // Note: standard dimensions are N,C,H,W so the default is a spatial crop, 913 | // and `axis` may be negative to index from the end (e.g., -1 for the last 914 | // axis). 915 | optional int32 axis = 1 [default = 2]; 916 | repeated uint32 offset = 2; 917 | } 918 | 919 | message DataParameter { 920 | enum DB { 921 | LEVELDB = 0; 922 | LMDB = 1; 923 | } 924 | // Specify the data source. 925 | optional string source = 1; 926 | // Specify the batch size. 927 | optional uint32 batch_size = 4; 928 | // The rand_skip variable is for the data layer to skip a few data points 929 | // to avoid all asynchronous sgd clients to start at the same point. The skip 930 | // point would be set as rand_skip * rand(0,1). Note that rand_skip should not 931 | // be larger than the number of keys in the database. 932 | // DEPRECATED. Each solver accesses a different subset of the database. 933 | optional uint32 rand_skip = 7 [default = 0]; 934 | optional DB backend = 8 [default = LEVELDB]; 935 | // DEPRECATED. See TransformationParameter. For data pre-processing, we can do 936 | // simple scaling and subtracting the data mean, if provided. Note that the 937 | // mean subtraction is always carried out before scaling. 938 | optional float scale = 2 [default = 1]; 939 | optional string mean_file = 3; 940 | // DEPRECATED. See TransformationParameter. Specify if we would like to randomly 941 | // crop an image. 942 | optional uint32 crop_size = 5 [default = 0]; 943 | // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror 944 | // data. 945 | optional bool mirror = 6 [default = false]; 946 | // Force the encoded image to have 3 color channels 947 | optional bool force_encoded_color = 9 [default = false]; 948 | // Prefetch queue (Number of batches to prefetch to host memory, increase if 949 | // data access bandwidth varies). 950 | optional uint32 prefetch = 10 [default = 4]; 951 | } 952 | 953 | // Message that store parameters used by DetectionEvaluateLayer 954 | message DetectionEvaluateParameter { 955 | // Number of classes that are actually predicted. Required! 956 | optional uint32 num_classes = 1; 957 | // Label id for background class. Needed for sanity check so that 958 | // background class is neither in the ground truth nor the detections. 959 | optional uint32 background_label_id = 2 [default = 0]; 960 | // Threshold for deciding true/false positive. 961 | optional float overlap_threshold = 3 [default = 0.5]; 962 | // If true, also consider difficult ground truth for evaluation. 963 | optional bool evaluate_difficult_gt = 4 [default = true]; 964 | // A file which contains a list of names and sizes with same order 965 | // of the input DB. The file is in the following format: 966 | // name height width 967 | // ... 968 | // If provided, we will scale the prediction and ground truth NormalizedBBox 969 | // for evaluation. 970 | optional string name_size_file = 5; 971 | // The resize parameter used in converting NormalizedBBox to original image. 972 | optional ResizeParameter resize_param = 6; 973 | } 974 | 975 | message NonMaximumSuppressionParameter { 976 | // Threshold to be used in nms. 977 | optional float nms_threshold = 1 [default = 0.3]; 978 | // Maximum number of results to be kept. 979 | optional int32 top_k = 2; 980 | // Parameter for adaptive nms. 981 | optional float eta = 3 [default = 1.0]; 982 | } 983 | 984 | message SaveOutputParameter { 985 | // Output directory. If not empty, we will save the results. 986 | optional string output_directory = 1; 987 | // Output name prefix. 988 | optional string output_name_prefix = 2; 989 | // Output format. 990 | // VOC - PASCAL VOC output format. 991 | // COCO - MS COCO output format. 992 | optional string output_format = 3; 993 | // If you want to output results, must also provide the following two files. 994 | // Otherwise, we will ignore saving results. 995 | // label map file. 996 | optional string label_map_file = 4; 997 | // A file which contains a list of names and sizes with same order 998 | // of the input DB. The file is in the following format: 999 | // name height width 1000 | // ... 1001 | optional string name_size_file = 5; 1002 | // Number of test images. It can be less than the lines specified in 1003 | // name_size_file. For example, when we only want to evaluate on part 1004 | // of the test images. 1005 | optional uint32 num_test_image = 6; 1006 | // The resize parameter used in saving the data. 1007 | optional ResizeParameter resize_param = 7; 1008 | } 1009 | 1010 | // Message that store parameters used by DetectionOutputLayer 1011 | message DetectionOutputParameter { 1012 | // Number of classes to be predicted. Required! 1013 | optional uint32 num_classes = 1; 1014 | // If true, bounding box are shared among different classes. 1015 | optional bool share_location = 2 [default = true]; 1016 | // Background label id. If there is no background class, 1017 | // set it as -1. 1018 | optional int32 background_label_id = 3 [default = 0]; 1019 | // Parameters used for non maximum suppression. 1020 | optional NonMaximumSuppressionParameter nms_param = 4; 1021 | // Parameters used for saving detection results. 1022 | optional SaveOutputParameter save_output_param = 5; 1023 | // Type of coding method for bbox. 1024 | optional PriorBoxParameter.CodeType code_type = 6 [default = CORNER]; 1025 | // If true, variance is encoded in target; otherwise we need to adjust the 1026 | // predicted offset accordingly. 1027 | optional bool variance_encoded_in_target = 8 [default = false]; 1028 | // Number of total bboxes to be kept per image after nms step. 1029 | // -1 means keeping all bboxes after nms step. 1030 | optional int32 keep_top_k = 7 [default = -1]; 1031 | // Only consider detections whose confidences are larger than a threshold. 1032 | // If not provided, consider all boxes. 1033 | optional float confidence_threshold = 9; 1034 | // If true, visualize the detection results. 1035 | optional bool visualize = 10 [default = false]; 1036 | // The threshold used to visualize the detection results. 1037 | optional float visualize_threshold = 11; 1038 | // If provided, save outputs to video file. 1039 | optional string save_file = 12; 1040 | } 1041 | 1042 | message DropoutParameter { 1043 | optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio 1044 | } 1045 | 1046 | // DummyDataLayer fills any number of arbitrarily shaped blobs with random 1047 | // (or constant) data generated by "Fillers" (see "message FillerParameter"). 1048 | message DummyDataParameter { 1049 | // This layer produces N >= 1 top blobs. DummyDataParameter must specify 1 or N 1050 | // shape fields, and 0, 1 or N data_fillers. 1051 | // 1052 | // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used. 1053 | // If 1 data_filler is specified, it is applied to all top blobs. If N are 1054 | // specified, the ith is applied to the ith top blob. 1055 | repeated FillerParameter data_filler = 1; 1056 | repeated BlobShape shape = 6; 1057 | 1058 | // 4D dimensions -- deprecated. Use "shape" instead. 1059 | repeated uint32 num = 2; 1060 | repeated uint32 channels = 3; 1061 | repeated uint32 height = 4; 1062 | repeated uint32 width = 5; 1063 | } 1064 | 1065 | message EltwiseParameter { 1066 | enum EltwiseOp { 1067 | PROD = 0; 1068 | SUM = 1; 1069 | MAX = 2; 1070 | } 1071 | optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation 1072 | repeated float coeff = 2; // blob-wise coefficient for SUM operation 1073 | 1074 | // Whether to use an asymptotically slower (for >2 inputs) but stabler method 1075 | // of computing the gradient for the PROD operation. (No effect for SUM op.) 1076 | optional bool stable_prod_grad = 3 [default = true]; 1077 | } 1078 | 1079 | // Message that stores parameters used by ELULayer 1080 | message ELUParameter { 1081 | // Described in: 1082 | // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate 1083 | // Deep Network Learning by Exponential Linear Units (ELUs). arXiv 1084 | optional float alpha = 1 [default = 1]; 1085 | } 1086 | 1087 | // Message that stores parameters used by EmbedLayer 1088 | message EmbedParameter { 1089 | optional uint32 num_output = 1; // The number of outputs for the layer 1090 | // The input is given as integers to be interpreted as one-hot 1091 | // vector indices with dimension num_input. Hence num_input should be 1092 | // 1 greater than the maximum possible input value. 1093 | optional uint32 input_dim = 2; 1094 | 1095 | optional bool bias_term = 3 [default = true]; // Whether to use a bias term 1096 | optional FillerParameter weight_filler = 4; // The filler for the weight 1097 | optional FillerParameter bias_filler = 5; // The filler for the bias 1098 | 1099 | } 1100 | 1101 | // Message that stores parameters used by ExpLayer 1102 | message ExpParameter { 1103 | // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0. 1104 | // Or if base is set to the default (-1), base is set to e, 1105 | // so y = exp(shift + scale * x). 1106 | optional float base = 1 [default = -1.0]; 1107 | optional float scale = 2 [default = 1.0]; 1108 | optional float shift = 3 [default = 0.0]; 1109 | } 1110 | 1111 | /// Message that stores parameters used by FlattenLayer 1112 | message FlattenParameter { 1113 | // The first axis to flatten: all preceding axes are retained in the output. 1114 | // May be negative to index from the end (e.g., -1 for the last axis). 1115 | optional int32 axis = 1 [default = 1]; 1116 | 1117 | // The last axis to flatten: all following axes are retained in the output. 1118 | // May be negative to index from the end (e.g., the default -1 for the last 1119 | // axis). 1120 | optional int32 end_axis = 2 [default = -1]; 1121 | } 1122 | 1123 | // Message that stores parameters used by HDF5DataLayer 1124 | message HDF5DataParameter { 1125 | // Specify the data source. 1126 | optional string source = 1; 1127 | // Specify the batch size. 1128 | optional uint32 batch_size = 2; 1129 | 1130 | // Specify whether to shuffle the data. 1131 | // If shuffle == true, the ordering of the HDF5 files is shuffled, 1132 | // and the ordering of data within any given HDF5 file is shuffled, 1133 | // but data between different files are not interleaved; all of a file's 1134 | // data are output (in a random order) before moving onto another file. 1135 | optional bool shuffle = 3 [default = false]; 1136 | } 1137 | 1138 | message HDF5OutputParameter { 1139 | optional string file_name = 1; 1140 | } 1141 | 1142 | message HingeLossParameter { 1143 | enum Norm { 1144 | L1 = 1; 1145 | L2 = 2; 1146 | } 1147 | // Specify the Norm to use L1 or L2 1148 | optional Norm norm = 1 [default = L1]; 1149 | } 1150 | 1151 | message ImageDataParameter { 1152 | // Specify the data source. 1153 | optional string source = 1; 1154 | // Specify the batch size. 1155 | optional uint32 batch_size = 4 [default = 1]; 1156 | // The rand_skip variable is for the data layer to skip a few data points 1157 | // to avoid all asynchronous sgd clients to start at the same point. The skip 1158 | // point would be set as rand_skip * rand(0,1). Note that rand_skip should not 1159 | // be larger than the number of keys in the database. 1160 | optional uint32 rand_skip = 7 [default = 0]; 1161 | // Whether or not ImageLayer should shuffle the list of files at every epoch. 1162 | optional bool shuffle = 8 [default = false]; 1163 | // It will also resize images if new_height or new_width are not zero. 1164 | optional uint32 new_height = 9 [default = 0]; 1165 | optional uint32 new_width = 10 [default = 0]; 1166 | // Specify if the images are color or gray 1167 | optional bool is_color = 11 [default = true]; 1168 | // DEPRECATED. See TransformationParameter. For data pre-processing, we can do 1169 | // simple scaling and subtracting the data mean, if provided. Note that the 1170 | // mean subtraction is always carried out before scaling. 1171 | optional float scale = 2 [default = 1]; 1172 | optional string mean_file = 3; 1173 | // DEPRECATED. See TransformationParameter. Specify if we would like to randomly 1174 | // crop an image. 1175 | optional uint32 crop_size = 5 [default = 0]; 1176 | // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror 1177 | // data. 1178 | optional bool mirror = 6 [default = false]; 1179 | optional string root_folder = 12 [default = ""]; 1180 | } 1181 | 1182 | message InfogainLossParameter { 1183 | // Specify the infogain matrix source. 1184 | optional string source = 1; 1185 | } 1186 | 1187 | message InnerProductParameter { 1188 | optional uint32 num_output = 1; // The number of outputs for the layer 1189 | optional bool bias_term = 2 [default = true]; // whether to have bias terms 1190 | optional FillerParameter weight_filler = 3; // The filler for the weight 1191 | optional FillerParameter bias_filler = 4; // The filler for the bias 1192 | 1193 | // The first axis to be lumped into a single inner product computation; 1194 | // all preceding axes are retained in the output. 1195 | // May be negative to index from the end (e.g., -1 for the last axis). 1196 | optional int32 axis = 5 [default = 1]; 1197 | // Specify whether to transpose the weight matrix or not. 1198 | // If transpose == true, any operations will be performed on the transpose 1199 | // of the weight matrix. The weight matrix itself is not going to be transposed 1200 | // but rather the transfer flag of operations will be toggled accordingly. 1201 | optional bool transpose = 6 [default = false]; 1202 | } 1203 | 1204 | message InputParameter { 1205 | // This layer produces N >= 1 top blob(s) to be assigned manually. 1206 | // Define N shapes to set a shape for each top. 1207 | // Define 1 shape to set the same shape for every top. 1208 | // Define no shape to defer to reshaping manually. 1209 | repeated BlobShape shape = 1; 1210 | } 1211 | 1212 | // Message that stores parameters used by LogLayer 1213 | message LogParameter { 1214 | // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0. 1215 | // Or if base is set to the default (-1), base is set to e, 1216 | // so y = ln(shift + scale * x) = log_e(shift + scale * x) 1217 | optional float base = 1 [default = -1.0]; 1218 | optional float scale = 2 [default = 1.0]; 1219 | optional float shift = 3 [default = 0.0]; 1220 | } 1221 | 1222 | // Message that stores parameters used by LRNLayer 1223 | message LRNParameter { 1224 | optional uint32 local_size = 1 [default = 5]; 1225 | optional float alpha = 2 [default = 1.]; 1226 | optional float beta = 3 [default = 0.75]; 1227 | enum NormRegion { 1228 | ACROSS_CHANNELS = 0; 1229 | WITHIN_CHANNEL = 1; 1230 | } 1231 | optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS]; 1232 | optional float k = 5 [default = 1.]; 1233 | enum Engine { 1234 | DEFAULT = 0; 1235 | CAFFE = 1; 1236 | CUDNN = 2; 1237 | } 1238 | optional Engine engine = 6 [default = DEFAULT]; 1239 | } 1240 | 1241 | message MemoryDataParameter { 1242 | optional uint32 batch_size = 1; 1243 | optional uint32 channels = 2; 1244 | optional uint32 height = 3; 1245 | optional uint32 width = 4; 1246 | } 1247 | 1248 | // Message that store parameters used by MultiBoxLossLayer 1249 | message MultiBoxLossParameter { 1250 | // Localization loss type. 1251 | enum LocLossType { 1252 | L2 = 0; 1253 | SMOOTH_L1 = 1; 1254 | } 1255 | optional LocLossType loc_loss_type = 1 [default = SMOOTH_L1]; 1256 | // Confidence loss type. 1257 | enum ConfLossType { 1258 | SOFTMAX = 0; 1259 | LOGISTIC = 1; 1260 | } 1261 | optional ConfLossType conf_loss_type = 2 [default = SOFTMAX]; 1262 | // Weight for localization loss. 1263 | optional float loc_weight = 3 [default = 1.0]; 1264 | // Weight for center loss. 1265 | optional float center_loss_weight = 24 [default = 1.0]; 1266 | // Number of classes to be predicted. Required! 1267 | optional uint32 num_classes = 4; 1268 | // If true, bounding box are shared among different classes. 1269 | optional bool share_location = 5 [default = true]; 1270 | // Matching method during training. 1271 | enum MatchType { 1272 | BIPARTITE = 0; 1273 | PER_PREDICTION = 1; 1274 | } 1275 | optional MatchType match_type = 6 [default = PER_PREDICTION]; 1276 | // If match_type is PER_PREDICTION, use overlap_threshold to 1277 | // determine the extra matching bboxes. 1278 | optional float overlap_threshold = 7 [default = 0.5]; 1279 | // Use prior for matching. 1280 | optional bool use_prior_for_matching = 8 [default = true]; 1281 | // Background label id. 1282 | optional uint32 background_label_id = 9 [default = 0]; 1283 | // If true, also consider difficult ground truth. 1284 | optional bool use_difficult_gt = 10 [default = true]; 1285 | // If true, perform negative mining. 1286 | // DEPRECATED: use mining_type instead. 1287 | optional bool do_neg_mining = 11; 1288 | // The negative/positive ratio. 1289 | optional float neg_pos_ratio = 12 [default = 3.0]; 1290 | // The negative overlap upperbound for the unmatched predictions. 1291 | optional float neg_overlap = 13 [default = 0.5]; 1292 | // Type of coding method for bbox. 1293 | optional PriorBoxParameter.CodeType code_type = 14 [default = CORNER]; 1294 | // If true, encode the variance of prior box in the loc loss target instead of 1295 | // in bbox. 1296 | optional bool encode_variance_in_target = 16 [default = false]; 1297 | // If true, map all object classes to agnostic class. It is useful for learning 1298 | // objectness detector. 1299 | optional bool map_object_to_agnostic = 17 [default = false]; 1300 | // If true, ignore cross boundary bbox during matching. 1301 | // Cross boundary bbox is a bbox who is outside of the image region. 1302 | optional bool ignore_cross_boundary_bbox = 18 [default = false]; 1303 | // If true, only backpropagate on corners which are inside of the image 1304 | // region when encode_type is CORNER or CORNER_SIZE. 1305 | optional bool bp_inside = 19 [default = false]; 1306 | // Mining type during training. 1307 | // NONE : use all negatives. 1308 | // MAX_NEGATIVE : select negatives based on the score. 1309 | // HARD_EXAMPLE : select hard examples based on "Training Region-based Object Detectors with Online Hard Example Mining", Shrivastava et.al. 1310 | enum MiningType { 1311 | NONE = 0; 1312 | MAX_NEGATIVE = 1; 1313 | HARD_EXAMPLE = 2; 1314 | } 1315 | optional MiningType mining_type = 20 [default = MAX_NEGATIVE]; 1316 | // Parameters used for non maximum suppression durig hard example mining. 1317 | optional NonMaximumSuppressionParameter nms_param = 21; 1318 | optional int32 sample_size = 22 [default = 64]; 1319 | optional bool use_prior_for_nms = 23 [default = false]; 1320 | } 1321 | 1322 | 1323 | 1324 | message MVNParameter { 1325 | // This parameter can be set to false to normalize mean only 1326 | optional bool normalize_variance = 1 [default = true]; 1327 | 1328 | // This parameter can be set to true to perform DNN-like MVN 1329 | optional bool across_channels = 2 [default = false]; 1330 | 1331 | // Epsilon for not dividing by zero while normalizing variance 1332 | optional float eps = 3 [default = 1e-9]; 1333 | } 1334 | 1335 | // Message that stores parameters used by NormalizeLayer 1336 | message NormalizeParameter { 1337 | optional bool across_spatial = 1 [default = true]; 1338 | // Initial value of scale. Default is 1.0 for all 1339 | optional FillerParameter scale_filler = 2; 1340 | // Whether or not scale parameters are shared across channels. 1341 | optional bool channel_shared = 3 [default = true]; 1342 | // Epsilon for not dividing by zero while normalizing variance 1343 | optional float eps = 4 [default = 1e-10]; 1344 | } 1345 | 1346 | message ParameterParameter { 1347 | optional BlobShape shape = 1; 1348 | } 1349 | 1350 | message PermuteParameter { 1351 | // The new orders of the axes of data. Notice it should be with 1352 | // in the same range as the input data, and it starts from 0. 1353 | // Do not provide repeated order. 1354 | repeated uint32 order = 1; 1355 | } 1356 | 1357 | message PoolingParameter { 1358 | enum PoolMethod { 1359 | MAX = 0; 1360 | AVE = 1; 1361 | STOCHASTIC = 2; 1362 | } 1363 | optional PoolMethod pool = 1 [default = MAX]; // The pooling method 1364 | // Pad, kernel size, and stride are all given as a single value for equal 1365 | // dimensions in height and width or as Y, X pairs. 1366 | optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X) 1367 | optional uint32 pad_h = 9 [default = 0]; // The padding height 1368 | optional uint32 pad_w = 10 [default = 0]; // The padding width 1369 | optional uint32 kernel_size = 2; // The kernel size (square) 1370 | optional uint32 kernel_h = 5; // The kernel height 1371 | optional uint32 kernel_w = 6; // The kernel width 1372 | optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X) 1373 | optional uint32 stride_h = 7; // The stride height 1374 | optional uint32 stride_w = 8; // The stride width 1375 | enum Engine { 1376 | DEFAULT = 0; 1377 | CAFFE = 1; 1378 | CUDNN = 2; 1379 | } 1380 | optional Engine engine = 11 [default = DEFAULT]; 1381 | // If global_pooling then it will pool over the size of the bottom by doing 1382 | // kernel_h = bottom->height and kernel_w = bottom->width 1383 | optional bool global_pooling = 12 [default = false]; 1384 | } 1385 | 1386 | message PowerParameter { 1387 | // PowerLayer computes outputs y = (shift + scale * x) ^ power. 1388 | optional float power = 1 [default = 1.0]; 1389 | optional float scale = 2 [default = 1.0]; 1390 | optional float shift = 3 [default = 0.0]; 1391 | } 1392 | 1393 | // Message that store parameters used by PriorBoxLayer 1394 | message PriorBoxParameter { 1395 | // Encode/decode type. 1396 | enum CodeType { 1397 | CORNER = 1; 1398 | CENTER_SIZE = 2; 1399 | CORNER_SIZE = 3; 1400 | } 1401 | // Minimum box size (in pixels). Required! 1402 | repeated float min_size = 1; 1403 | // Maximum box size (in pixels). Required! 1404 | repeated float max_size = 2; 1405 | // Various of aspect ratios. Duplicate ratios will be ignored. 1406 | // If none is provided, we use default ratio 1. 1407 | repeated float aspect_ratio = 3; 1408 | // If true, will flip each aspect ratio. 1409 | // For example, if there is aspect ratio "r", 1410 | // we will generate aspect ratio "1.0/r" as well. 1411 | optional bool flip = 4 [default = true]; 1412 | // If true, will clip the prior so that it is within [0, 1] 1413 | optional bool clip = 5 [default = false]; 1414 | // Variance for adjusting the prior bboxes. 1415 | repeated float variance = 6; 1416 | // By default, we calculate img_height, img_width, step_x, step_y based on 1417 | // bottom[0] (feat) and bottom[1] (img). Unless these values are explicitely 1418 | // provided. 1419 | // Explicitly provide the img_size. 1420 | optional uint32 img_size = 7; 1421 | // Either img_size or img_h/img_w should be specified; not both. 1422 | optional uint32 img_h = 8; 1423 | optional uint32 img_w = 9; 1424 | 1425 | // Explicitly provide the step size. 1426 | optional float step = 10; 1427 | // Either step or step_h/step_w should be specified; not both. 1428 | optional float step_h = 11; 1429 | optional float step_w = 12; 1430 | 1431 | // Offset to the top left corner of each cell. 1432 | optional float offset = 13 [default = 0.5]; 1433 | } 1434 | 1435 | message PythonParameter { 1436 | optional string module = 1; 1437 | optional string layer = 2; 1438 | // This value is set to the attribute `param_str` of the `PythonLayer` object 1439 | // in Python before calling the `setup()` method. This could be a number, 1440 | // string, dictionary in Python dict format, JSON, etc. You may parse this 1441 | // string in `setup` method and use it in `forward` and `backward`. 1442 | optional string param_str = 3 [default = '']; 1443 | // Whether this PythonLayer is shared among worker solvers during data parallelism. 1444 | // If true, each worker solver sequentially run forward from this layer. 1445 | // This value should be set true if you are using it as a data layer. 1446 | optional bool share_in_parallel = 4 [default = false]; 1447 | } 1448 | 1449 | // Message that stores parameters used by RecurrentLayer 1450 | message RecurrentParameter { 1451 | // The dimension of the output (and usually hidden state) representation -- 1452 | // must be explicitly set to non-zero. 1453 | optional uint32 num_output = 1 [default = 0]; 1454 | 1455 | optional FillerParameter weight_filler = 2; // The filler for the weight 1456 | optional FillerParameter bias_filler = 3; // The filler for the bias 1457 | 1458 | // Whether to enable displaying debug_info in the unrolled recurrent net. 1459 | optional bool debug_info = 4 [default = false]; 1460 | 1461 | // Whether to add as additional inputs (bottoms) the initial hidden state 1462 | // blobs, and add as additional outputs (tops) the final timestep hidden state 1463 | // blobs. The number of additional bottom/top blobs required depends on the 1464 | // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs. 1465 | optional bool expose_hidden = 5 [default = false]; 1466 | } 1467 | 1468 | // Message that stores parameters used by ReductionLayer 1469 | message ReductionParameter { 1470 | enum ReductionOp { 1471 | SUM = 1; 1472 | ASUM = 2; 1473 | SUMSQ = 3; 1474 | MEAN = 4; 1475 | } 1476 | 1477 | optional ReductionOp operation = 1 [default = SUM]; // reduction operation 1478 | 1479 | // The first axis to reduce to a scalar -- may be negative to index from the 1480 | // end (e.g., -1 for the last axis). 1481 | // (Currently, only reduction along ALL "tail" axes is supported; reduction 1482 | // of axis M through N, where N < num_axes - 1, is unsupported.) 1483 | // Suppose we have an n-axis bottom Blob with shape: 1484 | // (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)). 1485 | // If axis == m, the output Blob will have shape 1486 | // (d0, d1, d2, ..., d(m-1)), 1487 | // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1)) 1488 | // times, each including (dm * d(m+1) * ... * d(n-1)) individual data. 1489 | // If axis == 0 (the default), the output Blob always has the empty shape 1490 | // (count 1), performing reduction across the entire input -- 1491 | // often useful for creating new loss functions. 1492 | optional int32 axis = 2 [default = 0]; 1493 | 1494 | optional float coeff = 3 [default = 1.0]; // coefficient for output 1495 | } 1496 | 1497 | // Message that stores parameters used by ReLULayer 1498 | message ReLUParameter { 1499 | // Allow non-zero slope for negative inputs to speed up optimization 1500 | // Described in: 1501 | // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities 1502 | // improve neural network acoustic models. In ICML Workshop on Deep Learning 1503 | // for Audio, Speech, and Language Processing. 1504 | optional float negative_slope = 1 [default = 0]; 1505 | enum Engine { 1506 | DEFAULT = 0; 1507 | CAFFE = 1; 1508 | CUDNN = 2; 1509 | } 1510 | optional Engine engine = 2 [default = DEFAULT]; 1511 | } 1512 | 1513 | message ReshapeParameter { 1514 | // Specify the output dimensions. If some of the dimensions are set to 0, 1515 | // the corresponding dimension from the bottom layer is used (unchanged). 1516 | // Exactly one dimension may be set to -1, in which case its value is 1517 | // inferred from the count of the bottom blob and the remaining dimensions. 1518 | // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8: 1519 | // 1520 | // layer { 1521 | // type: "Reshape" bottom: "input" top: "output" 1522 | // reshape_param { ... } 1523 | // } 1524 | // 1525 | // If "input" is 2D with shape 2 x 8, then the following reshape_param 1526 | // specifications are all equivalent, producing a 3D blob "output" with shape 1527 | // 2 x 2 x 4: 1528 | // 1529 | // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } 1530 | // reshape_param { shape { dim: 0 dim: 2 dim: 4 } } 1531 | // reshape_param { shape { dim: 0 dim: 2 dim: -1 } } 1532 | // reshape_param { shape { dim: 0 dim:-1 dim: 4 } } 1533 | // 1534 | optional BlobShape shape = 1; 1535 | 1536 | // axis and num_axes control the portion of the bottom blob's shape that are 1537 | // replaced by (included in) the reshape. By default (axis == 0 and 1538 | // num_axes == -1), the entire bottom blob shape is included in the reshape, 1539 | // and hence the shape field must specify the entire output shape. 1540 | // 1541 | // axis may be non-zero to retain some portion of the beginning of the input 1542 | // shape (and may be negative to index from the end; e.g., -1 to begin the 1543 | // reshape after the last axis, including nothing in the reshape, 1544 | // -2 to include only the last axis, etc.). 1545 | // 1546 | // For example, suppose "input" is a 2D blob with shape 2 x 8. 1547 | // Then the following ReshapeLayer specifications are all equivalent, 1548 | // producing a blob "output" with shape 2 x 2 x 4: 1549 | // 1550 | // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } 1551 | // reshape_param { shape { dim: 2 dim: 4 } axis: 1 } 1552 | // reshape_param { shape { dim: 2 dim: 4 } axis: -3 } 1553 | // 1554 | // num_axes specifies the extent of the reshape. 1555 | // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on 1556 | // input axes in the range [axis, axis+num_axes]. 1557 | // num_axes may also be -1, the default, to include all remaining axes 1558 | // (starting from axis). 1559 | // 1560 | // For example, suppose "input" is a 2D blob with shape 2 x 8. 1561 | // Then the following ReshapeLayer specifications are equivalent, 1562 | // producing a blob "output" with shape 1 x 2 x 8. 1563 | // 1564 | // reshape_param { shape { dim: 1 dim: 2 dim: 8 } } 1565 | // reshape_param { shape { dim: 1 dim: 2 } num_axes: 1 } 1566 | // reshape_param { shape { dim: 1 } num_axes: 0 } 1567 | // 1568 | // On the other hand, these would produce output blob shape 2 x 1 x 8: 1569 | // 1570 | // reshape_param { shape { dim: 2 dim: 1 dim: 8 } } 1571 | // reshape_param { shape { dim: 1 } axis: 1 num_axes: 0 } 1572 | // 1573 | optional int32 axis = 2 [default = 0]; 1574 | optional int32 num_axes = 3 [default = -1]; 1575 | } 1576 | 1577 | message ScaleParameter { 1578 | // The first axis of bottom[0] (the first input Blob) along which to apply 1579 | // bottom[1] (the second input Blob). May be negative to index from the end 1580 | // (e.g., -1 for the last axis). 1581 | // 1582 | // For example, if bottom[0] is 4D with shape 100x3x40x60, the output 1583 | // top[0] will have the same shape, and bottom[1] may have any of the 1584 | // following shapes (for the given value of axis): 1585 | // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 1586 | // (axis == 1 == -3) 3; 3x40; 3x40x60 1587 | // (axis == 2 == -2) 40; 40x60 1588 | // (axis == 3 == -1) 60 1589 | // Furthermore, bottom[1] may have the empty shape (regardless of the value of 1590 | // "axis") -- a scalar multiplier. 1591 | optional int32 axis = 1 [default = 1]; 1592 | 1593 | // (num_axes is ignored unless just one bottom is given and the scale is 1594 | // a learned parameter of the layer. Otherwise, num_axes is determined by the 1595 | // number of axes by the second bottom.) 1596 | // The number of axes of the input (bottom[0]) covered by the scale 1597 | // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. 1598 | // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar. 1599 | optional int32 num_axes = 2 [default = 1]; 1600 | 1601 | // (filler is ignored unless just one bottom is given and the scale is 1602 | // a learned parameter of the layer.) 1603 | // The initialization for the learned scale parameter. 1604 | // Default is the unit (1) initialization, resulting in the ScaleLayer 1605 | // initially performing the identity operation. 1606 | optional FillerParameter filler = 3; 1607 | 1608 | // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but 1609 | // may be more efficient). Initialized with bias_filler (defaults to 0). 1610 | optional bool bias_term = 4 [default = false]; 1611 | optional FillerParameter bias_filler = 5; 1612 | } 1613 | 1614 | message SigmoidParameter { 1615 | enum Engine { 1616 | DEFAULT = 0; 1617 | CAFFE = 1; 1618 | CUDNN = 2; 1619 | } 1620 | optional Engine engine = 1 [default = DEFAULT]; 1621 | } 1622 | 1623 | message SliceParameter { 1624 | // The axis along which to slice -- may be negative to index from the end 1625 | // (e.g., -1 for the last axis). 1626 | // By default, SliceLayer concatenates blobs along the "channels" axis (1). 1627 | optional int32 axis = 3 [default = 1]; 1628 | repeated uint32 slice_point = 2; 1629 | 1630 | // DEPRECATED: alias for "axis" -- does not support negative indexing. 1631 | optional uint32 slice_dim = 1 [default = 1]; 1632 | } 1633 | 1634 | // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer 1635 | message SoftmaxParameter { 1636 | enum Engine { 1637 | DEFAULT = 0; 1638 | CAFFE = 1; 1639 | CUDNN = 2; 1640 | } 1641 | optional Engine engine = 1 [default = DEFAULT]; 1642 | 1643 | // The axis along which to perform the softmax -- may be negative to index 1644 | // from the end (e.g., -1 for the last axis). 1645 | // Any other axes will be evaluated as independent softmaxes. 1646 | optional int32 axis = 2 [default = 1]; 1647 | } 1648 | 1649 | message TanHParameter { 1650 | enum Engine { 1651 | DEFAULT = 0; 1652 | CAFFE = 1; 1653 | CUDNN = 2; 1654 | } 1655 | optional Engine engine = 1 [default = DEFAULT]; 1656 | } 1657 | 1658 | // Message that stores parameters used by TileLayer 1659 | message TileParameter { 1660 | // The index of the axis to tile. 1661 | optional int32 axis = 1 [default = 1]; 1662 | 1663 | // The number of copies (tiles) of the blob to output. 1664 | optional int32 tiles = 2; 1665 | } 1666 | 1667 | // Message that stores parameters used by ThresholdLayer 1668 | message ThresholdParameter { 1669 | optional float threshold = 1 [default = 0]; // Strictly positive values 1670 | } 1671 | 1672 | message VideoDataParameter{ 1673 | enum VideoType { 1674 | WEBCAM = 0; 1675 | VIDEO = 1; 1676 | } 1677 | optional VideoType video_type = 1 [default = WEBCAM]; 1678 | optional int32 device_id = 2 [default = 0]; 1679 | optional string video_file = 3; 1680 | // Number of frames to be skipped before processing a frame. 1681 | optional uint32 skip_frames = 4 [default = 0]; 1682 | } 1683 | 1684 | message WindowDataParameter { 1685 | // Specify the data source. 1686 | optional string source = 1; 1687 | // For data pre-processing, we can do simple scaling and subtracting the 1688 | // data mean, if provided. Note that the mean subtraction is always carried 1689 | // out before scaling. 1690 | optional float scale = 2 [default = 1]; 1691 | optional string mean_file = 3; 1692 | // Specify the batch size. 1693 | optional uint32 batch_size = 4; 1694 | // Specify if we would like to randomly crop an image. 1695 | optional uint32 crop_size = 5 [default = 0]; 1696 | // Specify if we want to randomly mirror data. 1697 | optional bool mirror = 6 [default = false]; 1698 | // Foreground (object) overlap threshold 1699 | optional float fg_threshold = 7 [default = 0.5]; 1700 | // Background (non-object) overlap threshold 1701 | optional float bg_threshold = 8 [default = 0.5]; 1702 | // Fraction of batch that should be foreground objects 1703 | optional float fg_fraction = 9 [default = 0.25]; 1704 | // Amount of contextual padding to add around a window 1705 | // (used only by the window_data_layer) 1706 | optional uint32 context_pad = 10 [default = 0]; 1707 | // Mode for cropping out a detection window 1708 | // warp: cropped window is warped to a fixed size and aspect ratio 1709 | // square: the tightest square around the window is cropped 1710 | optional string crop_mode = 11 [default = "warp"]; 1711 | // cache_images: will load all images in memory for faster access 1712 | optional bool cache_images = 12 [default = false]; 1713 | // append root_folder to locate images 1714 | optional string root_folder = 13 [default = ""]; 1715 | } 1716 | 1717 | message SPPParameter { 1718 | enum PoolMethod { 1719 | MAX = 0; 1720 | AVE = 1; 1721 | STOCHASTIC = 2; 1722 | } 1723 | optional uint32 pyramid_height = 1; 1724 | optional PoolMethod pool = 2 [default = MAX]; // The pooling method 1725 | enum Engine { 1726 | DEFAULT = 0; 1727 | CAFFE = 1; 1728 | CUDNN = 2; 1729 | } 1730 | optional Engine engine = 6 [default = DEFAULT]; 1731 | } 1732 | 1733 | // DEPRECATED: use LayerParameter. 1734 | message V1LayerParameter { 1735 | repeated string bottom = 2; 1736 | repeated string top = 3; 1737 | optional string name = 4; 1738 | repeated NetStateRule include = 32; 1739 | repeated NetStateRule exclude = 33; 1740 | enum LayerType { 1741 | NONE = 0; 1742 | ABSVAL = 35; 1743 | ACCURACY = 1; 1744 | ARGMAX = 30; 1745 | BNLL = 2; 1746 | CONCAT = 3; 1747 | CONTRASTIVE_LOSS = 37; 1748 | CONVOLUTION = 4; 1749 | DATA = 5; 1750 | DECONVOLUTION = 39; 1751 | DROPOUT = 6; 1752 | DUMMY_DATA = 32; 1753 | EUCLIDEAN_LOSS = 7; 1754 | ELTWISE = 25; 1755 | EXP = 38; 1756 | FLATTEN = 8; 1757 | HDF5_DATA = 9; 1758 | HDF5_OUTPUT = 10; 1759 | HINGE_LOSS = 28; 1760 | IM2COL = 11; 1761 | IMAGE_DATA = 12; 1762 | INFOGAIN_LOSS = 13; 1763 | INNER_PRODUCT = 14; 1764 | LRN = 15; 1765 | MEMORY_DATA = 29; 1766 | MULTINOMIAL_LOGISTIC_LOSS = 16; 1767 | MVN = 34; 1768 | POOLING = 17; 1769 | POWER = 26; 1770 | RELU = 18; 1771 | SIGMOID = 19; 1772 | SIGMOID_CROSS_ENTROPY_LOSS = 27; 1773 | SILENCE = 36; 1774 | SOFTMAX = 20; 1775 | SOFTMAX_LOSS = 21; 1776 | SPLIT = 22; 1777 | SLICE = 33; 1778 | TANH = 23; 1779 | WINDOW_DATA = 24; 1780 | THRESHOLD = 31; 1781 | } 1782 | optional LayerType type = 5; 1783 | repeated BlobProto blobs = 6; 1784 | repeated string param = 1001; 1785 | repeated DimCheckMode blob_share_mode = 1002; 1786 | enum DimCheckMode { 1787 | STRICT = 0; 1788 | PERMISSIVE = 1; 1789 | } 1790 | repeated float blobs_lr = 7; 1791 | repeated float weight_decay = 8; 1792 | repeated float loss_weight = 35; 1793 | optional AccuracyParameter accuracy_param = 27; 1794 | optional ArgMaxParameter argmax_param = 23; 1795 | optional ConcatParameter concat_param = 9; 1796 | optional ContrastiveLossParameter contrastive_loss_param = 40; 1797 | optional ConvolutionParameter convolution_param = 10; 1798 | optional DataParameter data_param = 11; 1799 | optional DropoutParameter dropout_param = 12; 1800 | optional DummyDataParameter dummy_data_param = 26; 1801 | optional EltwiseParameter eltwise_param = 24; 1802 | optional ExpParameter exp_param = 41; 1803 | optional HDF5DataParameter hdf5_data_param = 13; 1804 | optional HDF5OutputParameter hdf5_output_param = 14; 1805 | optional HingeLossParameter hinge_loss_param = 29; 1806 | optional ImageDataParameter image_data_param = 15; 1807 | optional InfogainLossParameter infogain_loss_param = 16; 1808 | optional InnerProductParameter inner_product_param = 17; 1809 | optional LRNParameter lrn_param = 18; 1810 | optional MemoryDataParameter memory_data_param = 22; 1811 | optional MVNParameter mvn_param = 34; 1812 | optional PoolingParameter pooling_param = 19; 1813 | optional PowerParameter power_param = 21; 1814 | optional ReLUParameter relu_param = 30; 1815 | optional SigmoidParameter sigmoid_param = 38; 1816 | optional SoftmaxParameter softmax_param = 39; 1817 | optional SliceParameter slice_param = 31; 1818 | optional TanHParameter tanh_param = 37; 1819 | optional ThresholdParameter threshold_param = 25; 1820 | optional WindowDataParameter window_data_param = 20; 1821 | optional TransformationParameter transform_param = 36; 1822 | optional LossParameter loss_param = 42; 1823 | optional V0LayerParameter layer = 1; 1824 | } 1825 | 1826 | // DEPRECATED: V0LayerParameter is the old way of specifying layer parameters 1827 | // in Caffe. We keep this message type around for legacy support. 1828 | message V0LayerParameter { 1829 | optional string name = 1; // the layer name 1830 | optional string type = 2; // the string to specify the layer type 1831 | 1832 | // Parameters to specify layers with inner products. 1833 | optional uint32 num_output = 3; // The number of outputs for the layer 1834 | optional bool biasterm = 4 [default = true]; // whether to have bias terms 1835 | optional FillerParameter weight_filler = 5; // The filler for the weight 1836 | optional FillerParameter bias_filler = 6; // The filler for the bias 1837 | 1838 | optional uint32 pad = 7 [default = 0]; // The padding size 1839 | optional uint32 kernelsize = 8; // The kernel size 1840 | optional uint32 group = 9 [default = 1]; // The group size for group conv 1841 | optional uint32 stride = 10 [default = 1]; // The stride 1842 | enum PoolMethod { 1843 | MAX = 0; 1844 | AVE = 1; 1845 | STOCHASTIC = 2; 1846 | } 1847 | optional PoolMethod pool = 11 [default = MAX]; // The pooling method 1848 | optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio 1849 | 1850 | optional uint32 local_size = 13 [default = 5]; // for local response norm 1851 | optional float alpha = 14 [default = 1.]; // for local response norm 1852 | optional float beta = 15 [default = 0.75]; // for local response norm 1853 | optional float k = 22 [default = 1.]; 1854 | 1855 | // For data layers, specify the data source 1856 | optional string source = 16; 1857 | // For data pre-processing, we can do simple scaling and subtracting the 1858 | // data mean, if provided. Note that the mean subtraction is always carried 1859 | // out before scaling. 1860 | optional float scale = 17 [default = 1]; 1861 | optional string meanfile = 18; 1862 | // For data layers, specify the batch size. 1863 | optional uint32 batchsize = 19; 1864 | // For data layers, specify if we would like to randomly crop an image. 1865 | optional uint32 cropsize = 20 [default = 0]; 1866 | // For data layers, specify if we want to randomly mirror data. 1867 | optional bool mirror = 21 [default = false]; 1868 | 1869 | // The blobs containing the numeric parameters of the layer 1870 | repeated BlobProto blobs = 50; 1871 | // The ratio that is multiplied on the global learning rate. If you want to 1872 | // set the learning ratio for one blob, you need to set it for all blobs. 1873 | repeated float blobs_lr = 51; 1874 | // The weight decay that is multiplied on the global weight decay. 1875 | repeated float weight_decay = 52; 1876 | 1877 | // The rand_skip variable is for the data layer to skip a few data points 1878 | // to avoid all asynchronous sgd clients to start at the same point. The skip 1879 | // point would be set as rand_skip * rand(0,1). Note that rand_skip should not 1880 | // be larger than the number of keys in the database. 1881 | optional uint32 rand_skip = 53 [default = 0]; 1882 | 1883 | // Fields related to detection (det_*) 1884 | // foreground (object) overlap threshold 1885 | optional float det_fg_threshold = 54 [default = 0.5]; 1886 | // background (non-object) overlap threshold 1887 | optional float det_bg_threshold = 55 [default = 0.5]; 1888 | // Fraction of batch that should be foreground objects 1889 | optional float det_fg_fraction = 56 [default = 0.25]; 1890 | 1891 | // optional bool OBSOLETE_can_clobber = 57 [default = true]; 1892 | 1893 | // Amount of contextual padding to add around a window 1894 | // (used only by the window_data_layer) 1895 | optional uint32 det_context_pad = 58 [default = 0]; 1896 | 1897 | // Mode for cropping out a detection window 1898 | // warp: cropped window is warped to a fixed size and aspect ratio 1899 | // square: the tightest square around the window is cropped 1900 | optional string det_crop_mode = 59 [default = "warp"]; 1901 | 1902 | // For ReshapeLayer, one needs to specify the new dimensions. 1903 | optional int32 new_num = 60 [default = 0]; 1904 | optional int32 new_channels = 61 [default = 0]; 1905 | optional int32 new_height = 62 [default = 0]; 1906 | optional int32 new_width = 63 [default = 0]; 1907 | 1908 | // Whether or not ImageLayer should shuffle the list of files at every epoch. 1909 | // It will also resize images if new_height or new_width are not zero. 1910 | optional bool shuffle_images = 64 [default = false]; 1911 | 1912 | // For ConcatLayer, one needs to specify the dimension for concatenation, and 1913 | // the other dimensions must be the same for all the bottom blobs. 1914 | // By default it will concatenate blobs along the channels dimension. 1915 | optional uint32 concat_dim = 65 [default = 1]; 1916 | 1917 | optional HDF5OutputParameter hdf5_output_param = 1001; 1918 | } 1919 | 1920 | message PReLUParameter { 1921 | // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: 1922 | // Surpassing Human-Level Performance on ImageNet Classification, 2015. 1923 | 1924 | // Initial value of a_i. Default is a_i=0.25 for all i. 1925 | optional FillerParameter filler = 1; 1926 | // Whether or not slope paramters are shared across channels. 1927 | optional bool channel_shared = 2 [default = false]; 1928 | } 1929 | 1930 | message CenterLossParameter { 1931 | optional uint32 num_output = 1; // The number of outputs for the layer 1932 | optional FillerParameter center_filler = 2; // The filler for the centers 1933 | // The first axis to be lumped into a single inner product computation; 1934 | // all preceding axes are retained in the output. 1935 | // May be negative to index from the end (e.g., -1 for the last axis). 1936 | optional int32 axis = 3 [default = 1]; 1937 | } 1938 | 1939 | message FocalLossParameter { 1940 | enum Engine { 1941 | DEFAULT = 0; 1942 | CAFFE = 1; 1943 | CUDNN = 2; 1944 | } 1945 | optional Engine engine = 1 [default = DEFAULT]; 1946 | 1947 | // The axis along which to perform the softmax -- may be negative to index 1948 | // from the end (e.g., -1 for the last axis). 1949 | // Any other axes will be evaluated as independent softmaxes. 1950 | optional int32 axis = 2 [default = 1]; 1951 | optional float alpha = 3 [default = 0.25]; 1952 | optional float gamma = 4 [default = 2.0]; 1953 | } 1954 | 1955 | message MultiBoxCenterLossParameter{ 1956 | optional uint32 center_features = 1; 1957 | } --------------------------------------------------------------------------------