├── README.md ├── src ├── crbm.gdb ├── nvmatrix_kernel.cuh ├── Makefile ├── crbm_kernel.cuh ├── utils.h ├── nvmatrix.cuh ├── matrix.h ├── crbm.cuh ├── nvmatrix_kernel.cu ├── pyNVCRBM.cu ├── nvmatrix.cu ├── utils.py ├── test_nvcrbm.py ├── matrix.cpp ├── crbm_kernel.cu └── crbm.cu ├── .gitignore ├── include └── Matrix.cuh └── script └── preprocess.py /README.md: -------------------------------------------------------------------------------- 1 | Deep learning code about computer vision 2 | -------------------------------------------------------------------------------- /src/crbm.gdb: -------------------------------------------------------------------------------- 1 | set breakpoint pending on 2 | b CRBM::start 3 | run test_nvcrbm.py 4 | c 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.o 3 | *.png 4 | *.dat 5 | *.tif 6 | *.pkl 7 | *.swp 8 | *.pyc 9 | tags 10 | core 11 | -------------------------------------------------------------------------------- /include/Matrix.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define KYOTO_DATA "/home/rolexye/project/data/image_db_kyoto/kyoto.txt" 5 | 6 | using namespace std; 7 | 8 | class Matrix{ 9 | private: 10 | int _nrow, _ncol; 11 | int _nelem; 12 | float *_data; 13 | public: 14 | Matrix(int nrow, int ncol); 15 | int get_row_num(); 16 | int get_col_num(); 17 | int read_from_text(istream &fs); 18 | ~Matrix(); 19 | }; 20 | -------------------------------------------------------------------------------- /src/nvmatrix_kernel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _NVMATRIX_KERNEL_H 2 | #define _NVMATRIX_KERNEL_H 3 | #include 4 | 5 | #define NUM_THREAD_PER_ROW 128 6 | 7 | __global__ void _init_mat(float *m, float val, int len); 8 | __global__ void _copy_mat(float *m, float* target, int len); 9 | __global__ void _ele_scale(float *m, float *target, float scaler, int len); 10 | __global__ void _ele_add(float *m, float *target, float val, int len); 11 | __global__ void _mat_add(float *ma, float *mb, float *target, float sa, float sb, int len); 12 | __global__ void _mat_mul(float *ma, float *mb, float *target, int len); 13 | __global__ void _mat_sum_col(float *m, float *target,int nrow, int ncol); 14 | __global__ void _mat_sum_row(float *m, float *target,int nrow, int ncol); 15 | __global__ void _mat_sum_row_fast(float *m, float *target, int nrow, int ncol, int agg_col); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | NVCC=nvcc 3 | INC_PATH=-I/usr/include/python2.7/ \ 4 | -I/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ \ 5 | -I/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ 6 | CPPFLAGS :=-g -O0 -fno-inline -std=c++0x -w -fPIC 7 | NVCCFLAGS :=-G -g -w -Xcompiler -fPIC -arch=sm_20 -Xptxas=-v 8 | CPPFLAGS += $(INC_PATH) 9 | NVCCFLAGS += $(INC_PATH) 10 | LDFLAGS=-fPIC -shared -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 11 | OBJS=pyNVCRBM.o crbm.o crbm_kernel.o matrix.o nvmatrix.o nvmatrix_kernel.o 12 | 13 | nvcrbm.so : $(OBJS) 14 | $(CC) $(LDFLAGS) $^ -lcudart -lcurand -o $@ 15 | 16 | %.o: %.cpp 17 | $(CC) -c $(CPPFLAGS) $< -o $@ 18 | 19 | %.o: %.cu 20 | $(NVCC) -c $(NVCCFLAGS) $< -o $@ 21 | 22 | install64: 23 | cp nvcrbm.so /usr/lib/python2.7/dist-packages/nvcrbm.so 24 | 25 | install: 26 | cp nvcrbm.so /usr/local/lib/python2.7/dist-packages/nvcrbm.so 27 | 28 | clean: 29 | rm -rf *.so *.o 30 | -------------------------------------------------------------------------------- /src/crbm_kernel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _CRBM_KERNEL_H 2 | #define _CRBM_KERNEL_H 3 | 4 | #include 5 | #include 6 | 7 | #define MAX_FILETER_SIZE 8 8 | #define MAX_POOLING_RATE 3 9 | #define MAX_IMGAG_SIZE 128 10 | #define RAND_SIZE 10000 11 | 12 | __global__ void convolution_forward_kernel(float *input, 13 | float *filters, float *feature_map, float *hbias, int input_size, 14 | int channel_num, int feature_map_size, int filter_size, 15 | int filter_num, int lu_padding, float sigma); 16 | 17 | __global__ void max_pooling_kernel(float *feature_map, float *probs, float *target, 18 | int feature_map_size, int feature_map_num, int pooling_rate, 19 | float *rnd_array, int rnd_num); 20 | 21 | __global__ void convolution_backward_kernel(float *y_h, float *filters, float *vbias, 22 | float *target, float *y_v, 23 | int input_size, int lu_padding, int channel_num, int feature_map_size, 24 | int filter_num, int filter_size, float *rnd_array, int rnd_num); 25 | 26 | __global__ void compute_d_w_kernel(float *v, float *h, float *dw, bool is_init, 27 | int input_size, int lu_padding, int channel_num, int filter_num, 28 | int filter_size, int feature_map_size); 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _UITL_H 2 | #define _UITL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifdef TIME_FUNC 10 | #define timeFunc(func, message) \ 11 | gettimeofday(&_start_time, NULL); \ 12 | (func); \ 13 | gettimeofday(&_end_time, NULL); \ 14 | if(_end_time.tv_sec == _start_time.tv_sec){ \ 15 | std::cout << message << " : " \ 16 | << (_end_time.tv_usec - _start_time.tv_usec) / 1000.0 \ 17 | << "ms" << std::endl; \ 18 | }else{ \ 19 | std::cout << message << " : " \ 20 | << _end_time.tv_sec - _start_time.tv_sec + \ 21 | (_end_time.tv_usec - _start_time.tv_usec) / 1000000.0 \ 22 | << "s" << std::endl; \ 23 | } 24 | #else 25 | #define timeFunc(func, message) (func); 26 | #endif 27 | 28 | inline float random_float(float low, float upper){ 29 | return (rand() * 1.0 / RAND_MAX) * (upper - low) + low; 30 | } 31 | 32 | inline bool float_equal(float a, float b, float e){ 33 | float diff = a - b; 34 | return (diff < e) && (-diff < e); 35 | } 36 | 37 | inline float logisitc(float a){ 38 | return 1.0 / (1 + exp(-a)); 39 | } 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /src/nvmatrix.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _NVMATRIX_H 2 | #define _NVMATRIX_H 3 | 4 | #include "matrix.h" 5 | 6 | 7 | class NVMatrix { 8 | private: 9 | int nrow, ncol; 10 | bool trans; 11 | bool own; 12 | float *data; 13 | public: 14 | NVMatrix(Matrix&); 15 | NVMatrix(int nrow, int ncol); 16 | ~NVMatrix(); 17 | inline float& operator()(int i, int j){ 18 | return this->data[i * this->ncol + j]; 19 | } 20 | int get_row_num(); 21 | int get_col_num(); 22 | int get_ele_num(); 23 | bool get_trans(); 24 | float* get_data(); 25 | void reshape(int, int); 26 | 27 | void copyFromHost(Matrix& source); 28 | void assign(Matrix&); 29 | void assign(NVMatrix&); 30 | void mat_init(float val); 31 | void mat_add(NVMatrix& m, float sb); 32 | void mat_add(NVMatrix& m, NVMatrix& target, float sa, float sb); 33 | void mat_mul(NVMatrix& m); 34 | void mat_mul(NVMatrix& m, NVMatrix& target); 35 | void ele_add(float val, NVMatrix& target); 36 | void ele_add(float val); 37 | void ele_scale(float scaler, NVMatrix& target); 38 | void ele_scale(float scaler); 39 | void mat_sum(int axis, NVMatrix &target); 40 | float ele_mean(); 41 | }; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/matrix.h: -------------------------------------------------------------------------------- 1 | #ifndef _MATRIX_H 2 | #define _MATRIX_H 3 | 4 | extern "C"{ 5 | #include "Python.h" 6 | #include "arrayobject.h" 7 | } 8 | 9 | class Matrix { 10 | private: 11 | int nrow, ncol; 12 | bool trans; 13 | bool own; 14 | float *data; 15 | void _init(int, int, float*); 16 | public: 17 | Matrix(PyArrayObject *); 18 | Matrix(int nrow, int ncol, float low, float upper); 19 | Matrix(int nrow, int ncol); 20 | Matrix(Matrix&); 21 | ~Matrix(); 22 | inline float& operator()(int i, int j){ 23 | return this->data[i * this->ncol + j]; 24 | } 25 | int get_row_num(); 26 | int get_col_num(); 27 | int get_ele_num(); 28 | void reshape(int, int); 29 | bool get_trans(); 30 | float* get_data(); 31 | bool equal_value(Matrix&); 32 | bool equal_value(Matrix&, float); 33 | bool check_nan(); 34 | 35 | void assign(Matrix& target); 36 | void mat_init(float val); 37 | void mat_add(Matrix& m, float sb); 38 | void mat_add(Matrix& m, Matrix& target, float sa, float sb); 39 | void ele_scale(float); 40 | void ele_scale(float, Matrix&); 41 | void ele_add(float); 42 | void ele_add(float, Matrix&); 43 | void mat_sum(int axis, Matrix&); 44 | float ele_mean(); 45 | void mat_mul(Matrix& m, Matrix& target); 46 | void mat_mul(Matrix& m); 47 | }; 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/crbm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _CRBM_H 2 | #define _CRBM_H 3 | 4 | #include "matrix.h" 5 | #include "nvmatrix.cuh" 6 | #include 7 | #include 8 | #include 9 | 10 | #define CUDA_CALL(x) do { if((x) != cudaSuccess) { \ 11 | printf("Error at %s:%d\n", __FILE__, __LINE__); \ 12 | exit(1); }} while(0) 13 | 14 | class CRBM { 15 | public: 16 | float epsilon; 17 | float momentum; 18 | float l2reg; //regularization penaltiy coefficient 19 | float ph_lambda; //sparsity penaltiy coefficient 20 | float ph; //hidden layer sparsity percentage 21 | float sigma; 22 | int cur_trial; 23 | int cur_batch; 24 | int cur_image; 25 | float ferr; 26 | float sparsity; 27 | 28 | int filter_num; 29 | int filter_size; 30 | int input_num; 31 | int input_size; 32 | int channel_num; 33 | int feature_map_size; 34 | int pooling_rate; 35 | int subsample_size; 36 | int left_upper_padding, right_low_padding; 37 | 38 | Matrix *CPU_input; 39 | Matrix *CPU_filters; 40 | Matrix *CPU_vbias, *CPU_hbias; 41 | Matrix *CPU_y_h, *CPU_y_h_probs; 42 | Matrix *CPU_y_h2, *CPU_y_h2_probs; 43 | Matrix *CPU_y_p; 44 | Matrix *CPU_y_v, *CPU_y_v_probs; 45 | Matrix *CPU_d_w, *CPU_d_hbias; 46 | Matrix *CPU_d_w_pre, *CPU_d_hbias_pre; 47 | Matrix *CPU_d_hbias_tmp; 48 | Matrix *CPU_d_h_sum_tmp; 49 | 50 | void CPU_convolution_forward(float*, float*, float*, float*); 51 | void CPU_max_pooling(float*, float*, float*); 52 | void CPU_convolution_backward(float*, float*, float*, float*, float*); 53 | void CPU_compute_d_w(float*, float*, float*, bool); 54 | 55 | NVMatrix *GPU_input; 56 | NVMatrix *GPU_filters; 57 | NVMatrix *GPU_vbias, *GPU_hbias; 58 | NVMatrix *GPU_y_h, *GPU_y_h_probs; 59 | NVMatrix *GPU_y_h2, *GPU_y_h2_probs; 60 | NVMatrix *GPU_y_p; 61 | NVMatrix *GPU_y_v, *GPU_y_v_probs; 62 | NVMatrix *GPU_d_w, *GPU_d_hbias; 63 | NVMatrix *GPU_d_w_pre, *GPU_d_hbias_pre; 64 | NVMatrix *GPU_d_hbias_tmp; 65 | NVMatrix *GPU_d_h_sum_tmp; 66 | 67 | curandGenerator_t rnd_gen; 68 | int rnd_num; 69 | float *rnd_array; 70 | 71 | void GPU_convolution_forward(float*, float*, float*, float*); 72 | void GPU_max_pooling(float*, float*, float*); 73 | void GPU_convolution_backward(float*, float*, float*, float*, float*); 74 | void GPU_compute_d_w(float*, float*, float*, bool); 75 | 76 | CRBM(int, int, int, int, int, int, int, int, 77 | Matrix*, Matrix*, Matrix*); 78 | ~CRBM(); 79 | 80 | void start(); 81 | void run_batch(int, int, int, Matrix&); 82 | 83 | private: 84 | Matrix* filter_init(int, int, int); 85 | }; 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /src/nvmatrix_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "nvmatrix_kernel.cuh" 2 | 3 | __global__ void _init_mat(float *m, float val, int len){ 4 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 5 | if(tid < len){ 6 | m[tid] = val; 7 | } 8 | } 9 | 10 | __global__ void _copy_mat(float *m, float* target, int len){ 11 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 12 | if(tid < len){ 13 | target[tid] = m[tid]; 14 | } 15 | } 16 | 17 | __global__ void _ele_scale(float *m, float *target, float scaler, int len){ 18 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 19 | if(tid < len){ 20 | target[tid] = scaler * m[tid]; 21 | } 22 | } 23 | 24 | __global__ void _ele_add(float *m, float *target, float val, int len){ 25 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 26 | if(tid < len){ 27 | target[tid] = val + m[tid]; 28 | } 29 | } 30 | 31 | __global__ void _mat_add(float *ma, float *mb, float *target, float sa, float sb, int len){ 32 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 33 | if(tid < len){ 34 | target[tid] = sa * ma[tid] + sb * mb[tid]; 35 | } 36 | } 37 | 38 | __global__ void _mat_mul(float *ma, float *mb, float *target, int len){ 39 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 40 | if(tid < len){ 41 | target[tid] = ma[tid] * mb[tid]; 42 | } 43 | } 44 | 45 | __global__ void _mat_sum_row_fast(float *m, float *target,int nrow, int ncol, int agg_col){ 46 | int tx = blockIdx.x * blockDim.x + threadIdx.x; 47 | 48 | __shared__ float accum[NUM_THREAD_PER_ROW]; 49 | 50 | if(tx < ncol){ 51 | accum[threadIdx.x] = m[blockIdx.y*ncol+tx]; 52 | }else{ 53 | accum[threadIdx.x] = 0.0f; 54 | } 55 | __syncthreads(); 56 | 57 | if(NUM_THREAD_PER_ROW >= 512){ 58 | if(threadIdx.x < 256) 59 | accum[threadIdx.x] += accum[threadIdx.x+256]; 60 | __syncthreads(); 61 | } 62 | 63 | if(NUM_THREAD_PER_ROW >= 256){ 64 | if(threadIdx.x < 128) 65 | accum[threadIdx.x] += accum[threadIdx.x+128]; 66 | __syncthreads(); 67 | } 68 | 69 | //NUM_THREAD_PER_ROW at least 128 70 | if(threadIdx.x < 64) 71 | accum[threadIdx.x] += accum[threadIdx.x+64]; 72 | __syncthreads(); 73 | 74 | if(threadIdx.x < 32){ 75 | accum[threadIdx.x] += accum[threadIdx.x+32]; 76 | accum[threadIdx.x] += accum[threadIdx.x+16]; 77 | accum[threadIdx.x] += accum[threadIdx.x+8]; 78 | accum[threadIdx.x] += accum[threadIdx.x+4]; 79 | accum[threadIdx.x] += accum[threadIdx.x+2]; 80 | accum[threadIdx.x] += accum[threadIdx.x+1]; 81 | } 82 | target[blockIdx.y*agg_col+blockIdx.x] = accum[0]; 83 | } 84 | 85 | __global__ void _mat_sum_row(float *m, float *target,int nrow, int ncol){ 86 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 87 | 88 | if(tid < nrow){ 89 | float sum = 0; 90 | for(int i = 0; i < ncol; i++){ 91 | sum += m[tid*ncol+i]; 92 | } 93 | target[tid] = sum; 94 | } 95 | } 96 | 97 | __global__ void _mat_sum_col(float *m, float *target,int nrow, int ncol){ 98 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 99 | 100 | if(tid < ncol){ 101 | float sum = 0; 102 | for(int i = 0; i < nrow; i++){ 103 | sum += m[i*ncol+tid]; 104 | } 105 | target[tid] = sum; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/pyNVCRBM.cu: -------------------------------------------------------------------------------- 1 | extern "C"{ 2 | #include 3 | } 4 | #include 5 | #include "matrix.h" 6 | #include "nvmatrix.cuh" 7 | #include "crbm.cuh" 8 | #include 9 | #include 10 | #include 11 | #include "utils.h" 12 | 13 | using namespace std; 14 | 15 | static CRBM* crbm = NULL; 16 | 17 | static PyArrayObject* copy_host_matrix(Matrix& src){ 18 | PyArrayObject* ret; 19 | int dims[2]; 20 | 21 | dims[0] = src.get_row_num(); 22 | dims[1] = src.get_col_num(); 23 | ret = (PyArrayObject*)PyArray_FromDims(2, dims, NPY_FLOAT); 24 | 25 | memcpy(ret->data, src.get_data(), sizeof(float) * src.get_ele_num()); 26 | 27 | return ret; 28 | } 29 | 30 | static PyObject* 31 | init(PyObject *self, PyObject *args){ 32 | PyArrayObject *pyfilter; 33 | PyArrayObject *pyinit_filter, *pyinit_hbias, *pyinit_vbias; 34 | int filter_num; 35 | int filter_size; 36 | int input_num; 37 | int input_size; 38 | int input_group_num; 39 | int pooling_rate; 40 | int left_upper_padding, right_low_padding; 41 | 42 | if(!PyArg_ParseTuple(args, "iiiiiiiiO!O!O!", 43 | &filter_num, &filter_size, 44 | &input_num, &input_size, 45 | &input_group_num, &pooling_rate, 46 | &left_upper_padding, &right_low_padding, 47 | &PyArray_Type, &pyinit_filter, 48 | &PyArray_Type, &pyinit_hbias, 49 | &PyArray_Type, &pyinit_vbias)) 50 | return NULL; 51 | 52 | Matrix init_filter(pyinit_filter); 53 | Matrix init_hbias(pyinit_hbias); 54 | Matrix init_vbias(pyinit_vbias); 55 | 56 | crbm = new CRBM(filter_num, filter_size, 57 | input_num, input_size, input_group_num, 58 | left_upper_padding, right_low_padding, 59 | pooling_rate, &init_filter, //&filter, 60 | &init_hbias, &init_vbias); 61 | 62 | pyfilter = copy_host_matrix(*crbm->CPU_filters); 63 | 64 | return PyArray_Return(pyfilter); 65 | //return Py_BuildValue("i", 0); 66 | } 67 | 68 | static PyObject* 69 | run_batch(PyObject *self, PyObject *args){ 70 | PyArrayObject *pybatch_data; 71 | int cur_trail, cur_image, cur_batch; 72 | 73 | if(!PyArg_ParseTuple(args, "iiiO!", 74 | &cur_trail, &cur_image, &cur_batch, 75 | &PyArray_Type, &pybatch_data)){ 76 | return NULL; 77 | } 78 | Matrix batch_data(pybatch_data); 79 | 80 | crbm->run_batch(cur_trail, cur_image, cur_batch, batch_data); 81 | 82 | return Py_BuildValue("i", 0); 83 | } 84 | 85 | static PyObject* 86 | get_gpu_filters(PyObject *self, PyObject *args){ 87 | PyArrayObject *pyfilter; 88 | 89 | Matrix *tmp_filter = new Matrix(*crbm->CPU_filters); 90 | crbm->GPU_filters->assign(*tmp_filter); 91 | pyfilter = copy_host_matrix(*tmp_filter); 92 | delete tmp_filter; 93 | 94 | return PyArray_Return(pyfilter); 95 | } 96 | 97 | static PyObject* 98 | get_cpu_filters(PyObject *self, PyObject *args){ 99 | PyArrayObject *pyfilter; 100 | 101 | pyfilter = copy_host_matrix(*crbm->CPU_filters); 102 | 103 | return PyArray_Return(pyfilter); 104 | } 105 | 106 | static PyObject* 107 | get_gpu_hbias(PyObject *self, PyObject *args){ 108 | PyArrayObject *pyhbias; 109 | 110 | Matrix *tmp_hbias = new Matrix(*crbm->CPU_hbias); 111 | crbm->GPU_hbias->assign(*tmp_hbias); 112 | pyhbias = copy_host_matrix(*tmp_hbias); 113 | delete tmp_hbias; 114 | 115 | return PyArray_Return(pyhbias); 116 | } 117 | 118 | static PyObject* 119 | print_result(PyObject *self, PyObject *args){ 120 | cout << "ferr : " << crbm->ferr / 20.0 << endl; 121 | cout << "sparsity : " << crbm->sparsity / 20.0 << endl; 122 | crbm->ferr = 0.0; 123 | crbm->sparsity = 0.0; 124 | return Py_BuildValue("i", 0); 125 | } 126 | 127 | static PyMethodDef PyNVcrbmMethods[] = { 128 | {"get_gpu_filters", get_gpu_filters, METH_VARARGS, "Get the filter weight matrix"}, 129 | {"get_cpu_filters", get_cpu_filters, METH_VARARGS, "Get the filter weight matrix"}, 130 | {"get_gpu_hbias", get_gpu_hbias, METH_VARARGS, "Get gpu hidden layer bias"}, 131 | {"print_result", print_result, METH_VARARGS, "print result"}, 132 | {"run_batch", run_batch, METH_VARARGS, "Run a batch"}, 133 | {"init", init, METH_VARARGS, "Initialize the convolutional RBM"}, 134 | {NULL, NULL, 0, NULL} 135 | }; 136 | 137 | PyMODINIT_FUNC 138 | initnvcrbm(void){ 139 | (void)Py_InitModule("nvcrbm", PyNVcrbmMethods); 140 | _import_array(); 141 | srand(1234); 142 | //srand(time(NULL)); 143 | } 144 | -------------------------------------------------------------------------------- /script/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import numpy as np 3 | import Image 4 | from glob import glob 5 | import os 6 | import math 7 | import cPickle 8 | 9 | def crop_image(img, target): 10 | width = img.size[0] 11 | height = img.size[1] 12 | if width > height: 13 | ratio = target * 1.0 / height 14 | newWidth = int(width * ratio) 15 | newHeight = target 16 | img_ret = img.resize((newWidth, newHeight), Image.ANTIALIAS) 17 | img_ret = img_ret.crop(((newWidth - target)/2, 0, 18 | (newWidth - target)/2 + target, target)) 19 | else: 20 | ratio = target * 1.0 / width 21 | newWidth = target 22 | newHeight = int(height * ratio) 23 | img_ret = img.resize((newWidth, newHeight)) 24 | img_ret = img_ret.crop((0, (newHeight - target)/2, 25 | target, (newHeight - target)/2 + target)) 26 | return img_ret 27 | 28 | def preprocess_caltech101_image(): 29 | caltech101_image_path = "/home/rolexye/project/data/101_ObjectCategories/Faces/" 30 | for idx, img_file in enumerate(glob(os.path.join(caltech101_image_path, "*.jpg"))): 31 | img = Image.open(img_file) 32 | img = img.convert("L") 33 | img = crop_image(img, 150) 34 | img.save("../data/caltech101/faces/%d.png" % idx) 35 | 36 | def divide_kyoto_image(img_num, div_num): 37 | img = [] 38 | image_path = "../data/kyoto" 39 | for idx in xrange(img_num): 40 | img.append(Image.open("%s/%d.tif" % (image_path, idx))) 41 | 42 | width = img[0].size[0] 43 | height = img[0].size[1] 44 | 45 | for div_idx in xrange(div_num): 46 | rnd_img_idx = np.random.randint(0, img_num) 47 | x = np.random.randint(0, width - 64) 48 | y = np.random.randint(0, height - 64) 49 | div_img = img[rnd_img_idx].crop((x, y, x+64, y+64)) 50 | div_img.save("%s/divide/%d.png" % (image_path, div_idx)) 51 | 52 | def raw_image_2_pkl(image_path, image_suffix, image_count, data_path): 53 | """ 54 | import numpy as np 55 | import Image 56 | from glob import glob 57 | import os 58 | import math 59 | import cPickle 60 | image_path = "../data/kyoto" 61 | image_suffix = "tif" 62 | idx = 1 63 | img = Image.open("%s/%d.%s" % (image_path, idx, image_suffix)) 64 | img_data = np.asarray(img.getdata()).reshape(img.size) 65 | img_data = img_data - img_data.mean() 66 | img_data = img_data / img_data.std() 67 | width = img_data.shape[0] 68 | height = img_data.shape[1] 69 | fx, fy = np.meshgrid(np.linspace(-width/2, width/2-1, width), np.linspace(-height/2, height/2-1, height)) 70 | rho = np.sqrt(fx * fx + fy * fy) 71 | f0 = 0.4 * np.mean([width, height]) 72 | filt = rho * np.exp(- np.power(rho/f0, 4)) 73 | If = np.fft.fft2(img_data) 74 | img_data = np.real(np.fft.ifft2(If * np.fft.fftshift(filt))) 75 | img_data = img_data / img_data.std() 76 | img_data = img_data - img_data.mean() 77 | img_data = img_data / np.sqrt(np.mean(np.power(img_data, 2))) 78 | img_data = np.sqrt(0.1) * img_data 79 | """ 80 | data = [] 81 | for idx in xrange(image_count+1): 82 | try: 83 | img = Image.open("%s/%d.%s" % (image_path, idx, image_suffix)) 84 | img_data = np.asarray(img.getdata()).reshape(img.size) 85 | img_data = img_data - img_data.mean() 86 | img_data = img_data / img_data.std() 87 | width = img_data.shape[0] 88 | height = img_data.shape[1] 89 | fx, fy = np.meshgrid(np.linspace(-width/2, width/2-1, width), np.linspace(-height/2, height/2-1, height)) 90 | rho = np.sqrt(fx * fx + fy * fy) 91 | f0 = 0.4 * np.mean([width, height]) 92 | filt = rho * np.exp(- np.power(rho/f0, 4)) 93 | If = np.fft.fft2(img_data) 94 | img_data = np.real(np.fft.ifft2(If * np.fft.fftshift(filt))) 95 | img_data = img_data / img_data.std() 96 | img_data = img_data - img_data.mean() 97 | img_data = img_data / np.sqrt(np.mean(np.power(img_data, 2))) 98 | img_data = np.sqrt(0.1) * img_data 99 | data.append(img_data) 100 | except: 101 | pass 102 | cPickle.dump(data, open(data_path, "w+")) 103 | 104 | if __name__ == "__main__": 105 | #divide_kyoto_image(10, 500) 106 | #preprocess_kyoto_image() 107 | #preprocess_caltech101_image() 108 | #raw_image_2_pkl("../data/caltech101/faces", "png", 100, "../data/faces_train.pkl") 109 | raw_image_2_pkl("../data/kyoto", "tif", 10, "../data/kyoto_large_train.pkl") 110 | #raw_image_2_pkl("../data/kyoto/divide", "png", 500, "../data/kyoto_train.pkl") 111 | 112 | -------------------------------------------------------------------------------- /src/nvmatrix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "nvmatrix.cuh" 5 | #include "nvmatrix_kernel.cuh" 6 | 7 | using namespace std; 8 | 9 | NVMatrix::NVMatrix(Matrix &m){ 10 | this->nrow = m.get_row_num(); 11 | this->ncol = m.get_col_num(); 12 | this->own = true; 13 | this->trans = m.get_trans(); 14 | cudaMalloc((void**)&this->data, nrow * ncol * sizeof(float)); 15 | cudaMemcpy(this->data, m.get_data(), nrow * ncol * sizeof(float), 16 | cudaMemcpyHostToDevice); 17 | } 18 | 19 | NVMatrix::NVMatrix(int nrow, int ncol){ 20 | this->nrow = nrow; 21 | this->ncol = ncol; 22 | this->own = true; 23 | this->trans = false; 24 | cudaMalloc((void**)&this->data, nrow * ncol * sizeof(float)); 25 | _init_mat<<>>(this->get_data(), 0.0f, nrow*ncol); 26 | cudaDeviceSynchronize(); 27 | } 28 | 29 | NVMatrix::~NVMatrix(){ 30 | cudaFree(this->data); 31 | } 32 | 33 | void NVMatrix::reshape(int nrow, int ncol){ 34 | this->nrow = nrow; 35 | this->ncol = ncol; 36 | } 37 | 38 | float* NVMatrix::get_data(){ 39 | return this->data; 40 | } 41 | 42 | int NVMatrix::get_row_num(){ 43 | return this->nrow; 44 | } 45 | 46 | int NVMatrix::get_col_num(){ 47 | return this->ncol; 48 | } 49 | 50 | int NVMatrix::get_ele_num(){ 51 | return this->ncol * this->nrow; 52 | } 53 | 54 | bool NVMatrix::get_trans(){ 55 | return this->trans; 56 | } 57 | 58 | void NVMatrix::assign(Matrix &target){ 59 | assert(this->nrow == target.get_row_num() 60 | && this->ncol == target.get_col_num()); 61 | cudaMemcpy(target.get_data(), this->data, nrow * ncol * sizeof(float), 62 | cudaMemcpyDeviceToHost); 63 | } 64 | 65 | void NVMatrix::mat_init(float val){ 66 | _init_mat<<>>(this->get_data(), val, nrow*ncol); 67 | cudaDeviceSynchronize(); 68 | } 69 | 70 | void NVMatrix::ele_scale(float scaler, NVMatrix& target){ 71 | int len = nrow * ncol; 72 | _ele_scale<<>>(this->get_data(), target.get_data(), 73 | scaler, len); 74 | cudaDeviceSynchronize(); 75 | } 76 | 77 | void NVMatrix::ele_scale(float scaler){ 78 | ele_scale(scaler, *this); 79 | } 80 | 81 | void NVMatrix::ele_add(float val, NVMatrix& target){ 82 | int len = nrow * ncol; 83 | _ele_add<<>>(this->get_data(), target.get_data(), 84 | val, len); 85 | cudaDeviceSynchronize(); 86 | } 87 | 88 | void NVMatrix::ele_add(float val){ 89 | ele_add(val, *this); 90 | } 91 | 92 | float NVMatrix::ele_mean(){ 93 | float mean; 94 | NVMatrix *m = new NVMatrix(1, 1); 95 | int ori_nrow = nrow; 96 | int ori_ncol = ncol; 97 | this->reshape(1, nrow*ncol); 98 | this->mat_sum(0, *m); 99 | this->reshape(ori_nrow, ori_ncol); 100 | 101 | cudaMemcpy(&mean, m->get_data(), sizeof(float), 102 | cudaMemcpyDeviceToHost); 103 | delete m; 104 | 105 | mean /= get_ele_num(); 106 | return mean; 107 | } 108 | 109 | void NVMatrix::mat_sum(int axis, NVMatrix& target){ 110 | if(axis == 1){ //column sum 111 | dim3 blocks = dim3(ceil(ncol / 64.0), 1); 112 | dim3 threads = dim3(64, 1); 113 | _mat_sum_col<<>>(get_data(), target.get_data(), nrow, ncol); 114 | cudaDeviceSynchronize(); 115 | }else{ //row sum 116 | /* 117 | dim3 blocks = dim3(ceil(nrow / 64.0), 1); 118 | dim3 threads = dim3(64, 1); 119 | _mat_sum_row<<>>(get_data(), target.get_data(), nrow, ncol); 120 | */ 121 | 122 | int cur_col = ncol; 123 | NVMatrix* cur_sum_mat = this; 124 | while(cur_col > 1){ 125 | int agg_col = ceil(cur_col * 1.0 / NUM_THREAD_PER_ROW); 126 | NVMatrix* agg_sum_mat = new NVMatrix(nrow, agg_col); 127 | 128 | dim3 blocks = dim3(agg_col, nrow); 129 | dim3 threads = dim3(NUM_THREAD_PER_ROW, 1); 130 | 131 | _mat_sum_row_fast<<>>(cur_sum_mat->get_data(), agg_sum_mat->get_data(), 132 | nrow, cur_col, agg_col); 133 | 134 | if(cur_sum_mat != this) 135 | delete cur_sum_mat; 136 | 137 | cur_sum_mat = agg_sum_mat; 138 | cur_col = agg_col; 139 | } 140 | _copy_mat<<>>(cur_sum_mat->get_data(), target.get_data(), 141 | cur_sum_mat->get_ele_num()); 142 | 143 | if(cur_sum_mat != this) 144 | delete cur_sum_mat; 145 | cudaDeviceSynchronize(); 146 | } 147 | } 148 | 149 | void NVMatrix::mat_add(NVMatrix& m, float sb){ 150 | mat_add(m, *this, 1.0, sb); 151 | } 152 | 153 | void NVMatrix::mat_add(NVMatrix& m, NVMatrix& target, float sa, float sb){ 154 | _mat_add<<get_ele_num() / 64.0), 64>>>(this->get_data(), m.get_data(), 155 | target.get_data(), sa, sb, this->get_ele_num()); 156 | } 157 | 158 | void NVMatrix::mat_mul(NVMatrix& m){ 159 | mat_mul(m, *this); 160 | } 161 | 162 | void NVMatrix::mat_mul(NVMatrix& m, NVMatrix& target){ 163 | _mat_mul<<get_ele_num() / 64.0), 64>>>(this->get_data(), m.get_data(), 164 | target.get_data(), this->get_ele_num()); 165 | } 166 | 167 | void NVMatrix::assign(NVMatrix& target){ 168 | _copy_mat<<>>(get_data(), target.get_data(), get_ele_num()); 169 | } 170 | 171 | void NVMatrix::copyFromHost(Matrix& source){ 172 | cudaMemcpy(this->data, source.get_data(), nrow * ncol * sizeof(float), 173 | cudaMemcpyHostToDevice); 174 | } 175 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | 10 | import numpy 11 | 12 | 13 | def scale_to_unit_interval(ndar, eps=1e-8): 14 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 15 | ndar = ndar.copy() 16 | ndar -= ndar.min() 17 | ndar *= 1.0 / (ndar.max() + eps) 18 | return ndar 19 | 20 | 21 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 22 | scale_rows_to_unit_interval=True, 23 | output_pixel_vals=True): 24 | """ 25 | Transform an array with one flattened image per row, into an array in 26 | which images are reshaped and layed out like tiles on a floor. 27 | 28 | This function is useful for visualizing datasets whose rows are images, 29 | and also columns of matrices for transforming those rows 30 | (such as the first layer of a neural net). 31 | 32 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 33 | be 2-D ndarrays or None; 34 | :param X: a 2-D array in which every row is a flattened image. 35 | 36 | :type img_shape: tuple; (height, width) 37 | :param img_shape: the original shape of each image 38 | 39 | :type tile_shape: tuple; (rows, cols) 40 | :param tile_shape: the number of images to tile (rows, cols) 41 | 42 | :param output_pixel_vals: if output should be pixel values (i.e. int8 43 | values) or floats 44 | 45 | :param scale_rows_to_unit_interval: if the values need to be scaled before 46 | being plotted to [0,1] or not 47 | 48 | 49 | :returns: array suitable for viewing as an image. 50 | (See:`PIL.Image.fromarray`.) 51 | :rtype: a 2-d array with same dtype as X. 52 | 53 | """ 54 | 55 | assert len(img_shape) == 2 56 | assert len(tile_shape) == 2 57 | assert len(tile_spacing) == 2 58 | 59 | # The expression below can be re-written in a more C style as 60 | # follows : 61 | # 62 | # out_shape = [0,0] 63 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 64 | # tile_spacing[0] 65 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 66 | # tile_spacing[1] 67 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 68 | in zip(img_shape, tile_shape, tile_spacing)] 69 | 70 | if isinstance(X, tuple): 71 | assert len(X) == 4 72 | # Create an output numpy ndarray to store the image 73 | if output_pixel_vals: 74 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 75 | dtype='uint8') 76 | else: 77 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 78 | dtype=X.dtype) 79 | 80 | #colors default to 0, alpha defaults to 1 (opaque) 81 | if output_pixel_vals: 82 | channel_defaults = [0, 0, 0, 255] 83 | else: 84 | channel_defaults = [0., 0., 0., 1.] 85 | 86 | for i in xrange(4): 87 | if X[i] is None: 88 | # if channel is None, fill it with zeros of the correct 89 | # dtype 90 | dt = out_array.dtype 91 | if output_pixel_vals: 92 | dt = 'uint8' 93 | out_array[:, :, i] = numpy.zeros(out_shape, 94 | dtype=dt) + channel_defaults[i] 95 | else: 96 | # use a recurrent call to compute the channel and store it 97 | # in the output 98 | out_array[:, :, i] = tile_raster_images( 99 | X[i], img_shape, tile_shape, tile_spacing, 100 | scale_rows_to_unit_interval, output_pixel_vals) 101 | return out_array 102 | 103 | else: 104 | # if we are dealing with only one channel 105 | H, W = img_shape 106 | Hs, Ws = tile_spacing 107 | 108 | # generate a matrix to store the output 109 | dt = X.dtype 110 | if output_pixel_vals: 111 | dt = 'uint8' 112 | out_array = numpy.zeros(out_shape, dtype=dt) 113 | 114 | for tile_row in xrange(tile_shape[0]): 115 | for tile_col in xrange(tile_shape[1]): 116 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 117 | this_x = X[tile_row * tile_shape[1] + tile_col] 118 | if scale_rows_to_unit_interval: 119 | # if we should scale values to be between 0 and 1 120 | # do this by calling the `scale_to_unit_interval` 121 | # function 122 | this_img = scale_to_unit_interval( 123 | this_x.reshape(img_shape)) 124 | else: 125 | this_img = this_x.reshape(img_shape) 126 | # add the slice to the corresponding position in the 127 | # output array 128 | c = 1 129 | if output_pixel_vals: 130 | c = 255 131 | out_array[ 132 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 133 | tile_col * (W + Ws): tile_col * (W + Ws) + W 134 | ] = this_img * c 135 | return out_array 136 | -------------------------------------------------------------------------------- /src/test_nvcrbm.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import numpy as np 3 | import utils 4 | from PIL import Image 5 | 6 | 7 | def dump_filter_image(filters, filename="filters.png"): 8 | img_array = utils.tile_raster_images(filters, (8,8), (4,8), (1,1)) 9 | Image.fromarray(img_array).save(filename) 10 | 11 | def test_load(): 12 | kyoto_data = cPickle.load(open("../data/kyoto_train.pkl", "r")) 13 | 14 | test_data = np.arange(1, 65, dtype="float32") 15 | test_data = np.tile(test_data, 10*3*64).reshape(10, 3*64*64) 16 | test_data = test_data / 64.0; 17 | #test_data = np.array(10*3*64*64*[1], dtype="float32").reshape(10, 3*64*64) 18 | #test_filter = np.array(64*[1]+ 64*[2]+ 64*[3], dtype="float32") 19 | #test_filter = np.tile(test_filter, 32).reshape(32, 3*64) 20 | 21 | 22 | init_filters = np.array(np.random.normal(size=filter_num * channel_num * 23 | filter_size*filter_size), dtype="float32") 24 | init_filters = 0.01 * init_filters.reshape(filter_num, channel_num*filter_size*filter_size) 25 | 26 | init_hbias = np.array([-1.0] * filter_num, dtype="float32").reshape(filter_num, 1) 27 | 28 | init_vbias = np.array([0.0] * channel_num, dtype="float32").reshape(channel_num, 1) 29 | 30 | libnvcrbm = __import__("nvcrbm") 31 | cur_filters = libnvcrbm.init(filter_num, filter_size, 32 | input_batch_num, input_size, channel_num, 33 | pooling_rate, left_upper_padding, right_lower_padding, 34 | init_filters, init_hbias, init_vbias) 35 | #init_filter = libnvcrbm.init(32, 8, 10, 64, 3, 2, 4, 3) 36 | 37 | batch_num = 500 38 | batch_size = 2 39 | for batch_idx in xrange(batch_num/batch_size): 40 | batch_data = kyoto_data[batch_idx*batch_size: 41 | (batch_idx+1)*batch_size] 42 | batch_data = np.asarray(batch_data).reshape(batch_size, 43 | channel_num * input_size * input_size) 44 | libnvcrbm.run_batch(batch_data) 45 | if batch_idx % 10 == 0: 46 | cur_filters = libnvcrbm.get_filters() 47 | dump_filter_image(cur_filters, "../data/kyoto/filters/batch_%d.png" % batch_idx) 48 | 49 | def train(trial_num, image_num, filter_num, filter_size, input_size, channel_num, pooling_rate, left_upper_padding, right_lower_padding): 50 | """ 51 | import cPickle 52 | import numpy as np 53 | import utils 54 | from PIL import Image 55 | filter_num = 32 56 | filter_size = 8 57 | input_batch_num = 10 58 | input_size = 64 59 | channel_num = 1 60 | pooling_rate = 2 61 | left_upper_padding = 4 62 | right_lower_padding = 3 63 | image_num = 10 64 | imgs = cPickle.load(open("../data/kyoto_large_train.pkl", "r")) 65 | img_size = imgs[0].shape[0] 66 | 67 | for trial_idx in xrange(trial_num): 68 | for img_idx in xrange(image_num): 69 | row_idx = np.arange(0, input_size) + np.random.random_integers(img_size - 2 * filter_size - input_size) + filter_size - 1 70 | col_idx = np.arange(0, input_size) + np.random.random_integers(img_size - 2 * filter_size - input_size) + filter_size - 1 71 | """ 72 | 73 | input_batch_num = 1 74 | batch_num = 2 75 | 76 | init_filters = np.array(np.random.normal(size=filter_num * channel_num * 77 | filter_size*filter_size), dtype="float32") 78 | #init_filters = np.array([1.0] * filter_num * channel_num * filter_size * filter_size, dtype="float32") 79 | init_filters = 0.01 * init_filters.reshape(filter_num, channel_num*filter_size*filter_size) 80 | 81 | init_hbias = np.array([-0.1] * filter_num, dtype="float32").reshape(filter_num, 1) 82 | 83 | init_vbias = np.array([0.0] * channel_num, dtype="float32").reshape(channel_num, 1) 84 | 85 | libnvcrbm = __import__("nvcrbm") 86 | cur_filters = libnvcrbm.init(filter_num, filter_size, 87 | input_batch_num, input_size, channel_num, 88 | pooling_rate, left_upper_padding, right_lower_padding, 89 | init_filters, init_hbias, init_vbias) 90 | 91 | imgs = cPickle.load(open("../data/kyoto_large_train.pkl", "r")) 92 | img_size = imgs[0].shape[0] 93 | 94 | for trial_idx in xrange(trial_num): 95 | for img_idx in xrange(image_num): 96 | for batch_idx in xrange(batch_num): 97 | row_idx = np.arange(0, input_size) + np.random.random_integers(img_size - 2 * filter_size - input_size) + filter_size - 1 98 | col_idx = np.arange(0, input_size) + np.random.random_integers(img_size - 2 * filter_size - input_size) + filter_size - 1 99 | #row_idx = np.arange(0, input_size) + 200 100 | #col_idx = np.arange(0, input_size) + 200 101 | 102 | batch_data = imgs[img_idx][row_idx][:,col_idx] 103 | batch_data = batch_data - batch_data.mean() 104 | batch_data = np.asarray(batch_data.reshape(1, input_size * input_size), dtype="float32") 105 | 106 | libnvcrbm.run_batch(trial_idx, img_idx, batch_idx, batch_data) 107 | 108 | libnvcrbm.print_result() 109 | cur_filters = libnvcrbm.get_gpu_filters() 110 | dump_filter_image(cur_filters, "../data/kyoto/filters/trial_%d.png" % trial_idx) 111 | 112 | first_layer = {} 113 | first_layer["filters"] = cur_filters 114 | first_layer["bias"] = libnvcrbm.get_gpu_hbias() 115 | cPickle.dump(first_layer, open("../data/first_layer.dat", "w+")) 116 | 117 | 118 | if __name__ == "__main__": 119 | trial_num = 500 120 | filter_num = 32 121 | filter_size = 8 122 | input_batch_num = 10 123 | input_size = 64 124 | channel_num = 1 125 | pooling_rate = 2 126 | left_upper_padding = 4 127 | right_lower_padding = 3 128 | image_num = 10 129 | 130 | train(trial_num, image_num, filter_num, filter_size, input_size, channel_num, pooling_rate, left_upper_padding, right_lower_padding) 131 | 132 | -------------------------------------------------------------------------------- /src/matrix.cpp: -------------------------------------------------------------------------------- 1 | #include "matrix.h" 2 | #include "utils.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | Matrix::Matrix(PyArrayObject *pyarr){ 10 | this->nrow = PyArray_DIM(pyarr, 0); 11 | this->ncol = PyArray_DIM(pyarr, 1); 12 | /* 13 | if(pyarr->flags & NPY_ARRAY_C_CONTIGUOUS){ 14 | this->data = reinterpret_cast(pyarr->data); 15 | this->own = false; 16 | }else{*/ 17 | 18 | this->data = new float[this->nrow * this->ncol]; 19 | for(int i = 0; i < this->nrow; i++){ 20 | for(int j = 0; j < this->ncol; j++){ 21 | (*this)(i, j) = *reinterpret_cast(PyArray_GETPTR2(pyarr, i, j)); 22 | } 23 | } 24 | this->own = true; 25 | //} 26 | this->trans = false; 27 | } 28 | 29 | Matrix::Matrix(int nrow, int ncol, float low, float upper){ 30 | this->nrow = nrow; 31 | this->ncol = ncol; 32 | this->own = true; 33 | this->trans = false; 34 | this->data = new float[nrow * ncol]; 35 | for(int i = 0; i < nrow; i++){ 36 | for(int j = 0; j < ncol; j++){ 37 | (*this)(i, j) = random_float(low, upper); 38 | } 39 | } 40 | } 41 | 42 | Matrix::Matrix(int nrow, int ncol){ 43 | this->nrow = nrow; 44 | this->ncol = ncol; 45 | this->own = true; 46 | this->trans = false; 47 | this->data = new float[nrow * ncol]; 48 | for(int i = 0; i < nrow; i++){ 49 | for(int j = 0; j < ncol; j++){ 50 | (*this)(i, j) = 0; 51 | } 52 | } 53 | } 54 | 55 | Matrix::Matrix(Matrix& target){ 56 | this->nrow = target.nrow; 57 | this->ncol = target.ncol; 58 | this->own = true; 59 | this->trans = target.trans; 60 | this->data = new float[nrow * ncol]; 61 | memcpy(this->data, target.get_data(), nrow * ncol * sizeof(float)); 62 | } 63 | 64 | Matrix::~Matrix(){ 65 | if(this->own) 66 | delete this->data; 67 | } 68 | 69 | float* Matrix::get_data(){ 70 | return this->data; 71 | } 72 | 73 | int Matrix::get_row_num(){ 74 | return this->nrow; 75 | } 76 | 77 | int Matrix::get_col_num(){ 78 | return this->ncol; 79 | } 80 | 81 | int Matrix::get_ele_num(){ 82 | return nrow * ncol; 83 | } 84 | 85 | bool Matrix::get_trans(){ 86 | return this->trans; 87 | } 88 | 89 | void Matrix::mat_init(float val){ 90 | for(int i = 0; i < nrow; i++){ 91 | for(int j = 0; j < ncol; j++){ 92 | (*this)(i, j) = val; 93 | } 94 | } 95 | } 96 | 97 | bool Matrix::equal_value(Matrix &target){ 98 | this->equal_value(target, 1e-5); 99 | } 100 | 101 | bool Matrix::equal_value(Matrix &target, float e){ 102 | assert(this->nrow == target.get_row_num() && 103 | this->ncol == target.get_col_num()); 104 | for(int i = 0; i < this->nrow; i++) 105 | for(int j = 0; j < this->ncol; j++){ 106 | if(!float_equal((*this)(i, j), target(i,j), e)){ 107 | cout << "this(" << i << j << "):" << (*this)(i, j) << endl; 108 | cout << "target(" << i << j << "):" << target(i, j) << endl; 109 | return false; 110 | } 111 | } 112 | 113 | cout << "same" << endl; 114 | return true; 115 | } 116 | 117 | void Matrix::ele_add(float val){ 118 | ele_add(val, *this); 119 | } 120 | 121 | void Matrix::ele_add(float val, Matrix& target){ 122 | for(int i = 0; i < this->nrow; i++) 123 | for(int j = 0; j < this->ncol; j++){ 124 | target(i, j) = (*this)(i, j) + val; 125 | } 126 | } 127 | 128 | void Matrix::ele_scale(float scaler){ 129 | ele_scale(scaler, *this); 130 | } 131 | 132 | void Matrix::ele_scale(float scaler, Matrix& target){ 133 | for(int i = 0; i < this->nrow; i++) 134 | for(int j = 0; j < this->ncol; j++){ 135 | target(i, j) = (*this)(i, j) * scaler; 136 | } 137 | } 138 | 139 | void Matrix::mat_sum(int axis, Matrix& target){ 140 | if(axis == 0){ 141 | for(int i = 0; i < this->nrow; i++){ 142 | float sum = 0.0; 143 | for(int j = 0; j < this->ncol; j++){ 144 | sum += (*this)(i, j); 145 | } 146 | target(i, 0) = sum; 147 | } 148 | }else{ 149 | for(int i = 0; i < this->ncol; i++){ 150 | float sum = 0.0; 151 | for(int j = 0; j < this->nrow; j++){ 152 | sum += (*this)(j, i); 153 | } 154 | target(0, i) = sum; 155 | } 156 | } 157 | } 158 | 159 | void Matrix::mat_add(Matrix& m, float sb){ 160 | mat_add(m, *this, 1.0, sb); 161 | } 162 | 163 | void Matrix::mat_add(Matrix& m, Matrix& target, float sa, float sb){ 164 | for(int i = 0; i < this->nrow; i++) 165 | for(int j = 0; j < this->ncol; j++){ 166 | target(i, j) = sa * (*this)(i, j) + sb * m(i, j); 167 | } 168 | } 169 | 170 | void Matrix::assign(Matrix& target){ 171 | for(int i = 0; i < this->nrow; i++) 172 | for(int j = 0; j < this->ncol; j++){ 173 | target(i, j) = (*this)(i, j); 174 | } 175 | } 176 | 177 | float Matrix::ele_mean(){ 178 | float mean = 0.0f; 179 | for(int i = 0; i < this->nrow; i++) 180 | for(int j = 0; j < this->ncol; j++){ 181 | mean += (*this)(i, j); 182 | } 183 | mean /= get_ele_num(); 184 | return mean; 185 | } 186 | 187 | void Matrix::mat_mul(Matrix& m, Matrix& target){ 188 | for(int i = 0; i < this->nrow; i++) 189 | for(int j = 0; j < this->ncol; j++){ 190 | target(i, j) = (*this)(i, j) * m(i, j); 191 | } 192 | } 193 | 194 | void Matrix::mat_mul(Matrix& m){ 195 | mat_mul(m, *this); 196 | } 197 | 198 | void Matrix::reshape(int nrow, int ncol){ 199 | this->nrow = nrow; 200 | this->ncol = ncol; 201 | } 202 | 203 | bool Matrix::check_nan(){ 204 | for(int i = 0; i < this->nrow; i++) 205 | for(int j = 0; j < this->ncol; j++){ 206 | if(isnan((*this)(i, j))){ 207 | return false; 208 | } 209 | } 210 | return true; 211 | } 212 | -------------------------------------------------------------------------------- /src/crbm_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "crbm_kernel.cuh" 2 | 3 | using namespace std; 4 | 5 | __global__ void convolution_forward_kernel(float *input, 6 | float *filters, float *feature_map, float *hbias, int input_size, 7 | int channel_num, int feature_map_size, int filter_size, 8 | int filter_num, int lu_padding, float sigma){ 9 | __shared__ float shImg[32+MAX_FILETER_SIZE-1][32+MAX_FILETER_SIZE-1]; 10 | __shared__ float shFilter[MAX_FILETER_SIZE][MAX_FILETER_SIZE]; 11 | 12 | int imgIdx = blockIdx.y / (input_size / 32); 13 | int filterIdx = blockIdx.x / (input_size / 32); 14 | int tx = blockIdx.x % (input_size / 32) * 32 + threadIdx.x; 15 | int ty = blockIdx.y % (input_size / 32) * 32 + threadIdx.y; 16 | 17 | float *target = feature_map + 18 | imgIdx * feature_map_size * feature_map_size * filter_num + 19 | feature_map_size * feature_map_size * filterIdx + 20 | ty * feature_map_size + tx; 21 | 22 | float local_target = 0.0f; 23 | 24 | for(int g = 0; g < channel_num; g++){ 25 | 26 | if(threadIdx.x < filter_size && threadIdx.y < filter_size){ 27 | shFilter[threadIdx.y][threadIdx.x] = 28 | filters[filterIdx * channel_num * filter_size * filter_size + 29 | + g * filter_size * filter_size + 30 | threadIdx.y * filter_size + threadIdx.x]; 31 | } 32 | __syncthreads(); 33 | 34 | float *img = input + imgIdx * input_size * input_size * channel_num 35 | + g * input_size * input_size; 36 | 37 | float *shImgLoad = &shImg[threadIdx.y][threadIdx.x]; 38 | if(tx < lu_padding || ty < lu_padding){ 39 | *shImgLoad = 0; 40 | }else{ 41 | *shImgLoad = img[(ty-lu_padding) * input_size + (tx-lu_padding)]; 42 | } 43 | 44 | if(threadIdx.x < MAX_FILETER_SIZE-1){ 45 | shImgLoad = &shImg[threadIdx.y][threadIdx.x+32]; 46 | if(ty < lu_padding || (tx+32) >= (input_size+lu_padding)){ 47 | *shImgLoad = 0; 48 | }else{ 49 | *shImgLoad = img[(ty-lu_padding) * input_size + 50 | (tx+32-lu_padding)]; 51 | } 52 | } 53 | 54 | if(threadIdx.y < MAX_FILETER_SIZE-1){ 55 | shImgLoad = &shImg[threadIdx.y+32][threadIdx.x]; 56 | if(tx < lu_padding || (ty+32) >= (input_size+lu_padding)){ 57 | *shImgLoad = 0; 58 | }else{ 59 | *shImgLoad = img[(ty+32-lu_padding) * input_size + 60 | (tx-lu_padding)]; 61 | } 62 | 63 | if(threadIdx.x < MAX_FILETER_SIZE-1){ 64 | shImgLoad = &shImg[threadIdx.y+32][threadIdx.x+32]; 65 | if((ty+32) >= (input_size+lu_padding) || 66 | (tx+32) >= (input_size+lu_padding)){ 67 | *shImgLoad = 0; 68 | }else{ 69 | *shImgLoad = img[(ty+32-lu_padding) * input_size + 70 | (tx+32-lu_padding)]; 71 | } 72 | } 73 | } 74 | __syncthreads(); 75 | 76 | float *imgPtr = &shImg[threadIdx.y][threadIdx.x]; 77 | 78 | for(int i = 0; i < filter_size; i++){ 79 | for(int j = 0; j < filter_size; j++){ 80 | local_target += imgPtr[j] * shFilter[i][j]; 81 | } 82 | imgPtr += 32 + MAX_FILETER_SIZE - 1; 83 | } 84 | 85 | __syncthreads(); 86 | 87 | } 88 | 89 | local_target += hbias[filterIdx]; 90 | local_target *= __fdividef(1.0f , sigma * sigma); 91 | *target = local_target; 92 | 93 | } 94 | 95 | __global__ void max_pooling_kernel(float *feature_map, float *probs, float *target, 96 | int feature_map_size, int feature_map_num, int pooling_rate, 97 | float *rnd_array, int rnd_num){ 98 | __shared__ float shFm[16*MAX_POOLING_RATE][16*MAX_POOLING_RATE]; 99 | 100 | int imgIdx = blockIdx.y / (feature_map_size / 16 / pooling_rate); 101 | int fmIdx = blockIdx.x / (feature_map_size / 16 / pooling_rate); 102 | int tx = (blockIdx.x % (feature_map_size / pooling_rate / 16)) * 16 + threadIdx.x; 103 | int ty = (blockIdx.y % (feature_map_size / pooling_rate / 16)) * 16 + threadIdx.y; 104 | int subsample_size = feature_map_size / pooling_rate; 105 | 106 | int rnd_index = ((blockIdx.y * blockDim.y + threadIdx.y) * (blockIdx.x * blockDim.x) + threadIdx.x ) % rnd_num; 107 | float rnd = rnd_array[rnd_index]; 108 | 109 | float *fm = feature_map + imgIdx * feature_map_num * feature_map_size * feature_map_size + 110 | fmIdx * feature_map_size * feature_map_size; 111 | 112 | probs = probs + imgIdx * feature_map_num * feature_map_size * feature_map_size + 113 | fmIdx * feature_map_size * feature_map_size; 114 | 115 | target = target + imgIdx * feature_map_num * subsample_size * subsample_size + 116 | fmIdx * subsample_size * subsample_size; 117 | 118 | for(int i = 0; i < pooling_rate; i++){ 119 | for(int j = 0; j < pooling_rate; j++){ 120 | shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j] = 121 | fm[(ty*pooling_rate+i) * feature_map_size + (tx*pooling_rate+j)]; 122 | } 123 | } 124 | 125 | __syncthreads(); 126 | 127 | float sum = 0; 128 | for(int i = 0; i < pooling_rate; i++){ 129 | for(int j = 0; j < pooling_rate; j++){ 130 | if(shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j] > 50){ 131 | shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j] = 50.0f; 132 | } 133 | shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j] = 134 | __expf(shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j]); 135 | sum += shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j]; 136 | } 137 | } 138 | for(int i = 0; i < pooling_rate; i++){ 139 | for(int j = 0; j < pooling_rate; j++){ 140 | shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j] = 141 | __fdividef(shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j], (1.0f + sum)); 142 | probs[(ty*pooling_rate+i) * feature_map_size + (tx*pooling_rate+j)] = 143 | shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j]; 144 | fm[(ty*pooling_rate+i) * feature_map_size + (tx*pooling_rate+j)] = 0; 145 | } 146 | } 147 | 148 | sum = 0; 149 | bool isStop = false; 150 | for(int i = 0; i < pooling_rate && !isStop; i++){ 151 | for(int j = 0; j < pooling_rate && !isStop; j++){ 152 | sum += shFm[threadIdx.y*pooling_rate+i][threadIdx.x*pooling_rate+j]; 153 | if(rnd < sum){ 154 | fm[(ty*pooling_rate+i) * feature_map_size + (tx*pooling_rate+j)] = 1; 155 | isStop = true; 156 | } 157 | } 158 | } 159 | if(isStop){ 160 | target[threadIdx.y*subsample_size+threadIdx.x] = 1; 161 | }else{ 162 | target[threadIdx.y*subsample_size+threadIdx.x] = 0; 163 | } 164 | } 165 | 166 | __global__ void convolution_backward_kernel(float *y_h, float *filters, float *vbias, 167 | float *target, float *y_v, 168 | int input_size, int lu_padding, int channel_num, int feature_map_size, 169 | int filter_num, int filter_size, float *rnd_array, int rnd_num){ 170 | int imgIdx = blockIdx.y / (input_size / 16); 171 | int channelIdx = blockIdx.x / (input_size / 16); 172 | int tx = (blockIdx.x % (input_size / 16)) * 16 + threadIdx.x; 173 | int ty = (blockIdx.y % (input_size / 16)) * 16 + threadIdx.y; 174 | int padding = (filter_size - 1); 175 | 176 | int rnd_index = ((blockIdx.y * blockDim.y + threadIdx.y) * (blockIdx.x * blockDim.x) + threadIdx.x ) % rnd_num; 177 | float rnd = rnd_array[rnd_index]; 178 | 179 | __shared__ float shHidden[16+2*(MAX_FILETER_SIZE-1)][16+2*(MAX_FILETER_SIZE-1)]; 180 | __shared__ float shFlipFilter[MAX_FILETER_SIZE][MAX_FILETER_SIZE]; 181 | float local_target = 0.0f; 182 | 183 | target = target + imgIdx * channel_num * input_size * input_size + 184 | channelIdx * input_size * input_size; 185 | 186 | float *target_y_v = y_v + imgIdx * channel_num * input_size * input_size + 187 | channelIdx * input_size * input_size; 188 | 189 | __syncthreads(); 190 | 191 | 192 | for(int f = 0; f < filter_num; f++){ 193 | float *cur_y_h = y_h + imgIdx * filter_num * feature_map_size * feature_map_size + 194 | f * feature_map_size * feature_map_size; 195 | 196 | float *cur_filter = filters + f * channel_num * filter_size * filter_size + 197 | channelIdx * filter_size * filter_size; 198 | 199 | if(threadIdx.x < filter_size && threadIdx.y < filter_size){ 200 | shFlipFilter[threadIdx.y][threadIdx.x] = 201 | cur_filter[(filter_size-1-threadIdx.y)*filter_size + filter_size-1-threadIdx.x]; 202 | } 203 | 204 | float *shHiddenLoad = &shHidden[threadIdx.y][threadIdx.x]; 205 | if(tx < padding || ty < padding){ 206 | *shHiddenLoad = 0; 207 | }else{ 208 | *shHiddenLoad = cur_y_h[(ty-padding) * input_size + 209 | (tx-padding)]; 210 | } 211 | 212 | if(threadIdx.x < 2 * padding){ 213 | shHiddenLoad = &shHidden[threadIdx.y][threadIdx.x+16]; 214 | if(ty < padding || (tx+16) >= (feature_map_size+padding)){ 215 | *shHiddenLoad = 0; 216 | }else{ 217 | *shHiddenLoad = cur_y_h[(ty-padding) * feature_map_size + 218 | (tx+16-padding)]; 219 | } 220 | } 221 | 222 | if(threadIdx.y < 2 * padding){ 223 | shHiddenLoad = &shHidden[threadIdx.y+16][threadIdx.x]; 224 | if(tx < padding || (ty+16) >= (feature_map_size+padding)){ 225 | *shHiddenLoad = 0; 226 | }else{ 227 | *shHiddenLoad = cur_y_h[(ty+16-padding) * feature_map_size + 228 | (tx-padding)]; 229 | } 230 | 231 | if(threadIdx.x < 2 * padding){ 232 | shHiddenLoad = &shHidden[threadIdx.y+16][threadIdx.x+16]; 233 | if((ty+16) >= (feature_map_size+padding) || 234 | (tx+16) >= (feature_map_size+padding)){ 235 | *shHiddenLoad = 0; 236 | }else{ 237 | *shHiddenLoad = cur_y_h[(ty+16-padding) * feature_map_size + 238 | (tx+16-padding)]; 239 | } 240 | } 241 | } 242 | 243 | __syncthreads(); 244 | 245 | for(int i = 0; i < filter_size; i++){ 246 | for(int j = 0; j < filter_size; j++){ 247 | target[ty*input_size+tx] += 248 | local_target += 249 | shHidden[threadIdx.y+i+lu_padding][threadIdx.x+j+lu_padding] * 250 | shFlipFilter[i][j]; 251 | } 252 | } 253 | 254 | __syncthreads(); 255 | } 256 | local_target += vbias[channelIdx]; 257 | //local_target = expf(-local_target); 258 | //local_target = __fdividef(1.0f , (1.0f + local_target)); 259 | if(rnd < local_target){ 260 | target_y_v[ty*input_size+tx] = 1; 261 | }else{ 262 | target_y_v[ty*input_size+tx] = 0; 263 | } 264 | target[ty*input_size+tx] = local_target; 265 | } 266 | 267 | __global__ void compute_d_w_kernel(float *v, float *h, float *dw, bool is_init, 268 | int input_size, int lu_padding, int channel_num, int filter_num, 269 | int filter_size, int feature_map_size){ 270 | 271 | int imgIdx = blockIdx.y / (feature_map_size / 32); 272 | int filterIdx = blockIdx.x / (channel_num * feature_map_size / 32); 273 | int channelIdx = (blockIdx.x % (channel_num * feature_map_size / 32)) / 274 | (feature_map_size / 32); 275 | int tx = (blockIdx.x % (channel_num * feature_map_size / 32)) % 276 | (feature_map_size / 32) *32 + threadIdx.x; 277 | int ty = (blockIdx.y % (feature_map_size / 32)) * 32 + threadIdx.y; 278 | 279 | __shared__ float shV[32+MAX_FILETER_SIZE][32+MAX_FILETER_SIZE]; 280 | __shared__ float shH[32][32]; 281 | 282 | float sign; 283 | if(is_init){ 284 | sign = 1.0f; 285 | }else{ 286 | sign = -1.0f; 287 | } 288 | 289 | v = v + imgIdx * channel_num * input_size * input_size + 290 | channelIdx * input_size * input_size; 291 | 292 | h = h + imgIdx * filter_num * feature_map_size * feature_map_size + 293 | filterIdx * feature_map_size * feature_map_size; 294 | 295 | dw = dw + filterIdx * channel_num * filter_size * filter_size + 296 | channelIdx * filter_size * filter_size; 297 | 298 | float local_dw = 0.0f; 299 | 300 | for(int loadX = 0; loadX <= 32; loadX += filter_size){ 301 | for(int loadY = 0; loadY <= 32; loadY += filter_size){ 302 | if(loadX < 32 && loadY < 32){ 303 | //TODO:feature map overflow 304 | shH[threadIdx.y+loadY][threadIdx.x+loadX] = 305 | h[(ty+loadY)*feature_map_size + (tx+loadX)]; 306 | } 307 | if((tx+loadX) < lu_padding || 308 | (ty+loadY) < lu_padding || 309 | (tx+loadX) >= (input_size+lu_padding) || 310 | (ty+loadY) >= (input_size+lu_padding)){ 311 | shV[threadIdx.y+loadY][threadIdx.x+loadX] = 0; 312 | }else{ 313 | shV[threadIdx.y+loadY][threadIdx.x+loadX] = 314 | v[(ty+loadY-lu_padding)*input_size + (tx+loadX-lu_padding)]; 315 | } 316 | } 317 | } 318 | 319 | __syncthreads(); 320 | 321 | for(int i = 0; i < 32; i++){ 322 | for(int j = 0; j < 32; j++){ 323 | local_dw += shV[threadIdx.y+i][threadIdx.x+j] * 324 | shH[i][j]; 325 | } 326 | } 327 | 328 | atomicAdd(dw + threadIdx.y*filter_size + threadIdx.x, sign * local_dw); 329 | } 330 | -------------------------------------------------------------------------------- /src/crbm.cu: -------------------------------------------------------------------------------- 1 | #include "matrix.h" 2 | #include "crbm.cuh" 3 | #include 4 | #include "utils.h" 5 | #include "crbm_kernel.cuh" 6 | #include 7 | 8 | using namespace std; 9 | #define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \ 10 | printf("Error at %s:%d\n",__FILE__,__LINE__);\ 11 | return EXIT_FAILURE;}} while(0) 12 | 13 | __global__ void setup_curand_kernel(curandState *state, int count){ 14 | int id = threadIdx.x + blockIdx.x * 64; 15 | if(id < count){ 16 | curand_init(1234, id, 0, &state[id]); 17 | } 18 | } 19 | 20 | void setup_curand(curandState **state, int count){ 21 | CUDA_CALL(cudaMalloc((void**)state, count * sizeof(curandState))); 22 | setup_curand_kernel<<< ceil(count/64.0), 64>>>(*state, count); 23 | } 24 | 25 | CRBM::CRBM(int filter_num, int filter_size, 26 | int input_num, int input_size, int channel_num, 27 | int left_upper_padding, int right_low_padding, 28 | int pooling_rate, 29 | Matrix *filters, Matrix *hbias, 30 | Matrix *vbias){ 31 | 32 | this->epsilon = 0.01; 33 | this->momentum = 0.5; 34 | this->l2reg = 0.01; 35 | this->ph_lambda = 5; 36 | this->ph = 0.002; 37 | this->sigma = 0.2; 38 | this->cur_trial = 0; 39 | 40 | this->filter_num = filter_num; 41 | this->filter_size = filter_size; 42 | this->input_num = input_num; 43 | this->input_size = input_size; 44 | this->pooling_rate = pooling_rate; 45 | this->channel_num = channel_num; 46 | this->left_upper_padding = left_upper_padding; 47 | this->right_low_padding = right_low_padding; 48 | this->feature_map_size = input_size + left_upper_padding + 49 | right_low_padding - filter_size + 1; 50 | this->subsample_size = feature_map_size / pooling_rate; 51 | 52 | if(filters == NULL){ 53 | this->CPU_filters = filter_init(filter_size, filter_num, channel_num); 54 | }else{ 55 | this->CPU_filters = new Matrix(*filters); 56 | } 57 | 58 | if(hbias == NULL){ 59 | this->CPU_hbias = new Matrix(filter_num, 1, -0.1f, -0.1f); 60 | }else{ 61 | this->CPU_hbias = new Matrix(*hbias); 62 | } 63 | 64 | if(vbias == NULL){ 65 | this->CPU_vbias = new Matrix(channel_num, 1); 66 | }else{ 67 | this->CPU_vbias = new Matrix(*vbias); 68 | } 69 | this->CPU_input = new Matrix(input_num, channel_num * input_size * input_size); 70 | 71 | this->CPU_y_h = new Matrix(input_num , 72 | filter_num * feature_map_size * feature_map_size); 73 | this->CPU_y_h_probs = new Matrix(input_num , 74 | filter_num * feature_map_size * feature_map_size); 75 | this->CPU_y_h2 = new Matrix(input_num , 76 | filter_num * feature_map_size * feature_map_size); 77 | this->CPU_y_h2_probs = new Matrix(input_num , 78 | filter_num * feature_map_size * feature_map_size); 79 | //filter_num * feature_map_size * feature_map_size, 1, 1); 80 | this->CPU_y_p = new Matrix(input_num, 81 | filter_num * subsample_size * subsample_size); 82 | this->CPU_y_v = new Matrix(this->CPU_input->get_row_num(), 83 | this->CPU_input->get_col_num()); 84 | this->CPU_y_v_probs = new Matrix(this->CPU_input->get_row_num(), 85 | this->CPU_input->get_col_num()); 86 | this->CPU_d_w = new Matrix(this->CPU_filters->get_row_num(), 87 | this->CPU_filters->get_col_num()); 88 | this->CPU_d_w_pre = new Matrix(this->CPU_filters->get_row_num(), 89 | this->CPU_filters->get_col_num()); 90 | this->CPU_d_hbias = new Matrix(this->CPU_hbias->get_row_num(), 91 | this->CPU_hbias->get_col_num()); 92 | this->CPU_d_hbias_pre = new Matrix(this->CPU_hbias->get_row_num(), 93 | this->CPU_hbias->get_col_num()); 94 | this->CPU_d_hbias_tmp = new Matrix(this->CPU_hbias->get_row_num(), 95 | this->CPU_hbias->get_col_num()); 96 | this->CPU_d_h_sum_tmp = new Matrix(1, this->CPU_y_h->get_col_num()); 97 | 98 | this->GPU_filters = new NVMatrix(*this->CPU_filters); 99 | this->GPU_hbias = new NVMatrix(*this->CPU_hbias); 100 | this->GPU_vbias = new NVMatrix(*this->CPU_vbias); 101 | this->GPU_input = new NVMatrix(*this->CPU_input); 102 | this->GPU_y_h = new NVMatrix(*this->CPU_y_h); 103 | this->GPU_y_h_probs = new NVMatrix(*this->CPU_y_h_probs); 104 | this->GPU_y_h2 = new NVMatrix(*this->CPU_y_h2); 105 | this->GPU_y_h2_probs = new NVMatrix(*this->CPU_y_h2_probs); 106 | this->GPU_y_p = new NVMatrix(*this->CPU_y_p); 107 | this->GPU_y_v = new NVMatrix(*this->CPU_y_v); 108 | this->GPU_y_v_probs = new NVMatrix(*this->CPU_y_v_probs); 109 | this->GPU_d_w = new NVMatrix(*this->CPU_d_w); 110 | this->GPU_d_w_pre = new NVMatrix(*this->CPU_d_w); 111 | this->GPU_d_hbias = new NVMatrix(*this->CPU_d_hbias); 112 | this->GPU_d_hbias_pre = new NVMatrix(*this->CPU_d_hbias); 113 | this->GPU_d_hbias_tmp = new NVMatrix(*this->CPU_d_hbias_tmp); 114 | this->GPU_d_h_sum_tmp= new NVMatrix(*this->CPU_d_h_sum_tmp); 115 | 116 | this->rnd_num = std::max(input_num * channel_num * input_size * input_size, input_num * feature_map_size * feature_map_size / (pooling_rate * pooling_rate)); 117 | curandCreateGenerator(&this->rnd_gen, CURAND_RNG_PSEUDO_DEFAULT); 118 | curandSetPseudoRandomGeneratorSeed(this->rnd_gen, 1234ULL); 119 | cudaMalloc((void **)&this->rnd_array, this->rnd_num * sizeof(float)); 120 | //setup_curand(&this->rnd_state, this->rnd_state_num); 121 | } 122 | 123 | CRBM::~CRBM(){ 124 | delete this->CPU_filters; 125 | delete this->CPU_hbias; 126 | delete this->CPU_vbias; 127 | delete this->CPU_y_h; 128 | delete this->CPU_y_h_probs; 129 | delete this->CPU_y_p; 130 | delete this->CPU_y_v; 131 | delete this->CPU_y_v_probs; 132 | delete this->CPU_d_w; 133 | delete this->CPU_d_w_pre; 134 | delete this->CPU_d_hbias; 135 | delete this->CPU_d_hbias_pre; 136 | delete this->CPU_d_hbias_tmp; 137 | delete this->CPU_d_h_sum_tmp; 138 | 139 | delete this->GPU_filters; 140 | delete this->GPU_hbias; 141 | delete this->GPU_vbias; 142 | delete this->GPU_y_h; 143 | delete this->GPU_y_h_probs; 144 | delete this->GPU_y_p; 145 | delete this->GPU_y_v; 146 | delete this->GPU_y_v_probs; 147 | delete this->GPU_d_w; 148 | delete this->GPU_d_w_pre; 149 | delete this->GPU_d_hbias; 150 | delete this->GPU_d_hbias_pre; 151 | delete this->GPU_d_hbias_tmp; 152 | delete this->GPU_d_h_sum_tmp; 153 | 154 | CUDA_CALL(cudaFree(this->rnd_array)); 155 | curandDestroyGenerator(this->rnd_gen); 156 | } 157 | 158 | Matrix* CRBM::filter_init(int filter_size, int filter_num, int channel_num){ 159 | float low = - 4 * sqrt(6.0 / (2 * filter_size * filter_size * channel_num)); 160 | float upper = -low; 161 | return new Matrix(filter_num, channel_num*filter_size*filter_size, low, upper); 162 | } 163 | 164 | void CRBM::CPU_convolution_forward(float *input, float *filter, float *target, float *hbias){ 165 | 166 | bzero(target, input_num * filter_num * feature_map_size * feature_map_size * sizeof(float)); 167 | 168 | for(int img = 0; img < input_num; img++){ 169 | for(int fil = 0; fil < filter_num; fil++){ 170 | 171 | float *curBias = hbias + fil; 172 | 173 | for(int r = 0; r < feature_map_size; r++){ 174 | for(int c = 0; c < feature_map_size; c++){ 175 | 176 | float *curFilter = filter + fil * channel_num * filter_size * filter_size; 177 | 178 | float* curTarget = target + img * filter_num * feature_map_size * feature_map_size + 179 | fil * feature_map_size * feature_map_size + 180 | r * feature_map_size + c; 181 | 182 | for(int k = 0; k < channel_num; k++){ 183 | 184 | float* curInput = input + img * channel_num * input_size * input_size + 185 | k * input_size * input_size + 186 | (r < left_upper_padding ? 0 : r - left_upper_padding) * input_size + 187 | (c < left_upper_padding ? 0 : c - left_upper_padding); 188 | 189 | for(int i = 0; i < filter_size; i++){ 190 | 191 | if(!((r+i) < left_upper_padding || 192 | (r+i) >= (left_upper_padding + input_size))){ 193 | 194 | int step = 0; 195 | 196 | for(int j = 0; j < filter_size; j++){ 197 | if(!((c+j) < left_upper_padding || 198 | (c+j) >= (left_upper_padding + input_size))){ 199 | *curTarget += curFilter[i*filter_size+j] * (*curInput); 200 | curInput++; 201 | step++; 202 | } 203 | } 204 | curInput += input_size - step; 205 | 206 | } 207 | } 208 | curFilter += filter_size * filter_size; 209 | } 210 | *curTarget += *curBias; 211 | *curTarget = (1.0 / (this->sigma * this->sigma)) * (*curTarget); 212 | } 213 | } 214 | } 215 | } 216 | } 217 | 218 | static int max_pooling_multinomial(float *probs, int len){ 219 | float rnd = random_float(0, 1); 220 | int i; 221 | 222 | for(i = 0; rnd > probs[i]; i++, probs[i] += probs[i-1]); 223 | 224 | return i; 225 | } 226 | 227 | void CRBM::CPU_max_pooling(float *y_h, float *y_h_probs, float *y_p){ 228 | 229 | float pooling_area[MAX_POOLING_RATE*MAX_FILETER_SIZE+1]; 230 | 231 | for(int img = 0; img < input_num; img++){ 232 | for(int fil = 0; fil < filter_num; fil++){ 233 | float *fm = y_h + 234 | img * filter_num * feature_map_size * feature_map_size + 235 | fil * feature_map_size * feature_map_size; 236 | float *probs = y_h_probs + 237 | img * filter_num * feature_map_size * feature_map_size + 238 | fil * feature_map_size * feature_map_size; 239 | float *target = y_p + 240 | img * filter_num * subsample_size * subsample_size + 241 | fil * subsample_size * subsample_size; 242 | 243 | for(int i = 0; i < feature_map_size; i += pooling_rate){ 244 | for(int j = 0; j < feature_map_size; j += pooling_rate){ 245 | 246 | float sum = 0; 247 | for(int pi = 0; pi < pooling_rate; pi++){ 248 | for(int pj = 0; pj < pooling_rate; pj++){ 249 | float *cur_fm = fm + (i+pi) * feature_map_size + (j+pj); 250 | 251 | if(*cur_fm > 50) 252 | *cur_fm = 50; 253 | 254 | *cur_fm = expf(*cur_fm); 255 | assert(!isinf(*cur_fm)); 256 | sum += *cur_fm; 257 | } 258 | } 259 | for(int pi = 0; pi < pooling_rate; pi++){ 260 | for(int pj = 0; pj < pooling_rate; pj++){ 261 | float *cur_fm = fm + (i+pi) * feature_map_size + (j+pj); 262 | float *cur_probs = probs + (i+pi) * feature_map_size + (j+pj); 263 | *cur_probs = *cur_fm / (1 + sum); 264 | pooling_area[pi*pooling_rate+pj] = *cur_probs; 265 | *cur_fm = 0; 266 | } 267 | } 268 | pooling_area[pooling_rate*pooling_rate] = 1.0/(1+sum); 269 | int pooling_idx = max_pooling_multinomial(pooling_area, 270 | pooling_rate*pooling_rate+1); 271 | if(pooling_idx == pooling_rate*pooling_rate){ 272 | target[(i/pooling_rate)*subsample_size+(j/pooling_rate)] = 0; 273 | }else{ 274 | target[(i/pooling_rate)*subsample_size+(j/pooling_rate)] = 1; 275 | int pi = pooling_idx / pooling_rate; 276 | int pj = pooling_idx % pooling_rate; 277 | fm[(i+pi) * feature_map_size + (j+pj)] = 1; 278 | } 279 | } 280 | } 281 | } 282 | } 283 | } 284 | 285 | void CRBM::CPU_convolution_backward(float *y_h, float *filters, float *vbias, 286 | float *y_v_probs, float *y_v){ 287 | float tmp_recon[MAX_IMGAG_SIZE][MAX_IMGAG_SIZE]; 288 | int padding = filter_size-1; 289 | int input_padding_size = feature_map_size + filter_size - 1; 290 | int lu_padding = left_upper_padding; 291 | 292 | bzero(tmp_recon, sizeof(tmp_recon)); 293 | 294 | for(int img = 0; img < input_num; img++){ 295 | for(int cha = 0; cha < channel_num; cha++){ 296 | float *target = y_v_probs + 297 | img * channel_num * input_size * input_size + 298 | cha * input_size * input_size; 299 | 300 | float *target_y_v = y_v + 301 | img * channel_num * input_size * input_size + 302 | cha * input_size * input_size; 303 | 304 | for(int fil = 0; fil < filter_num; fil++){ 305 | float *filter = filters + 306 | fil * filter_size * filter_size * channel_num + 307 | cha * filter_size * filter_size; 308 | 309 | float *fm = y_h + 310 | img * filter_num * feature_map_size * feature_map_size + 311 | fil * feature_map_size * feature_map_size; 312 | 313 | for(int r = 0; r < feature_map_size + filter_size - 1; r++){ 314 | for(int c = 0; c < feature_map_size + filter_size - 1; c++){ 315 | 316 | for(int i = r; i < r+filter_size; i++){ 317 | for(int j = c; j < c+filter_size; j++){ 318 | if(!(i < padding || j < padding || 319 | i >= (padding + feature_map_size) || 320 | j >= (padding + feature_map_size))){ 321 | tmp_recon[r][c] += 322 | fm[(i-padding)*feature_map_size + (j-padding)] * 323 | filter[(filter_size-1-(i-r))*filter_size + (filter_size-1-(j-c))]; 324 | } 325 | } 326 | } 327 | } 328 | } 329 | } 330 | 331 | for(int i = 0; i < input_size; i++){ 332 | for(int j = 0; j < input_size; j++){ 333 | target[i*input_size+j] = tmp_recon[i+lu_padding][j+lu_padding]; 334 | //target[i*input_size+j] = logisitc(tmp_recon[i+lu_padding][j+lu_padding]); 335 | //target_y_v[i*input_size+j] = 336 | // (random_float(0,1) < target[i*input_size+j]) ? 1 : 0; 337 | } 338 | } 339 | bzero(tmp_recon, sizeof(tmp_recon)); 340 | } 341 | } 342 | } 343 | 344 | /* 345 | * 分为positive phase和negative phase 346 | * is_init为true则计算positive phase, dw初始化为0 347 | * is_init为false则计算negative phase, dw -= new_dw 348 | */ 349 | void CRBM::CPU_compute_d_w(float *v, float *h, float *dw, bool is_init){ 350 | 351 | float sign; 352 | int lu_padding = left_upper_padding; 353 | if(is_init){ 354 | bzero(dw, filter_num * channel_num * filter_size * filter_size * sizeof(float)); 355 | sign = 1.0f; 356 | }else{ 357 | sign = -1.0f; 358 | } 359 | 360 | for(int img = 0; img < input_num; img++){ 361 | for(int fil = 0; fil < filter_num; fil++){ 362 | 363 | float *this_h = h + img * filter_num * feature_map_size * feature_map_size + 364 | fil * feature_map_size * feature_map_size; 365 | 366 | for(int cha = 0; cha < channel_num; cha++){ 367 | 368 | float *this_v = v + img * channel_num * input_size * input_size + 369 | cha * input_size * input_size; 370 | 371 | float *this_dw = dw + fil * channel_num * filter_size * filter_size + 372 | cha * filter_size * filter_size; 373 | 374 | for(int r = 0; r < filter_size; r++){ 375 | for(int c = 0; c < filter_size; c++){ 376 | 377 | float *cur_v = this_v + (r-lu_padding) * input_size + 378 | (c-lu_padding); 379 | 380 | for(int i = 0; i < feature_map_size; i++){ 381 | for(int j = 0; j < feature_map_size; j++){ 382 | if(!((r+i) < lu_padding || 383 | (c+j) < lu_padding || 384 | (r+i) >= (lu_padding+input_size) || 385 | (c+j) >= (lu_padding+input_size))){ 386 | 387 | this_dw[r*filter_size+c] += 388 | sign * cur_v[j] * this_h[i*feature_map_size+j]; 389 | } 390 | } 391 | cur_v += input_size; 392 | } 393 | } 394 | } 395 | } 396 | } 397 | } 398 | } 399 | 400 | void CRBM::GPU_convolution_forward(float *input, float *filters, float *y_h, float *hbias){ 401 | dim3 blocks = dim3(input_size / 32 * filter_num, input_size / 32 * input_num); 402 | dim3 threads = dim3(32, 32); 403 | convolution_forward_kernel<<>>(input, filters, y_h, 404 | hbias, input_size, channel_num, feature_map_size, filter_size, 405 | filter_num, left_upper_padding, sigma); 406 | cudaDeviceSynchronize(); 407 | } 408 | 409 | void CRBM::GPU_max_pooling(float *y_h, float *y_h_probs, float *y_p){ 410 | dim3 blocks = dim3(feature_map_size / pooling_rate / 16 * filter_num, 411 | feature_map_size / pooling_rate / 16 * input_num); 412 | dim3 threads = dim3(16, 16); 413 | curandGenerateUniform(rnd_gen, rnd_array, rnd_num); 414 | max_pooling_kernel<<>>(y_h, y_h_probs, y_p, 415 | feature_map_size, filter_num, pooling_rate, rnd_array, rnd_num); 416 | cudaDeviceSynchronize(); 417 | } 418 | 419 | void CRBM::GPU_convolution_backward(float *y_h, float *filters, float *vbias, 420 | float *y_v_probs, float *y_v){ 421 | dim3 blocks = dim3(input_size / 16 * channel_num, input_size / 16 * input_num); 422 | dim3 threads = dim3(16, 16); 423 | 424 | curandGenerateUniform(rnd_gen, rnd_array, rnd_num); 425 | convolution_backward_kernel<<>>(y_h, 426 | filters, vbias, y_v_probs, y_v, input_size, left_upper_padding, 427 | channel_num, feature_map_size, filter_num, filter_size, rnd_array, rnd_num); 428 | cudaDeviceSynchronize(); 429 | } 430 | 431 | void CRBM::GPU_compute_d_w(float *v, float *h, float *dw, bool is_init){ 432 | dim3 blocks = dim3(channel_num * filter_num * feature_map_size / 32, 433 | input_num * feature_map_size / 32); 434 | dim3 threads = dim3(filter_size, filter_size); 435 | 436 | compute_d_w_kernel<<>>(v, h, dw, is_init, input_size, left_upper_padding, 437 | channel_num, filter_num, filter_size, feature_map_size); 438 | cudaDeviceSynchronize(); 439 | } 440 | 441 | void CRBM::run_batch(int cur_trial, int cur_image, int cur_batch, Matrix& batch_data){ 442 | 443 | batch_data.assign(*this->CPU_input); 444 | this->GPU_input->copyFromHost(*this->CPU_input); 445 | 446 | if(this->cur_trial > 5) 447 | this->momentum = 0.9; 448 | if(this->cur_image != cur_image && this->sigma > 0.1) 449 | this->sigma *= 0.99; 450 | 451 | this->cur_trial = cur_trial; 452 | this->cur_image = cur_image; 453 | this->cur_batch = cur_batch; 454 | 455 | cout << "trial : " << cur_trial << " image : " << cur_image << " batch : " << cur_batch << endl; 456 | start(); 457 | } 458 | 459 | void CRBM::start(){ 460 | bool cheak_euqality = false; 461 | bool run_CPU = false; 462 | bool run_GPU = true; 463 | bool check_nan = true; 464 | 465 | struct timeval _start_time, _end_time; 466 | 467 | if(run_CPU){ 468 | /* CPU computation */ 469 | /**********************************/ 470 | timeFunc(this->CPU_convolution_forward(this->CPU_input->get_data(), 471 | this->CPU_filters->get_data(), this->CPU_y_h->get_data(), 472 | this->CPU_hbias->get_data()), "CPU convolutional forward"); 473 | 474 | if(check_nan){ 475 | assert(this->CPU_input->check_nan()); 476 | assert(this->CPU_y_h->check_nan()); 477 | } 478 | 479 | timeFunc(this->CPU_max_pooling(this->CPU_y_h->get_data(), 480 | this->CPU_y_h_probs->get_data(), this->CPU_y_p->get_data()), 481 | "CPU max pooling"); 482 | 483 | if(check_nan){ 484 | assert(this->CPU_y_h->check_nan()); 485 | assert(this->CPU_y_h_probs->check_nan()); 486 | } 487 | 488 | timeFunc(this->CPU_convolution_backward(this->CPU_y_h->get_data(), 489 | //timeFunc(this->CPU_convolution_backward(this->CPU_y_h_probs->get_data(), 490 | this->CPU_filters->get_data(), this->CPU_vbias->get_data(), 491 | this->CPU_y_v_probs->get_data(), this->CPU_y_v->get_data()), 492 | "CPU convolutional backward"); 493 | 494 | if(check_nan){ 495 | assert(this->CPU_y_v_probs->check_nan()); 496 | } 497 | 498 | timeFunc(this->CPU_convolution_forward(this->CPU_y_v_probs->get_data(), 499 | this->CPU_filters->get_data(), this->CPU_y_h2->get_data(), 500 | this->CPU_hbias->get_data()), "CPU convolutional forward"); 501 | 502 | timeFunc(this->CPU_max_pooling(this->CPU_y_h2->get_data(), 503 | this->CPU_y_h2_probs->get_data(), this->CPU_y_p->get_data()), 504 | "CPU max pooling"); 505 | 506 | timeFunc(this->CPU_compute_d_w(this->CPU_input->get_data(), 507 | this->CPU_y_h_probs->get_data(), this->CPU_d_w->get_data(), 508 | true), "CPU compute dw positive phase"); 509 | 510 | timeFunc(this->CPU_compute_d_w(this->CPU_y_v_probs->get_data(), 511 | this->CPU_y_h2_probs->get_data(), this->CPU_d_w->get_data(), 512 | false), "CPU compute dw negative phase"); 513 | 514 | this->CPU_d_w->ele_scale(1.0 / (input_num * feature_map_size * feature_map_size)); 515 | this->CPU_d_w->mat_add(*this->CPU_filters, -this->l2reg); 516 | 517 | this->CPU_y_h_probs->mat_sum(1, *this->CPU_d_h_sum_tmp); 518 | this->CPU_d_h_sum_tmp->reshape(filter_num, feature_map_size * feature_map_size); 519 | this->CPU_d_h_sum_tmp->mat_sum(0, *this->CPU_d_hbias); 520 | this->CPU_d_h_sum_tmp->reshape(1, filter_num * feature_map_size * feature_map_size); 521 | 522 | this->CPU_y_h2_probs->mat_sum(1, *this->CPU_d_h_sum_tmp); 523 | this->CPU_d_h_sum_tmp->reshape(filter_num, feature_map_size * feature_map_size); 524 | this->CPU_d_h_sum_tmp->mat_sum(0, *this->CPU_d_hbias_tmp); 525 | this->CPU_d_h_sum_tmp->reshape(1, filter_num * feature_map_size * feature_map_size); 526 | 527 | this->CPU_d_hbias->mat_add(*this->CPU_d_hbias_tmp, -1.0f); 528 | this->CPU_d_hbias->ele_scale(1.0 / (input_num * feature_map_size * feature_map_size)); 529 | 530 | this->CPU_y_h_probs->mat_sum(1, *this->CPU_d_h_sum_tmp); 531 | this->CPU_d_h_sum_tmp->reshape(filter_num, feature_map_size * feature_map_size); 532 | this->CPU_d_h_sum_tmp->mat_sum(0, *this->CPU_d_hbias_tmp); 533 | this->CPU_d_h_sum_tmp->reshape(1, filter_num * feature_map_size * feature_map_size); 534 | 535 | this->CPU_d_hbias_tmp->ele_scale(1.0 / (input_num * feature_map_size * feature_map_size)); 536 | this->CPU_d_hbias_tmp->ele_add(-this->ph); 537 | this->CPU_d_hbias_tmp->ele_scale(this->ph_lambda); 538 | this->CPU_d_hbias->mat_add(*this->CPU_d_hbias_tmp, -1.0f); 539 | 540 | this->CPU_d_w->mat_add(*this->CPU_d_w_pre, *this->CPU_d_w, epsilon, momentum); 541 | this->CPU_d_w->assign(*this->CPU_d_w_pre); 542 | this->CPU_filters->mat_add(*this->CPU_d_w, 1.0f); 543 | 544 | this->CPU_d_hbias->mat_add(*this->CPU_d_hbias_pre, *this->CPU_d_hbias, epsilon, momentum); 545 | this->CPU_d_hbias->assign(*this->CPU_d_hbias_pre); 546 | this->CPU_hbias->mat_add(*this->CPU_d_hbias, 1.0f); 547 | 548 | this->CPU_y_v_probs->mat_add(*this->CPU_input, -1.0f); 549 | this->CPU_y_v_probs->mat_mul(*this->CPU_y_v_probs); 550 | 551 | float cur_ferr = this->CPU_y_v_probs->ele_mean(); 552 | float cur_sparsity = this->CPU_y_h_probs->ele_mean(); 553 | this->ferr += cur_ferr; 554 | this->sparsity += cur_sparsity; 555 | 556 | } 557 | /**********************************/ 558 | 559 | if(run_GPU){ 560 | /* GPU computation */ 561 | /**********************************/ 562 | Matrix* tmp = new Matrix(CPU_filters->get_row_num(), this->CPU_filters->get_col_num()); 563 | 564 | timeFunc(this->GPU_convolution_forward(this->GPU_input->get_data(), 565 | this->GPU_filters->get_data(), this->GPU_y_h->get_data(), 566 | this->GPU_hbias->get_data()), "GPU convolutional forward"); 567 | 568 | timeFunc(this->GPU_max_pooling(this->GPU_y_h->get_data(), 569 | this->GPU_y_h_probs->get_data(), this->GPU_y_p->get_data()), 570 | "GPU max pooling"); 571 | 572 | timeFunc(this->GPU_convolution_backward(this->GPU_y_h->get_data(), 573 | //timeFunc(this->GPU_convolution_backward(this->GPU_y_h_probs->get_data(), 574 | this->GPU_filters->get_data(), this->GPU_vbias->get_data(), 575 | this->GPU_y_v_probs->get_data(), this->GPU_y_v->get_data()), 576 | "GPU convolutional backward"); 577 | 578 | timeFunc(this->GPU_convolution_forward(this->GPU_y_v_probs->get_data(), 579 | this->GPU_filters->get_data(), this->GPU_y_h2->get_data(), 580 | this->GPU_hbias->get_data()), "GPU convolutional forward"); 581 | 582 | timeFunc(this->GPU_max_pooling(this->GPU_y_h2->get_data(), 583 | this->GPU_y_h2_probs->get_data(), this->GPU_y_p->get_data()), 584 | "GPU max pooling"); 585 | 586 | this->GPU_d_w->mat_init(0.0f); 587 | timeFunc(this->GPU_compute_d_w(this->GPU_input->get_data(), 588 | this->GPU_y_h_probs->get_data(), this->GPU_d_w->get_data(), 589 | true), "GPU compute dw positive phase"); 590 | 591 | //this->GPU_d_w->assign(*tmp); 592 | 593 | timeFunc(this->GPU_compute_d_w(this->GPU_y_v_probs->get_data(), 594 | this->GPU_y_h2_probs->get_data(), this->GPU_d_w->get_data(), 595 | false), "GPU compute dw negative phase"); 596 | 597 | //this->GPU_d_w->assign(*tmp); 598 | 599 | this->GPU_d_w->ele_scale(1.0 / (input_num * feature_map_size * feature_map_size)); 600 | this->GPU_d_w->mat_add(*this->GPU_filters, -this->l2reg); 601 | 602 | this->GPU_y_h_probs->mat_sum(1, *this->GPU_d_h_sum_tmp); 603 | this->GPU_d_h_sum_tmp->reshape(filter_num, feature_map_size * feature_map_size); 604 | this->GPU_d_h_sum_tmp->mat_sum(0, *this->GPU_d_hbias); 605 | this->GPU_d_h_sum_tmp->reshape(1, filter_num * feature_map_size * feature_map_size); 606 | 607 | this->GPU_y_h2_probs->mat_sum(1, *this->GPU_d_h_sum_tmp); 608 | this->GPU_d_h_sum_tmp->reshape(filter_num, feature_map_size * feature_map_size); 609 | this->GPU_d_h_sum_tmp->mat_sum(0, *this->GPU_d_hbias_tmp); 610 | this->GPU_d_h_sum_tmp->reshape(1, filter_num * feature_map_size * feature_map_size); 611 | 612 | this->GPU_d_hbias->mat_add(*this->GPU_d_hbias_tmp, -1.0f); 613 | this->GPU_d_hbias->ele_scale(1.0 / (input_num * feature_map_size * feature_map_size)); 614 | 615 | this->GPU_y_h_probs->mat_sum(1, *this->GPU_d_h_sum_tmp); 616 | this->GPU_d_h_sum_tmp->reshape(filter_num, feature_map_size * feature_map_size); 617 | this->GPU_d_h_sum_tmp->mat_sum(0, *this->GPU_d_hbias_tmp); 618 | this->GPU_d_h_sum_tmp->reshape(1, filter_num * feature_map_size * feature_map_size); 619 | 620 | this->GPU_d_hbias_tmp->ele_scale(1.0 / (input_num * feature_map_size * feature_map_size)); 621 | this->GPU_d_hbias_tmp->ele_add(-this->ph); 622 | this->GPU_d_hbias_tmp->ele_scale(this->ph_lambda); 623 | this->GPU_d_hbias->mat_add(*this->GPU_d_hbias_tmp, -1.0f); 624 | 625 | this->GPU_d_w->mat_add(*this->GPU_d_w_pre, *this->GPU_d_w, epsilon, momentum); 626 | this->GPU_d_w->assign(*this->GPU_d_w_pre); 627 | this->GPU_filters->mat_add(*this->GPU_d_w, 1.0f); 628 | 629 | this->GPU_d_hbias->mat_add(*this->GPU_d_hbias_pre, *this->GPU_d_hbias, epsilon, momentum); 630 | this->GPU_d_hbias->assign(*this->GPU_d_hbias_pre); 631 | this->GPU_hbias->mat_add(*this->GPU_d_hbias, 1.0f); 632 | 633 | this->GPU_y_v_probs->mat_add(*this->GPU_input, -1.0f); 634 | this->GPU_y_v_probs->mat_mul(*this->GPU_y_v_probs); 635 | 636 | float cur_ferr = this->GPU_y_v_probs->ele_mean(); 637 | float cur_sparsity = this->GPU_y_h_probs->ele_mean(); 638 | this->ferr += cur_ferr; 639 | this->sparsity += cur_sparsity; 640 | 641 | delete tmp; 642 | } 643 | 644 | if(cheak_euqality){ 645 | /* 646 | * CPU and GPU equality test 647 | */ 648 | /*cout << "y_h : "; 649 | Matrix* tmp_y_h = new Matrix(this->CPU_y_h->get_row_num(), 650 | this->CPU_y_h->get_col_num()); 651 | this->GPU_y_h->assign(*tmp_y_h); 652 | this->CPU_y_h->equal_value(*tmp_y_h); 653 | delete tmp_y_h;*/ 654 | 655 | cout << "y_h_probs : "; 656 | Matrix* tmp_y_h_probs = new Matrix(this->CPU_y_h_probs->get_row_num(), 657 | this->CPU_y_h_probs->get_col_num()); 658 | this->GPU_y_h_probs->assign(*tmp_y_h_probs); 659 | this->CPU_y_h_probs->equal_value(*tmp_y_h_probs); 660 | delete tmp_y_h_probs; 661 | 662 | cout << "y_v_probs : "; 663 | Matrix* tmp_y_v_probs = new Matrix(this->CPU_y_v_probs->get_row_num(), 664 | this->CPU_y_v_probs->get_col_num()); 665 | this->GPU_y_v_probs->assign(*tmp_y_v_probs); 666 | this->CPU_y_v_probs->equal_value(*tmp_y_v_probs); 667 | delete tmp_y_v_probs; 668 | 669 | cout << "y_h2_probs : "; 670 | Matrix* tmp_y_h2_probs = new Matrix(this->CPU_y_h2_probs->get_row_num(), 671 | this->CPU_y_h2_probs->get_col_num()); 672 | this->GPU_y_h2_probs->assign(*tmp_y_h2_probs); 673 | this->CPU_y_h2_probs->equal_value(*tmp_y_h2_probs); 674 | delete tmp_y_h2_probs; 675 | 676 | cout << "d_w : "; 677 | Matrix* tmp_d_w = new Matrix(this->CPU_d_w->get_row_num(), 678 | this->CPU_d_w->get_col_num()); 679 | this->GPU_d_w->assign(*tmp_d_w); 680 | this->CPU_d_w->equal_value(*tmp_d_w, 1e-7); 681 | delete tmp_d_w; 682 | 683 | cout << "d_hbias : "; 684 | Matrix* tmp_d_hbias = new Matrix(this->CPU_d_hbias->get_row_num(), 685 | this->CPU_d_hbias->get_col_num()); 686 | this->GPU_d_hbias->assign(*tmp_d_hbias); 687 | this->CPU_d_hbias->equal_value(*tmp_d_hbias); 688 | delete tmp_d_hbias; 689 | 690 | cout << "d_h_sum_tmp : "; 691 | Matrix* tmp_d_h_sum_tmp = new Matrix(this->CPU_d_h_sum_tmp->get_row_num(), 692 | this->CPU_d_h_sum_tmp->get_col_num()); 693 | this->GPU_d_h_sum_tmp->assign(*tmp_d_h_sum_tmp); 694 | this->CPU_d_h_sum_tmp->equal_value(*tmp_d_h_sum_tmp); 695 | delete tmp_d_h_sum_tmp; 696 | 697 | cout << "filter : "; 698 | Matrix* tmp_filters = new Matrix(this->CPU_filters->get_row_num(), 699 | this->CPU_filters->get_col_num()); 700 | this->GPU_filters->assign(*tmp_filters); 701 | this->CPU_filters->equal_value(*tmp_filters); 702 | delete tmp_filters; 703 | 704 | cout << "hbias : "; 705 | Matrix* tmp_hbias = new Matrix(this->CPU_hbias->get_row_num(), 706 | this->CPU_hbias->get_col_num()); 707 | this->GPU_hbias->assign(*tmp_hbias); 708 | this->CPU_hbias->equal_value(*tmp_hbias); 709 | delete tmp_hbias; 710 | } 711 | } 712 | --------------------------------------------------------------------------------