├── .gitignore ├── Makefile ├── README.md ├── data └── 12mer-kd │ ├── 10fold_idx │ ├── test_idx-1.txt │ ├── test_idx-10.txt │ ├── test_idx-2.txt │ ├── test_idx-3.txt │ ├── test_idx-4.txt │ ├── test_idx-5.txt │ ├── test_idx-6.txt │ ├── test_idx-7.txt │ ├── test_idx-8.txt │ ├── test_idx-9.txt │ ├── train_idx-1.txt │ ├── train_idx-10.txt │ ├── train_idx-2.txt │ ├── train_idx-3.txt │ ├── train_idx-4.txt │ ├── train_idx-5.txt │ ├── train_idx-6.txt │ ├── train_idx-7.txt │ ├── train_idx-8.txt │ └── train_idx-9.txt │ ├── 12mer-kd.txt │ ├── raw-12mer-kd.csv │ └── split_train_test.m ├── include ├── config.h ├── fmt.h ├── matrix │ ├── cuda_binary_kernel.cuh │ ├── cuda_helper.h │ ├── cuda_rand_kernel.cuh │ ├── cuda_unary_kernel.cuh │ ├── dense_matrix.h │ ├── fastWalshTransform_kernel_double.cuh │ ├── fastWalshTransform_kernel_float.cuh │ ├── gpuhandle.h │ ├── imatrix.h │ ├── mat_typedef.h │ ├── matrix_utils.h │ ├── mkl_helper.h │ ├── sp_data.h │ ├── sparse_matrix.h │ └── vector.h ├── net │ ├── abs_criterion_layer.h │ ├── avg_rank_criterion_layer.h │ ├── c_add_layer.h │ ├── c_mul_layer.h │ ├── classnll_criterion_layer.h │ ├── col_slice_layer.h │ ├── concat_layer.h │ ├── const_scalar_param.h │ ├── err_cnt_criterion_layer.h │ ├── exp_layer.h │ ├── gaussian_ll_layer.h │ ├── general_loss_criterion_layer.h │ ├── global_sum_layer.h │ ├── graph_pool_param.h │ ├── graph_struct.h │ ├── i_act_layer.h │ ├── i_criterion_layer.h │ ├── i_layer.h │ ├── i_param.h │ ├── inner_product_layer.h │ ├── input_layer.h │ ├── learner.h │ ├── linear_param.h │ ├── log_layer.h │ ├── loss_func.h │ ├── max_entropy_criterion_layer.h │ ├── mixture_nll_criterion_layer.h │ ├── model.h │ ├── mse_criterion_layer.h │ ├── msg_pass_param.h │ ├── multinomial_sample_layer.h │ ├── mvn_diag_nll_criterion_layer.h │ ├── nngraph.h │ ├── param_layer.h │ ├── relu_layer.h │ ├── repeat_layer.h │ ├── sigmoid_layer.h │ ├── softmax_layer.h │ ├── tanh_layer.h │ └── transpose_layer.h ├── nn_common.h └── utils.h ├── local_run.sh ├── make_common └── src ├── kernel_loopy_bp.cpp ├── kernel_mean_field.cpp ├── matrix ├── cpu_dense_matrix.cpp ├── cpu_sparse_mat.cpp ├── cpu_vector.cpp ├── gpu_dense_matrix.cu ├── gpu_sparse_mat.cu ├── gpu_vector.cu └── gpuhandle.cu └── net ├── act_layer.cpp ├── act_layer.cu ├── concat_layer.cpp ├── fmt.cpp ├── gaussian_ll_layer.cpp ├── global_sum_layer.cpp ├── graph_pool_param.cpp ├── graph_pool_param.cu ├── learner.cpp ├── loss_func.cpp ├── loss_func.cu ├── max_entropy_criterion_layer.cpp ├── mixture_nll_criterion_layer.cpp ├── msg_pass_param.cpp ├── mvn_diag_nll_criterion_layer.cpp └── nngraph.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | results/ 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include make_common 2 | 3 | build_root = build 4 | 5 | include_dirs = $(CUDA_HOME)/include $(MKL_ROOT)/include include/matrix include/net ./include 6 | CXXFLAGS += $(addprefix -I,$(include_dirs)) 7 | NVCCFLAGS += $(addprefix -I,$(include_dirs)) 8 | NVCCFLAGS += -std=c++11 --use_fast_math 9 | 10 | cu_files = $(shell $(FIND) src/ -name "*.cu" -printf "%P\n") 11 | cpp_files = $(shell $(FIND) src/ -name "*.cpp" -printf "%P\n") 12 | cu_obj_files = $(subst .cu,.o,$(cu_files)) 13 | cxx_obj_files = $(subst .cpp,.o,$(cpp_files)) 14 | obj_build_root = $(build_root)/objs 15 | objs = $(addprefix $(obj_build_root)/cuda/,$(cu_obj_files)) $(addprefix $(obj_build_root)/cxx/,$(cxx_obj_files)) 16 | DEPS = ${objs:.o=.d} 17 | 18 | lib_dir = $(build_root)/lib 19 | net_lib = $(lib_dir)/libnet.a 20 | 21 | all: $(net_lib) build/kernel_mean_field build/kernel_loopy_bp 22 | 23 | $(net_lib): $(objs) 24 | $(dir_guard) 25 | ar rcs $@ $(objs) 26 | 27 | $(obj_build_root)/cuda/%.o: src/%.cu 28 | $(dir_guard) 29 | $(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} -odir $(@D) 30 | $(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 31 | 32 | $(obj_build_root)/cxx/%.o: src/%.cpp 33 | $(dir_guard) 34 | $(CXX) $(CXXFLAGS) -MMD -c -o $@ $(filter %.cpp, $^) 35 | 36 | build/%: src/%.cpp $(net_lib) ./include/* 37 | $(dir_guard) 38 | $(CXX) $(CXXFLAGS) -o $@ $(filter %.cpp, %.a $^) -L$(lib_dir) -lnet $(LDFLAGS) 39 | 40 | clean: 41 | rm -rf build 42 | 43 | -include $(DEPS) 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sequence2vec 2 | 3 | #### Prerequisites 4 | 5 | Tested under Ubuntu 14.04 6 | 7 | 8 | ##### Download and install cuda from https://developer.nvidia.com/cuda-toolkit 9 | 10 | wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb 11 | sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb 12 | sudo apt-get update 13 | sudo apt-get install cuda 14 | 15 | in .bashrc, add the following path (suppose you installed to the default path) 16 | 17 | export CUDA_HOME=/usr/local/cuda 18 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 19 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 20 | 21 | ##### Download and install intel mkl 22 | 23 | in .bashrc, add the following path 24 | 25 | source {path_to_your_intel_root/name_of_parallel_tool_box}/bin/psxevars.sh 26 | export MKL_ROOT={path_to_your_intel_root}/mkl 27 | 28 | ##### Install cppformat (now called fmtlib) 29 | 30 | check https://github.com/fmtlib/fmt for help 31 | 32 | #### Build 33 | make 34 | 35 | ##### Run Kd prediction 36 | modify the configs in local_run.sh 37 | ./local_run.sh 38 | -------------------------------------------------------------------------------- /data/12mer-kd/split_train_test.m: -------------------------------------------------------------------------------- 1 | clear; 2 | clc; 3 | 4 | T = readtable('raw-12mer-kd.csv'); 5 | 6 | total = size(T, 1); 7 | 8 | fold_size = floor(total / 10); 9 | p = randperm(total); 10 | for fold = 1 : 10 11 | test_range = (fold - 1) * fold_size + 1 : fold * fold_size; 12 | train_range = [1 : (fold - 1) * fold_size, fold * fold_size + 1 : total]; 13 | 14 | fid = fopen(sprintf('10fold_idx/test_idx-%d.txt', fold), 'w'); 15 | for i = 1 : length(test_range) 16 | fprintf(fid, '%d\n', p(test_range(i)) - 1); 17 | end 18 | fclose(fid); 19 | 20 | fid = fopen(sprintf('10fold_idx/train_idx-%d.txt', fold), 'w'); 21 | for i = 1 : length(train_range) 22 | fprintf(fid, '%d\n', p(train_range(i)) - 1); 23 | end 24 | fclose(fid); 25 | end 26 | 27 | fid = fopen('12mer-kd.txt', 'w'); 28 | fprintf(fid, '%d\n', total); 29 | for i = 1 : total 30 | fprintf(fid, '%.10f %s\n', T.kd(i), T.str{i}); 31 | end 32 | fclose(fid); -------------------------------------------------------------------------------- /include/config.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | typedef float Dtype; 9 | 10 | struct cfg 11 | { 12 | static bool evaluate, rev_order; 13 | static int dev_id, iter; 14 | static int max_lv, conv_size, fp_len, kmer; 15 | static unsigned n_hidden; 16 | static Dtype scale; 17 | static unsigned batch_size; 18 | static unsigned max_epoch; 19 | static bool max_pool, global_pool; 20 | static int num_nodes; 21 | static unsigned test_interval; 22 | static unsigned report_interval; 23 | static unsigned save_interval; 24 | static int window_size; 25 | static int node_dim; 26 | static bool pad; 27 | static Dtype lr; 28 | static Dtype l2_penalty; 29 | static Dtype momentum; 30 | static const char *result_file, *train_idx_file, *test_idx_file, *string_file, *save_dir; 31 | 32 | static void LoadParams(const int argc, const char** argv) 33 | { 34 | for (int i = 1; i < argc; i += 2) 35 | { 36 | if (strcmp(argv[i], "-kmer") == 0) 37 | kmer = atoi(argv[i + 1]); 38 | if (strcmp(argv[i], "-scale") == 0) 39 | scale = atof(argv[i + 1]); 40 | if (strcmp(argv[i], "-global_pool") == 0) 41 | global_pool = (bool)atoi(argv[i + 1]); 42 | if (strcmp(argv[i], "-rev_order") == 0) 43 | rev_order = (bool)atoi(argv[i + 1]); 44 | if (strcmp(argv[i], "-eval") == 0) 45 | evaluate = (bool)atoi(argv[i + 1]); 46 | if (strcmp(argv[i], "-max_pool") == 0) 47 | max_pool = (bool)atoi(argv[i + 1]); 48 | if (strcmp(argv[i], "-pad") == 0) 49 | pad = (bool)atoi(argv[i + 1]); 50 | if (strcmp(argv[i], "-w") == 0) 51 | window_size = atoi(argv[i + 1]); 52 | if (strcmp(argv[i], "-lr") == 0) 53 | lr = atof(argv[i + 1]); 54 | if (strcmp(argv[i], "-cur_iter") == 0) 55 | iter = atoi(argv[i + 1]); 56 | if (strcmp(argv[i], "-hidden") == 0) 57 | n_hidden = atoi(argv[i + 1]); 58 | if (strcmp(argv[i], "-lv") == 0) 59 | max_lv = atoi(argv[i + 1]); 60 | if (strcmp(argv[i], "-conv") == 0) 61 | conv_size = atoi(argv[i + 1]); 62 | if (strcmp(argv[i], "-fp") == 0) 63 | fp_len = atoi(argv[i + 1]); 64 | if (strcmp(argv[i], "-b") == 0) 65 | batch_size = atoi(argv[i + 1]); 66 | if (strcmp(argv[i], "-maxe") == 0) 67 | max_epoch = atoi(argv[i + 1]); 68 | if (strcmp(argv[i], "-int_test") == 0) 69 | test_interval = atoi(argv[i + 1]); 70 | if (strcmp(argv[i], "-int_report") == 0) 71 | report_interval = atoi(argv[i + 1]); 72 | if (strcmp(argv[i], "-int_save") == 0) 73 | save_interval = atoi(argv[i + 1]); 74 | if (strcmp(argv[i], "-l2") == 0) 75 | l2_penalty = atof(argv[i + 1]); 76 | if (strcmp(argv[i], "-m") == 0) 77 | momentum = atof(argv[i + 1]); 78 | if (strcmp(argv[i], "-result") == 0) 79 | result_file = argv[i + 1]; 80 | if (strcmp(argv[i], "-svdir") == 0) 81 | save_dir = argv[i + 1]; 82 | if (strcmp(argv[i], "-string") == 0) 83 | string_file = argv[i + 1]; 84 | if (strcmp(argv[i], "-train_idx") == 0) 85 | train_idx_file = argv[i + 1]; 86 | if (strcmp(argv[i], "-test_idx") == 0) 87 | test_idx_file = argv[i + 1]; 88 | if (strcmp(argv[i], "-device") == 0) 89 | dev_id = atoi(argv[i + 1]); 90 | } 91 | 92 | if (pad) 93 | { 94 | node_dim = 1; 95 | for (int i = 0; i < window_size; ++i) 96 | node_dim *= 5; 97 | } 98 | else 99 | node_dim = 1 << (2 * window_size); 100 | 101 | std::cerr << "max_pool = " << max_pool << std::endl; 102 | std::cerr << "node_dim = " << node_dim << std::endl; 103 | std::cerr << "pad = " << pad << std::endl; 104 | std::cerr << "window_size = " << window_size << std::endl; 105 | std::cerr << "n_hidden = " << n_hidden << std::endl; 106 | std::cerr << "global_pool = " << global_pool << std::endl; 107 | std::cerr << "max level = " << max_lv << std::endl; 108 | std::cerr << "conv size = " << conv_size << std::endl; 109 | std::cerr << "fp len = " << fp_len << std::endl; 110 | std::cerr << "batch_size = " << batch_size << std::endl; 111 | std::cerr << "max_epoch = " << max_epoch << std::endl; 112 | std::cerr << "test_interval = " << test_interval << std::endl; 113 | std::cerr << "report_interval = " << report_interval << std::endl; 114 | std::cerr << "save_interval = " << save_interval << std::endl; 115 | std::cerr << "lr = " << lr << std::endl; 116 | std::cerr << "l2_penalty = " << l2_penalty << std::endl; 117 | std::cerr << "momentum = " << momentum << std::endl; 118 | std::cerr << "init iter = " << iter << std::endl; 119 | std::cerr << "device id = " << dev_id << std::endl; 120 | std::cerr << "scale = " << scale << std::endl; 121 | } 122 | }; 123 | 124 | bool cfg::global_pool = false; 125 | bool cfg::max_pool = false; 126 | bool cfg::rev_order = false; 127 | bool cfg::pad = false; 128 | bool cfg::evaluate = false; 129 | int cfg::dev_id = 0; 130 | int cfg::node_dim = 0; 131 | int cfg::iter = 0; 132 | int cfg::max_lv = 4; 133 | int cfg::kmer = 3; 134 | int cfg::conv_size = 20; 135 | int cfg::fp_len = 512; 136 | int cfg::num_nodes = 0; 137 | unsigned cfg::n_hidden = 100; 138 | unsigned cfg::batch_size = 50; 139 | unsigned cfg::max_epoch = 200; 140 | unsigned cfg::test_interval = 10000; 141 | unsigned cfg::report_interval = 100; 142 | unsigned cfg::save_interval = 50000; 143 | int cfg::window_size = 1; 144 | Dtype cfg::lr = 0.0005; 145 | Dtype cfg::l2_penalty = 0; 146 | Dtype cfg::momentum = 0; 147 | Dtype cfg::scale = 1; 148 | const char* cfg::train_idx_file = nullptr; 149 | const char* cfg::test_idx_file = nullptr; 150 | const char* cfg::string_file = nullptr; 151 | const char* cfg::result_file = nullptr; 152 | const char* cfg::save_dir = "./saved"; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /include/matrix/cuda_binary_kernel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_BINARY_KERNEL_CUH 2 | #define CUDA_BINARY_KERNEL_CUH 3 | 4 | #include 5 | #include "gpuhandle.h" 6 | 7 | //=================================== mul ====================================== 8 | 9 | template 10 | class BinaryMul 11 | { 12 | public: 13 | BinaryMul() {} 14 | 15 | __device__ inline void operator()(Dtype& dst, const Dtype& lhs) 16 | { 17 | dst *= lhs; 18 | } 19 | 20 | __device__ inline void operator()(Dtype& dst, const Dtype& lhs, const Dtype& rhs) 21 | { 22 | dst = lhs * rhs; 23 | } 24 | }; 25 | 26 | //=================================== mul ====================================== 27 | 28 | template 29 | class BinaryDiv 30 | { 31 | public: 32 | BinaryDiv() {} 33 | 34 | __device__ inline void operator()(Dtype& dst, const Dtype& lhs) 35 | { 36 | dst /= lhs; 37 | } 38 | 39 | __device__ inline void operator()(Dtype& dst, const Dtype& lhs, const Dtype& rhs) 40 | { 41 | dst = lhs / rhs; 42 | } 43 | }; 44 | 45 | 46 | //=================================== call interface ====================================== 47 | 48 | template 49 | __global__ void BinaryKernel(Dtype *dst, const Dtype *lhs, int numElements, BinaryEngine binary) 50 | { 51 | int i = blockDim.x * blockIdx.x + threadIdx.x; 52 | 53 | if(i < numElements) 54 | { 55 | binary(dst[i], lhs[i]); 56 | } 57 | } 58 | 59 | template 60 | void BinaryOp(Dtype *dst, const Dtype *lhs, int numElements, BinaryEngine binary, const unsigned& sid) 61 | { 62 | int thread_num = min(c_uCudaThreadNum, numElements); 63 | int blocksPerGrid = (numElements + thread_num - 1) / thread_num; 64 | BinaryKernel<<>> (dst, lhs, numElements, binary); 65 | } 66 | 67 | template 68 | __global__ void BinaryKernel(Dtype *dst, const Dtype* lhs, const Dtype *rhs, int numElements, BinaryEngine binary) 69 | { 70 | int i = blockDim.x * blockIdx.x + threadIdx.x; 71 | 72 | if(i < numElements) 73 | { 74 | binary(dst[i], lhs[i], rhs[i]); 75 | } 76 | } 77 | 78 | template 79 | void BinaryOp(Dtype *dst, const Dtype* lhs, const Dtype *rhs, int numElements, BinaryEngine binary, const unsigned& sid) 80 | { 81 | int thread_num = min(c_uCudaThreadNum, numElements); 82 | int blocksPerGrid = (numElements + thread_num - 1) / thread_num; 83 | BinaryKernel<<>> (dst, lhs, rhs, numElements, binary); 84 | } 85 | 86 | #endif -------------------------------------------------------------------------------- /include/matrix/cuda_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_HELPER_H 2 | #define CUDA_HELPER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | __device__ inline int get_sp_row_idx(int i, int* row_ptr, int n_rows) 10 | { 11 | int l = 0, r = n_rows - 1, row; 12 | while (l <= r) 13 | { 14 | row = (l + r) / 2; 15 | if (row_ptr[row] <= i) 16 | { 17 | if (row_ptr[row + 1] > i) 18 | break; 19 | else 20 | l = row + 1; 21 | } else r = row - 1; 22 | } 23 | return row; 24 | } 25 | 26 | __device__ inline float cuda_pow(const float& x, const float& y) 27 | { 28 | return powf(x, y); 29 | } 30 | 31 | __device__ inline double cuda_pow(const double& x, const double& y) 32 | { 33 | return pow(x, y); 34 | } 35 | 36 | __device__ inline float cuda_exp(const float& src) 37 | { 38 | return expf(src); 39 | } 40 | 41 | __device__ inline double cuda_exp(const double& src) 42 | { 43 | return exp(src); 44 | } 45 | 46 | __device__ inline float cuda_log(const float& src) 47 | { 48 | return logf(src); 49 | } 50 | 51 | __device__ inline double cuda_log(const double& src) 52 | { 53 | return log(src); 54 | } 55 | 56 | inline float CudaHelper_Dot(cublasHandle_t& handle, int n, const float *x, const float* y) 57 | { 58 | float result; 59 | cublasSdot(handle, n, x, 1, y, 1, &result); 60 | return result; 61 | } 62 | 63 | inline double CudaHelper_Dot(cublasHandle_t& handle, int n, const double *x, const double* y) 64 | { 65 | double result; 66 | cublasDdot(handle, n, x, 1, y, 1, &result); 67 | return result; 68 | } 69 | 70 | inline float CudaHelper_Norm2(cublasHandle_t& handle, int n, const float *x) 71 | { 72 | float result; 73 | cublasSnrm2(handle, n, x, 1, &result); 74 | return result; 75 | } 76 | 77 | inline double CudaHelper_Norm2(cublasHandle_t& handle, int n, const double *x) 78 | { 79 | double result; 80 | cublasDnrm2(handle, n, x, 1, &result); 81 | return result; 82 | } 83 | 84 | inline void CudaHelper_Amax(cublasHandle_t& handle, int n, const float *x, int* result) 85 | { 86 | cublasIsamax(handle, n, x, 1, result); 87 | } 88 | 89 | inline void CudaHelper_Amax(cublasHandle_t& handle, int n, const double *x, int* result) 90 | { 91 | cublasIdamax(handle, n, x, 1, result); 92 | } 93 | 94 | inline float CudaHelper_Asum(cublasHandle_t& handle, int n, const float *x) 95 | { 96 | float result; 97 | cublasSasum(handle, n, x, 1, &result); 98 | return result; 99 | } 100 | 101 | inline double CudaHelper_Asum(cublasHandle_t& handle, int n, const double *x) 102 | { 103 | double result; 104 | cublasDasum(handle, n, x, 1, &result); 105 | return result; 106 | } 107 | 108 | inline void CudaHelper_Ger(cublasHandle_t& handle, int m, int n, const float* alpha, const float* x, const float* y, float* A) 109 | { 110 | cublasSger(handle, m, n, alpha, x, 1, y, 1, A, m); 111 | } 112 | 113 | inline void CudaHelper_Ger(cublasHandle_t& handle, int m, int n, const double* alpha, const double* x, const double* y, double* A) 114 | { 115 | cublasDger(handle, m, n, alpha, x, 1, y, 1, A, m); 116 | } 117 | 118 | inline void CudaHelper_GeMV(cublasHandle_t& handle, cublasOperation_t trans, 119 | int m, int n, 120 | const float* alpha, const float* A, int lda, 121 | const float *x, int incx, 122 | const float *beta, float* y, int incy) 123 | { 124 | cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); 125 | } 126 | 127 | inline void CudaHelper_GeMV(cublasHandle_t& handle, cublasOperation_t trans, 128 | int m, int n, 129 | const double* alpha, const double* A, int lda, 130 | const double *x, int incx, 131 | const double *beta, double* y, int incy) 132 | { 133 | cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); 134 | } 135 | 136 | inline void CudaHelper_GeaM(cublasHandle_t& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, 137 | const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, 138 | float* C, int ldc) 139 | { 140 | cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); 141 | } 142 | 143 | inline void CudaHelper_GeaM(cublasHandle_t& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, 144 | const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, 145 | double* C, int ldc) 146 | { 147 | cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); 148 | } 149 | 150 | inline void CudaHelper_Axpy(cublasHandle_t& handle, int n, const float *alpha, const float *x, float *y) 151 | { 152 | cublasSaxpy(handle, n, alpha, x, 1, y, 1); 153 | } 154 | 155 | inline void CudaHelper_Axpy(cublasHandle_t& handle, int n, const double *alpha, const double *x, double *y) 156 | { 157 | cublasDaxpy(handle, n, alpha, x, 1, y, 1); 158 | } 159 | 160 | inline void CudaHelper_SetRandNormal(curandGenerator_t& generator, float* outputPtr, size_t n, float mean, float stddev) 161 | { 162 | curandGenerateNormal(generator, outputPtr, n, mean, stddev); 163 | } 164 | 165 | inline void CudaHelper_SetRandNormal(curandGenerator_t& generator, double* outputPtr, size_t n, double mean, double stddev) 166 | { 167 | curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); 168 | } 169 | 170 | #endif -------------------------------------------------------------------------------- /include/matrix/cuda_rand_kernel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_RAND_KERNEL_CUH 2 | #define CUDA_RAND_KERNEL_CUH 3 | 4 | #include 5 | #include 6 | #include "gpuhandle.h" 7 | 8 | template 9 | class NormalRandomizer 10 | { 11 | public: 12 | NormalRandomizer(Dtype _mean, Dtype _std) : mean(_mean), std(_std) {} 13 | __device__ inline Dtype operator()(curandState_t* state) 14 | { 15 | return curand_normal(state) * std + mean; 16 | } 17 | 18 | private: 19 | Dtype mean; 20 | Dtype std; 21 | }; 22 | 23 | template 24 | class BinomialRandomizer 25 | { 26 | public: 27 | BinomialRandomizer() {} 28 | __device__ inline Dtype operator()(curandState_t* state) 29 | { 30 | return curand_uniform(state) > 0.5 ? 1.0 : -1.0; 31 | } 32 | }; 33 | 34 | template 35 | class UniformRandomizer 36 | { 37 | public: 38 | UniformRandomizer(Dtype _lb, Dtype _ub) : lb(_lb), ub(_ub) {} 39 | __device__ inline Dtype operator()(curandState_t* state) 40 | { 41 | return curand_uniform(state) * (ub - lb) + lb; 42 | } 43 | 44 | private: 45 | Dtype lb; 46 | Dtype ub; 47 | }; 48 | 49 | 50 | template 51 | class ChisquareRandomizer 52 | { 53 | public: 54 | ChisquareRandomizer(Dtype _degree) : alpha(_degree / 2) {} 55 | 56 | __device__ inline Dtype operator()(curandState_t* state) 57 | { 58 | Dtype x, v, u; 59 | Dtype d = alpha - 1.0 / 3.0; 60 | Dtype c = (1.0 / 3.0) / sqrt (d); 61 | 62 | while (1){ 63 | do { 64 | x = curand_normal(state); 65 | v = 1.0 + c * x; 66 | } while (v <= 0); 67 | 68 | v = v * v * v; 69 | u = curand_uniform(state); 70 | 71 | if (u < 1 - 0.0331 * x * x * x * x) 72 | break; 73 | 74 | if (log (u) < 0.5 * x * x + d * (1 - v + log (v))) 75 | break; 76 | } 77 | // scale by 2.0 to get chisquare 78 | return 2.0 * (d * v); 79 | } 80 | 81 | private: 82 | const Dtype alpha; 83 | }; 84 | 85 | template 86 | __global__ void RandKernel(Dtype *targets, int numElements, curandState_t* state, RandEngine rnd) 87 | { 88 | const int tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; 89 | curandState_t localState = state[tidx]; 90 | for (int i = tidx; i < numElements; i += NUM_RND_STREAMS) 91 | { 92 | targets[i] = rnd(&localState); 93 | } 94 | state[tidx] = localState; 95 | } 96 | 97 | template 98 | void SetRand(Dtype *dst, int numElements, RandEngine rnd, const unsigned& sid) 99 | { 100 | RandKernel<<>>(dst, numElements, GPUHandle::devRandStates, rnd); 101 | } 102 | 103 | #endif -------------------------------------------------------------------------------- /include/matrix/cuda_unary_kernel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_UNARY_KERNEL_CUH 2 | #define CUDA_UNARY_KERNEL_CUH 3 | 4 | #include 5 | #include "gpuhandle.h" 6 | #include "cuda_helper.h" 7 | 8 | //=================================== power ====================================== 9 | 10 | template 11 | class UnaryPow 12 | { 13 | public: 14 | UnaryPow(Dtype _scalar) : scalar(_scalar) {} 15 | 16 | __device__ inline void operator()(Dtype& dst) 17 | { 18 | dst = cuda_pow(dst, scalar); 19 | } 20 | 21 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 22 | { 23 | dst = cuda_pow(src, scalar); 24 | } 25 | 26 | private: 27 | Dtype scalar; 28 | }; 29 | 30 | //=================================== scale ====================================== 31 | 32 | template 33 | class UnaryScale 34 | { 35 | public: 36 | UnaryScale(Dtype _scalar) : scalar(_scalar) {} 37 | 38 | __device__ inline void operator()(Dtype& dst) 39 | { 40 | dst *= scalar; 41 | } 42 | 43 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 44 | { 45 | dst = src * scalar; 46 | } 47 | 48 | private: 49 | Dtype scalar; 50 | }; 51 | 52 | //=================================== sqrt ====================================== 53 | 54 | template 55 | class UnarySqrt{}; 56 | 57 | template<> 58 | class UnarySqrt 59 | { 60 | public: 61 | UnarySqrt() {} 62 | __device__ inline void operator()(float& dst) 63 | { 64 | dst = sqrtf(dst); 65 | } 66 | 67 | __device__ inline void operator()(float& dst, const float& src) 68 | { 69 | dst = sqrtf(src); 70 | } 71 | }; 72 | 73 | template<> 74 | class UnarySqrt 75 | { 76 | public: 77 | UnarySqrt() {} 78 | __device__ inline void operator()(double& dst) 79 | { 80 | dst = sqrt(dst); 81 | } 82 | 83 | __device__ inline void operator()(double& dst, const double& src) 84 | { 85 | dst = sqrt(src); 86 | } 87 | }; 88 | 89 | //=================================== set ====================================== 90 | 91 | template 92 | class UnarySet 93 | { 94 | public: 95 | UnarySet(Dtype _scalar) : scalar(_scalar) {} 96 | 97 | __device__ inline void operator()(Dtype& dst) 98 | { 99 | dst = scalar; 100 | } 101 | 102 | private: 103 | Dtype scalar; 104 | }; 105 | 106 | //=================================== add ====================================== 107 | 108 | template 109 | class UnaryAdd 110 | { 111 | public: 112 | UnaryAdd(Dtype _scalar) : scalar(_scalar) {} 113 | 114 | __device__ inline void operator()(Dtype& dst) 115 | { 116 | dst += scalar; 117 | } 118 | 119 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 120 | { 121 | dst = src + scalar; 122 | } 123 | 124 | private: 125 | Dtype scalar; 126 | }; 127 | 128 | //=================================== inv ====================================== 129 | 130 | template 131 | class UnaryInv 132 | { 133 | public: 134 | UnaryInv() {} 135 | 136 | __device__ inline void operator()(Dtype& dst) 137 | { 138 | dst = 1.0 / dst; 139 | } 140 | 141 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 142 | { 143 | dst = 1.0 / src; 144 | } 145 | }; 146 | 147 | //=================================== inv_sqrt ====================================== 148 | 149 | template 150 | class UnaryInvSqrt 151 | { 152 | public: 153 | UnaryInvSqrt() {} 154 | 155 | __device__ inline void operator()(Dtype& dst) 156 | { 157 | dst = my_inv_sqrt(dst); 158 | } 159 | 160 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 161 | { 162 | dst = my_inv_sqrt(src); 163 | } 164 | 165 | private: 166 | __device__ inline float my_inv_sqrt(const float& src) 167 | { 168 | return rsqrtf(src); 169 | } 170 | 171 | __device__ inline double my_inv_sqrt(const double& src) 172 | { 173 | return rsqrt(src); 174 | } 175 | }; 176 | 177 | //=================================== sin ====================================== 178 | 179 | template 180 | class UnarySin 181 | { 182 | public: 183 | UnarySin() {} 184 | 185 | __device__ inline void operator()(Dtype& dst) 186 | { 187 | dst = my_sin(dst); 188 | } 189 | 190 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 191 | { 192 | dst = my_sin(src); 193 | } 194 | 195 | private: 196 | __device__ inline float my_sin(const float& src) 197 | { 198 | return sinf(src); 199 | } 200 | 201 | __device__ inline double my_sin(const double& src) 202 | { 203 | return sin(src); 204 | } 205 | }; 206 | 207 | //=================================== exp ====================================== 208 | 209 | template 210 | class UnaryExp 211 | { 212 | public: 213 | UnaryExp() {} 214 | 215 | __device__ inline void operator()(Dtype& dst) 216 | { 217 | dst = cuda_exp(dst); 218 | } 219 | 220 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 221 | { 222 | dst = cuda_exp(src); 223 | } 224 | }; 225 | 226 | //=================================== log ====================================== 227 | template 228 | class UnaryLog 229 | { 230 | public: 231 | UnaryLog() {} 232 | 233 | __device__ inline void operator()(Dtype& dst) 234 | { 235 | dst = cuda_log(dst); 236 | } 237 | 238 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 239 | { 240 | dst = cuda_log(src); 241 | } 242 | }; 243 | 244 | //=================================== sigmoid ====================================== 245 | 246 | template 247 | class UnarySigmoid 248 | { 249 | public: 250 | UnarySigmoid() {} 251 | 252 | __device__ inline void operator()(Dtype& dst) 253 | { 254 | dst = 1.0 / (1.0 + cuda_exp(-dst)); 255 | } 256 | 257 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 258 | { 259 | dst = 1.0 / (1.0 + cuda_exp(-src)); 260 | } 261 | }; 262 | 263 | //=================================== cos ====================================== 264 | 265 | template 266 | class UnaryCos 267 | { 268 | public: 269 | UnaryCos() {} 270 | 271 | __device__ inline void operator()(Dtype& dst) 272 | { 273 | dst = my_cos(dst); 274 | } 275 | 276 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 277 | { 278 | dst = my_cos(src); 279 | } 280 | 281 | private: 282 | __device__ inline float my_cos(const float& src) 283 | { 284 | return cosf(src); 285 | } 286 | 287 | __device__ inline double my_cos(const double& src) 288 | { 289 | return cos(src); 290 | } 291 | }; 292 | 293 | //=================================== square ====================================== 294 | 295 | template 296 | class UnarySquare 297 | { 298 | public: 299 | UnarySquare() {} 300 | 301 | __device__ inline void operator()(Dtype& dst) 302 | { 303 | dst = dst * dst; 304 | } 305 | 306 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 307 | { 308 | dst = src * src; 309 | } 310 | }; 311 | 312 | //=================================== relu ====================================== 313 | 314 | template 315 | class UnaryReLU 316 | { 317 | public: 318 | UnaryReLU() {} 319 | 320 | __device__ inline void operator()(Dtype& dst) 321 | { 322 | dst = dst > 0 ? dst : 0; 323 | } 324 | 325 | __device__ inline void operator()(Dtype& dst, const Dtype& src) 326 | { 327 | dst = src > 0 ? src : 0; 328 | } 329 | }; 330 | 331 | 332 | //=================================== call interface ====================================== 333 | 334 | template 335 | __global__ void UnaryKernel(Dtype *dst, int numElements, UnaryEngine unary) 336 | { 337 | int i = blockDim.x * blockIdx.x + threadIdx.x; 338 | 339 | if(i < numElements) 340 | { 341 | unary(dst[i]); 342 | } 343 | } 344 | 345 | template 346 | void UnaryOp(Dtype *dst, int numElements, UnaryEngine unary, const unsigned& sid) 347 | { 348 | int thread_num = min(c_uCudaThreadNum, numElements); 349 | int blocksPerGrid = (numElements + thread_num - 1) / thread_num; 350 | UnaryKernel<<>> (dst, numElements, unary); 351 | } 352 | 353 | template 354 | __global__ void UnaryKernel(Dtype *dst, Dtype* src, int numElements, UnaryEngine unary) 355 | { 356 | int i = blockDim.x * blockIdx.x + threadIdx.x; 357 | 358 | if(i < numElements) 359 | { 360 | unary(dst[i], src[i]); 361 | } 362 | } 363 | 364 | template 365 | void UnaryOp(Dtype *dst, Dtype* src, int numElements, UnaryEngine unary, const unsigned& sid) 366 | { 367 | int thread_num = min(c_uCudaThreadNum, numElements); 368 | int blocksPerGrid = (numElements + thread_num - 1) / thread_num; 369 | UnaryKernel<<>> (dst, src, numElements, unary); 370 | } 371 | 372 | #endif -------------------------------------------------------------------------------- /include/matrix/fastWalshTransform_kernel_double.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | 13 | 14 | #ifndef FWT_KERNEL_CUH_DOUBLE 15 | #define FWT_KERNEL_CUH_DOUBLE 16 | #ifndef fwt_kernel_cuh_double 17 | #define fwt_kernel_cuh_double 18 | 19 | 20 | 21 | /////////////////////////////////////////////////////////////////////////////// 22 | // Elementary(for vectors less than elementary size) in-shared memory 23 | // combined radix-2 + radix-4 Fast Walsh Transform 24 | /////////////////////////////////////////////////////////////////////////////// 25 | #define ELEMENTARY_LOG2SIZE 11 26 | 27 | __global__ void fwtBatch1Kernel(double *d_Output, double *d_Input, int log2N) 28 | { 29 | const int N = 1 << log2N; 30 | const int base = blockIdx.x << log2N; 31 | 32 | //(2 ** 11) * 4 bytes == 8KB -- maximum d_data[] size for G80 33 | extern __shared__ double d_data[]; 34 | double *d_Src = d_Input + base; 35 | double *d_Dst = d_Output + base; 36 | 37 | for (int pos = threadIdx.x; pos < N; pos += blockDim.x) 38 | { 39 | d_data[pos] = d_Src[pos]; 40 | } 41 | 42 | //Main radix-4 stages 43 | const int pos = threadIdx.x; 44 | 45 | for (int stride = N >> 2; stride > 0; stride >>= 2) 46 | { 47 | int lo = pos & (stride - 1); 48 | int i0 = ((pos - lo) << 2) + lo; 49 | int i1 = i0 + stride; 50 | int i2 = i1 + stride; 51 | int i3 = i2 + stride; 52 | 53 | __syncthreads(); 54 | double D0 = d_data[i0]; 55 | double D1 = d_data[i1]; 56 | double D2 = d_data[i2]; 57 | double D3 = d_data[i3]; 58 | 59 | double T; 60 | T = D0; 61 | D0 = D0 + D2; 62 | D2 = T - D2; 63 | T = D1; 64 | D1 = D1 + D3; 65 | D3 = T - D3; 66 | T = D0; 67 | d_data[i0] = D0 + D1; 68 | d_data[i1] = T - D1; 69 | T = D2; 70 | d_data[i2] = D2 + D3; 71 | d_data[i3] = T - D3; 72 | } 73 | 74 | //Do single radix-2 stage for odd power of two 75 | if (log2N & 1) 76 | { 77 | __syncthreads(); 78 | 79 | for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) 80 | { 81 | int i0 = pos << 1; 82 | int i1 = i0 + 1; 83 | 84 | double D0 = d_data[i0]; 85 | double D1 = d_data[i1]; 86 | d_data[i0] = D0 + D1; 87 | d_data[i1] = D0 - D1; 88 | } 89 | } 90 | 91 | __syncthreads(); 92 | 93 | for (int pos = threadIdx.x; pos < N; pos += blockDim.x) 94 | { 95 | d_Dst[pos] = d_data[pos]; 96 | } 97 | } 98 | 99 | //////////////////////////////////////////////////////////////////////////////// 100 | // Single in-global memory radix-4 Fast Walsh Transform pass 101 | // (for strides exceeding elementary vector size) 102 | //////////////////////////////////////////////////////////////////////////////// 103 | __global__ void fwtBatch2Kernel( 104 | double *d_Output, 105 | double *d_Input, 106 | int stride 107 | ) 108 | { 109 | const int pos = blockIdx.x * blockDim.x + threadIdx.x; 110 | const int N = blockDim.x * gridDim.x * 4; 111 | 112 | double *d_Src = d_Input + blockIdx.y * N; 113 | double *d_Dst = d_Output + blockIdx.y * N; 114 | 115 | int lo = pos & (stride - 1); 116 | int i0 = ((pos - lo) << 2) + lo; 117 | int i1 = i0 + stride; 118 | int i2 = i1 + stride; 119 | int i3 = i2 + stride; 120 | 121 | double D0 = d_Src[i0]; 122 | double D1 = d_Src[i1]; 123 | double D2 = d_Src[i2]; 124 | double D3 = d_Src[i3]; 125 | 126 | double T; 127 | T = D0; 128 | D0 = D0 + D2; 129 | D2 = T - D2; 130 | T = D1; 131 | D1 = D1 + D3; 132 | D3 = T - D3; 133 | T = D0; 134 | d_Dst[i0] = D0 + D1; 135 | d_Dst[i1] = T - D1; 136 | T = D2; 137 | d_Dst[i2] = D2 + D3; 138 | d_Dst[i3] = T - D3; 139 | } 140 | 141 | //////////////////////////////////////////////////////////////////////////////// 142 | // Put everything together: batched Fast Walsh Transform CPU front-end 143 | //////////////////////////////////////////////////////////////////////////////// 144 | void fwtBatchGPU(double *d_Data, int M, int log2N) 145 | { 146 | const int THREAD_N = 1024; 147 | 148 | int N = 1 << log2N; 149 | dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); 150 | 151 | for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) 152 | { 153 | fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); 154 | } 155 | 156 | fwtBatch1Kernel<<>>( 157 | d_Data, 158 | d_Data, 159 | log2N 160 | ); 161 | } 162 | 163 | 164 | //////////////////////////////////////////////////////////////////////////////// 165 | // Modulate two arrays 166 | //////////////////////////////////////////////////////////////////////////////// 167 | __global__ void modulateKernel(double *d_A, double *d_B, int N) 168 | { 169 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 170 | int numThreads = blockDim.x * gridDim.x; 171 | double rcpN = 1.0f / (double)N; 172 | 173 | for (int pos = tid; pos < N; pos += numThreads) 174 | { 175 | d_A[pos] *= d_B[pos] * rcpN; 176 | } 177 | } 178 | 179 | //Interface to modulateKernel() 180 | void modulateGPU(double *d_A, double *d_B, int N) 181 | { 182 | modulateKernel<<<128, 256>>>(d_A, d_B, N); 183 | } 184 | 185 | 186 | 187 | #endif 188 | #endif 189 | -------------------------------------------------------------------------------- /include/matrix/fastWalshTransform_kernel_float.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | 13 | 14 | #ifndef FWT_KERNEL_CUH_FLOAT 15 | #define FWT_KERNEL_CUH_FLOAT 16 | #ifndef fwt_kernel_cuh_float 17 | #define fwt_kernel_cuh_float 18 | 19 | 20 | 21 | /////////////////////////////////////////////////////////////////////////////// 22 | // Elementary(for vectors less than elementary size) in-shared memory 23 | // combined radix-2 + radix-4 Fast Walsh Transform 24 | /////////////////////////////////////////////////////////////////////////////// 25 | #define ELEMENTARY_LOG2SIZE 11 26 | 27 | __global__ void fwtBatch1Kernel(float *d_Output, float *d_Input, int log2N) 28 | { 29 | const int N = 1 << log2N; 30 | const int base = blockIdx.x << log2N; 31 | 32 | //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 33 | extern __shared__ float s_data[]; 34 | float *d_Src = d_Input + base; 35 | float *d_Dst = d_Output + base; 36 | 37 | for (int pos = threadIdx.x; pos < N; pos += blockDim.x) 38 | { 39 | s_data[pos] = d_Src[pos]; 40 | } 41 | 42 | //Main radix-4 stages 43 | const int pos = threadIdx.x; 44 | 45 | for (int stride = N >> 2; stride > 0; stride >>= 2) 46 | { 47 | int lo = pos & (stride - 1); 48 | int i0 = ((pos - lo) << 2) + lo; 49 | int i1 = i0 + stride; 50 | int i2 = i1 + stride; 51 | int i3 = i2 + stride; 52 | 53 | __syncthreads(); 54 | float D0 = s_data[i0]; 55 | float D1 = s_data[i1]; 56 | float D2 = s_data[i2]; 57 | float D3 = s_data[i3]; 58 | 59 | float T; 60 | T = D0; 61 | D0 = D0 + D2; 62 | D2 = T - D2; 63 | T = D1; 64 | D1 = D1 + D3; 65 | D3 = T - D3; 66 | T = D0; 67 | s_data[i0] = D0 + D1; 68 | s_data[i1] = T - D1; 69 | T = D2; 70 | s_data[i2] = D2 + D3; 71 | s_data[i3] = T - D3; 72 | } 73 | 74 | //Do single radix-2 stage for odd power of two 75 | if (log2N & 1) 76 | { 77 | __syncthreads(); 78 | 79 | for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) 80 | { 81 | int i0 = pos << 1; 82 | int i1 = i0 + 1; 83 | 84 | float D0 = s_data[i0]; 85 | float D1 = s_data[i1]; 86 | s_data[i0] = D0 + D1; 87 | s_data[i1] = D0 - D1; 88 | } 89 | } 90 | 91 | __syncthreads(); 92 | 93 | for (int pos = threadIdx.x; pos < N; pos += blockDim.x) 94 | { 95 | d_Dst[pos] = s_data[pos]; 96 | } 97 | } 98 | 99 | //////////////////////////////////////////////////////////////////////////////// 100 | // Single in-global memory radix-4 Fast Walsh Transform pass 101 | // (for strides exceeding elementary vector size) 102 | //////////////////////////////////////////////////////////////////////////////// 103 | __global__ void fwtBatch2Kernel( 104 | float *d_Output, 105 | float *d_Input, 106 | int stride 107 | ) 108 | { 109 | const int pos = blockIdx.x * blockDim.x + threadIdx.x; 110 | const int N = blockDim.x * gridDim.x * 4; 111 | 112 | float *d_Src = d_Input + blockIdx.y * N; 113 | float *d_Dst = d_Output + blockIdx.y * N; 114 | 115 | int lo = pos & (stride - 1); 116 | int i0 = ((pos - lo) << 2) + lo; 117 | int i1 = i0 + stride; 118 | int i2 = i1 + stride; 119 | int i3 = i2 + stride; 120 | 121 | float D0 = d_Src[i0]; 122 | float D1 = d_Src[i1]; 123 | float D2 = d_Src[i2]; 124 | float D3 = d_Src[i3]; 125 | 126 | float T; 127 | T = D0; 128 | D0 = D0 + D2; 129 | D2 = T - D2; 130 | T = D1; 131 | D1 = D1 + D3; 132 | D3 = T - D3; 133 | T = D0; 134 | d_Dst[i0] = D0 + D1; 135 | d_Dst[i1] = T - D1; 136 | T = D2; 137 | d_Dst[i2] = D2 + D3; 138 | d_Dst[i3] = T - D3; 139 | } 140 | 141 | //////////////////////////////////////////////////////////////////////////////// 142 | // Put everything together: batched Fast Walsh Transform CPU front-end 143 | //////////////////////////////////////////////////////////////////////////////// 144 | void fwtBatchGPU(float *d_Data, int M, int log2N) 145 | { 146 | const int THREAD_N = 1024; 147 | 148 | int N = 1 << log2N; 149 | dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); 150 | 151 | for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) 152 | { 153 | fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); 154 | } 155 | 156 | fwtBatch1Kernel<<>>( 157 | d_Data, 158 | d_Data, 159 | log2N 160 | ); 161 | } 162 | 163 | 164 | //////////////////////////////////////////////////////////////////////////////// 165 | // Modulate two arrays 166 | //////////////////////////////////////////////////////////////////////////////// 167 | __global__ void modulateKernel(float *d_A, float *d_B, int N) 168 | { 169 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 170 | int numThreads = blockDim.x * gridDim.x; 171 | float rcpN = 1.0f / (float)N; 172 | 173 | for (int pos = tid; pos < N; pos += numThreads) 174 | { 175 | d_A[pos] *= d_B[pos] * rcpN; 176 | } 177 | } 178 | 179 | //Interface to modulateKernel() 180 | void modulateGPU(float *d_A, float *d_B, int N) 181 | { 182 | modulateKernel<<<128, 256>>>(d_A, d_B, N); 183 | } 184 | 185 | 186 | 187 | #endif 188 | #endif 189 | -------------------------------------------------------------------------------- /include/matrix/gpuhandle.h: -------------------------------------------------------------------------------- 1 | #ifndef GPUHANDLE_H 2 | #define GPUHANDLE_H 3 | 4 | #include "mat_typedef.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define NUM_RND_BLOCKS 96 13 | #define NUM_RND_THREADS_PER_BLOCK 128 14 | #define NUM_RND_STREAMS (NUM_RND_BLOCKS * NUM_RND_THREADS_PER_BLOCK) 15 | 16 | struct GPUHandle 17 | { 18 | static cudaStream_t* streams; 19 | static cublasHandle_t cublashandle; 20 | static cusparseHandle_t cusparsehandle; 21 | static curandGenerator_t curandgenerator; 22 | static unsigned int streamcnt; 23 | 24 | static void Init(int dev_id, unsigned int _streamcnt = 1U); 25 | static void Destroy(); 26 | 27 | static curandState_t* devRandStates; 28 | }; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /include/matrix/imatrix.h: -------------------------------------------------------------------------------- 1 | #ifndef IMATRIX_H 2 | #define IMATRIX_H 3 | 4 | #include "matrix_utils.h" 5 | #include "gpuhandle.h" 6 | #include 7 | #define GPU_T(x) (x == Trans::N ? cublasOperation_t::CUBLAS_OP_N : cublasOperation_t::CUBLAS_OP_T) 8 | #define CUSP_T(x) (x == Trans::N ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE) 9 | #define CPU_CharT(x) (x == Trans::N ? 'N' : 'T') 10 | #define CPU_T(x) (x == Trans::N ? CblasNoTrans : CblasTrans) 11 | 12 | template 13 | class SparseMat; 14 | template 15 | class DenseMat; 16 | 17 | template 18 | class IMatrix 19 | { 20 | public: 21 | virtual MatType GetMatType() = 0; 22 | virtual ~IMatrix() {} 23 | 24 | virtual void Serialize(FILE* fid) 25 | { 26 | assert(fwrite(&rows, sizeof(size_t), 1, fid) == 1); 27 | assert(fwrite(&cols, sizeof(size_t), 1, fid) == 1); 28 | assert(fwrite(&count, sizeof(size_t), 1, fid) == 1); 29 | } 30 | 31 | virtual void Deserialize(FILE* fid) 32 | { 33 | assert(fread(&rows, sizeof(size_t), 1, fid) == 1); 34 | assert(fread(&cols, sizeof(size_t), 1, fid) == 1); 35 | assert(fread(&count, sizeof(size_t), 1, fid) == 1); 36 | } 37 | 38 | virtual void Print2Screen() = 0; 39 | 40 | virtual DenseMat& DenseDerived() 41 | { 42 | throw "Can not derive Dense Matrix from CSR Matrix"; 43 | } 44 | 45 | virtual const DenseMat& DenseDerived() const 46 | { 47 | throw "Can not derive Dense Matrix from CSR Matrix"; 48 | } 49 | 50 | virtual SparseMat& SparseDerived() 51 | { 52 | throw "Can not derive CSR Matrix from Dense Matrix"; 53 | } 54 | 55 | virtual const SparseMat& SparseDerived() const 56 | { 57 | throw "Can not derive CSR Matrix from Dense Matrix"; 58 | } 59 | 60 | size_t rows, cols, count; 61 | }; 62 | 63 | #endif -------------------------------------------------------------------------------- /include/matrix/mat_typedef.h: -------------------------------------------------------------------------------- 1 | #ifndef MAT_TYPEDEF_H 2 | #define MAT_TYPEDEF_H 3 | 4 | #include 5 | 6 | enum MatMode 7 | { 8 | CPU = 0, 9 | GPU = 1 10 | }; 11 | 12 | enum class Trans 13 | { 14 | N = 0, 15 | T = 1 16 | }; 17 | 18 | enum Phase 19 | { 20 | TRAIN = 0, 21 | TEST = 1 22 | }; 23 | 24 | enum MatType 25 | { 26 | DENSE, 27 | SPARSE 28 | }; 29 | 30 | #define c_uCudaThreadNum 1024 31 | 32 | const double eps = 1e-8; 33 | 34 | #endif -------------------------------------------------------------------------------- /include/matrix/matrix_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef MATRIX_UTILS_H 2 | #define MATRIX_UTILS_H 3 | 4 | #include "mat_typedef.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct MatUtils 14 | { 15 | template 16 | static void DelArr(T*& p); 17 | 18 | template 19 | static void MallocArr(T*& p, size_t nBytes); 20 | 21 | template 22 | static void ArrSetZeros(T*& p, size_t nBytes); 23 | }; 24 | 25 | template<> 26 | template 27 | void MatUtils::ArrSetZeros(T*& p, size_t nBytes) 28 | { 29 | if (p) 30 | memset(p, 0, nBytes); 31 | } 32 | 33 | template<> 34 | template 35 | void MatUtils::DelArr(T*& p) 36 | { 37 | if (p) 38 | { 39 | delete[] p; 40 | p = nullptr; 41 | } 42 | } 43 | 44 | template<> 45 | template 46 | void MatUtils::MallocArr(T*& p, size_t nBytes) 47 | { 48 | if (nBytes) 49 | p = (T*) malloc(nBytes); 50 | else p = nullptr; 51 | } 52 | 53 | template<> 54 | template 55 | void MatUtils::DelArr(T*& p) 56 | { 57 | if (p) 58 | { 59 | cudaFree(p); 60 | p = nullptr; 61 | } 62 | } 63 | 64 | template<> 65 | template 66 | void MatUtils::MallocArr(T*& p, size_t nBytes) 67 | { 68 | if (nBytes) 69 | { 70 | cudaError_t t = cudaMalloc(&p, nBytes); 71 | assert(t != cudaErrorMemoryAllocation); 72 | } 73 | else p = nullptr; 74 | } 75 | 76 | 77 | inline void GetDims(const size_t& lhs_rows, const size_t& lhs_cols, Trans ltrans, 78 | const size_t& rhs_rows, const size_t& rhs_cols, Trans rtrans, 79 | size_t &m, size_t &n, size_t &k) 80 | { 81 | m = ltrans == Trans::N ? lhs_rows : lhs_cols; 82 | n = rtrans == Trans::N ? rhs_cols : rhs_rows; 83 | k = ltrans == Trans::N ? lhs_cols : lhs_rows; 84 | assert((rtrans == Trans::N && rhs_rows == k) || (rtrans == Trans::T && rhs_cols == k)); 85 | } 86 | 87 | #endif -------------------------------------------------------------------------------- /include/matrix/mkl_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef MKL_HELPER_H 2 | #define MKL_HELPER_H 3 | 4 | #include 5 | 6 | inline float MKLHelper_Dot(const MKL_INT n, const float* x, const float* y) 7 | { 8 | return cblas_sdot(n, x, 1, y, 1); 9 | } 10 | 11 | inline double MKLHelper_Dot(const MKL_INT n, const double* x, const double* y) 12 | { 13 | return cblas_ddot(n, x, 1, y, 1); 14 | } 15 | 16 | inline CBLAS_INDEX MKLHelper_Amax(const MKL_INT n, const float *x) 17 | { 18 | return cblas_isamax(n, x, 1); 19 | } 20 | 21 | inline CBLAS_INDEX MKLHelper_Amax(const MKL_INT n, const double *x) 22 | { 23 | return cblas_idamax(n, x, 1); 24 | } 25 | 26 | inline float MKLHelper_Asum(const MKL_INT n, const float *x) 27 | { 28 | return cblas_sasum(n, x, 1); 29 | } 30 | 31 | inline double MKLHelper_Asum(const MKL_INT n, const double *x) 32 | { 33 | return cblas_dasum(n, x, 1); 34 | } 35 | 36 | inline float MKLHelper_Norm2(const MKL_INT n, const float *x) 37 | { 38 | return cblas_snrm2(n, x, 1); 39 | } 40 | 41 | inline double MKLHelper_Norm2(const MKL_INT n, const double *x) 42 | { 43 | return cblas_dnrm2(n, x, 1); 44 | } 45 | 46 | inline void MKLHelper_Ger(const CBLAS_LAYOUT Layout, const MKL_INT m, const MKL_INT n, 47 | const float alpha, const float *x, const float *y, float *a) 48 | { 49 | const MKL_INT lda = Layout == CblasRowMajor ? n : m; 50 | cblas_sger(Layout, m, n, alpha, x, 1, y, 1, a, lda); 51 | } 52 | 53 | inline void MKLHelper_Ger(const CBLAS_LAYOUT Layout, const MKL_INT m, const MKL_INT n, 54 | const double alpha, const double *x, const double *y, double *a) 55 | { 56 | const MKL_INT lda = Layout == CblasRowMajor ? n : m; 57 | cblas_dger(Layout, m, n, alpha, x, 1, y, 1, a, lda); 58 | } 59 | 60 | inline void MKLHelper_Axpy(const MKL_INT n, const float a, const float *x, float *y) 61 | { 62 | cblas_saxpy(n, a, x, 1, y, 1); 63 | } 64 | 65 | inline void MKLHelper_Axpy(const MKL_INT n, const double a, const double *x, double *y) 66 | { 67 | cblas_daxpy(n, a, x, 1, y, 1); 68 | } 69 | 70 | inline void MKLHelper_Axpby(const MKL_INT n, const float a, const float *x, const float b, float *y) 71 | { 72 | cblas_saxpby(n, a, x, 1, b, y, 1); 73 | } 74 | 75 | inline void MKLHelper_Axpby(const MKL_INT n, const double a, const double *x, const double b, double *y) 76 | { 77 | cblas_daxpby(n, a, x, 1, b, y, 1); 78 | } 79 | 80 | inline void MKLHelper_Omatadd(char ordering, char transa, char transb, size_t m, size_t n, 81 | const float alpha, const float * A, size_t lda, 82 | const float beta, const float * B, size_t ldb, 83 | float * C, size_t ldc) 84 | { 85 | mkl_somatadd(ordering, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); 86 | } 87 | 88 | inline void MKLHelper_Omatadd(char ordering, char transa, char transb, size_t m, size_t n, 89 | const double alpha, const double * A, size_t lda, 90 | const double beta, const double * B, size_t ldb, 91 | double * C, size_t ldc) 92 | { 93 | mkl_domatadd(ordering, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); 94 | } 95 | 96 | inline void MKLHelper_GeMV(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE trans, 97 | const MKL_INT m, const MKL_INT n, 98 | const float alpha, const float *a, const MKL_INT lda, const float *x, const MKL_INT incx, 99 | const float beta, float *y, const MKL_INT incy) 100 | { 101 | cblas_sgemv(Layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); 102 | } 103 | 104 | inline void MKLHelper_GeMV(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE trans, 105 | const MKL_INT m, const MKL_INT n, 106 | const double alpha, const double *a, const MKL_INT lda, const double *x, const MKL_INT incx, 107 | const double beta, double *y, const MKL_INT incy) 108 | { 109 | cblas_dgemv(Layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); 110 | } 111 | 112 | inline void MKLHelper_GeMM(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE transa, const CBLAS_TRANSPOSE transb, 113 | const MKL_INT m, const MKL_INT n, const MKL_INT k, 114 | const float alpha, const float *a, const MKL_INT lda, const float *b, const MKL_INT ldb, 115 | const float beta, float *c, const MKL_INT ldc) 116 | { 117 | cblas_sgemm(Layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); 118 | } 119 | 120 | inline void MKLHelper_GeMM(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE transa, const CBLAS_TRANSPOSE transb, 121 | const MKL_INT m, const MKL_INT n, const MKL_INT k, 122 | const double alpha, const double *a, const MKL_INT lda, const double *b, const MKL_INT ldb, 123 | const double beta, double *c, const MKL_INT ldc) 124 | { 125 | cblas_dgemm(Layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); 126 | } 127 | 128 | inline void MKLHelper_CSRMM(char trans, MKL_INT m, MKL_INT n, MKL_INT k, float alpha, 129 | char *matdescra, float *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, 130 | float *b, MKL_INT ldb, float beta, float *c, MKL_INT ldc) 131 | { 132 | mkl_scsrmm(&trans, &m, &n, &k, &alpha, 133 | matdescra, val, indx, pntrb, pntre, 134 | b, &ldb, &beta, c, &ldc); 135 | } 136 | 137 | inline void MKLHelper_CSRMM(char trans, MKL_INT m, MKL_INT n, MKL_INT k, double alpha, 138 | char *matdescra, double *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, 139 | double *b, MKL_INT ldb, double beta, double *c, MKL_INT ldc) 140 | { 141 | mkl_dcsrmm(&trans, &m, &n, &k, &alpha, 142 | matdescra, val, indx, pntrb, pntre, 143 | b, &ldb, &beta, c, &ldc); 144 | } 145 | 146 | inline void MKLHelper_Sin(const MKL_INT n, float* a, float* y) 147 | { 148 | vsSin(n, a, y); 149 | } 150 | 151 | inline void MKLHelper_Sin(const MKL_INT n, double* a, double* y) 152 | { 153 | vdSin(n, a, y); 154 | } 155 | 156 | inline void MKLHelper_Cos(const MKL_INT n, float* a, float* y) 157 | { 158 | vsCos(n, a, y); 159 | } 160 | 161 | inline void MKLHelper_Cos(const MKL_INT n, double* a, double* y) 162 | { 163 | vdCos(n, a, y); 164 | } 165 | 166 | inline void MKLHelper_Exp(const MKL_INT n, float* a, float* y) 167 | { 168 | vsExp(n, a, y); 169 | } 170 | 171 | inline void MKLHelper_Exp(const MKL_INT n, double* a, double* y) 172 | { 173 | vdExp(n, a, y); 174 | } 175 | 176 | inline void MKLHelper_Log(const MKL_INT n, float* a, float* y) 177 | { 178 | vsLn(n, a, y); 179 | } 180 | 181 | inline void MKLHelper_Log(const MKL_INT n, double* a, double* y) 182 | { 183 | vdLn(n, a, y); 184 | } 185 | 186 | inline void MKLHelper_Mul(const MKL_INT n, float* a, float* b, float* y) 187 | { 188 | vsMul(n, a, b, y); 189 | } 190 | 191 | inline void MKLHelper_Mul(const MKL_INT n, double* a, double* b, double* y) 192 | { 193 | vdMul(n, a, b, y); 194 | } 195 | 196 | inline void MKLHelper_Div(const MKL_INT n, float* a, float* b, float* y) 197 | { 198 | vsDiv(n, a, b, y); 199 | } 200 | 201 | inline void MKLHelper_Div(const MKL_INT n, double* a, double* b, double* y) 202 | { 203 | vdDiv(n, a, b, y); 204 | } 205 | 206 | inline void MKLHelper_Sqrt(const MKL_INT n, float* a, float* y) 207 | { 208 | vsSqrt(n, a, y); 209 | } 210 | 211 | inline void MKLHelper_Sqrt(const MKL_INT n, double* a, double* y) 212 | { 213 | vdSqrt(n, a, y); 214 | } 215 | 216 | inline void MKLHelper_InvSqrt(const MKL_INT n, float* a, float* y) 217 | { 218 | vsInvSqrt(n, a, y); 219 | } 220 | 221 | inline void MKLHelper_InvSqrt(const MKL_INT n, double* a, double* y) 222 | { 223 | vdInvSqrt(n, a, y); 224 | } 225 | 226 | inline void MKLHelper_Inv(const MKL_INT n, float* a, float* y) 227 | { 228 | vsInv(n, a, y); 229 | } 230 | 231 | inline void MKLHelper_Inv(const MKL_INT n, double* a, double* y) 232 | { 233 | vdInv(n, a, y); 234 | } 235 | 236 | inline void MKLHelper_Square(const MKL_INT n, float* a, float* y) 237 | { 238 | vsSqr(n, a, y); 239 | } 240 | 241 | inline void MKLHelper_Square(const MKL_INT n, double* a, double* y) 242 | { 243 | vdSqr(n, a, y); 244 | } 245 | 246 | inline void MKLHelper_PowerX(const MKL_INT n, float* a, float b, float* y) 247 | { 248 | vsPowx(n, a, b, y); 249 | } 250 | 251 | inline void MKLHelper_PowerX(const MKL_INT n, double* a, double b, double* y) 252 | { 253 | vdPowx(n, a, b, y); 254 | } 255 | 256 | #endif -------------------------------------------------------------------------------- /include/matrix/sp_data.h: -------------------------------------------------------------------------------- 1 | #ifndef SP_DATA_H 2 | #define SP_DATA_H 3 | 4 | #include "matrix_utils.h" 5 | 6 | template 7 | class SpData 8 | { 9 | public: 10 | inline SpData() 11 | { 12 | nnz = len_ptr = nzCap = ptrCap = 0; 13 | val = nullptr; 14 | col_idx = ptr = nullptr; 15 | } 16 | 17 | inline SpData(int newNzCap, int newPtrCap) 18 | { 19 | nnz = len_ptr = 0; 20 | nzCap = newNzCap; 21 | ptrCap = newPtrCap; 22 | MatUtils::MallocArr(val, sizeof(Dtype) * nzCap); 23 | MatUtils::MallocArr(col_idx, sizeof(int) * nzCap); 24 | MatUtils::MallocArr(ptr, sizeof(int) * ptrCap); 25 | } 26 | 27 | void Serialize(FILE* fid) 28 | { 29 | assert(fwrite(&nnz, sizeof(int), 1, fid) == 1); 30 | assert(fwrite(&len_ptr, sizeof(int), 1, fid) == 1); 31 | assert(fwrite(&nzCap, sizeof(int), 1, fid) == 1); 32 | assert(fwrite(&ptrCap, sizeof(int), 1, fid) == 1); 33 | 34 | int *p_col_idx = col_idx, *p_ptr = ptr; 35 | Dtype* p_val = val; 36 | if (mode == GPU) 37 | { 38 | p_val = new Dtype[nzCap]; 39 | p_col_idx = new int[nzCap]; 40 | p_ptr = new int[ptrCap]; 41 | cudaMemcpy(p_val, val, sizeof(Dtype) * nzCap, cudaMemcpyDeviceToHost); 42 | cudaMemcpy(p_col_idx, col_idx, sizeof(int) * nzCap, cudaMemcpyDeviceToHost); 43 | cudaMemcpy(p_ptr, ptr, sizeof(int) * ptrCap, cudaMemcpyDeviceToHost); 44 | } 45 | assert(fwrite(p_val, sizeof(Dtype), nzCap, fid) == nzCap); 46 | assert(fwrite(p_col_idx, sizeof(int), nzCap, fid) == nzCap); 47 | assert(fwrite(p_ptr, sizeof(int), ptrCap, fid) == ptrCap); 48 | if (mode == GPU) 49 | { 50 | delete[] p_val; 51 | delete[] p_col_idx; 52 | delete[] p_ptr; 53 | } 54 | } 55 | 56 | void Deserialize(FILE* fid) 57 | { 58 | assert(fread(&nnz, sizeof(int), 1, fid) == 1); 59 | assert(fread(&len_ptr, sizeof(int), 1, fid) == 1); 60 | assert(fread(&nzCap, sizeof(int), 1, fid) == 1); 61 | assert(fread(&ptrCap, sizeof(int), 1, fid) == 1); 62 | 63 | MatUtils::DelArr(val); 64 | MatUtils::DelArr(col_idx); 65 | MatUtils::DelArr(ptr); 66 | MatUtils::MallocArr(val, sizeof(Dtype) * nzCap); 67 | MatUtils::MallocArr(col_idx, sizeof(int) * nzCap); 68 | MatUtils::MallocArr(ptr, sizeof(int) * ptrCap); 69 | 70 | int *p_col_idx = col_idx, *p_ptr = ptr; 71 | Dtype* p_val = val; 72 | if (mode == GPU) 73 | { 74 | p_val = new Dtype[nzCap]; 75 | p_col_idx = new int[nzCap]; 76 | p_ptr = new int[ptrCap]; 77 | } 78 | assert(fread(p_val, sizeof(Dtype), nzCap, fid) == nzCap); 79 | assert(fread(p_col_idx, sizeof(int), nzCap, fid) == nzCap); 80 | assert(fread(p_ptr, sizeof(int), ptrCap, fid) == ptrCap); 81 | if (mode == GPU) 82 | { 83 | cudaMemcpy(val, p_val, sizeof(Dtype) * nzCap, cudaMemcpyHostToDevice); 84 | cudaMemcpy(col_idx, p_col_idx, sizeof(int) * nzCap, cudaMemcpyHostToDevice); 85 | cudaMemcpy(ptr, p_ptr, sizeof(int) * ptrCap, cudaMemcpyHostToDevice); 86 | delete[] p_val; 87 | delete[] p_col_idx; 88 | delete[] p_ptr; 89 | } 90 | } 91 | 92 | ~SpData() 93 | { 94 | MatUtils::DelArr(val); 95 | MatUtils::DelArr(col_idx); 96 | MatUtils::DelArr(ptr); 97 | } 98 | 99 | Dtype* val; 100 | int* col_idx; 101 | int* ptr; 102 | 103 | int nnz; 104 | int len_ptr; 105 | int nzCap; 106 | int ptrCap; 107 | }; 108 | 109 | #endif -------------------------------------------------------------------------------- /include/matrix/sparse_matrix.h: -------------------------------------------------------------------------------- 1 | #ifndef SPARSE_MATRIX_H 2 | #define SPARSE_MATRIX_H 3 | 4 | #include "imatrix.h" 5 | #include "sp_data.h" 6 | #include 7 | 8 | template 9 | class SparseMat : public IMatrix 10 | { 11 | public: 12 | 13 | }; 14 | 15 | 16 | template 17 | class SparseMat : public IMatrix 18 | { 19 | public: 20 | SparseMat(); 21 | ~SparseMat(); 22 | template 23 | SparseMat(SparseMat& src) 24 | { 25 | CopyFrom(src); 26 | } 27 | SparseMat(size_t _rows, size_t cols); 28 | inline virtual MatType GetMatType() override 29 | { 30 | return SPARSE; 31 | } 32 | inline virtual SparseMat& SparseDerived() override 33 | { 34 | return *this; 35 | } 36 | 37 | inline virtual const SparseMat& SparseDerived() const override 38 | { 39 | return *this; 40 | } 41 | 42 | Dtype Asum(); 43 | 44 | virtual void Print2Screen() override; 45 | 46 | virtual void Serialize(FILE* fid) override; 47 | virtual void Deserialize(FILE* fid) override; 48 | 49 | void Resize(size_t newRos, size_t newCols); 50 | void ResizeSp(int newNNZ, int newNPtr); 51 | 52 | void CopyFrom(SparseMat& src); 53 | void CopyFrom(SparseMat& src); 54 | 55 | std::shared_ptr< SpData > data; 56 | }; 57 | 58 | template 59 | class SparseMat : public IMatrix 60 | { 61 | public: 62 | SparseMat(); 63 | ~SparseMat(); 64 | template 65 | SparseMat(SparseMat& src) 66 | { 67 | CopyFrom(src); 68 | } 69 | SparseMat(size_t _rows, size_t _cols, unsigned _streamid = 0U); 70 | inline virtual MatType GetMatType() override 71 | { 72 | return SPARSE; 73 | } 74 | inline virtual SparseMat& SparseDerived() override 75 | { 76 | return *this; 77 | } 78 | 79 | inline virtual const SparseMat& SparseDerived() const override 80 | { 81 | return *this; 82 | } 83 | 84 | virtual void Serialize(FILE* fid) override; 85 | virtual void Deserialize(FILE* fid) override; 86 | virtual void Print2Screen() override; 87 | 88 | void Resize(size_t newRos, size_t newCols); 89 | void ResizeSp(int newNNZ, int newNPtr); 90 | 91 | Dtype Asum(); 92 | 93 | void CopyFrom(SparseMat& src); 94 | void CopyFrom(SparseMat& src); 95 | 96 | std::shared_ptr< SpData > data; 97 | unsigned int streamid; 98 | cusparseMatDescr_t descr; 99 | }; 100 | #endif -------------------------------------------------------------------------------- /include/matrix/vector.h: -------------------------------------------------------------------------------- 1 | #ifndef DENSE_VECTOR_H 2 | #define DENSE_VECTOR_H 3 | 4 | #include "imatrix.h" 5 | #include 6 | 7 | template 8 | class Vector 9 | { 10 | public: 11 | }; 12 | 13 | template 14 | class Vector 15 | { 16 | public: 17 | ~Vector(); 18 | Vector(); 19 | Vector(size_t _count); 20 | 21 | virtual void Serialize(FILE* fid) 22 | { 23 | assert(fwrite(&count, sizeof(size_t), 1, fid) == 1); 24 | assert(fwrite(&mem_size, sizeof(size_t), 1, fid) == 1); 25 | assert(fwrite(data, sizeof(Dtype), mem_size, fid) == mem_size); 26 | } 27 | 28 | virtual void Deserialize(FILE* fid) 29 | { 30 | assert(fread(&count, sizeof(size_t), 1, fid) == 1); 31 | assert(fread(&mem_size, sizeof(size_t), 1, fid) == 1); 32 | MatUtils::DelArr(data); 33 | MatUtils::MallocArr(data, sizeof(Dtype) * mem_size); 34 | assert(fread(data, sizeof(Dtype), mem_size, fid) == mem_size); 35 | } 36 | 37 | void Resize(size_t _count); 38 | void Fill(Dtype scalar); 39 | 40 | Dtype* data; 41 | size_t count, mem_size; 42 | }; 43 | 44 | template 45 | class Vector 46 | { 47 | public: 48 | ~Vector(); 49 | Vector(); 50 | Vector(size_t _count, unsigned int _streamid = 0U); 51 | 52 | virtual void Serialize(FILE* fid) 53 | { 54 | assert(fwrite(&count, sizeof(size_t), 1, fid) == 1); 55 | assert(fwrite(&mem_size, sizeof(size_t), 1, fid) == 1); 56 | Dtype* buf = new Dtype[mem_size]; 57 | cudaMemcpy(buf, data, sizeof(Dtype) * mem_size, cudaMemcpyDeviceToHost); 58 | assert(fwrite(buf, sizeof(Dtype), mem_size, fid) == mem_size); 59 | delete[] buf; 60 | } 61 | 62 | virtual void Deserialize(FILE* fid) 63 | { 64 | assert(fread(&count, sizeof(size_t), 1, fid) == 1); 65 | assert(fread(&mem_size, sizeof(size_t), 1, fid) == 1); 66 | MatUtils::DelArr(data); 67 | MatUtils::MallocArr(data, sizeof(Dtype) * mem_size); 68 | Dtype* buf = new Dtype[mem_size]; 69 | assert(fread(buf, sizeof(Dtype), mem_size, fid) == mem_size); 70 | cudaMemcpy(data, buf, sizeof(Dtype) * mem_size, cudaMemcpyHostToDevice); 71 | delete[] buf; 72 | } 73 | 74 | void Resize(size_t _count); 75 | void Fill(Dtype scalar); 76 | 77 | void CopyFrom(Vector& src); 78 | 79 | Dtype* data; 80 | size_t count, mem_size; 81 | unsigned streamid; 82 | }; 83 | 84 | #endif -------------------------------------------------------------------------------- /include/net/abs_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef ABS_CRITERION_LAYER_H 2 | #define ABS_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | 6 | template 7 | class ABSCriterionLayer : public ICriterionLayer 8 | { 9 | public: 10 | ABSCriterionLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : ABSCriterionLayer(_name, 1.0, _properr) {} 12 | 13 | ABSCriterionLayer(std::string _name, Dtype _lambda, PropErr _properr = PropErr::T) 14 | : ICriterionLayer(_name, _lambda, _properr) 15 | { 16 | this->grad = new DenseMat(); 17 | } 18 | 19 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 20 | { 21 | assert(operands.size() == 2); 22 | 23 | auto& node_diff = this->grad->DenseDerived(); 24 | node_diff.GeaM(1.0, Trans::N, operands[0]->state->DenseDerived(), -1.0, Trans::N, operands[1]->state->DenseDerived()); 25 | this->loss = node_diff.Asum(); 26 | } 27 | 28 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 29 | { 30 | throw std::runtime_error("not impltemented"); 31 | } 32 | }; 33 | 34 | #endif -------------------------------------------------------------------------------- /include/net/avg_rank_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef AVG_RANK_CRITERION_LAYER_H 2 | #define AVG_RANK_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | 6 | template 7 | class AvgRankCriterionLayer : public ICriterionLayer 8 | { 9 | public: 10 | AvgRankCriterionLayer(std::string _name, RankOrder _order) 11 | : ICriterionLayer(_name, PropErr::N), order(_order) {} 12 | 13 | static std::string str_type() 14 | { 15 | return "AverageRank"; 16 | } 17 | 18 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 19 | { 20 | auto& pred = operands[0]->state->DenseDerived(); 21 | auto& labels = operands[1]->state->SparseDerived(); 22 | 23 | this->loss = LossFunc::GetAverageRank(pred, labels, order); 24 | } 25 | 26 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 27 | { 28 | throw std::runtime_error("no grad in this layer"); 29 | } 30 | 31 | RankOrder order; 32 | }; 33 | 34 | #endif -------------------------------------------------------------------------------- /include/net/c_add_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef C_ADD_LAYER_H 2 | #define C_ADD_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class CAddLayer : public ILayer 8 | { 9 | public: 10 | CAddLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : ILayer(_name, _properr) 12 | { 13 | this->state = new DenseMat(); 14 | this->grad = new DenseMat(); 15 | } 16 | 17 | static std::string str_type() 18 | { 19 | return "CAdd"; 20 | } 21 | 22 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 23 | { 24 | assert(operands.size()); 25 | auto& cur_output = this->state->DenseDerived(); 26 | 27 | for (size_t i = 0; i < operands.size(); ++i) 28 | { 29 | auto& prev_state = operands[i]->state->DenseDerived(); 30 | 31 | if (i == 0) 32 | cur_output.CopyFrom(prev_state); 33 | else 34 | cur_output.Axpy(1.0, prev_state); 35 | } 36 | } 37 | 38 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 39 | { 40 | assert(cur_idx >= 0 && cur_idx < operands.size()); 41 | auto& cur_grad = this->grad->DenseDerived(); 42 | auto& prev_grad = operands[cur_idx]->grad->DenseDerived(); 43 | 44 | if (beta == 0) 45 | prev_grad.CopyFrom(cur_grad); 46 | else 47 | prev_grad.Axpby(1.0, cur_grad, beta); 48 | } 49 | }; 50 | 51 | #endif -------------------------------------------------------------------------------- /include/net/c_mul_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef C_MUL_LAYER_H 2 | #define C_MUL_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class CMulLayer : public ILayer 8 | { 9 | public: 10 | CMulLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : ILayer(_name, _properr) 12 | { 13 | this->state = new DenseMat(); 14 | this->grad = new DenseMat(); 15 | } 16 | 17 | static std::string str_type() 18 | { 19 | return "CMul"; 20 | } 21 | 22 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 23 | { 24 | assert(operands.size() > 1); 25 | 26 | auto& cur_output = this->state->DenseDerived(); 27 | cur_output.EleWiseMul(operands[0]->state->DenseDerived(), operands[1]->state->DenseDerived()); 28 | 29 | for (size_t i = 2; i < operands.size(); ++i) 30 | cur_output.EleWiseMul(operands[i]->state->DenseDerived()); 31 | } 32 | 33 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 34 | { 35 | assert(operands.size() > 1); 36 | 37 | auto& cur_grad = this->grad->DenseDerived(); 38 | 39 | auto& prev_grad = beta == 0 ? operands[cur_idx]->grad->DenseDerived() : buf; 40 | 41 | prev_grad.CopyFrom(cur_grad); 42 | 43 | for (size_t i = 0; i < operands.size(); ++i) 44 | if (i != cur_idx) 45 | prev_grad.EleWiseMul(operands[i]->state->DenseDerived()); 46 | 47 | if (beta != 0) 48 | operands[cur_idx]->grad->DenseDerived().Axpby(1.0, prev_grad, beta); 49 | } 50 | 51 | DenseMat buf; 52 | }; 53 | 54 | #endif -------------------------------------------------------------------------------- /include/net/classnll_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef CLASSNLL_CRITERION_LAYER_H 2 | #define CLASSNLL_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | #include "dense_matrix.h" 6 | #include "sparse_matrix.h" 7 | #include "loss_func.h" 8 | 9 | template 10 | class ClassNLLCriterionLayer : public ICriterionLayer 11 | { 12 | public: 13 | ClassNLLCriterionLayer(std::string _name, bool _need_softmax, PropErr _properr = PropErr::T) 14 | : ClassNLLCriterionLayer(_name, _need_softmax, 1.0, _properr) {} 15 | 16 | ClassNLLCriterionLayer(std::string _name, bool _need_softmax, Dtype _lambda, PropErr _properr = PropErr::T) 17 | : ICriterionLayer(_name, _lambda, _properr), need_softmax(_need_softmax) 18 | { 19 | this->grad = new DenseMat(); 20 | } 21 | 22 | static std::string str_type() 23 | { 24 | return "ClassNLL"; 25 | } 26 | 27 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 28 | { 29 | auto& top = this->grad->DenseDerived(); 30 | top.CopyFrom(operands[0]->state->DenseDerived()); 31 | if (need_softmax) 32 | top.Softmax(); 33 | auto& labels = operands[1]->state->SparseDerived(); 34 | 35 | this->loss = LossFunc::GetLogLoss(top, labels); 36 | 37 | if (need_softmax) 38 | { 39 | top.Axpy(-1.0, labels); // calc grad 40 | top.Scale(1.0 / top.rows); // normalize by batch size 41 | } else 42 | { 43 | top.Inv(); 44 | top.EleWiseMul(labels); 45 | top.Scale(-1.0 / top.rows); // normalize by batch size 46 | } 47 | } 48 | 49 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 50 | { 51 | assert(operands.size() == 2 && cur_idx == 0); 52 | 53 | auto& prev_grad = operands[0]->grad->DenseDerived(); 54 | auto& cur_grad = this->grad->DenseDerived(); 55 | if (beta == 0) 56 | prev_grad.CopyFrom(cur_grad); 57 | else 58 | prev_grad.Axpby(1.0, cur_grad, beta); 59 | } 60 | 61 | protected: 62 | const bool need_softmax; 63 | }; 64 | 65 | #endif -------------------------------------------------------------------------------- /include/net/col_slice_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef COL_SLICE_LAYER_H 2 | #define COL_SLICE_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class ColSliceLayer : public ILayer 8 | { 9 | public: 10 | ColSliceLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : ILayer(_name, _properr) 12 | { 13 | this->state = new DenseMat(); 14 | this->grad = new DenseMat(); 15 | } 16 | 17 | static std::string str_type() 18 | { 19 | return "ColSlice"; 20 | } 21 | 22 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 23 | { 24 | assert(operands.size() >= 2); 25 | auto& cur_output = this->state->DenseDerived(); 26 | 27 | if ((int)operands.size() == 2) 28 | { 29 | int col_idx = GetColIdx(operands[1]); 30 | 31 | auto& prev_output = operands[0]->state->DenseDerived(); 32 | cur_output.GetColsFrom(prev_output, col_idx, 1); 33 | } else 34 | { 35 | throw std::runtime_error("only support single column selection"); 36 | } 37 | } 38 | 39 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 40 | { 41 | assert(cur_idx == 0); 42 | auto& cur_grad = this->grad->DenseDerived(); 43 | auto& prev_grad = operands[cur_idx]->grad->DenseDerived(); 44 | 45 | if ((int)operands.size() == 2) 46 | { 47 | int col_idx = GetColIdx(operands[1]); 48 | prev_grad.SubmatAdd(0, col_idx, cur_grad, beta); 49 | } else 50 | { 51 | throw std::runtime_error("only support single column selection"); 52 | } 53 | } 54 | 55 | protected: 56 | 57 | inline int GetColIdx(ILayer* op) 58 | { 59 | auto& col_selected = op->state->SparseDerived(); 60 | assert(col_selected.data->nnz == 1); 61 | return col_selected.data->col_idx[0]; 62 | } 63 | }; 64 | 65 | #endif -------------------------------------------------------------------------------- /include/net/concat_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef CONCAT_LAYER_H 2 | #define CONCAT_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class ConcatLayer : public ILayer 8 | { 9 | public: 10 | ConcatLayer(std::string _name, PropErr _properr = PropErr::T); 11 | 12 | static std::string str_type() 13 | { 14 | return "Concat"; 15 | } 16 | 17 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override; 18 | 19 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override; 20 | 21 | protected: 22 | 23 | DenseMat buf; 24 | }; 25 | 26 | 27 | #endif -------------------------------------------------------------------------------- /include/net/const_scalar_param.h: -------------------------------------------------------------------------------- 1 | #ifndef CONST_SCALAR_PARAM 2 | #define CONST_SCALAR_PARAM 3 | 4 | #include "i_param.h" 5 | 6 | template 7 | class ConstScalarParam : public IConstParam 8 | { 9 | public: 10 | ConstScalarParam(std::string _name, Dtype _a, Dtype _b) 11 | : IConstParam(_name), a(_a), b(_b) {} 12 | 13 | virtual void InitConst(void* side_info) override {} 14 | 15 | virtual void ResetOutput(const IMatrix* input, DenseMat* output) override 16 | { 17 | output->Resize(input->rows, input->cols); 18 | } 19 | 20 | virtual void UpdateOutput(IMatrix* input, DenseMat* output, Dtype beta, Phase phase) override 21 | { 22 | output->Axpby(a, input->DenseDerived(), beta); 23 | output->Add(b); 24 | } 25 | 26 | virtual void UpdateGradInput(DenseMat* gradInput, DenseMat* gradOutput, Dtype beta) override 27 | { 28 | gradInput->Axpby(a, *gradOutput, beta); 29 | } 30 | 31 | const Dtype a, b; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /include/net/err_cnt_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef ERR_CNT_CRITERION_LAYER_H 2 | #define ERR_CNT_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | #include "dense_matrix.h" 6 | #include "sparse_matrix.h" 7 | #include "loss_func.h" 8 | 9 | template 10 | class ErrCntCriterionLayer : public ICriterionLayer 11 | { 12 | public: 13 | ErrCntCriterionLayer(std::string _name) 14 | : ICriterionLayer(_name, PropErr::N) 15 | { 16 | 17 | } 18 | 19 | static std::string str_type() 20 | { 21 | return "ErrCnt"; 22 | } 23 | 24 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 25 | { 26 | auto& pred = operands[0]->state->DenseDerived(); 27 | auto& labels = operands[1]->state->SparseDerived(); 28 | this->loss = LossFunc::GetErrCnt(pred, labels); 29 | } 30 | 31 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 32 | { 33 | throw std::runtime_error("no grad in this layer"); 34 | } 35 | 36 | protected: 37 | }; 38 | 39 | #endif -------------------------------------------------------------------------------- /include/net/exp_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef EXP_LAYER_H 2 | #define EXP_LAYER_H 3 | 4 | #include "i_act_layer.h" 5 | 6 | template 7 | class ExpLayer : public IActLayer 8 | { 9 | public: 10 | 11 | ExpLayer(std::string _name, WriteType _wt = WriteType::OUTPLACE, PropErr _properr = PropErr::T) 12 | : IActLayer(_name, _wt, _properr) {} 13 | 14 | static std::string str_type() 15 | { 16 | return "Exp"; 17 | } 18 | 19 | virtual void Act(DenseMat& prev_out, DenseMat& cur_out) override 20 | { 21 | cur_out.Exp(prev_out); 22 | } 23 | 24 | virtual void Derivative(DenseMat& dst, DenseMat& prev_output, 25 | DenseMat& cur_output, DenseMat& cur_grad, Dtype beta) override 26 | { 27 | assert(beta == 0); 28 | 29 | dst.CopyFrom(cur_grad); 30 | dst.EleWiseMul(cur_output); 31 | } 32 | }; 33 | 34 | #endif -------------------------------------------------------------------------------- /include/net/gaussian_ll_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef GAUSSIAN_LL_LAYER_H 2 | #define GAUSSIAN_LL_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class GaussianLLLayer : public ILayer 8 | { 9 | public: 10 | GaussianLLLayer(std::string _name, PropErr _properr = PropErr::T); 11 | 12 | static std::string str_type() 13 | { 14 | return "GaussianLL"; 15 | } 16 | 17 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override; 18 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override; 19 | 20 | protected: 21 | 22 | DenseMat buffer, diff; 23 | }; 24 | 25 | #endif -------------------------------------------------------------------------------- /include/net/general_loss_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef GENERAL_LOSS_CRITERION_LAYER_H 2 | #define GENERAL_LOSS_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | 6 | template 7 | class GeneralLossCriterionLayer : public ICriterionLayer 8 | { 9 | public: 10 | GeneralLossCriterionLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : GeneralLossCriterionLayer(_name, 1.0, _properr) {} 12 | 13 | GeneralLossCriterionLayer(std::string _name, Dtype _lambda, PropErr _properr = PropErr::T) 14 | : ICriterionLayer(_name, _lambda, _properr) 15 | { 16 | 17 | } 18 | 19 | static std::string str_type() 20 | { 21 | return "GeneralLossCriterion"; 22 | } 23 | 24 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 25 | { 26 | assert(operands.size() == 1); 27 | this->loss = operands[0]->state->DenseDerived().Sum() * this->lambda; 28 | } 29 | 30 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 31 | { 32 | assert(operands.size() == 1 && cur_idx == 0); 33 | auto& prev_grad = operands[0]->grad->DenseDerived(); 34 | 35 | if (beta == 0) 36 | prev_grad.Fill(this->lambda / operands[0]->state->rows); 37 | else{ 38 | prev_grad.Scale(beta); 39 | prev_grad.Add(this->lambda / operands[0]->state->rows); 40 | } 41 | } 42 | }; 43 | 44 | #endif -------------------------------------------------------------------------------- /include/net/global_sum_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_SUM_LAYER_H 2 | #define GLOBAL_SUM_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class GlobalSumLayer : public ILayer 8 | { 9 | public: 10 | GlobalSumLayer(std::string _name, PropErr _properr = PropErr::T); 11 | 12 | static std::string str_type() 13 | { 14 | return "GlobalPool"; 15 | } 16 | 17 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override; 18 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override; 19 | 20 | protected: 21 | 22 | DenseMat buf; 23 | }; 24 | 25 | #endif -------------------------------------------------------------------------------- /include/net/graph_pool_param.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPH_POOL_PARAM_H 2 | #define GRAPH_POOL_PARAM_H 3 | 4 | #include "msg_pass_param.h" 5 | #include "graph_struct.h" 6 | #include "sparse_matrix.h" 7 | 8 | template 9 | class NodeAvgPoolParam : public IMessagePassParam 10 | { 11 | public: 12 | NodeAvgPoolParam(std::string _name) 13 | : IMessagePassParam(_name) {} 14 | 15 | protected: 16 | virtual void InitCPUWeight(GraphStruct* graph) override; 17 | }; 18 | 19 | template 20 | class NodeMaxPoolParam; 21 | 22 | template 23 | class NodeMaxPoolParam : public IConstParam 24 | { 25 | public: 26 | NodeMaxPoolParam(std::string _name) 27 | : IConstParam(_name) { max_index.clear(); } 28 | 29 | virtual void InitConst(void* side_info) override 30 | { 31 | graph = static_cast(side_info); 32 | } 33 | 34 | virtual void ResetOutput(const IMatrix* input, DenseMat* output) override 35 | { 36 | output->Zeros(graph->num_nodes, input->cols); 37 | } 38 | 39 | virtual void UpdateOutput(IMatrix* input, DenseMat* output, Dtype beta, Phase phase) override; 40 | 41 | virtual void UpdateGradInput(DenseMat* gradInput, DenseMat* gradOutput, Dtype beta) override; 42 | protected: 43 | GraphStruct* graph; 44 | std::vector max_index; 45 | }; 46 | 47 | template 48 | class NodeMaxPoolParam : public IConstParam 49 | { 50 | public: 51 | 52 | }; 53 | 54 | #endif -------------------------------------------------------------------------------- /include/net/graph_struct.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef GRAPH_STRUCT_H 3 | #define GRAPH_STRUCT_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | template 11 | class LinkedTable 12 | { 13 | public: 14 | LinkedTable() 15 | { 16 | n = ncap = 0; 17 | head.clear(); 18 | } 19 | 20 | inline void AddEntry(int head_id, T content) 21 | { 22 | if (head_id >= n) 23 | { 24 | if (head_id + 1 > ncap) 25 | { 26 | ncap = std::max(ncap * 2, head_id + 1); 27 | head.resize(ncap); 28 | for (int i = n; i < head_id + 1; ++i) 29 | head[i].clear(); 30 | } 31 | n = head_id + 1; 32 | } 33 | 34 | head[head_id].push_back(content); 35 | } 36 | 37 | inline void Resize(int new_n) 38 | { 39 | if (new_n > ncap) 40 | { 41 | ncap = std::max(ncap * 2, new_n); 42 | head.resize(ncap); 43 | } 44 | n = new_n; 45 | for (size_t i = 0; i < head.size(); ++i) 46 | head[i].clear(); 47 | } 48 | 49 | int n; 50 | std::vector< std::vector > head; 51 | private: 52 | int ncap; 53 | }; 54 | 55 | class GraphStruct 56 | { 57 | public: 58 | GraphStruct() 59 | { 60 | out_edges = new LinkedTable< std::pair >(); 61 | in_edges = new LinkedTable< std::pair >(); 62 | subgraph = new LinkedTable< int >(); 63 | edge_list.clear(); 64 | } 65 | 66 | ~GraphStruct() 67 | { 68 | delete out_edges; 69 | delete in_edges; 70 | delete subgraph; 71 | } 72 | 73 | inline void AddEdge(int idx, int x, int y) 74 | { 75 | out_edges->AddEntry(x, std::pair(idx, y)); 76 | in_edges->AddEntry(y, std::pair(idx, x)); 77 | num_edges++; 78 | edge_list.push_back(std::make_pair(x, y)); 79 | assert(num_edges == edge_list.size()); 80 | assert(num_edges - 1 == (unsigned)idx); 81 | } 82 | 83 | inline void AddNode(int subg_id, int n_idx) 84 | { 85 | subgraph->AddEntry(subg_id, n_idx); 86 | } 87 | 88 | inline void Resize(unsigned _num_subgraph, unsigned _num_nodes = 0) 89 | { 90 | num_nodes = _num_nodes; 91 | num_edges = 0; 92 | edge_list.clear(); 93 | num_subgraph = _num_subgraph; 94 | 95 | in_edges->Resize(num_nodes); 96 | out_edges->Resize(num_nodes); 97 | subgraph->Resize(num_subgraph); 98 | } 99 | 100 | LinkedTable< std::pair > *out_edges, *in_edges; 101 | LinkedTable< int >* subgraph; 102 | std::vector< std::pair > edge_list; 103 | 104 | unsigned num_nodes, num_edges, num_subgraph; 105 | }; 106 | 107 | #endif -------------------------------------------------------------------------------- /include/net/i_act_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef I_ACT_LAYER_H 2 | #define I_ACT_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | enum class WriteType 7 | { 8 | INPLACE = 0, 9 | OUTPLACE = 1 10 | }; 11 | 12 | template 13 | class IActLayer : public ILayer 14 | { 15 | public: 16 | 17 | IActLayer(std::string _name, WriteType _wt, PropErr _properr = PropErr::T) 18 | : ILayer(_name, _properr), wt(_wt) 19 | { 20 | this->state = new DenseMat(); 21 | this->grad = new DenseMat(); 22 | } 23 | 24 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 25 | { 26 | assert(operands.size() == 1); 27 | auto* prev_state = operands[0]->state; 28 | 29 | if (wt == WriteType::INPLACE) 30 | this->state = prev_state; 31 | else 32 | this->state->DenseDerived().Resize(prev_state->rows, prev_state->cols); 33 | 34 | Act(prev_state->DenseDerived(), this->state->DenseDerived()); 35 | } 36 | 37 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 38 | { 39 | assert(operands.size() == 1 && cur_idx == 0); 40 | 41 | auto& prev_grad = operands[0]->grad->DenseDerived(); 42 | auto& cur_grad = this->grad->DenseDerived(); 43 | auto& prev_output = operands[0]->state->DenseDerived(); 44 | auto& cur_output = this->state->DenseDerived(); 45 | 46 | Derivative(prev_grad, prev_output, cur_output, cur_grad, beta); 47 | } 48 | 49 | virtual void Act(DenseMat& prev_out, DenseMat& cur_out) = 0; 50 | virtual void Derivative(DenseMat& dst, DenseMat& prev_output, 51 | DenseMat& cur_output, DenseMat& cur_grad, Dtype beta) = 0; 52 | 53 | WriteType wt; 54 | }; 55 | 56 | #endif -------------------------------------------------------------------------------- /include/net/i_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef I_CRITERION_LAYER_H 2 | #define I_CRITERION_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class ICriterionLayer : public ILayer 8 | { 9 | public: 10 | ICriterionLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : ICriterionLayer(_name, 1.0, _properr) {} 12 | 13 | ICriterionLayer(std::string _name, Dtype _lambda, PropErr _properr = PropErr::T) 14 | : ILayer(_name, _properr), lambda(_lambda), loss(0.0) {} 15 | 16 | Dtype GetLoss() 17 | { 18 | return loss; 19 | } 20 | 21 | virtual bool IsSupervised() override 22 | { 23 | return true; 24 | } 25 | 26 | Dtype lambda; 27 | 28 | protected: 29 | Dtype loss; 30 | }; 31 | 32 | #endif -------------------------------------------------------------------------------- /include/net/i_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef ILAYER_H 2 | #define ILAYER_H 3 | 4 | #include "imatrix.h" 5 | #include 6 | 7 | enum class PropErr 8 | { 9 | N = 0, 10 | T = 1 11 | }; 12 | 13 | template 14 | class NNGraph; 15 | 16 | template 17 | class ILayer 18 | { 19 | public: 20 | ILayer(std::string _name, PropErr _properr = PropErr::T) 21 | : name(_name), properr(_properr) 22 | { 23 | this->state = this->grad = nullptr; 24 | } 25 | 26 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) = 0; 27 | 28 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) = 0; 29 | 30 | virtual bool HasParam() 31 | { 32 | return false; 33 | } 34 | 35 | virtual bool IsSupervised() 36 | { 37 | return false; 38 | } 39 | 40 | std::string name; 41 | PropErr properr; 42 | IMatrix* state, *grad; 43 | }; 44 | 45 | template 46 | class IParametric 47 | { 48 | public: 49 | virtual void AccDeriv(std::vector< ILayer* >& operands, unsigned cur_idx) = 0; 50 | }; 51 | 52 | #endif -------------------------------------------------------------------------------- /include/net/i_param.h: -------------------------------------------------------------------------------- 1 | #ifndef IPARAM_H 2 | #define IPARAM_H 3 | 4 | #include "dense_matrix.h" 5 | #include 6 | #include 7 | 8 | enum class BiasOption 9 | { 10 | NONE, 11 | BIAS 12 | }; 13 | 14 | template 15 | struct PP 16 | { 17 | PP(){} 18 | 19 | PP(size_t rows, size_t cols) 20 | { 21 | value.Resize(rows, cols); 22 | grad.Resize(rows, cols); 23 | } 24 | 25 | DenseMat value, grad; 26 | }; 27 | 28 | template 29 | class IParam 30 | { 31 | public: 32 | IParam() {} 33 | IParam(std::string _name) 34 | : name(_name) 35 | { 36 | } 37 | 38 | virtual void Serialize(FILE* fid) = 0; 39 | 40 | virtual void Deserialize(FILE* fid) = 0; 41 | 42 | virtual bool IsDiff() = 0; 43 | 44 | virtual void UpdateOutput(IMatrix* input, DenseMat* output, Dtype beta, Phase phase) = 0; 45 | virtual void UpdateGradInput(DenseMat* gradInput, DenseMat* gradOutput, Dtype beta) = 0; 46 | 47 | virtual void ResetOutput(const IMatrix* input, DenseMat* output) = 0; 48 | 49 | std::string name; 50 | }; 51 | 52 | template 53 | class IDiffParam : public IParam 54 | { 55 | public: 56 | IDiffParam() {} 57 | IDiffParam(std::string _name) 58 | : IParam(_name) 59 | { 60 | p.clear(); 61 | } 62 | virtual bool IsDiff() override 63 | { 64 | return true; 65 | } 66 | 67 | virtual void Serialize(FILE* fid) override 68 | { 69 | for (auto it = p.begin(); it != p.end(); ++it) 70 | { 71 | it->second->value.Serialize(fid); 72 | it->second->grad.Serialize(fid); 73 | } 74 | } 75 | 76 | virtual void Deserialize(FILE* fid) override 77 | { 78 | for (auto it = p.begin(); it != p.end(); ++it) 79 | { 80 | it->second->value.Deserialize(fid); 81 | it->second->grad.Deserialize(fid); 82 | } 83 | } 84 | 85 | virtual void AccDeriv(IMatrix* input, DenseMat* gradOutput) = 0; 86 | 87 | std::map*> p; 88 | }; 89 | 90 | template 91 | class IConstParam : public IParam 92 | { 93 | public: 94 | IConstParam() {} 95 | IConstParam(std::string _name) 96 | : IParam(_name) 97 | { 98 | 99 | } 100 | 101 | virtual void Serialize(FILE* fid) override {} 102 | 103 | virtual void Deserialize(FILE* fid) override {} 104 | 105 | virtual void InitConst(void* side_info) = 0; 106 | virtual bool IsDiff() override 107 | { 108 | return false; 109 | } 110 | }; 111 | 112 | #endif -------------------------------------------------------------------------------- /include/net/inner_product_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef INNER_PRODUCT_LAYER_H 2 | #define INNER_PRODUCT_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class InnerProductLayer : public ILayer 8 | { 9 | public: 10 | InnerProductLayer(std::string _name, PropErr _properr = PropErr::T) 11 | : ILayer(_name, _properr) 12 | { 13 | this->state = new DenseMat(); 14 | this->grad = new DenseMat(); 15 | } 16 | 17 | static std::string str_type() 18 | { 19 | return "InnerProduct"; 20 | } 21 | 22 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override 23 | { 24 | assert(operands.size() == 2); 25 | 26 | auto& cur_output = this->state->DenseDerived(); 27 | buf.EleWiseMul(operands[0]->state->DenseDerived(), operands[1]->state->DenseDerived()); 28 | ones.Resize(buf.cols, 1); 29 | ones.Fill(1.0); 30 | cur_output.GeMM(buf, ones, Trans::N, Trans::N, 1.0, 0.0); 31 | } 32 | 33 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override 34 | { 35 | assert(operands.size() == 2); 36 | 37 | auto& cur_grad = this->grad->DenseDerived(); 38 | auto& prev_grad = operands[cur_idx]->grad->DenseDerived(); 39 | auto& another_operand = operands[1 - cur_idx]->state->DenseDerived(); 40 | 41 | buf.MulColVec(another_operand, cur_grad); 42 | if (beta == 0) 43 | prev_grad.CopyFrom(buf); 44 | else 45 | prev_grad.Axpby(1.0, buf, beta); 46 | } 47 | 48 | DenseMat buf, ones; 49 | }; 50 | 51 | 52 | #endif -------------------------------------------------------------------------------- /include/net/input_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef INPUT_LAYER_H 2 | #define INPUT_LAYER_H 3 | 4 | #include "i_layer.h" 5 | 6 | template 7 | class InputLayer : public ILayer 8 | { 9 | public: 10 | InputLayer(std::string _name) 11 | : ILayer(_name, PropErr::N) {} 12 | 13 | static std::string str_type() 14 | { 15 | return "Input"; 16 | } 17 | 18 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override {} 19 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override {} 20 | }; 21 | 22 | #endif -------------------------------------------------------------------------------- /include/net/learner.h: -------------------------------------------------------------------------------- 1 | #ifndef LEARNER_H 2 | #define LEARNER_H 3 | 4 | #include "model.h" 5 | 6 | template 7 | class ILearner 8 | { 9 | public: 10 | explicit ILearner(Model* m, Dtype _init_lr, Dtype _l2_penalty = 0) 11 | : model(m), init_lr(_init_lr), l2_penalty(_l2_penalty), cur_lr(_init_lr), clip_threshold(5), clipping_enabled(true), cur_iter(0) {} 12 | 13 | virtual void Update() = 0; 14 | 15 | Dtype ClipGradients(); 16 | 17 | Model* model; 18 | Dtype init_lr, l2_penalty, cur_lr; 19 | Dtype clip_threshold; 20 | bool clipping_enabled; 21 | int cur_iter; 22 | }; 23 | 24 | template 25 | class SGDLearner : public ILearner 26 | { 27 | public: 28 | explicit SGDLearner(Model* m, Dtype _init_lr, Dtype _l2_penalty = 0) 29 | : ILearner(m, _init_lr, _l2_penalty) {} 30 | 31 | virtual void Update() override; 32 | }; 33 | 34 | template 35 | class MomentumSGDLearner : public ILearner 36 | { 37 | public: 38 | explicit MomentumSGDLearner(Model* m, 39 | Dtype _init_lr, 40 | Dtype _momentum = 0.9, 41 | Dtype _l2_penalty = 0) 42 | : ILearner(m, _init_lr, _l2_penalty), momentum(_momentum) 43 | { 44 | acc_grad_dict.clear(); 45 | } 46 | 47 | virtual void Update() override; 48 | Dtype momentum; 49 | std::map > > acc_grad_dict; 50 | }; 51 | 52 | template 53 | class ExplicitBatchLearner : public ILearner 54 | { 55 | public: 56 | explicit ExplicitBatchLearner(Model* m, Dtype _init_lr, Dtype _l2_penalty = 0) 57 | : ILearner(m, _init_lr, _l2_penalty) 58 | { 59 | acc_grad_dict.clear(); 60 | } 61 | 62 | virtual void Update() override; 63 | void AccumulateGrad(); 64 | 65 | std::map > > acc_grad_dict; 66 | }; 67 | 68 | template 69 | class AdamLearner : public ILearner 70 | { 71 | public: 72 | explicit AdamLearner(Model* m, 73 | Dtype _init_lr, 74 | Dtype _l2_penalty = 0, 75 | Dtype _beta_1 = 0.9, 76 | Dtype _beta_2 = 0.999, 77 | Dtype _eps = 1e-8) 78 | : ILearner(m, _init_lr, _l2_penalty), beta_1(_beta_1), beta_2(_beta_2), eps(_eps) 79 | { 80 | first_moments.clear(); 81 | second_moments.clear(); 82 | } 83 | 84 | virtual void Update() override; 85 | 86 | std::map > > first_moments, second_moments; 87 | Dtype beta_1, beta_2, eps; 88 | DenseMat m_hat, v_hat; 89 | }; 90 | 91 | #endif -------------------------------------------------------------------------------- /include/net/linear_param.h: -------------------------------------------------------------------------------- 1 | #ifndef LINEAR_PARAM_H 2 | #define LINEAR_PARAM_H 3 | 4 | #include "i_param.h" 5 | 6 | template 7 | class DenseMat; 8 | 9 | template 10 | class LinearParam : public IDiffParam 11 | { 12 | public: 13 | LinearParam(FILE* fid) 14 | { 15 | this->Deserialize(fid); 16 | } 17 | 18 | LinearParam(std::string _name, size_t _input_size, size_t _output_size, BiasOption _bo = BiasOption::BIAS) 19 | : LinearParam(_name, _input_size, _output_size, 0, 1.0 / sqrt(output_size), _bo) {} 20 | 21 | LinearParam(std::string _name, size_t _input_size, size_t _output_size, Dtype mean, Dtype std, BiasOption _bo = BiasOption::BIAS) 22 | : IDiffParam(_name), bo(_bo) 23 | { 24 | input_size = _input_size; 25 | output_size = _output_size; 26 | 27 | this->p["weight"] = new PP(); 28 | if (bo == BiasOption::BIAS) 29 | this->p["bias"] = new PP(); 30 | 31 | Reset(mean, std); 32 | } 33 | 34 | virtual void ResetOutput(const IMatrix* input, DenseMat* output) override 35 | { 36 | output->Zeros(input->rows, this->p["weight"]->value.cols); 37 | } 38 | 39 | virtual void UpdateOutput(IMatrix* input, DenseMat* output, Dtype beta, Phase phase) override 40 | { 41 | auto& weight = this->p["weight"]->value; 42 | 43 | if (input->GetMatType() == DENSE) 44 | output->GeMM(input->DenseDerived(), weight, Trans::N, Trans::N, 1.0, beta); 45 | else 46 | output->SparseMM(input->SparseDerived(), weight, Trans::N, Trans::N, 1.0, beta); 47 | 48 | if (bo == BiasOption::BIAS) 49 | { 50 | auto& bias = this->p["bias"]->value; 51 | output->AddRowVec(bias, 1.0); 52 | } 53 | } 54 | 55 | virtual void UpdateGradInput(DenseMat* gradInput, DenseMat* gradOutput, Dtype beta) override 56 | { 57 | gradInput->GeMM(*gradOutput, this->p["weight"]->value, Trans::N, Trans::T, 1.0, beta); 58 | } 59 | 60 | virtual void AccDeriv(IMatrix* input, DenseMat* gradOutput) override 61 | { 62 | if (input->GetMatType() == DENSE) 63 | this->p["weight"]->grad.GeMM(input->DenseDerived(), *gradOutput, Trans::T, Trans::N, 1.0, 1.0); 64 | else 65 | this->p["weight"]->grad.SparseMM(input->SparseDerived(), *gradOutput, Trans::T, Trans::N, 1.0, 1.0); 66 | 67 | if (bo == BiasOption::BIAS) 68 | { 69 | bias_multiplier.Resize(1, input->rows); 70 | bias_multiplier.Fill(1.0); 71 | this->p["bias"]->grad.GeMM(bias_multiplier, *gradOutput, Trans::N, Trans::N, 1.0, 1.0); 72 | } 73 | } 74 | 75 | virtual void Reset(Dtype mean, Dtype std) 76 | { 77 | this->p["weight"]->value.SetRandN(mean, std, input_size, output_size); 78 | this->p["weight"]->grad.Zeros(input_size, output_size); 79 | if (bo == BiasOption::BIAS) 80 | { 81 | this->p["bias"]->value.Zeros(1, output_size); 82 | this->p["bias"]->grad.Zeros(1, output_size); 83 | } 84 | } 85 | 86 | protected: 87 | BiasOption bo; 88 | size_t input_size, output_size; 89 | DenseMat bias_multiplier; 90 | }; 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /include/net/log_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef LOG_LAYER_H 2 | #define LOG_LAYER_H 3 | 4 | #include "i_act_layer.h" 5 | 6 | template 7 | class LogLayer : public IActLayer 8 | { 9 | public: 10 | 11 | LogLayer(std::string _name, PropErr _properr = PropErr::T) 12 | : IActLayer(_name, WriteType::OUTPLACE, _properr) {} 13 | 14 | static std::string str_type() 15 | { 16 | return "Log"; 17 | } 18 | 19 | virtual void Act(DenseMat& prev_out, DenseMat& cur_out) override 20 | { 21 | cur_out.Log(prev_out); 22 | } 23 | 24 | virtual void Derivative(DenseMat& dst, DenseMat& prev_output, 25 | DenseMat& cur_output, DenseMat& cur_grad, Dtype beta) override 26 | { 27 | buf.CopyFrom(prev_output); 28 | buf.Inv(); 29 | buf.EleWiseMul(cur_grad); 30 | 31 | dst.Axpby(1.0, buf, beta); 32 | } 33 | 34 | protected: 35 | 36 | DenseMat buf; 37 | }; 38 | 39 | #endif -------------------------------------------------------------------------------- /include/net/loss_func.h: -------------------------------------------------------------------------------- 1 | #ifndef LOSS_FUNC_H 2 | #define LOSS_FUNC_H 3 | 4 | #include "dense_matrix.h" 5 | #include "sparse_matrix.h" 6 | 7 | enum class RankOrder 8 | { 9 | ASCE, 10 | DESC 11 | }; 12 | 13 | template 14 | class LossFunc; 15 | 16 | template 17 | class LossFunc 18 | { 19 | public: 20 | static Dtype GetLogLoss(DenseMat& pred, SparseMat& label); 21 | static Dtype GetErrCnt(DenseMat& pred, SparseMat& label); 22 | static Dtype GetAverageRank(DenseMat& pred, SparseMat& label, RankOrder order); 23 | }; 24 | 25 | template 26 | class LossFunc 27 | { 28 | public: 29 | static Dtype GetLogLoss(DenseMat& pred, SparseMat& label); 30 | static Dtype GetErrCnt(DenseMat& pred, SparseMat& label); 31 | static Dtype GetAverageRank(DenseMat& pred, SparseMat& label, RankOrder order); 32 | 33 | private: 34 | static DenseMat buf; 35 | }; 36 | 37 | template 38 | DenseMat LossFunc::buf; 39 | 40 | 41 | 42 | 43 | #endif -------------------------------------------------------------------------------- /include/net/max_entropy_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef MAX_ENTROPY_CRITERION_LAYER_H 2 | #define MAX_ENTROPY_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | 6 | template 7 | class MaxEntropyCriterionLayer : public ICriterionLayer 8 | { 9 | public: 10 | 11 | MaxEntropyCriterionLayer(std::string _name, PropErr _properr = PropErr::T) 12 | : MaxEntropyCriterionLayer(_name, 1.0, _properr) {} 13 | 14 | MaxEntropyCriterionLayer(std::string _name, Dtype _lambda, PropErr _properr = PropErr::T); 15 | 16 | static std::string str_type() 17 | { 18 | return "EntropyLoss"; 19 | } 20 | 21 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override; 22 | 23 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override; 24 | }; 25 | 26 | #endif -------------------------------------------------------------------------------- /include/net/mixture_nll_criterion_layer.h: -------------------------------------------------------------------------------- 1 | #ifndef SUM_LOSS_CRITERION_LAYER_H 2 | #define SUM_LOSS_CRITERION_LAYER_H 3 | 4 | #include "i_criterion_layer.h" 5 | 6 | template 7 | class MixtureNLLCriterionLayer : public ICriterionLayer 8 | { 9 | public: 10 | 11 | MixtureNLLCriterionLayer(std::string _name, PropErr _properr = PropErr::T) 12 | : MixtureNLLCriterionLayer(_name, 1.0, _properr) {} 13 | 14 | MixtureNLLCriterionLayer(std::string _name, Dtype _lambda, PropErr _properr = PropErr::T); 15 | 16 | virtual void UpdateOutput(std::vector< ILayer* >& operands, Phase phase) override; 17 | 18 | virtual void BackPropErr(std::vector< ILayer* >& operands, unsigned cur_idx, Dtype beta) override; 19 | }; 20 | 21 | #endif -------------------------------------------------------------------------------- /include/net/model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_H 2 | #define MODEL_H 3 | 4 | #include "i_param.h" 5 | #include 6 | #include 7 | #include "fmt.h" 8 | 9 | template 10 | class Model 11 | { 12 | public: 13 | 14 | Model() 15 | { 16 | flatten = false; 17 | diff_params.clear(); 18 | const_params.clear(); 19 | param_list.clear(); 20 | } 21 | 22 | inline void AddParam(IDiffParam* param) 23 | { 24 | flatten = false; 25 | assert(diff_params.count(param->name) == 0); 26 | diff_params[param->name] = param; 27 | all_params[param->name] = param; 28 | } 29 | 30 | inline void AddParam(IConstParam* param) 31 | { 32 | assert(const_params.count(param->name) == 0); 33 | const_params[param->name] = param; 34 | all_params[param->name] = param; 35 | } 36 | 37 | inline void SetupConstParams(std::map arg_dict) 38 | { 39 | for (auto& p : arg_dict) 40 | { 41 | assert(const_params.count(p.first)); 42 | const_params[p.first]->InitConst(p.second); 43 | } 44 | } 45 | 46 | std::map< std::string, PP* >& GetDiffParams() 47 | { 48 | if (!flatten) 49 | DiffParams2List(); 50 | return param_list; 51 | } 52 | 53 | void DiffParams2List() 54 | { 55 | param_list.clear(); 56 | for (auto& param_pair : diff_params) 57 | { 58 | for (auto& weight_pair : param_pair.second->p) 59 | { 60 | param_list[param_pair.first + "-" + weight_pair.first] = weight_pair.second; 61 | } 62 | } 63 | } 64 | void Load(std::string filename) 65 | { 66 | FILE* fid = fopen(filename.c_str(), "rb"); 67 | 68 | for (auto it = diff_params.begin(); it != diff_params.end(); ++it) 69 | it->second->Deserialize(fid); 70 | 71 | fclose(fid); 72 | } 73 | 74 | void Save(std::string filename) 75 | { 76 | FILE* fid = fopen(filename.c_str(), "wb"); 77 | 78 | for (auto it = diff_params.begin(); it != diff_params.end(); ++it) 79 | it->second->Serialize(fid); 80 | 81 | fclose(fid); 82 | } 83 | 84 | std::map< std::string, IDiffParam* > diff_params; 85 | std::map< std::string, IConstParam* > const_params; 86 | std::map< std::string, IParam*> all_params; 87 | bool flatten; 88 | std::map< std::string, PP* > param_list; 89 | }; 90 | 91 | template