├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.md ├── dataset ├── a9a_test ├── a9a_train0 └── a9a_train1 ├── include └── thundergbm │ ├── clion_cuda.h │ ├── config.h.in │ ├── csc2r_transform.h │ ├── dataset.h │ ├── gpu_lsh.h │ ├── hist_cut.h │ ├── ins_stat.h │ ├── param.h │ ├── quantile_sketch.h │ ├── sparse_columns.h │ ├── syncarray.h │ ├── syncmem.h │ ├── thundergbm.h │ ├── tree.h │ ├── updater │ ├── exact_updater.h │ └── hist_updater.h │ └── util │ ├── common.h │ ├── cub_wrapper.h │ ├── device_lambda.cuh │ ├── log.h │ └── multi_device.h └── src ├── test ├── CMakeLists.txt ├── test_exact_updater.cu ├── test_main.cpp └── test_unifiedmem.cu └── thundergbm ├── CMakeLists.txt ├── csc2r_transform.cu ├── dataset.cu ├── gpu_lsh.cu ├── hist_cut.cpp ├── ins_stat.cu ├── quantile_sketch.cpp ├── sparse_columns.cu ├── syncmem.cpp ├── tree.cpp ├── updater ├── exact_updater.cu └── hist_updater.cu └── util ├── common.cpp └── log.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | !.gitignore 3 | !.gitmodules 4 | !.travis.yml 5 | *build* 6 | !dataset/*.sh 7 | !dataset/test_dataset.txt 8 | html 9 | latex 10 | logs 11 | *.pyc 12 | 13 | nccl* 14 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/test/googletest"] 2 | path = src/test/googletest 3 | url = https://github.com/google/googletest.git 4 | [submodule "cub"] 5 | path = cub 6 | url = git@github.com:NVlabs/cub.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(thundergbm) 3 | 4 | find_package(CUDA REQUIRED QUIET) 5 | find_package(OpenMP REQUIRED QUIET) 6 | 7 | if (NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE Release) 9 | endif () 10 | 11 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -gencode arch=compute_60,code=sm_60 --expt-extended-lambda --default-stream per-thread") 12 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" --source-in-ptx -lineinfo) 13 | 14 | if (CMAKE_VERSION VERSION_LESS "3.1") 15 | add_compile_options("-std=c++11") 16 | else () 17 | set(CMAKE_CXX_STANDARD 11) 18 | endif () 19 | 20 | if (OPENMP_FOUND) 21 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 23 | endif () 24 | 25 | add_definitions("-DELPP_FEATURE_PERFORMANCE_TRACKING") 26 | add_definitions("-DELPP_THREAD_SAFE") 27 | set(COMMON_INCLUDES ${PROJECT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR} ${PROJECT_SOURCE_DIR}/cub) 28 | include_directories(${COMMON_INCLUDES}) 29 | 30 | add_subdirectory(src/thundergbm) 31 | add_subdirectory(src/test) 32 | 33 | # configuration file 34 | set(DATASET_DIR ${PROJECT_SOURCE_DIR}/dataset/) 35 | configure_file(include/thundergbm/config.h.in config.h) 36 | 37 | add_custom_target(runtest ${PROJECT_NAME}-test --gtest_filter='UpdaterTest*') 38 | add_custom_target(runtest-performance ${PROJECT_NAME}-test --gtest_filter='PerformanceTest*') 39 | 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the code of paper [Practical Federated Gradient Boosting Decision Trees](https://arxiv.org/pdf/1911.04206.pdf). The implementation is based on a previous version of [ThunderGBM](https://github.com/Xtra-Computing/thundergbm.git). Only Linux-based operating systems are supported. 2 | 3 | 4 | # Installation 5 | 6 | ## Prerequisites 7 | * CMake 8 | * CUDA 9 | 10 | ## Build 11 | ``` 12 | git submodule init 13 | git submodule update 14 | mkdir build 15 | cd build 16 | cmake .. 17 | make -j 18 | ``` 19 | 20 | 21 | # Usage 22 | 23 | SimFL currently only works for binary classification tasks with labels 0 and 1 and requires GPUs. 24 | 25 | ## Parameteres 26 | 27 | ``` 28 | * -p: int, number of parties (default:2) 29 | * -t: int, number of lsh tables (default:40) 30 | * -b: int, number of buckets (default:500) 31 | * -r: float, r value of LSH function (default:4.0) 32 | * -s: int, init seed for LSH 33 | * -f: string, path to the dataset file 34 | * -d: int, the maximum dimension of the datasets 35 | ``` 36 | 37 | ## Datasets 38 | 39 | Please rename all the local datasets in such format: name+'_train'+party_id, e.g., `a9a_train0`, `a9a_train1`. For the test dataset, please rename it in such format: name+'_test', e.g., `a9a_test`. 40 | 41 | ## Sample command: 42 | Under `build` directory 43 | 44 | ``` 45 | ./src/test/thundergbm-test -p 2 -t 30 -b 500 -r 4 -s -1 -f ../dataset/a9a/a9a -d 123 -n 50 -e 8 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /include/thundergbm/clion_cuda.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 17-9-14. 3 | // 4 | 5 | #ifndef THUNDERSVM_CLION_CUDA_H 6 | #define THUNDERSVM_CLION_CUDA_H 7 | 8 | #ifdef __JETBRAINS_IDE__ 9 | 10 | #include "math.h" 11 | 12 | #define __CUDACC__ 1 13 | #define __host__ 14 | #define __device__ 15 | #define __global__ 16 | #define __noinline__ 17 | #define __forceinline__ 18 | #define __shared__ 19 | #define __constant__ 20 | #define __managed__ 21 | #define __restrict__ 22 | 23 | // CUDA Synchronization 24 | inline void __syncthreads() {}; 25 | 26 | inline void __threadfence_block() {}; 27 | 28 | inline void __threadfence() {}; 29 | 30 | inline void __threadfence_system(); 31 | 32 | inline int __syncthreads_count(int predicate) { return predicate }; 33 | 34 | inline int __syncthreads_and(int predicate) { return predicate }; 35 | 36 | inline int __syncthreads_or(int predicate) { return predicate }; 37 | 38 | template 39 | inline T __clz(const T val) { return val; } 40 | 41 | template 42 | inline T __ldg(const T *address) { return *address }; 43 | // CUDA TYPES 44 | typedef unsigned short uchar; 45 | typedef unsigned short ushort; 46 | typedef unsigned int uint; 47 | typedef unsigned long ulong; 48 | typedef unsigned long long ulonglong; 49 | typedef long long longlong; 50 | 51 | typedef struct uchar1 { 52 | uchar x; 53 | } uchar1; 54 | 55 | typedef struct uchar2 { 56 | uchar x; 57 | uchar y; 58 | } uchar2; 59 | 60 | typedef struct uchar3 { 61 | uchar x; 62 | uchar y; 63 | uchar z; 64 | } uchar3; 65 | 66 | typedef struct uchar4 { 67 | uchar x; 68 | uchar y; 69 | uchar z; 70 | uchar w; 71 | } uchar4; 72 | 73 | typedef struct char1 { 74 | char x; 75 | } char1; 76 | 77 | typedef struct char2 { 78 | char x; 79 | char y; 80 | } char2; 81 | 82 | typedef struct char3 { 83 | char x; 84 | char y; 85 | char z; 86 | } char3; 87 | 88 | typedef struct char4 { 89 | char x; 90 | char y; 91 | char z; 92 | char w; 93 | } char4; 94 | 95 | typedef struct ushort1 { 96 | ushort x; 97 | } ushort1; 98 | 99 | typedef struct ushort2 { 100 | ushort x; 101 | ushort y; 102 | } ushort2; 103 | 104 | typedef struct ushort3 { 105 | ushort x; 106 | ushort y; 107 | ushort z; 108 | } ushort3; 109 | 110 | typedef struct ushort4 { 111 | ushort x; 112 | ushort y; 113 | ushort z; 114 | ushort w; 115 | } ushort4; 116 | 117 | typedef struct short1 { 118 | short x; 119 | } short1; 120 | 121 | typedef struct short2 { 122 | short x; 123 | short y; 124 | } short2; 125 | 126 | typedef struct short3 { 127 | short x; 128 | short y; 129 | short z; 130 | } short3; 131 | 132 | typedef struct short4 { 133 | short x; 134 | short y; 135 | short z; 136 | short w; 137 | } short4; 138 | 139 | typedef struct uint1 { 140 | uint x; 141 | } uint1; 142 | 143 | typedef struct uint2 { 144 | uint x; 145 | uint y; 146 | } uint2; 147 | 148 | typedef struct uint3 { 149 | uint x; 150 | uint y; 151 | uint z; 152 | } uint3; 153 | 154 | typedef struct uint4 { 155 | uint x; 156 | uint y; 157 | uint z; 158 | uint w; 159 | } uint4; 160 | 161 | typedef struct int1 { 162 | int x; 163 | } int1; 164 | 165 | typedef struct int2 { 166 | int x; 167 | int y; 168 | } int2; 169 | 170 | typedef struct int3 { 171 | int x; 172 | int y; 173 | int z; 174 | } int3; 175 | 176 | typedef struct int4 { 177 | int x; 178 | int y; 179 | int z; 180 | int w; 181 | } int4; 182 | 183 | typedef struct ulong1 { 184 | ulong x; 185 | } ulong1; 186 | 187 | typedef struct ulong2 { 188 | ulong x; 189 | ulong y; 190 | } ulong2; 191 | 192 | typedef struct ulong3 { 193 | ulong x; 194 | ulong y; 195 | ulong z; 196 | } ulong3; 197 | 198 | typedef struct ulong4 { 199 | ulong x; 200 | ulong y; 201 | ulong z; 202 | ulong w; 203 | } ulong4; 204 | 205 | typedef struct long1 { 206 | long x; 207 | } long1; 208 | 209 | typedef struct long2 { 210 | long x; 211 | long y; 212 | } long2; 213 | 214 | typedef struct long3 { 215 | long x; 216 | long y; 217 | long z; 218 | } long3; 219 | 220 | typedef struct long4 { 221 | long x; 222 | long y; 223 | long z; 224 | long w; 225 | } long4; 226 | 227 | typedef struct ulonglong1 { 228 | ulonglong x; 229 | } ulonglong1; 230 | 231 | typedef struct ulonglong2 { 232 | ulonglong x; 233 | ulonglong y; 234 | } ulonglong2; 235 | 236 | typedef struct ulonglong3 { 237 | ulonglong x; 238 | ulonglong y; 239 | ulonglong z; 240 | } ulonglong3; 241 | 242 | typedef struct ulonglong4 { 243 | ulonglong x; 244 | ulonglong y; 245 | ulonglong z; 246 | ulonglong w; 247 | } ulonglong4; 248 | 249 | typedef struct longlong1 { 250 | longlong x; 251 | } longlong1; 252 | 253 | typedef struct longlong2 { 254 | longlong x; 255 | longlong y; 256 | } longlong2; 257 | 258 | typedef struct float1 { 259 | float x; 260 | } float1; 261 | 262 | typedef struct float2 { 263 | float x; 264 | float y; 265 | } float2; 266 | 267 | typedef struct float3 { 268 | float x; 269 | float y; 270 | float z; 271 | } float3; 272 | 273 | typedef struct float4 { 274 | float x; 275 | float y; 276 | float z; 277 | float w; 278 | } float4; 279 | 280 | typedef struct double1 { 281 | double x; 282 | } double1; 283 | 284 | typedef struct double2 { 285 | double x; 286 | double y; 287 | } double2; 288 | 289 | typedef uint3 dim3; 290 | 291 | extern dim3 gridDim; 292 | extern uint3 blockIdx; 293 | extern dim3 blockDim; 294 | extern uint3 threadIdx; 295 | extern int warpsize; 296 | #endif 297 | #endif //THUNDERSVM_CLION_CUDA_H 298 | -------------------------------------------------------------------------------- /include/thundergbm/config.h.in: -------------------------------------------------------------------------------- 1 | #cmakedefine DATASET_DIR "@DATASET_DIR@" 2 | -------------------------------------------------------------------------------- /include/thundergbm/csc2r_transform.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 21/8/18. 3 | // 4 | 5 | 6 | #ifndef THUNDERGBM_CSC2R_TRANSFORM_H 7 | #define THUNDERGBM_CSC2R_TRANSFORM_H 8 | 9 | #include "thundergbm.h" 10 | #include "syncarray.h" 11 | #include "cusparse.h" 12 | #include "sparse_columns.h" 13 | //change csc 2 csr or csr 2 csc 14 | 15 | class Csc2r{ 16 | public: 17 | SyncArray csc_val; 18 | SyncArray csc_row_ind; 19 | SyncArray csc_col_ptr; 20 | int nnz; 21 | //for gpu 22 | void from_csr(float_type* val, int* csr_col_ind, int* csr_row_ptr, int n_instances, int n_column, int nnz); 23 | void get_cut_points_evenly(int nBin, vector& bin_id, const vector& min_fea, const vector& max_fea); 24 | // void init_bin_id_csr(const vector>> &v_columns, int n_instances); 25 | }; 26 | 27 | #endif //THUNDERGBM_CSC2R_TRANSFORM_H 28 | -------------------------------------------------------------------------------- /include/thundergbm/dataset.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-17. 3 | // 4 | 5 | #ifndef THUNDERGBM_DATASET_H 6 | #define THUNDERGBM_DATASET_H 7 | 8 | #include "thundergbm.h" 9 | #include "syncarray.h" 10 | 11 | class DataSet { 12 | public: 13 | ///one feature value and corresponding index 14 | struct node { 15 | node(int index, float_type value) : index(index), value(value) {} 16 | 17 | int index; 18 | float_type value; 19 | }; 20 | 21 | ///two-dimension node vector 22 | typedef vector> node2d; 23 | 24 | ///load dataset from file 25 | void load_from_file(string file_name); 26 | 27 | void load_from_file_csr(string file_name); 28 | 29 | void load_from_file_two_dimension(string file_name); 30 | 31 | const node2d &instances() const; 32 | 33 | size_t n_features() const; 34 | 35 | size_t n_instances() const; 36 | 37 | const vector &y() const; 38 | vector> features; 39 | vector> line_num; 40 | 41 | //to do 42 | vector min_fea; 43 | vector max_fea; 44 | 45 | //suppose know the max dimension of all datasets 46 | int max_dimension = 123; 47 | 48 | void compression(); 49 | private: 50 | ///labels of instances 51 | vector y_; 52 | node2d instances_; 53 | size_t n_features_; 54 | }; 55 | 56 | 57 | #endif //THUNDERGBM_DATASET_H 58 | -------------------------------------------------------------------------------- /include/thundergbm/gpu_lsh.h: -------------------------------------------------------------------------------- 1 | #ifndef THUNDERGBM_GPU_LSH_H 2 | #define THUNDERGBM_GPU_LSH_H 3 | 4 | 5 | ////////////////////////////////////////////////////////////////////////////// 6 | /// Copyright (C) 2014 Gefu Tang . All Rights Reserved. 7 | /// 8 | /// This file is part of LSHBOX. 9 | /// 10 | /// LSHBOX is free software: you can redistribute it and/or modify it under 11 | /// the terms of the GNU General Public License as published by the Free 12 | /// Software Foundation, either version 3 of the License, or(at your option) 13 | /// any later version. 14 | /// 15 | /// LSHBOX is distributed in the hope that it will be useful, but WITHOUT 16 | /// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 | /// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 | /// more details. 19 | /// 20 | /// You should have received a copy of the GNU General Public License along 21 | /// with LSHBOX. If not, see . 22 | /// 23 | /// @version 0.1 24 | /// @author Gefu Tang & Zhifeng Xiao 25 | /// @date 2014.6.30 26 | ////////////////////////////////////////////////////////////////////////////// 27 | 28 | /** 29 | * @file psdlsh.h 30 | * 31 | * @brief Locality-Sensitive Hashing Scheme Based on p-Stable Distributions. 32 | */ 33 | //#pragma once 34 | 35 | #include "thundergbm.h" 36 | #include "cusparse.h" 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "syncarray.h" 42 | /** 43 | * Locality-Sensitive Hashing Scheme Based on p-Stable Distributions. 44 | * 45 | * 46 | * For more information on p-stable distribution based LSH, see the following reference. 47 | * 48 | * Mayur Datar , Nicole Immorlica , Piotr Indyk , Vahab S. Mirrokni, 49 | * Locality-sensitive hashing scheme based on p-stable distributions, 50 | * Proceedings of the twentieth annual symposium on Computational geometry, June 51 | * 08-11, 2004, Brooklyn, New York, USA. 52 | */ 53 | //template 54 | class psdLsh 55 | { 56 | public: 57 | struct Parameter 58 | { 59 | //number of buckets in a hash table 60 | unsigned n_bucket; 61 | //number of hash tables 62 | unsigned n_table; 63 | //number of dimensions 64 | unsigned n_dimension; 65 | //p_norm = 1, cauchy distribution, = 2, gaussian distribution 66 | unsigned p_norm; 67 | //the range r 68 | float r; 69 | //the number of companies 70 | unsigned n_comp; 71 | int seed; 72 | }; 73 | psdLsh() {} 74 | psdLsh(const Parameter ¶m_) 75 | { 76 | reset(param_); 77 | } 78 | ~psdLsh() {} 79 | 80 | 81 | void init(); 82 | /** 83 | * Reset the parameter setting 84 | * 85 | * @param param_ A instance of psdLsh::Parametor, which contains 86 | * the necessary parameters 87 | */ 88 | void reset(const Parameter ¶m_); 89 | /** 90 | * Hash the dataset. 91 | * 92 | * @param data A instance of Matrix, it is the search dataset. 93 | */ 94 | void hash(int n_instances, int n_features, int nnz, int key_offset, 95 | SyncArray &csr_val, SyncArray &csr_row_ptr, SyncArray &csr_col_ind, 96 | SyncArray &hash_values, int cid); 97 | 98 | // void query(int n_instances, int n_features, int nnz, 99 | // SyncArray &csr_val, SyncArray &csr_row_ptr, SyncArray &csr_col_ind, 100 | // vector> &buckets); 101 | 102 | /** 103 | * Insert a vector to the index. 104 | * 105 | * @param key The sequence number of vector 106 | * @param domin The pointer to the vector 107 | */ 108 | // void insert(unsigned key, const DATATYPE *domin); 109 | /** 110 | * Query the approximate nearest neighborholds. 111 | * 112 | * @param domin The pointer to the vector 113 | * @param scanner Top-K scanner, use for scan the approximate nearest neighborholds 114 | */ 115 | // template 116 | // void query(const DATATYPE *domin, SCANNER &scanner); 117 | /** 118 | * get the hash value of a vector. 119 | * 120 | * @param k The idx of the table 121 | * @param domin The pointer to the vector 122 | * @return The hash value 123 | */ 124 | // unsigned getHashVal(unsigned k, const DATATYPE *domin); 125 | /** 126 | * Load the index from binary file. 127 | * 128 | * @param file The path of binary file. 129 | */ 130 | // void load(const std::string &file); 131 | /** 132 | * Save the index as binary file. 133 | * 134 | * @param file The path of binary file. 135 | */ 136 | // void save(const std::string &file); 137 | //private: 138 | Parameter param; 139 | SyncArray b; 140 | SyncArray a; 141 | 142 | vector v_a; 143 | vector v_b; 144 | //random number vector before distribution transform 145 | vector random_vector; 146 | 147 | 148 | // std::vector v_a; 149 | // std::vector rndBs; 150 | // std::vector > stableArray; 151 | std::vector> > > tables; 152 | // std::vector> > > tables; 153 | }; 154 | 155 | // ------------------------- implementation ------------------------- 156 | 157 | //template 158 | //void lshbox::psdLsh::hash(Matrix &data) 159 | //{ 160 | // for (unsigned i = 0; i != data.getSize(); ++i) 161 | // { 162 | // insert(i, data[i]); 163 | // ++pd; 164 | // } 165 | //} 166 | //template 167 | //void lshbox::psdLsh::insert(unsigned key, const DATATYPE *domin) 168 | //{ 169 | // for(unsigned k = 0; k < param.n_table; k++){ 170 | // unsigned hashVal = getHashVal(k, ) 171 | // } 172 | //// for (unsigned k = 0; k != param.L; ++k) 173 | //// { 174 | //// unsigned hashVal = getHashVal(k, domin); 175 | //// tables[k][hashVal].push_back(key); 176 | //// } 177 | //} 178 | //template 179 | //template 180 | //void lshbox::psdLsh::query(const DATATYPE *domin, SCANNER &scanner) 181 | //{ 182 | // scanner.reset(domin); 183 | // for (unsigned k = 0; k != param.L; ++k) 184 | // { 185 | // unsigned hashVal = getHashVal(k, domin); 186 | // if (tables[k].find(hashVal) != tables[k].end()) 187 | // { 188 | // for (std::vector::iterator iter = tables[k][hashVal].begin(); iter != tables[k][hashVal].end(); ++iter) 189 | // { 190 | // scanner(*iter); 191 | // } 192 | // } 193 | // } 194 | // scanner.topk().genTopk(); 195 | //} 196 | //template 197 | //unsigned lshbox::psdLsh::getHashVal(unsigned k, const DATATYPE *domin) 198 | //{ 199 | // float sum(0); 200 | // for (unsigned i = 0; i != param.D; ++i) 201 | // { 202 | // sum += domin[i] * stableArray[k][i]; 203 | // } 204 | // unsigned hashVal = unsigned(std::floor((sum + rndBs[k]) / param.W)) % param.M; 205 | // return hashVal; 206 | //} 207 | //template 208 | //void lshbox::psdLsh::load(const std::string &file) 209 | //{ 210 | // std::ifstream in(file, std::ios::binary); 211 | // in.read((char *)¶m.M, sizeof(unsigned)); 212 | // in.read((char *)¶m.L, sizeof(unsigned)); 213 | // in.read((char *)¶m.D, sizeof(unsigned)); 214 | // in.read((char *)¶m.W, sizeof(float)); 215 | // tables.resize(param.L); 216 | // stableArray.resize(param.L); 217 | // rndBs.resize(param.L); 218 | // in.read((char *)&rndBs[0], sizeof(float) * param.L); 219 | // for (unsigned i = 0; i != param.L; ++i) 220 | // { 221 | // stableArray[i].resize(param.D); 222 | // in.read((char *)&stableArray[i][0], sizeof(float) * param.D); 223 | // unsigned count; 224 | // in.read((char *)&count, sizeof(unsigned)); 225 | // for (unsigned j = 0; j != count; ++j) 226 | // { 227 | // unsigned target; 228 | // in.read((char *)&target, sizeof(unsigned)); 229 | // unsigned length; 230 | // in.read((char *)&length, sizeof(unsigned)); 231 | // tables[i][target].resize(length); 232 | // in.read((char *) & (tables[i][target][0]), sizeof(unsigned) * length); 233 | // } 234 | // } 235 | // in.close(); 236 | //} 237 | //template 238 | //void lshbox::psdLsh::save(const std::string &file) 239 | //{ 240 | // std::ofstream out(file, std::ios::binary); 241 | // out.write((char *)¶m.M, sizeof(unsigned)); 242 | // out.write((char *)¶m.L, sizeof(unsigned)); 243 | // out.write((char *)¶m.D, sizeof(unsigned)); 244 | // out.write((char *)¶m.W, sizeof(float)); 245 | // out.write((char *)&rndBs[0], sizeof(float) * param.L); 246 | // for (int i = 0; i != param.L; ++i) 247 | // { 248 | // out.write((char *)&stableArray[i][0], sizeof(float) * param.D); 249 | // unsigned count = unsigned(tables[i].size()); 250 | // out.write((char *)&count, sizeof(unsigned)); 251 | // for (std::map >::iterator iter = tables[i].begin(); iter != tables[i].end(); ++iter) 252 | // { 253 | // unsigned target = iter->first; 254 | // out.write((char *)&target, sizeof(unsigned)); 255 | // unsigned length = unsigned(iter->second.size()); 256 | // out.write((char *)&length, sizeof(unsigned)); 257 | // out.write((char *) & ((iter->second)[0]), sizeof(unsigned) * length); 258 | // } 259 | // } 260 | // out.close(); 261 | //} 262 | 263 | 264 | #endif //THUNDERGBM_GPU_LSH_H 265 | -------------------------------------------------------------------------------- /include/thundergbm/hist_cut.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 2018/5/9. 3 | // 4 | 5 | #ifndef THUNDERGBM_HIST_CUT_H 6 | #define THUNDERGBM_HIST_CUT_H 7 | 8 | #include "thundergbm/thundergbm.h" 9 | #include "thundergbm/dataset.h" 10 | #include "thundergbm/tree.h" 11 | #include "sparse_columns.h" 12 | 13 | class HistCut { 14 | public: 15 | //split_point[i] stores the split points of feature i 16 | //std::vector> split_points; 17 | vector cut_points; 18 | vector row_ptr; 19 | //for gpu 20 | SyncArray cut_points_val; 21 | SyncArray cut_row_ptr; 22 | 23 | HistCut() = default; 24 | 25 | HistCut(const HistCut &cut) { 26 | cut_points = cut.cut_points; 27 | row_ptr = cut.row_ptr; 28 | cut_points_val.copy_from(cut.cut_points_val); 29 | cut_row_ptr.copy_from(cut.cut_row_ptr); 30 | } 31 | 32 | void get_cut_points(SparseColumns &columns, InsStat &stats, int max_num_bins, int n_instances, int n_features); 33 | }; 34 | 35 | //store the g/h of the bins of one feature 36 | class BinStat { 37 | public: 38 | SyncArray gh_pair; 39 | //feature id 40 | int fid; 41 | //number of bins 42 | int numBin; 43 | 44 | BinStat() = default; 45 | 46 | //feature: the pointer to features that need to build hist 47 | //insId: the pointer to instance id of features 48 | void Init(HistCut &cut, InsStat &stats, int pid, float_type *f_val, int n_f_val, int *iid); 49 | //void Init(vector& cut_points, InsStat& stats,SparseColumns& columns, int fid); 50 | }; 51 | 52 | #endif //THUNDERGBM_HIST_CUT_H 53 | -------------------------------------------------------------------------------- /include/thundergbm/ins_stat.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 5/7/18. 3 | // 4 | 5 | #ifndef THUNDERGBM_INS_STAT_H 6 | #define THUNDERGBM_INS_STAT_H 7 | 8 | 9 | #include "syncarray.h" 10 | 11 | struct GHPair { 12 | float_type g; 13 | float_type h; 14 | 15 | HOST_DEVICE GHPair operator+(const GHPair &rhs) const { 16 | GHPair res; 17 | res.g = this->g + rhs.g; 18 | res.h = this->h + rhs.h; 19 | return res; 20 | } 21 | 22 | HOST_DEVICE const GHPair operator-(const GHPair &rhs) const { 23 | GHPair res; 24 | res.g = this->g - rhs.g; 25 | res.h = this->h - rhs.h; 26 | return res; 27 | } 28 | 29 | HOST_DEVICE GHPair() : g(0), h(0) {}; 30 | 31 | HOST_DEVICE GHPair(float_type v) : g(v), h(v) {}; 32 | 33 | HOST_DEVICE GHPair(float_type g, float_type h) : g(g), h(h) {}; 34 | 35 | friend std::ostream &operator<<(std::ostream &os, 36 | const GHPair &p) { 37 | os << string_format("%f/%f", p.g, p.h); 38 | return os; 39 | } 40 | }; 41 | 42 | class InsStat { 43 | public: 44 | 45 | ///gradient and hessian 46 | SyncArray gh_pair; 47 | ///node id 48 | SyncArray nid; 49 | ///target value 50 | SyncArray y; 51 | ///predict value 52 | SyncArray y_predict; 53 | 54 | int n_instances; 55 | 56 | GHPair sum_gh; 57 | 58 | InsStat() = default; 59 | 60 | explicit InsStat(size_t n_instances) { 61 | resize(n_instances); 62 | } 63 | 64 | void resize(size_t n_instances); 65 | 66 | void updateGH(); 67 | 68 | void updateGH(SyncArray& is_multi); 69 | 70 | void updateGH(SyncArray& is_multi, int numP); 71 | }; 72 | 73 | #endif //THUNDERGBM_INS_STAT_H 74 | -------------------------------------------------------------------------------- /include/thundergbm/param.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 5/7/18. 3 | // 4 | 5 | #ifndef THUNDERGBM_PARAM_H 6 | #define THUNDERGBM_PARAM_H 7 | 8 | #include "thundergbm.h" 9 | 10 | struct GBMParam { 11 | int depth; 12 | int n_trees; 13 | float_type min_child_weight; 14 | float_type lambda; 15 | float_type gamma; 16 | float_type rt_eps; 17 | float_type learning_rate; 18 | string path; 19 | string test_path; 20 | bool do_exact = true; 21 | 22 | //for histogram 23 | int max_num_bin = 256; 24 | 25 | int n_device; 26 | }; 27 | #endif //THUNDERGBM_PARAM_H 28 | -------------------------------------------------------------------------------- /include/thundergbm/quantile_sketch.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 2018/5/9. 3 | // 4 | #ifndef THUNDERGBM_QUANTILE_SKETCH_H 5 | #define THUNDERGBM_QUANTILE_SKETCH_H 6 | 7 | #include "thundergbm/thundergbm.h" 8 | #include 9 | #include 10 | 11 | using std::pair; 12 | using std::tuple; 13 | using std::vector; 14 | 15 | 16 | class entry{ 17 | public: 18 | float_type val; 19 | float_type rmin; 20 | float_type rmax; 21 | float_type w; 22 | entry() {}; 23 | entry(float_type val, float_type rmin, float_type rmax, float_type w) : val(val), rmin(rmin), rmax(rmax), w(w) {}; 24 | }; 25 | 26 | class summary{ 27 | public: 28 | int entry_size; 29 | int entry_reserve_size; 30 | vector entries; 31 | summary(): entry_size(0),entry_reserve_size(0) { 32 | //entries.clear(); 33 | }; 34 | summary(int entry_size, int reserve_size): entry_size(entry_size), entry_reserve_size(reserve_size) {entries.resize(reserve_size);}; 35 | void Reserve(int size); 36 | void Prune(summary& src,int size); 37 | void Merge(summary& src1, summary& src2); 38 | void Copy(summary& src); 39 | 40 | }; 41 | 42 | class Qitem{ 43 | public: 44 | int tail; 45 | vector> data; 46 | Qitem(): tail(0) { 47 | //data.clear(); 48 | }; 49 | void GetSummary(summary& ret); 50 | }; 51 | 52 | 53 | class quanSketch{ 54 | public: 55 | int numOfLevel; 56 | int summarySize; 57 | Qitem Qentry; 58 | vector summaries; 59 | summary t_summary; //for temp 60 | void Init(int maxn, float_type eps); 61 | void Add(float_type, float_type); 62 | void GetSummary(summary& dest); 63 | quanSketch(): numOfLevel(0), summarySize(0) { 64 | //summaries.clear(); 65 | }; 66 | 67 | }; 68 | #endif //THUNDERGBM_QUANTILE_SKETCH_H 69 | -------------------------------------------------------------------------------- /include/thundergbm/sparse_columns.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 5/7/18. 3 | // 4 | 5 | #ifndef THUNDERGBM_SPARSE_COLUMNS_H 6 | #define THUNDERGBM_SPARSE_COLUMNS_H 7 | 8 | #include "thundergbm.h" 9 | #include "syncarray.h" 10 | #include "dataset.h" 11 | #include "cusparse.h" 12 | 13 | class SparseColumns {//one feature corresponding to one column 14 | public: 15 | SyncArray csc_val; 16 | SyncArray csc_row_ind; 17 | SyncArray csc_col_ptr; 18 | SyncArray csc_bin_id; 19 | int n_column; 20 | int column_offset; 21 | int nnz; 22 | 23 | void from_dataset(const DataSet &dataSet); 24 | void from_dataset_csr(const DataSet &dataset); 25 | 26 | void to_multi_devices(vector> &) const; 27 | // void get_cut_points_evenly(int nBin, vector& bin_id, const vector& min_fea, const vector& max_fea); 28 | }; 29 | #endif //THUNDERGBM_SPARSE_COLUMNS_H 30 | -------------------------------------------------------------------------------- /include/thundergbm/syncarray.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 17-9-17. 3 | // 4 | 5 | #ifndef THUNDERGBM_SYNCDATA_H 6 | #define THUNDERGBM_SYNCDATA_H 7 | 8 | #include "thundergbm.h" 9 | #include "syncmem.h" 10 | 11 | /** 12 | * @brief Wrapper of SyncMem with a type 13 | * @tparam T type of element 14 | */ 15 | template 16 | class SyncArray : public el::Loggable { 17 | public: 18 | /** 19 | * initialize class that can store given count of elements 20 | * @param count the given count 21 | */ 22 | explicit SyncArray(size_t count) : mem(new SyncMem(sizeof(T) * count)), size_(count) { 23 | } 24 | 25 | SyncArray() : mem(nullptr), size_(0) {} 26 | 27 | ~SyncArray() { delete mem; }; 28 | 29 | const T *host_data() const { 30 | to_host(); 31 | return static_cast(mem->host_data()); 32 | }; 33 | 34 | const T *device_data() const { 35 | to_device(); 36 | return static_cast(mem->device_data()); 37 | }; 38 | 39 | T *host_data() { 40 | to_host(); 41 | return static_cast(mem->host_data()); 42 | }; 43 | 44 | T *device_data() { 45 | to_device(); 46 | return static_cast(mem->device_data()); 47 | }; 48 | 49 | T *device_end() { 50 | return device_data() + size(); 51 | }; 52 | 53 | const T *device_end() const { 54 | return device_data() + size(); 55 | }; 56 | 57 | void set_host_data(T *host_ptr) { 58 | mem->set_host_data(host_ptr); 59 | } 60 | 61 | void set_device_data(T *device_ptr) { 62 | mem->set_device_data(device_ptr); 63 | } 64 | 65 | void to_host() const { 66 | mem->to_host(); 67 | } 68 | 69 | void to_device() const { 70 | mem->to_device(); 71 | } 72 | 73 | /** 74 | * copy device data. This will call to_device() implicitly. 75 | * @param source source data pointer (data can be on host or device) 76 | * @param count the count of elements 77 | */ 78 | void copy_from(const T *source, size_t count) { 79 | 80 | #ifdef USE_CUDA 81 | thunder::device_mem_copy(mem->device_data(), source, sizeof(T) * count); 82 | #else 83 | memcpy(mem->host_data(), source, sizeof(T) * count); 84 | #endif 85 | }; 86 | 87 | void copy_from(const SyncArray &source) { 88 | 89 | CHECK_EQ(size(), source.size()) << "destination and source count doesn't match"; 90 | #ifdef USE_CUDA 91 | copy_from(source.device_data(), source.size()); 92 | #else 93 | copy_from(source.host_data(), source.size()); 94 | #endif 95 | }; 96 | 97 | /** 98 | * resize to a new size. This will also clear all data. 99 | * @param count 100 | */ 101 | void resize(size_t count) { 102 | delete mem; 103 | mem = new SyncMem(sizeof(T) * count); 104 | this->size_ = count; 105 | }; 106 | 107 | size_t mem_size() const {//number of bytes 108 | return mem->size(); 109 | } 110 | 111 | size_t size() const {//number of values 112 | return size_; 113 | } 114 | 115 | SyncMem::HEAD head() const { 116 | return mem->head(); 117 | } 118 | 119 | void log(el::base::type::ostream_t &ostream) const override { 120 | int i; 121 | ostream << "["; 122 | for (i = 0; i < size() - 1 && i < el::base::consts::kMaxLogPerContainer - 1; ++i) { 123 | // for (i = 0; i < size() - 1; ++i) { 124 | ostream << host_data()[i] << ","; 125 | } 126 | ostream << host_data()[i]; 127 | if (size() < el::base::consts::kMaxLogPerContainer - 1) { 128 | ostream << "]"; 129 | } else { 130 | ostream << "..."; 131 | } 132 | }; 133 | 134 | int get_owner_id() { 135 | return mem->get_owner_id(); 136 | } 137 | 138 | private: 139 | 140 | SyncArray(const SyncArray &); 141 | 142 | SyncArray &operator=(const SyncArray &); 143 | 144 | SyncMem *mem; 145 | size_t size_; 146 | }; 147 | 148 | #endif //THUNDERGBM_SYNCDATA_H 149 | -------------------------------------------------------------------------------- /include/thundergbm/syncmem.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 17-9-16. 3 | // 4 | 5 | #ifndef THUNDERGBM_SYNCMEM_H 6 | #define THUNDERGBM_SYNCMEM_H 7 | 8 | #include "thundergbm.h" 9 | 10 | namespace thunder { 11 | inline void malloc_host(void **ptr, size_t size) { 12 | #ifdef USE_CUDA 13 | CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocPortable)); 14 | #else 15 | *ptr = malloc(size); 16 | #endif 17 | } 18 | 19 | inline void free_host(void *ptr) { 20 | #ifdef USE_CUDA 21 | CUDA_CHECK(cudaFreeHost(ptr)); 22 | #else 23 | free(ptr); 24 | #endif 25 | } 26 | 27 | inline void device_mem_copy(void *dst, const void *src, size_t size) { 28 | #ifdef USE_CUDA 29 | CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); 30 | #else 31 | NO_GPU; 32 | #endif 33 | } 34 | 35 | /** 36 | * @brief Auto-synced memory for CPU and GPU 37 | */ 38 | class SyncMem { 39 | public: 40 | SyncMem(); 41 | 42 | /** 43 | * create a piece of synced memory with given size. The GPU/CPU memory will not be allocated immediately, but 44 | * allocated when it is used at first time. 45 | * @param size the size of memory (in Bytes) 46 | */ 47 | explicit SyncMem(size_t size); 48 | 49 | ~SyncMem(); 50 | 51 | ///return raw host pointer 52 | void *host_data(); 53 | 54 | ///return raw device pointer 55 | void *device_data(); 56 | 57 | /** 58 | * set host data pointer to another host pointer, and its memory will not be managed by this class 59 | * @param data another host pointer 60 | */ 61 | void set_host_data(void *data); 62 | 63 | /** 64 | * set device data pointer to another device pointer, and its memory will not be managed by this class 65 | * @param data another device pointer 66 | */ 67 | void set_device_data(void *data); 68 | 69 | ///transfer data to host 70 | void to_host(); 71 | 72 | ///transfer data to device 73 | void to_device(); 74 | 75 | ///return the size of memory 76 | size_t size() const; 77 | 78 | ///to determine the where the newest data locates in 79 | enum HEAD { 80 | HOST, DEVICE, UNINITIALIZED 81 | }; 82 | 83 | HEAD head() const; 84 | 85 | int get_owner_id() { 86 | return device_id; 87 | } 88 | 89 | private: 90 | void *device_ptr; 91 | void *host_ptr; 92 | bool own_device_data; 93 | bool own_host_data; 94 | size_t size_; 95 | HEAD head_; 96 | int device_id; 97 | }; 98 | } 99 | using thunder::SyncMem; 100 | #endif //THUNDERGBM_SYNCMEM_H 101 | -------------------------------------------------------------------------------- /include/thundergbm/thundergbm.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-16. 3 | // 4 | 5 | #ifndef THUNDERGBM_THUNDERGBM_H 6 | #define THUNDERGBM_THUNDERGBM_H 7 | 8 | #include 9 | #include 10 | 11 | #include "cuda_runtime_api.h" 12 | #include "thundergbm/util/log.h" 13 | #include "thundergbm/util/common.h" 14 | #include "thundergbm/util/multi_device.h" 15 | #include "config.h" 16 | 17 | typedef float float_type; 18 | using std::vector; 19 | using std::string; 20 | #endif //THUNDERGBM_THUNDERGBM_H 21 | -------------------------------------------------------------------------------- /include/thundergbm/tree.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-18. 3 | // 4 | 5 | #ifndef THUNDERGBM_TREE_H 6 | #define THUNDERGBM_TREE_H 7 | 8 | #include "thundergbm/thundergbm.h" 9 | #include "syncarray.h" 10 | #include "sstream" 11 | #include "ins_stat.h" 12 | 13 | 14 | class Tree { 15 | public: 16 | struct TreeNode { 17 | int final_id;// node id after pruning, may not equal to node index 18 | int lch_index;// index of left child 19 | int rch_index;// index of right child 20 | int parent_index;// index of parent node 21 | float_type gain;// gain of splitting this node 22 | float_type base_weight; 23 | int split_feature_id; 24 | float_type split_value; 25 | bool default_right; 26 | bool is_leaf; 27 | bool is_valid;// non-valid nodes are those that are "children" of leaf nodes 28 | bool is_pruned;// pruned after pruning 29 | 30 | GHPair sum_gh_pair; 31 | 32 | friend std::ostream &operator<<(std::ostream &os, 33 | const TreeNode &node); 34 | 35 | HOST_DEVICE void calc_weight(float_type lambda) { 36 | this->base_weight = -sum_gh_pair.g / (sum_gh_pair.h + lambda); 37 | } 38 | 39 | HOST_DEVICE bool splittable() const { 40 | return !is_leaf && is_valid; 41 | } 42 | 43 | }; 44 | 45 | explicit Tree(int depth); 46 | 47 | Tree() = default; 48 | 49 | Tree(const Tree &tree) { 50 | nodes.resize(tree.nodes.size()); 51 | nodes.copy_from(tree.nodes); 52 | } 53 | 54 | Tree &operator=(const Tree &tree) { 55 | nodes.resize(tree.nodes.size()); 56 | nodes.copy_from(tree.nodes); 57 | return *this; 58 | } 59 | 60 | void init(int depth); 61 | 62 | string dump(int depth) const; 63 | 64 | SyncArray nodes; 65 | 66 | void prune_self(float_type gamma); 67 | 68 | void shrink(float_type learning_rate); 69 | 70 | private: 71 | void preorder_traversal(int nid, int max_depth, int depth, string &s) const; 72 | 73 | int try_prune_leaf(int nid, int np, float_type gamma, vector &leaf_child_count); 74 | 75 | void reorder_nid(); 76 | }; 77 | 78 | #endif //THUNDERGBM_TREE_H 79 | -------------------------------------------------------------------------------- /include/thundergbm/updater/exact_updater.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 5/7/18. 3 | // 4 | 5 | #ifndef THUNDERGBM_EXACT_UPDATER_H 6 | #define THUNDERGBM_EXACT_UPDATER_H 7 | 8 | #include 9 | #include 10 | #include "thrust/reduce.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "thundergbm/util/device_lambda.cuh" 19 | #include "thundergbm/gpu_lsh.h" 20 | 21 | 22 | class SplitPoint { 23 | public: 24 | float_type gain; 25 | int split_fea_id; 26 | float_type fval; 27 | int bin_id; 28 | GHPair fea_missing_gh; 29 | GHPair rch_sum_gh; 30 | bool default_right; 31 | int nid; 32 | 33 | SplitPoint() { 34 | nid = -1; 35 | split_fea_id = -1; 36 | gain = std::numeric_limits::min(); 37 | } 38 | 39 | friend std::ostream &operator<<(std::ostream &output, const SplitPoint &sp) { 40 | output << sp.gain << "/" << sp.split_fea_id << "/" << sp.nid; 41 | return output; 42 | } 43 | }; 44 | 45 | class ExactUpdater { 46 | public: 47 | explicit ExactUpdater(GBMParam ¶m) { 48 | depth = param.depth; 49 | min_child_weight = param.min_child_weight; 50 | lambda = param.lambda; 51 | gamma = param.gamma; 52 | rt_eps = param.rt_eps; 53 | n_devices = param.n_device; 54 | } 55 | 56 | 57 | void grow(Tree &tree, const vector> &v_columns, InsStat &stats); 58 | 59 | int depth; 60 | float_type min_child_weight; 61 | float_type lambda; 62 | float_type gamma; 63 | float_type rt_eps; 64 | 65 | psdLsh lsh_table; 66 | 67 | int n_devices; 68 | vector> v_stats; 69 | vector> v_trees_gpu; 70 | 71 | void init_tree(Tree &tree, const InsStat &stats); 72 | 73 | virtual void find_split(int level, const SparseColumns &columns, const Tree &tree, const InsStat &stats, 74 | SyncArray &sp); 75 | 76 | //update global best split for each node 77 | void update_tree(Tree &tree, const SyncArray &sp); 78 | 79 | virtual bool reset_ins2node_id(InsStat &stats, const Tree &tree, const SparseColumns &columns); 80 | 81 | void split_point_all_reduce(const vector> &local_sp, SyncArray &global_sp, 82 | int depth); 83 | 84 | void lsh_hash_init(unsigned table_size, unsigned num_table, unsigned num_dimension, unsigned p_norm, float r, 85 | unsigned numP, int seed); 86 | }; 87 | 88 | typedef thrust::tuple int_float; 89 | 90 | std::ostream &operator<<(std::ostream &os, const int_float &rhs); 91 | 92 | #endif //THUNDERGBM_EXACT_UPDATER_H 93 | -------------------------------------------------------------------------------- /include/thundergbm/updater/hist_updater.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 2018/7/6. 3 | // 4 | 5 | #ifndef GBM_MIRROR2_HIST_UPDATER_H 6 | #define GBM_MIRROR2_HIST_UPDATER_H 7 | 8 | #include "thundergbm/updater/exact_updater.h" 9 | #include "thundergbm/hist_cut.h" 10 | #include "thundergbm/csc2r_transform.h" 11 | 12 | class HistUpdater : public ExactUpdater{ 13 | public: 14 | int max_num_bin = 64; 15 | int do_cut = 0; 16 | bool use_similar_bundle = 1; 17 | vector v_cut; 18 | vector>> bin_id; 19 | 20 | 21 | void insBundle(const vector> &v_columns, InsStat &stats); 22 | void init_bin_id(const vector> &v_columns); 23 | void init_bin_id_outside(const vector> &v_columns, SyncArray& bin_id); 24 | void init_bin_id_unsort(SparseColumns& unsort_columns, SyncArray& bin_id); 25 | void copy_bin_id(const vector> &v_columns, SyncArray& bin_id); 26 | void init_cut(const vector> &v_columns, InsStat &stats, int n_instance, 27 | SparseColumns& unsort_columns); 28 | 29 | void init_bin_id_csr(const vector>> &v_columns, int n_instances); 30 | 31 | void similar_ins_bundle(const vector> &v_columns, InsStat &stats, 32 | int& n_instances, DataSet &dataSet, SparseColumns& unsort_columns, int* iidold2new, SyncArray& is_multi); 33 | void similar_ins_bundle(const vector> &v_columns, 34 | const vector> &v_columns2, InsStat &stats, 35 | int& n_instances, DataSet &dataSet, SparseColumns& unsort_columns, int* iidold2new, SyncArray& is_multi); 36 | void similar_ins_bundle_multi(const vector>> &v_columns, 37 | int numP, InsStat &stats, int& n_instances, DataSet &dataSet, 38 | SparseColumns& unsort_columns, int* iidold2new, SyncArray& is_multi, bool is_random = 0); 39 | void similar_ins_bundle_closest(const vector>> &v_columns, int numP, 40 | InsStat &stats, int& n_instances, DataSet& dataSet, SparseColumns& unsort_columns,int* iidold2new, SyncArray& is_multi); 41 | void similar_ins_bundle(const vector>> &v_columns, int numP, 42 | vector &stats, int& n_instances, DataSet& dataSet, SparseColumns& unsort_columns, 43 | int* iidold2new, SyncArray& is_multi); 44 | 45 | 46 | void similar_ins_bundle_independent(const vector>> &v_columns, int numP, 47 | vector &stats, int& n_instances, DataSet& dataSet, SparseColumns& unsort_columns, 48 | int* iidold2new, SyncArray& is_multi, bool is_random = 0, bool weighted_gh = 1); 49 | 50 | void get_bin_ids(const SparseColumns &columns); 51 | 52 | void find_split(int level, const SparseColumns &columns, const Tree &tree, const InsStat &stats, 53 | SyncArray &sp) override; 54 | 55 | bool reset_ins2node_id(InsStat &stats, const Tree &tree, const SparseColumns &columns) override; 56 | 57 | explicit HistUpdater(GBMParam ¶m): ExactUpdater(param) {}; 58 | 59 | 60 | 61 | Csc2r bin_id_csr; 62 | 63 | }; 64 | #endif //GBM_MIRROR2_HIST_UPDATER_H 65 | -------------------------------------------------------------------------------- /include/thundergbm/util/common.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-16. 3 | // 4 | 5 | #ifndef THUNDERGBM_COMMON_H 6 | #define THUNDERGBM_COMMON_H 7 | 8 | 9 | #include "thundergbm/thundergbm.h" 10 | 11 | #define USE_CUDA 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ 17 | } while (false) 18 | 19 | #define NO_GPU \ 20 | LOG(FATAL)<<"Cannot use GPU when compiling without GPU" 21 | 22 | //https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf 23 | template 24 | std::string string_format(const std::string &format, Args ... args) { 25 | size_t size = snprintf(nullptr, 0, format.c_str(), args ...) + 1; // Extra space for '\0' 26 | std::unique_ptr buf(new char[size]); 27 | snprintf(buf.get(), size, format.c_str(), args ...); 28 | return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside 29 | } 30 | 31 | #define HOST_DEVICE __host__ __device__ 32 | #endif //THUNDERGBM_COMMON_H 33 | -------------------------------------------------------------------------------- /include/thundergbm/util/cub_wrapper.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by ss on 18-5-13. 3 | // 4 | 5 | #ifndef THUNDERGBM_CUB_UTIL_H 6 | #define THUNDERGBM_CUB_UTIL_H 7 | 8 | #include "thundergbm/thundergbm.h" 9 | //#include 10 | //#include 11 | #include "cub/cub.cuh" 12 | #include 13 | 14 | template 15 | void cub_sort_by_key(SyncArray &keys, SyncArray &values, bool ascending = true) { 16 | CHECK_EQ(values.size(), values.size()) << "keys and values must have equal size"; 17 | using namespace cub; 18 | size_t num_items = keys.size(); 19 | SyncArray keys2(num_items); 20 | SyncArray values2(num_items); 21 | SyncArray temp_storage; 22 | 23 | DoubleBuffer d_keys(keys.device_data(), keys2.device_data()); 24 | DoubleBuffer d_values(values.device_data(), values2.device_data()); 25 | 26 | size_t temp_storage_bytes = 0; 27 | 28 | // Initialize device arrays 29 | if (ascending) 30 | DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, num_items); 31 | else 32 | DeviceRadixSort::SortPairsDescending(NULL, temp_storage_bytes, d_keys, d_values, num_items); 33 | temp_storage.resize(temp_storage_bytes); 34 | 35 | // Run 36 | if (ascending) 37 | DeviceRadixSort::SortPairs(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values, num_items); 38 | else 39 | DeviceRadixSort::SortPairsDescending(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values, 40 | num_items); 41 | 42 | CUDA_CHECK( 43 | cudaMemcpy(keys.device_data(), reinterpret_cast(d_keys.Current()), sizeof(float) * num_items, 44 | cudaMemcpyDeviceToDevice)); 45 | CUDA_CHECK(cudaMemcpy(values.device_data(), reinterpret_cast(d_values.Current()), 46 | sizeof(int) * num_items, 47 | cudaMemcpyDeviceToDevice)); 48 | } 49 | 50 | template 51 | void cub_seg_sort_by_key(SyncArray &keys, SyncArray &values, SyncArray &ptr, bool ascending = true) { 52 | CHECK_EQ(values.size(), values.size()) << "keys and values must have equal size"; 53 | using namespace cub; 54 | size_t num_items = keys.size(); 55 | size_t num_segments = ptr.size() - 1; 56 | SyncArray keys2(num_items); 57 | SyncArray values2(num_items); 58 | SyncArray temp_storage; 59 | 60 | DoubleBuffer d_keys(keys.device_data(), keys2.device_data()); 61 | DoubleBuffer d_values(values.device_data(), values2.device_data()); 62 | 63 | size_t temp_storage_bytes = 0; 64 | 65 | // Initialize device arrays 66 | if (ascending) 67 | DeviceSegmentedRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, num_items, num_segments, 68 | ptr.device_data(), ptr.device_data() + 1); 69 | else 70 | DeviceSegmentedRadixSort::SortPairsDescending(NULL, temp_storage_bytes, d_keys, d_values, num_items, 71 | num_segments, 72 | ptr.device_data(), ptr.device_data() + 1); 73 | temp_storage.resize(temp_storage_bytes); 74 | 75 | // Run 76 | if (ascending) 77 | DeviceSegmentedRadixSort::SortPairs(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values, 78 | num_items, num_segments, ptr.device_data(), 79 | ptr.device_data() + 1); 80 | else 81 | DeviceSegmentedRadixSort::SortPairsDescending(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values, 82 | num_items, num_segments, ptr.device_data(), 83 | ptr.device_data() + 1); 84 | 85 | CUDA_CHECK( 86 | cudaMemcpy(keys.device_data(), reinterpret_cast(d_keys.Current()), sizeof(float) * num_items, 87 | cudaMemcpyDeviceToDevice)); 88 | CUDA_CHECK(cudaMemcpy(values.device_data(), reinterpret_cast(d_values.Current()), 89 | sizeof(int) * num_items, 90 | cudaMemcpyDeviceToDevice)); 91 | }; 92 | 93 | #endif //THUNDERGBM_CUB_UTIL_H 94 | -------------------------------------------------------------------------------- /include/thundergbm/util/device_lambda.cuh: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-19. 3 | // 4 | 5 | #ifndef THUNDERGBM_DEVICE_LAMBDA_H 6 | #define THUNDERGBM_DEVICE_LAMBDA_H 7 | 8 | #include "thundergbm/thundergbm.h" 9 | #include "thundergbm/clion_cuda.h" 10 | 11 | template 12 | __global__ void lambda_kernel(size_t len, L lambda) { 13 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < len; i += blockDim.x * gridDim.x) { 14 | lambda(i); 15 | } 16 | } 17 | 18 | template 19 | __global__ void lambda_2d_sparse_kernel(const int *len2, L lambda) { 20 | int i = blockIdx.x; 21 | int begin = len2[i]; 22 | int end = len2[i + 1]; 23 | for (int j = begin + blockIdx.y * blockDim.x + threadIdx.x; j < end; j += blockDim.x * gridDim.y) { 24 | lambda(i, j); 25 | } 26 | } 27 | 28 | ///p100 has 56 MPs, using 32*56 thread blocks 29 | template 30 | void device_loop(int len, L lambda, unsigned int NUM_BLOCK = 32 *56, unsigned int BLOCK_SIZE=512) { 31 | if (len > 0) { 32 | lambda_kernel << < NUM_BLOCK, BLOCK_SIZE >> > (len, lambda); 33 | CUDA_CHECK(cudaPeekAtLastError()); 34 | } 35 | } 36 | 37 | 38 | template 39 | void device_loop_2d(int len1, const int *len2, L lambda, unsigned int NUM_BLOCK = 32 * 56, 40 | unsigned int BLOCK_SIZE = 256) { 41 | if (len1 > 0) { 42 | lambda_2d_sparse_kernel << < dim3(len1, NUM_BLOCK), BLOCK_SIZE >> > (len2, lambda); 43 | CUDA_CHECK(cudaPeekAtLastError()); 44 | } 45 | } 46 | 47 | template 48 | __global__ void lambda_2d_sparse_kernel_mod(const int mod_val, const int *len2, L lambda) { 49 | int i = blockIdx.x; 50 | int begin = len2[i%mod_val]; 51 | int end = len2[i%mod_val + 1]; 52 | for (int j = begin + blockIdx.y * blockDim.x + threadIdx.x; j < end; j += blockDim.x * gridDim.y) { 53 | lambda(i, j); 54 | } 55 | } 56 | 57 | template 58 | void device_loop_2d_mod(int len1, int mod_val, const int *len2, L lambda, unsigned int NUM_BLOCK = 32 * 56, 59 | unsigned int BLOCK_SIZE = 256) { 60 | if (len1 > 0) { 61 | lambda_2d_sparse_kernel_mod << < dim3(len1, NUM_BLOCK), BLOCK_SIZE >> > (mod_val, len2, lambda); 62 | CUDA_CHECK(cudaPeekAtLastError()); 63 | } 64 | } 65 | 66 | //template 67 | //__global__ void lambda_2d_sparse_kernel_zero(const int *len2, L lambda) { 68 | // int i = blockIdx.x; 69 | // int len = len2[i + 1] - len2[i]; 70 | // for (int j = blockIdx.y * blockDim.x + threadIdx.x; j < len; j += blockDim.x * gridDim.y) { 71 | // lambda(i, j); 72 | // } 73 | //} 74 | // 75 | //template 76 | //void device_loop_2d_zero(int len1, const int *len2, L lambda, unsigned int NUM_BLOCK = 32 * 56, 77 | // unsigned int BLOCK_SIZE = 256) { 78 | // if (len1 > 0) { 79 | // lambda_2d_sparse_kernel_zero << < dim3(len1, NUM_BLOCK), BLOCK_SIZE >> > (len2, lambda); 80 | // CUDA_CHECK(cudaPeekAtLastError()); 81 | // } 82 | //} 83 | #endif //THUNDERGBM_DEVICE_LAMBDA_H 84 | -------------------------------------------------------------------------------- /include/thundergbm/util/multi_device.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by ss on 18-6-18. 3 | // 4 | 5 | #ifndef THUNDERGBM_MULTI_DEVICE_H 6 | #define THUNDERGBM_MULTI_DEVICE_H 7 | //switch to specific device and do something, then switch back to the original device 8 | //FIXME make this macro into a function? 9 | #define DO_ON_DEVICE(device_id, something) \ 10 | do { \ 11 | int org_device_id = 0; \ 12 | CUDA_CHECK(cudaGetDevice(&org_device_id)); \ 13 | CUDA_CHECK(cudaSetDevice(device_id)); \ 14 | something; \ 15 | CUDA_CHECK(cudaSetDevice(org_device_id)); \ 16 | } while (false) 17 | 18 | /** 19 | * Do something on multiple devices, then switch back to the original device 20 | * 21 | * 22 | * example: 23 | * 24 | * DO_ON_MULTI_DEVICES(n_devices, [&](int device_id){ 25 | * //do_something_on_device(device_id); 26 | * }); 27 | */ 28 | 29 | template 30 | void DO_ON_MULTI_DEVICES(int n_devices, L do_something) { 31 | int org_device_id = 0; 32 | CUDA_CHECK(cudaGetDevice(&org_device_id)); 33 | #pragma omp parallel for num_threads(n_devices) 34 | for (int device_id = 0; device_id < n_devices; device_id++) { 35 | CUDA_CHECK(cudaSetDevice(device_id)); 36 | do_something(device_id); 37 | } 38 | CUDA_CHECK(cudaSetDevice(org_device_id)); 39 | 40 | } 41 | 42 | //void DO_ON_MULTI_DEVICES(int n_devices, L do_something) { 43 | // int org_device_id = 1; 44 | // CUDA_CHECK(cudaGetDevice(&org_device_id)); 45 | //#pragma omp parallel for num_threads(n_devices) 46 | // for (int device_id = 1; device_id <= n_devices; device_id++) { 47 | // CUDA_CHECK(cudaSetDevice(device_id)); 48 | // do_something(device_id); 49 | // } 50 | // CUDA_CHECK(cudaSetDevice(org_device_id)); 51 | // 52 | //} 53 | 54 | #endif //THUNDERGBM_MULTI_DEVICE_H 55 | -------------------------------------------------------------------------------- /src/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(googletest) 2 | 3 | include_directories(googletest/googletest/include) 4 | include_directories(googletest/googlemock/include) 5 | 6 | file(GLOB TEST_SRC *) 7 | 8 | cuda_add_executable(${PROJECT_NAME}-test ${TEST_SRC} ${COMMON_INCLUDES}) 9 | target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME} gtest) 10 | 11 | -------------------------------------------------------------------------------- /src/test/test_main.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 17-9-15. 3 | // 4 | #include "thundergbm/thundergbm.h" 5 | #include "gtest/gtest.h" 6 | 7 | //#include 8 | //#include 9 | //#include 10 | //#include 11 | 12 | 13 | //float_type compute_rmse(const InsStat &stats) { 14 | // float_type sum_error = 0; 15 | // const float_type *y_data = stats.y.host_data(); 16 | // const float_type *y_predict_data = stats.y_predict.host_data(); 17 | // for (int i = 0; i < stats.n_instances; ++i) { 18 | // float_type e = y_predict_data[i] - y_data[i]; 19 | // sum_error += e * e; 20 | // } 21 | // float_type rmse = sqrt(sum_error / stats.n_instances); 22 | // return rmse; 23 | //} 24 | 25 | 26 | 27 | int iargc; // Making arg and arv global to access within TESTs 28 | char** iargv; 29 | int main(int argc, char **argv) { 30 | ::testing::InitGoogleTest(&argc, argv); 31 | iargc = argc; 32 | iargv = argv; 33 | el::Loggers::reconfigureAllLoggers(el::ConfigurationType::Format, "%datetime %level %fbase:%line : %msg"); 34 | el::Loggers::addFlag(el::LoggingFlag::ColoredTerminalOutput); 35 | el::Loggers::addFlag(el::LoggingFlag::FixedTimeFormat); 36 | el::Loggers::reconfigureAllLoggers(el::Level::Debug, el::ConfigurationType::Enabled, "false"); 37 | el::Loggers::reconfigureAllLoggers(el::Level::Trace, el::ConfigurationType::Enabled, "false"); 38 | el::Loggers::reconfigureAllLoggers(el::Level::Info, el::ConfigurationType::Enabled, "false"); 39 | return RUN_ALL_TESTS(); 40 | 41 | // GBMParam param; 42 | // bool verbose = false; 43 | // param.depth = 6; 44 | // param.n_trees = 40; 45 | // param.min_child_weight = 1; 46 | // param.lambda = 1; 47 | // param.gamma = 1; 48 | // param.rt_eps = 1e-6; 49 | // param.do_exact = true; 50 | // param.n_device = 1; 51 | // if (!verbose) { 52 | // el::Loggers::reconfigureAllLoggers(el::Level::Debug, el::ConfigurationType::Enabled, "false"); 53 | // el::Loggers::reconfigureAllLoggers(el::Level::Trace, el::ConfigurationType::Enabled, "false"); 54 | // } 55 | // el::Loggers::reconfigureAllLoggers(el::ConfigurationType::PerformanceTracking, "true"); 56 | // DataSet dataSet; 57 | // dataSet.load_from_file(param.path); 58 | // int n_instances = dataSet.n_instances(); 59 | // InsStat stats; 60 | // vector trees; 61 | // SparseColumns columns; 62 | // columns.from_dataset(dataSet); 63 | // trees.resize(param.n_trees); 64 | // stats.resize(n_instances); 65 | // stats.y.copy_from(dataSet.y().data(), n_instances); 66 | // 67 | // int n_devices = 1; 68 | // vector> v_columns; 69 | // v_columns.resize(n_devices); 70 | // for (int i = 0; i < n_devices; i++) 71 | // v_columns[i].reset(new SparseColumns()); 72 | // columns.to_multi_devices(v_columns); 73 | // HistUpdater updater(param); 74 | // int round = 0; 75 | // float_type rmse = 0; 76 | // { 77 | // bool init_bin = 0; 78 | // for (Tree &tree:trees) { 79 | // stats.updateGH(); 80 | // //updater.insBundle(v_columns, stats); 81 | // TIMED_SCOPE(timerObj, "construct tree"); 82 | // if(!init_bin) { 83 | // updater.use_similar_bundle = 0; 84 | // { 85 | // TIMED_SCOPE(timerObj, "init cut"); 86 | // updater.init_cut(v_columns, stats, n_instances, columns); 87 | // } 88 | // if(updater.use_similar_bundle) 89 | // { 90 | // TIMED_SCOPE(timerObj, "similar ins bundle"); 91 | // updater.similar_ins_bundle(v_columns, stats, n_instances, dataSet, columns); 92 | // } 93 | // init_bin = 1; 94 | // } 95 | // 96 | // 97 | // { 98 | // TIMED_SCOPE(timerObj, "grow"); 99 | // updater.grow(tree, v_columns, stats); 100 | // } 101 | // { 102 | // TIMED_SCOPE(timerObj, "prune"); 103 | // tree.prune_self(param.gamma); 104 | // } 105 | // 106 | // LOG(DEBUG) << string_format("\nbooster[%d]", round) << tree.dump(param.depth); 107 | // predict_in_training(stats, tree); 108 | // //next round 109 | // round++; 110 | // 111 | // } 112 | // } 113 | // rmse = compute_rmse(stats); 114 | // LOG(INFO) << "rmse = " << rmse; 115 | // return 1; 116 | } 117 | -------------------------------------------------------------------------------- /src/test/test_unifiedmem.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 10/7/18. 3 | // 4 | 5 | #include 6 | #include "thundergbm/syncarray.h" 7 | #include "gtest/gtest.h" 8 | #include "thrust/reduce.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "thundergbm/util/device_lambda.cuh" 16 | 17 | 18 | void kernel(int count, int *ptr) { 19 | device_loop(count, [=] __device__(int i) { ptr[i] = 1; }); 20 | } 21 | 22 | //TEST(TestUnified, test) { 23 | // int *ptr; 24 | // size_t size = (1L << 30) * 4; 25 | // size_t count = size / sizeof(int); 26 | // 27 | // using namespace thrust; 28 | // cudaMallocManaged((void **) &ptr, size); 29 | // 30 | // memset(ptr, 0, size); 31 | // { 32 | // TIMED_SCOPE(timerObj, "prefetch kernel"); 33 | // cudaMemPrefetchAsync(ptr, size, 0); 34 | // 35 | // sort(cuda::par, ptr, ptr + count); 36 | // cudaDeviceSynchronize(); 37 | // } 38 | // 39 | // cudaFree(ptr); 40 | // ptr = nullptr; 41 | // 42 | // SyncArray arr(count); 43 | // arr.to_host(); 44 | // { 45 | // TIMED_SCOPE(timerObj, "copy kernel"); 46 | // sort(cuda::par, arr.device_data(), arr.device_end()); 47 | // } 48 | //} 49 | -------------------------------------------------------------------------------- /src/thundergbm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 2 | #set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 3 | #set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 4 | 5 | file(GLOB SRC util/*.c* updater/*.c* *.c*) 6 | 7 | cuda_add_library(${PROJECT_NAME} SHARED ${SRC}) 8 | 9 | target_link_libraries(${PROJECT_NAME} ${CUDA_cusparse_LIBRARY}) 10 | -------------------------------------------------------------------------------- /src/thundergbm/csc2r_transform.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 21/8/18. 3 | // 4 | 5 | #include "thundergbm/csc2r_transform.h" 6 | 7 | void Csc2r::from_csr(float_type* csr_val, int* csr_col_ind, int* csr_row_ptr, int n_instances, int n_column, int nnz){ 8 | 9 | cusparseHandle_t handle; 10 | cusparseMatDescr_t descr; 11 | cusparseCreate(&handle); 12 | cusparseCreateMatDescr(&descr); 13 | cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); 14 | cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); 15 | 16 | //std::cout<<"nnz:"<nnz = nnz; 21 | 22 | cusparseScsr2csc(handle, n_instances, n_column, nnz, csr_val, csr_row_ptr, 23 | csr_col_ind, csc_val.device_data(), csc_row_ind.device_data(), csc_col_ptr.device_data(), 24 | CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO); 25 | cudaDeviceSynchronize(); 26 | 27 | cusparseDestroy(handle); 28 | cusparseDestroyMatDescr(descr); 29 | 30 | 31 | } 32 | 33 | 34 | void Csc2r::get_cut_points_evenly(int nBin, vector& bin_id, 35 | const vector& min_fea, const vector& max_fea) { 36 | float* csc_val_host = csc_val.host_data(); 37 | int* csc_row_host = csc_row_ind.host_data(); 38 | int* csc_col_host = csc_col_ptr.host_data(); 39 | for(int cid = 0; cid < csc_col_ptr.size() - 1; cid ++){ 40 | int cstart = csc_col_host[cid]; 41 | int cend = csc_col_host[cid + 1]; 42 | for(int off = cstart; off < cend; off++){ 43 | 44 | float val = csc_val_host[off]; 45 | int rid = csc_row_host[off]; 46 | // std::cout<<"rid:"< 5 | #include 6 | #include "thundergbm/dataset.h" 7 | #include "cusparse.h" 8 | #include "thrust/sort.h" 9 | #include "thrust/system/cuda/detail/par.h" 10 | #include "thundergbm/util/device_lambda.cuh" 11 | 12 | void DataSet::load_from_file(string file_name) { 13 | LOG(INFO) << "loading LIBSVM dataset from file \"" << file_name << "\""; 14 | y_.clear(); 15 | features.clear(); 16 | line_num.clear(); 17 | //instances_.clear(); 18 | n_features_ = 123; 19 | std::ifstream ifs(file_name, std::ifstream::binary); 20 | CHECK(ifs.is_open()) << "file " << file_name << " not found"; 21 | 22 | std::array buffer{}; //16M 23 | const int nthread = omp_get_max_threads(); 24 | 25 | auto find_last_line = [](char *ptr, const char *begin) { 26 | while (ptr != begin && *ptr != '\n' && *ptr != '\r') --ptr; 27 | return ptr; 28 | }; 29 | 30 | string first_line; 31 | getline(ifs, first_line); 32 | std::stringstream first_ss(first_line); 33 | int n_f_first = 0; 34 | string tuple; 35 | while(first_ss >> tuple) 36 | n_f_first++; 37 | ifs.clear(); 38 | ifs.seekg (0, std::ios::beg); 39 | 40 | int n_sum_line = 0; 41 | while (ifs) { 42 | ifs.read(buffer.data(), buffer.size()); 43 | char *head = buffer.data(); 44 | size_t size = ifs.gcount(); 45 | vector> y_thread(nthread); 46 | //vector instances_thread(nthread); 47 | 48 | vector local_feature(nthread, 0); 49 | 50 | //vector>> index_thread(nthread); 51 | vector>> feature_thread(nthread); 52 | vector>> line_thread(nthread); 53 | for(int i = 0; i < nthread; i++){ 54 | feature_thread[i].resize(n_f_first * 2); 55 | line_thread[i].resize(n_f_first * 2); 56 | } 57 | vector n_line(nthread); 58 | #pragma omp parallel num_threads(nthread) 59 | { 60 | //get working area of this thread 61 | int tid = omp_get_thread_num(); 62 | size_t nstep = (size + nthread - 1) / nthread; 63 | size_t sbegin = std::min(tid * nstep, size - 1); 64 | size_t send = std::min((tid + 1) * nstep, size - 1); 65 | char *pbegin = find_last_line(head + sbegin, head); 66 | char *pend = find_last_line(head + send, head); 67 | 68 | //move stream start position to the end of last line 69 | if (tid == nthread - 1) ifs.seekg(pend - head - send, std::ios_base::cur); 70 | 71 | //read instances line by line 72 | char *lbegin = pbegin; 73 | char *lend = lbegin; 74 | int lid = 0; 75 | while (lend != pend) { 76 | //get one line 77 | lend = lbegin + 1; 78 | while (lend != pend && *lend != '\n' && *lend != '\r') { 79 | ++lend; 80 | } 81 | string line(lbegin, lend); 82 | std::stringstream ss(line); 83 | 84 | //read label of an instance 85 | y_thread[tid].emplace_back(); 86 | ss >> y_thread[tid].back(); 87 | 88 | string tuple; 89 | //int fid = 0; 90 | while(ss >> tuple){ 91 | int i; 92 | float v; 93 | CHECK_EQ(sscanf(tuple.c_str(), "%d:%f", &i, &v), 2) << "read error, using [index]:[value] format"; 94 | //index_thread[tid].back().emplace_back(i); 95 | if(i > local_feature[tid]){ 96 | local_feature[tid] = i; 97 | } 98 | if(i > feature_thread[tid].size()){ 99 | feature_thread[tid].resize(i); 100 | line_thread[tid].resize(i); 101 | // min_fea.resize(i); 102 | // max_fea.resize(i); 103 | // min_fea[i - 1] = INFINITY; 104 | // max_fea[i - 1] = -INFINITY; 105 | } 106 | 107 | feature_thread[tid][i-1].emplace_back(v); 108 | line_thread[tid][i-1].emplace_back(lid); 109 | //fid++; 110 | 111 | 112 | } 113 | lid++; 114 | //read next instance 115 | lbegin = lend; 116 | 117 | } 118 | n_line[tid] = lid; 119 | } 120 | for (int i = 0; i < nthread; i++) { 121 | if (local_feature[i] > n_features_) 122 | n_features_ = local_feature[i]; 123 | } 124 | // this->features.resize(n_features_); 125 | // this->line_num.resize(n_features_); 126 | this->features.resize(max_dimension); 127 | this->line_num.resize(max_dimension); 128 | for(int i = 0; i < nthread; i++) { 129 | for(int j = 0; j < local_feature[i]; j++) { 130 | this->features[j].insert(this->features[j].end(), 131 | feature_thread[i][j].begin(), 132 | feature_thread[i][j].end()); 133 | for (int k = 0; k < line_thread[i][j].size(); k++) { 134 | line_thread[i][j][k] += n_sum_line; 135 | } 136 | this->line_num[j].insert(this->line_num[j].end(), 137 | line_thread[i][j].begin(), line_thread[i][j].end()); 138 | } 139 | n_sum_line += n_line[i]; 140 | } 141 | for (int i = 0; i < nthread; i++) { 142 | this->y_.insert(y_.end(), y_thread[i].begin(), y_thread[i].end()); 143 | } 144 | } 145 | n_features_ = max_dimension; 146 | min_fea.resize(max_dimension); 147 | max_fea.resize(max_dimension); 148 | for(int i = 0; i < max_dimension; i++){ 149 | if(features[i].size() == 0){ 150 | min_fea[i] = INFINITY; 151 | max_fea[i] = -INFINITY; 152 | } 153 | else{ 154 | min_fea[i] = *std::min_element(features[i].begin(), features[i].end()); 155 | max_fea[i] = *std::max_element(features[i].begin(), features[i].end()); 156 | } 157 | } 158 | LOG(INFO) << "#instances = " << this->n_instances() << ", #features = " << this->n_features(); 159 | } 160 | 161 | void DataSet::load_from_file_csr(string file_name) { 162 | LOG(INFO) << "loading LIBSVM dataset from file \"" << file_name << "\""; 163 | y_.clear(); 164 | instances_.clear(); 165 | n_features_ = 0; 166 | std::ifstream ifs(file_name, std::ifstream::binary); 167 | CHECK(ifs.is_open()) << "file " << file_name << " not found"; 168 | 169 | std::array buffer{}; //16M 170 | const int nthread = omp_get_max_threads(); 171 | 172 | auto find_last_line = [](char *ptr, const char *begin) { 173 | while (ptr != begin && *ptr != '\n' && *ptr != '\r') --ptr; 174 | return ptr; 175 | }; 176 | 177 | while (ifs) { 178 | ifs.read(buffer.data(), buffer.size()); 179 | char *head = buffer.data(); 180 | size_t size = ifs.gcount(); 181 | vector> y_thread(nthread); 182 | vector instances_thread(nthread); 183 | 184 | vector local_feature(nthread, 0); 185 | #pragma omp parallel num_threads(nthread) 186 | { 187 | //get working area of this thread 188 | int tid = omp_get_thread_num(); 189 | size_t nstep = (size + nthread - 1) / nthread; 190 | size_t sbegin = std::min(tid * nstep, size - 1); 191 | size_t send = std::min((tid + 1) * nstep, size - 1); 192 | char *pbegin = find_last_line(head + sbegin, head); 193 | char *pend = find_last_line(head + send, head); 194 | 195 | //move stream start position to the end of last line 196 | if (tid == nthread - 1) ifs.seekg(pend - head - send, std::ios_base::cur); 197 | 198 | //read instances line by line 199 | char *lbegin = pbegin; 200 | char *lend = lbegin; 201 | while (lend != pend) { 202 | //get one line 203 | lend = lbegin + 1; 204 | while (lend != pend && *lend != '\n' && *lend != '\r') { 205 | ++lend; 206 | } 207 | string line(lbegin, lend); 208 | std::stringstream ss(line); 209 | 210 | //read label of an instance 211 | y_thread[tid].emplace_back(); 212 | ss >> y_thread[tid].back(); 213 | 214 | //read features of an instance 215 | instances_thread[tid].emplace_back(); 216 | string tuple; 217 | while (ss >> tuple) { 218 | int i; 219 | float v; 220 | CHECK_EQ(sscanf(tuple.c_str(), "%d:%f", &i, &v), 2) << "read error, using [index]:[value] format"; 221 | instances_thread[tid].back().emplace_back(i, v); 222 | if (i > local_feature[tid]) local_feature[tid] = i; 223 | }; 224 | 225 | //read next instance 226 | lbegin = lend; 227 | } 228 | } 229 | for (int i = 0; i < nthread; i++) { 230 | if (local_feature[i] > n_features_) 231 | n_features_ = local_feature[i]; 232 | } 233 | for (int i = 0; i < nthread; i++) { 234 | this->y_.insert(y_.end(), y_thread[i].begin(), y_thread[i].end()); 235 | this->instances_.insert(instances_.end(), instances_thread[i].begin(), instances_thread[i].end()); 236 | } 237 | } 238 | LOG(INFO) << "#instances = " << this->n_instances() << ", #features = " << this->n_features(); 239 | } 240 | 241 | void DataSet::load_from_file_two_dimension(string file_name){ 242 | LOG(INFO) << "loading LIBSVM dataset from file \"" << file_name << "\""; 243 | y_.clear(); 244 | features.clear(); 245 | line_num.clear(); 246 | //instances_.clear(); 247 | n_features_ = 0; 248 | std::ifstream ifs(file_name, std::ifstream::binary); 249 | CHECK(ifs.is_open()) << "file " << file_name << " not found"; 250 | 251 | std::array buffer{}; //16M 252 | const int nthread = omp_get_max_threads(); 253 | 254 | auto find_last_line = [](char *ptr, const char *begin) { 255 | while (ptr != begin && *ptr != '\n' && *ptr != '\r') --ptr; 256 | return ptr; 257 | }; 258 | 259 | string first_line; 260 | getline(ifs, first_line); 261 | std::stringstream first_ss(first_line); 262 | int n_f_first = 0; 263 | string tuple; 264 | while(first_ss >> tuple) 265 | n_f_first++; 266 | ifs.clear(); 267 | ifs.seekg (0, std::ios::beg); 268 | 269 | int n_sum_line = 0; 270 | while (ifs) { 271 | ifs.read(buffer.data(), buffer.size()); 272 | char *head = buffer.data(); 273 | size_t size = ifs.gcount(); 274 | vector> y_thread(nthread); 275 | vector instances_thread(nthread); 276 | 277 | vector local_feature(nthread, 0); 278 | 279 | //vector>> index_thread(nthread); 280 | vector>> feature_thread(nthread); 281 | vector>> line_thread(nthread); 282 | for(int i = 0; i < nthread; i++){ 283 | feature_thread[i].resize(n_f_first * 2); 284 | line_thread[i].resize(n_f_first * 2); 285 | } 286 | vector n_line(nthread); 287 | #pragma omp parallel num_threads(nthread) 288 | { 289 | //get working area of this thread 290 | int tid = omp_get_thread_num(); 291 | size_t nstep = (size + nthread - 1) / nthread; 292 | size_t sbegin = std::min(tid * nstep, size - 1); 293 | size_t send = std::min((tid + 1) * nstep, size - 1); 294 | char *pbegin = find_last_line(head + sbegin, head); 295 | char *pend = find_last_line(head + send, head); 296 | 297 | //move stream start position to the end of last line 298 | if (tid == nthread - 1) ifs.seekg(pend - head - send, std::ios_base::cur); 299 | 300 | //read instances line by line 301 | char *lbegin = pbegin; 302 | char *lend = lbegin; 303 | int lid = 0; 304 | while (lend != pend) { 305 | //get one line 306 | lend = lbegin + 1; 307 | while (lend != pend && *lend != '\n' && *lend != '\r') { 308 | ++lend; 309 | } 310 | string line(lbegin, lend); 311 | std::stringstream ss(line); 312 | 313 | //read label of an instance 314 | y_thread[tid].emplace_back(); 315 | ss >> y_thread[tid].back(); 316 | 317 | //read features of an instance 318 | instances_thread[tid].emplace_back(); 319 | 320 | string tuple; 321 | //int fid = 0; 322 | while(ss >> tuple){ 323 | int i; 324 | float v; 325 | CHECK_EQ(sscanf(tuple.c_str(), "%d:%f", &i, &v), 2) << "read error, using [index]:[value] format"; 326 | instances_thread[tid].back().emplace_back(i, v); 327 | if(i > local_feature[tid]){ 328 | local_feature[tid] = i; 329 | } 330 | if(i > feature_thread[tid].size()){ 331 | feature_thread[tid].resize(i); 332 | line_thread[tid].resize(i); 333 | } 334 | 335 | feature_thread[tid][i-1].emplace_back(v); 336 | line_thread[tid][i-1].emplace_back(lid); 337 | //fid++; 338 | 339 | 340 | } 341 | lid++; 342 | //read next instance 343 | lbegin = lend; 344 | 345 | } 346 | n_line[tid] = lid; 347 | } 348 | for (int i = 0; i < nthread; i++) { 349 | if (local_feature[i] > n_features_) 350 | n_features_ = local_feature[i]; 351 | } 352 | this->features.resize(n_features_); 353 | this->line_num.resize(n_features_); 354 | for(int i = 0; i < nthread; i++) { 355 | for(int j = 0; j < local_feature[i]; j++) { 356 | this->features[j].insert(this->features[j].end(), 357 | feature_thread[i][j].begin(), 358 | feature_thread[i][j].end()); 359 | for (int k = 0; k < line_thread[i][j].size(); k++) { 360 | line_thread[i][j][k] += n_sum_line; 361 | } 362 | this->line_num[j].insert(this->line_num[j].end(), 363 | line_thread[i][j].begin(), line_thread[i][j].end()); 364 | } 365 | n_sum_line += n_line[i]; 366 | } 367 | for (int i = 0; i < nthread; i++) { 368 | this->y_.insert(y_.end(), y_thread[i].begin(), y_thread[i].end()); 369 | this->instances_.insert(instances_.end(), instances_thread[i].begin(), instances_thread[i].end()); 370 | } 371 | } 372 | LOG(INFO) << "#instances = " << this->n_instances() << ", #features = " << this->n_features(); 373 | } 374 | 375 | const DataSet::node2d &DataSet::instances() const { 376 | return this->instances_; 377 | } 378 | 379 | size_t DataSet::n_features() const { 380 | return n_features_; 381 | } 382 | 383 | size_t DataSet::n_instances() const { 384 | //return this->instances_.size(); 385 | return this->y_.size(); 386 | } 387 | 388 | const vector &DataSet::y() const { 389 | return this->y_; 390 | } 391 | 392 | void DataSet::compression() { 393 | 394 | } 395 | -------------------------------------------------------------------------------- /src/thundergbm/gpu_lsh.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "thundergbm/gpu_lsh.h" 3 | #include "thundergbm/util/device_lambda.cuh" 4 | 5 | 6 | 7 | void psdLsh::init(){ 8 | tables.resize(param.n_table); 9 | //std::cout<<"init n_table:"< ur(0, param.r); 27 | 28 | switch (param.p_norm) 29 | { 30 | case 1: //CAUCHY 31 | { 32 | 33 | for(unsigned i = 0; i < param.n_table; i++){ 34 | std::cauchy_distribution cd; 35 | for(unsigned j = 0; j < param.n_dimension; j++) 36 | v_a.push_back(cd(rng)); 37 | // random_vector.push_back(rng()); 38 | v_b.push_back(ur(rng)); 39 | } 40 | break; 41 | // for (std::vector >::iterator iter = stableArray.begin(); iter != stableArray.end(); ++iter) 42 | // { 43 | // for (unsigned i = 0; i != param.D; ++i) 44 | // { 45 | // iter->push_back(cd(rng)); 46 | // } 47 | // rndBs.push_back(ur(rng)); 48 | // } 49 | // return; 50 | } 51 | case 2: //GAUSSIAN 52 | { 53 | for(unsigned i = 0; i < param.n_table; i++){ 54 | std::normal_distribution nd; 55 | for(unsigned j = 0; j < param.n_dimension; j++) 56 | v_a.push_back(nd(rng)); 57 | // random_vector.push_back(rng()); 58 | v_b.push_back(ur(rng)); 59 | } 60 | break; 61 | 62 | // for (std::vector >::iterator iter = stableArray.begin(); iter != stableArray.end(); ++iter) 63 | // { 64 | // for (unsigned i = 0; i != param.D; ++i) 65 | // { 66 | // iter->push_back(nd(rng)); 67 | // } 68 | // rndBs.push_back(ur(rng)); 69 | // } 70 | // return; 71 | } 72 | default: 73 | { 74 | break; 75 | } 76 | } 77 | 78 | a.resize(v_a.size()); 79 | a.copy_from(v_a.data(), v_a.size()); 80 | b.resize(v_b.size()); 81 | b.copy_from(v_b.data(), v_b.size()); 82 | 83 | for(int i = 0; i < param.n_table; i++){ 84 | // tables[i].resize(param.n_bucket); 85 | for(int j = 0; j < param.n_bucket; j++) 86 | tables[i][j].resize(param.n_comp); 87 | } 88 | } 89 | 90 | void psdLsh::reset(const Parameter ¶m_) 91 | { 92 | param = param_; 93 | init(); 94 | } 95 | 96 | 97 | void psdLsh::hash(int n_instances, int n_features, int nnz, int key_offset, 98 | SyncArray &csr_val, SyncArray &csr_row_ptr, SyncArray &csr_col_ind, SyncArray &hash_values, int cid) { 99 | // cudaSetDevice(0); 100 | 101 | CHECK(n_features == param.n_dimension); 102 | cusparseHandle_t handle; 103 | cusparseMatDescr_t descr; 104 | cusparseCreate(&handle); 105 | cusparseCreateMatDescr(&descr); 106 | cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); 107 | cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); 108 | float one(1); 109 | float zero(0); 110 | SyncArray result(n_instances * param.n_table); 111 | float *result_device = result.device_data(); 112 | float *b_device = b.device_data(); 113 | cudaDeviceSynchronize(); 114 | cusparseScsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_instances, param.n_table, n_features, nnz, &one, descr, 115 | csr_val.device_data(), csr_row_ptr.device_data(), csr_col_ind.device_data(), a.device_data(), 116 | n_features, 117 | &zero, result_device, n_instances); 118 | 119 | cudaDeviceSynchronize(); 120 | 121 | int *hash_values_device = hash_values.device_data(); 122 | 123 | SyncArray r_gpu(1); 124 | r_gpu.host_data()[0] = param.r; 125 | SyncArray n_table_gpu(1); 126 | n_table_gpu.host_data()[0] = param.n_table; 127 | cudaDeviceSynchronize(); 128 | float* r_device = r_gpu.device_data(); 129 | int* n_table_device = n_table_gpu.device_data(); 130 | device_loop(result.size(), [=]__device__(int vid){ 131 | result_device[vid] = result_device[vid] + b_device[vid % n_table_device[0]]; 132 | result_device[vid] = result_device[vid] / r_device[0]; 133 | hash_values_device[vid] = __float2int_rd(result_device[vid]); 134 | // hash_values_device[vid] = (int) (floorf(result_device[vid])); 135 | }); 136 | cudaDeviceSynchronize(); 137 | int *hash_values_host = hash_values.host_data(); 138 | for (unsigned nid = 0; nid < n_instances; nid++) { 139 | for (unsigned tid = 0; tid < param.n_table; tid++) { 140 | hash_values_host[nid * param.n_table + tid] %= param.n_bucket; 141 | // std::cout<<"hash value"< &csr_val, SyncArray &csr_row_ptr, SyncArray &csr_col_ind, 156 | // vector>& buckets){ 157 | // CHECK(n_features == param.n_dimension); 158 | // cusparseHandle_t handle; 159 | // cusparseMatDescr_t descr; 160 | // cusparseCreate(&handle); 161 | // cusparseCreateMatDescr(&descr); 162 | // cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); 163 | // cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); 164 | // float one(1); 165 | // float zero(0); 166 | // SyncArray result(n_instances * param.n_table); 167 | // float *result_device = result.device_data(); 168 | // float *b_device = b.device_data(); 169 | // cusparseScsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_instances, param.n_table, n_features, nnz, &one, descr, 170 | // csr_val.device_data(), csr_row_ptr.device_data(), csr_col_ind.device_data(), a.device_data(), 171 | // n_features, 172 | // &zero, result_device, n_instances); 173 | // 174 | // SyncArray hash_values(n_instances * param.n_table); 175 | // int *hash_values_device = hash_values.device_data(); 176 | // int *hash_values_host = hash_values.host_data(); 177 | // device_loop(n_instances * param.n_table, [=] 178 | // __device__(int vid){ 179 | // result_device[vid] += b_device[vid % param.n_table]; 180 | // result_device[vid] /= param.r; 181 | // hash_values_device[vid] = (int) (floorf(result_device[vid])); 182 | // }); 183 | // for (unsigned nid = 0; nid < n_instances; nid++) { 184 | // for (unsigned tid = 0; tid < param.n_table; tid++) { 185 | // int bid = hash_values_host[nid * n_instances + tid] % param.n_bucket; 186 | // buckets[nid].insert(buckets[nid].end(), tables[tid][bid].begin(), tables[tid][bid].end()); 187 | // } 188 | // } 189 | //} 190 | -------------------------------------------------------------------------------- /src/thundergbm/hist_cut.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 2018/5/9. 3 | // 4 | 5 | #include "thundergbm/hist_cut.h" 6 | #include "thundergbm/quantile_sketch.h" 7 | #include "thundergbm/syncarray.h" 8 | #include 9 | #include 10 | void HistCut::get_cut_points(SparseColumns& columns, InsStat& stats, int max_num_bins, int n_instances, int device_id){ 11 | LOG(TRACE)<<"get cut points"; 12 | int n_features = columns.n_column; 13 | std::cout<<"n_featrues:"< sketchs(n_features); 15 | const int kFactor = 8; 16 | for(int i = 0; i < n_features; i++){ 17 | sketchs[i].Init(n_instances, 1.0 / (max_num_bins * kFactor)); 18 | } 19 | float_type* val_ptr = columns.csc_val.host_data(); 20 | int* row_ptr = columns.csc_row_ind.host_data(); 21 | int* col_ptr = columns.csc_col_ptr.host_data(); 22 | auto stat_gh_ptr = stats.gh_pair.host_data(); 23 | // std::cout<<"before add"<= col_ptr[i]; j--){ 27 | float_type val = val_ptr[j]; 28 | float_type weight = stat_gh_ptr[row_ptr[j]].h; 29 | sketchs[i].Add(val, weight); 30 | } 31 | } 32 | // std::cout<<"after add"< n_summary(n_features); 34 | // summary n_summary[n_features]; 35 | // std::cout<<"before prune"<> cut_points_local; 45 | cut_points_local.resize(n_features); 46 | vector cut_points_size(n_features); 47 | for(int i = 0; i < n_features; i++) 48 | cut_points_local[i].resize(max_num_bins); 49 | #pragma omp parallel num_threads(nthread) 50 | { 51 | int tid = omp_get_thread_num(); 52 | int nstep = (n_features + nthread - 1) / nthread; 53 | int sbegin = std::min(tid * nstep, n_features); 54 | int send = std::min((tid + 1) * nstep, n_features); 55 | for(int i = sbegin; i < send; i++){ 56 | int k = 0; 57 | summary ts; 58 | ts.Reserve(max_num_bins); 59 | ts.Prune(n_summary[i], max_num_bins); 60 | if(ts.entry_size == 0) { cut_points_size[i] = 0;continue;} 61 | float_type min_val = ts.entries[0].val; 62 | //push a value that is smaller than min val 63 | cut_points_local[i][k++] = min_val - (fabsf(min_val) + 1e-5); 64 | 65 | if(ts.entry_size > 1 && ts.entry_size <= 16){ 66 | cut_points_local[i][k++] = (ts.entries[0].val + ts.entries[1].val) / 2; 67 | for(int j = 2; j < ts.entry_size; j++){ 68 | float_type mid = (ts.entries[j - 1].val + ts.entries[j].val) / 2; 69 | if(mid > cut_points_local[i][k-1]){ 70 | cut_points_local[i][k++] = mid; 71 | } 72 | } 73 | } 74 | else{ 75 | if(ts.entry_size == max_num_bins) 76 | LOG(INFO)<<"max bin"; 77 | if(ts.entry_size > 1) 78 | cut_points_local[i][k++] = ts.entries[1].val; 79 | for(int j = 2; j < ts.entry_size; j++){ 80 | float_type val = ts.entries[j].val; 81 | if(val > cut_points_local[i][k-1]){ 82 | cut_points_local[i][k++] = val; 83 | } 84 | } 85 | } 86 | 87 | /* 88 | float_type max_val = ts.entries[ts.entry_size - 1].val; 89 | if(max_val > 0){ 90 | cut_points_local[i][k++] = max_val*2 + 1e-5; 91 | } 92 | else{ 93 | cut_points_local[i][k++] = 1e-5; 94 | } 95 | */ 96 | cut_points_size[i]=k; 97 | } 98 | } 99 | for (int i = 0; i < n_features; i++) { 100 | if(cut_points_size[i] != 0) 101 | this->cut_points.insert(cut_points.end(), cut_points_local[i].begin(), cut_points_local[i].begin() + cut_points_size[i]); 102 | } 103 | this->row_ptr.push_back(0); 104 | for(int i = 0; i < n_features; i++){ 105 | this->row_ptr.push_back(cut_points_size[i]+this->row_ptr.back()); 106 | } 107 | CUDA_CHECK(cudaSetDevice(device_id)); 108 | cut_row_ptr.resize(this->row_ptr.size()); 109 | cut_row_ptr.copy_from(this->row_ptr.data(), this->row_ptr.size()); 110 | cut_points_val.resize(this->cut_points.size()); 111 | auto cut_points_val_ptr = cut_points_val.host_data(); 112 | auto cut_row_ptr_data = cut_row_ptr.host_data(); 113 | //descend order 114 | for(int i = 0; i < cut_row_ptr.size(); i++){ 115 | int sum = cut_row_ptr_data[i] + cut_row_ptr_data[i+1] - 1; 116 | for(int j = cut_row_ptr_data[i+1] - 1; j >= cut_row_ptr_data[i]; j--) 117 | cut_points_val_ptr[j] = this->cut_points[sum - j]; 118 | } 119 | CUDA_CHECK(cudaSetDevice(device_id)); 120 | 121 | } 122 | 123 | void BinStat::Init(HistCut& cut, InsStat& stats, int pid, float_type* f_val, int n_f_val, int* iid){ 124 | this->numBin = cut.row_ptr[pid+1] - cut.row_ptr[pid]; 125 | this->gh_pair.resize(cut.row_ptr[pid+1] - cut.row_ptr[pid]); 126 | auto cbegin = cut.cut_points.begin()+ cut.row_ptr[pid]; 127 | auto cend = cut.cut_points.begin()+ cut.row_ptr[pid + 1]; 128 | for(int i = 0; i < n_f_val; i++){ 129 | float_type val = f_val[i]; 130 | float_type g = stats.gh_pair.host_data()[iid[i]].g; 131 | float_type h = stats.gh_pair.host_data()[iid[i]].h; 132 | auto off = std::upper_bound(cbegin, cend, val); 133 | if(off == cend) off = cend - 1; 134 | int bid = off - cbegin; 135 | this->gh_pair.host_data()[bid].g += g; 136 | this->gh_pair.host_data()[bid].h += h; 137 | } 138 | } 139 | 140 | 141 | //void BinStat::Init(vector& cut_points, InsStat& stats, SparseColumns& columns, int fid){ 142 | // this->fid = fid; 143 | // this->gh_pair.resize(cut_points.size()); 144 | // float_type* val_ptr = columns.csc_val.host_data(); 145 | // int* row_ptr = columns.csc_row_ind.host_data(); 146 | // int* col_ptr = columns.csc_col_ptr.host_data(); 147 | // for(int i = col_ptr[fid + 1] - 1; i >= col_ptr[fid]; i--){ 148 | // float_type val = val_ptr[i]; 149 | // float_type g = stats.gh_pair.host_data()[row_ptr[i]].g; 150 | // float_type h = stats.gh_pair.host_data()[row_ptr[i]].h; 151 | // auto cbegin = cut_points.begin(); 152 | // auto cend = cut_points.end(); 153 | // auto off = std::upper_bound(cbegin, cend, val); 154 | // if(off == cend) off = cend - 1; 155 | // this->bid = off - cbegin; 156 | // this->gh_pair.host_data()[bid].g += g; 157 | // this->gh_pair.host_data()[bid].h += h; 158 | // } 159 | //} 160 | -------------------------------------------------------------------------------- /src/thundergbm/ins_stat.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 5/7/18. 3 | // 4 | #include "thundergbm/ins_stat.h" 5 | 6 | void InsStat::resize(size_t n_instances) { 7 | this->n_instances = n_instances; 8 | gh_pair.resize(n_instances); 9 | nid.resize(n_instances); 10 | y.resize(n_instances); 11 | y_predict.resize(n_instances); 12 | } 13 | 14 | void InsStat::updateGH() { 15 | sum_gh = GHPair(0, 0); 16 | GHPair *gh_pair_data = gh_pair.host_data(); 17 | int *nid_data = nid.host_data(); 18 | float_type *stats_y_data = y.host_data(); 19 | float_type *stats_yp_data = y_predict.host_data(); 20 | LOG(DEBUG) << y_predict; 21 | LOG(TRACE) << "initializing instance statistics"; 22 | //TODO parallel? 23 | for (int i = 0; i < n_instances; ++i) { 24 | nid_data[i] = 0; 25 | //TODO support other objective function 26 | gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i]; 27 | gh_pair_data[i].h = 1; 28 | sum_gh = sum_gh + gh_pair_data[i]; 29 | } 30 | } 31 | 32 | void InsStat::updateGH(SyncArray& is_multi) { 33 | sum_gh = GHPair(0, 0); 34 | GHPair *gh_pair_data = gh_pair.host_data(); 35 | int *nid_data = nid.host_data(); 36 | float_type *stats_y_data = y.host_data(); 37 | float_type *stats_yp_data = y_predict.host_data(); 38 | bool* is_multi_data = is_multi.host_data(); 39 | LOG(DEBUG) << y_predict; 40 | LOG(TRACE) << "initializing instance statistics"; 41 | //TODO parallel? 42 | for (int i = 0; i < n_instances; ++i) { 43 | nid_data[i] = 0; 44 | //TODO support other objective function 45 | if(is_multi_data[i]) { 46 | gh_pair_data[i].g = 2 * stats_yp_data[i] - stats_y_data[i]; 47 | gh_pair_data[i].h = 2; 48 | } 49 | else { 50 | gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i]; 51 | gh_pair_data[i].h = 1; 52 | } 53 | sum_gh = sum_gh + gh_pair_data[i]; 54 | } 55 | } 56 | 57 | void InsStat::updateGH(SyncArray& is_multi, int numP) { 58 | sum_gh = GHPair(0, 0); 59 | GHPair *gh_pair_data = gh_pair.host_data(); 60 | int *nid_data = nid.host_data(); 61 | float_type *stats_y_data = y.host_data(); 62 | float_type *stats_yp_data = y_predict.host_data(); 63 | bool* is_multi_data = is_multi.host_data(); 64 | LOG(DEBUG) << y_predict; 65 | LOG(TRACE) << "initializing instance statistics"; 66 | //TODO parallel? 67 | for (int i = 0; i < n_instances; ++i) { 68 | nid_data[i] = 0; 69 | //TODO support other objective function 70 | if(is_multi_data[i]) { 71 | gh_pair_data[i].g = numP * stats_yp_data[i] - stats_y_data[i]; // y is already multipled in similar_ins_bundle 72 | gh_pair_data[i].h = numP; 73 | 74 | // gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i]; 75 | // gh_pair_data[i].h = 1; 76 | } 77 | else { 78 | gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i]; 79 | gh_pair_data[i].h = 1; 80 | } 81 | sum_gh = sum_gh + gh_pair_data[i]; 82 | } 83 | } -------------------------------------------------------------------------------- /src/thundergbm/quantile_sketch.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by qinbin on 2018/5/9. 3 | // 4 | 5 | #include "thundergbm/quantile_sketch.h" 6 | #include 7 | 8 | void summary::Reserve(int size){ 9 | if(size > entries.size()){ 10 | entry_reserve_size = size; 11 | entries.resize(size); 12 | //data... 13 | } 14 | } 15 | 16 | void summary::Copy(summary& src){ 17 | entry_size = src.entry_size; 18 | entry_reserve_size = src.entry_reserve_size; 19 | entries = src.entries; 20 | } 21 | 22 | void summary::Merge(summary& src1, summary& src2){ 23 | if(src1.entry_size == 0 && src2.entry_size == 0){ 24 | this->entry_size = 0; 25 | this->entry_reserve_size = 0; 26 | this->entries.clear(); 27 | return; 28 | } 29 | else if(src1.entry_size == 0){ 30 | this->Copy(src2); 31 | return; 32 | } 33 | else if(src2.entry_size == 0){ 34 | this->Copy(src1); 35 | return; 36 | } 37 | float_type r1 = 0; 38 | float_type r2 = 0; 39 | int i = 0, j = 0; 40 | this->Reserve(src1.entry_size + src2.entry_size); 41 | this->entry_size = 0; 42 | for(; i < src1.entry_size && j < src2.entry_size;){ 43 | int val1 = src1.entries[i].val; 44 | int val2 = src2.entries[j].val; 45 | if(val1 == val2){ 46 | CHECK(this->entry_size < entry_reserve_size) << this->entry_size; 47 | this->entries[this->entry_size++] = entry(val1, 48 | src1.entries[i].rmin + src2.entries[j].rmin, 49 | src1.entries[i].rmax + src2.entries[j].rmax, 50 | src1.entries[i].w + src2.entries[j].w); 51 | r1 = src1.entries[i].rmin + src1.entries[i].w; 52 | r2 = src2.entries[j].rmin + src2.entries[j].w; 53 | i++; 54 | j++; 55 | //this->entry_size++; 56 | } 57 | else if(val1 < val2){ 58 | CHECK(this->entry_size < entry_reserve_size) << this->entry_size; 59 | this->entries[this->entry_size++]=entry(val1, 60 | src1.entries[i].rmin + r2, 61 | src1.entries[i].rmax + src2.entries[j].rmax - src2.entries[j].w, 62 | src1.entries[i].w); 63 | r1 = src1.entries[i].rmin + src1.entries[i].w; 64 | i++; 65 | //this->entry_size++; 66 | } 67 | else{ 68 | CHECK(this->entry_size < entry_reserve_size) << this->entry_size; 69 | this->entries[this->entry_size++] = entry(val2, 70 | src2.entries[j].rmin + r1, 71 | src2.entries[j].rmax + src1.entries[i].rmax - src1.entries[i].w, 72 | src2.entries[j].w); 73 | r2 = src2.entries[j].rmin + src2.entries[j].w; 74 | j++; 75 | //this->entry_size++; 76 | } 77 | } 78 | for(; i < src1.entry_size; i++){ 79 | CHECK(this->entry_size < entry_reserve_size) << this->entry_size; 80 | this->entries[this->entry_size++] = entry(src1.entries[i].val, 81 | src1.entries[i].rmin + r2, 82 | src1.entries[i].rmax + src2.entries[src2.entry_size - 1].rmax, 83 | src1.entries[i].w); 84 | //this->entry_size++; 85 | } 86 | for(; j < src2.entry_size; j++){ 87 | CHECK(this->entry_size < entry_reserve_size) << this->entry_size; 88 | this->entries[this->entry_size++] = entry(src2.entries[j].val, 89 | src2.entries[j].rmin + r1, 90 | src2.entries[j].rmax + src1.entries[src1.entry_size - 1].rmax, 91 | src2.entries[j].w); 92 | //this->entry_size++; 93 | } 94 | //this->entry_size = this->entries.size(); 95 | r1 = 0; 96 | r2 = 0; 97 | // float_type rmin_diff = 0; 98 | // float_type rmax_diff = 0; 99 | // float_type w_diff = 0; 100 | for(int i = 0; i < this->entry_size; i++){ 101 | if(this->entries[i].rmin < r1){ 102 | this->entries[i].rmin = r1; 103 | // if(r1 - this->entries[i].rmin > rmin_diff) 104 | // rmin_diff = r1 - this->entries[i].rmin; 105 | } 106 | else 107 | r1 = this->entries[i].rmin; 108 | if(this->entries[i].rmax < r2){ 109 | this->entries[i].rmax = r2; 110 | // if(r2 - this->entries[i].rmax > rmax_diff) 111 | // rmax_diff = r2 - this->entries[i].rmax; 112 | } 113 | if(this->entries[i].rmax < this->entries[i].rmin + this->entries[i].w){ 114 | this->entries[i].rmax = this->entries[i].rmin + this->entries[i].w; 115 | // if(this->entries[i].rmax - this->entries[i].rmin - this->entries[i].w > w_diff) 116 | // w_diff = this->entries[i].rmax - this->entries[i].rmin - this->entries[i].w; 117 | } 118 | r2 = this->entries[i].rmax; 119 | } 120 | } 121 | 122 | 123 | 124 | void summary::Prune(summary& src, int size){ 125 | if(src.entry_size <= size){ 126 | this->Copy(src); 127 | return; 128 | } 129 | float_type begin = src.entries[0].rmax; 130 | float_type End = src.entries[src.entry_size - 1].rmin; 131 | float_type range = End - begin; 132 | if(size <= 2 || range == 0.0f){ 133 | this->entry_size = 2; 134 | CHECK(1 < entry_reserve_size) << entry_reserve_size; 135 | this->entries[0] = src.entries[0]; 136 | this->entries[1] = src.entries[src.entry_size - 1]; 137 | return; 138 | } 139 | range = std::max(range, 1e-3f); 140 | int n_points = size - 2; 141 | int n_bigbin = 0; 142 | int safe_factor = 2; 143 | float_type chunk_size = safe_factor * range / n_points; 144 | float_type sum_small_range = 0; 145 | int j = 0; 146 | int i = 1; 147 | float_type r1; 148 | float_type r2; 149 | vector big_points; 150 | big_points.reserve(n_points + 1); 151 | //int last_big_point = 0; 152 | for(; i < src.entry_size - 1; i++){ 153 | CHECK(i < src.entry_reserve_size) << i; 154 | r1 = src.entries[i].rmin + src.entries[i].w; 155 | r2 = src.entries[i].rmax - src.entries[i].w; 156 | if(r1 > r2 + chunk_size){ 157 | n_bigbin++; 158 | big_points.push_back(i); 159 | // 160 | if(j != i -1) 161 | sum_small_range += r2 - (src.entries[j].rmin + src.entries[j].w); 162 | j = i; 163 | } 164 | } 165 | CHECK(n_bigbin < n_points) << "too many big bin"; 166 | int n_smallbin = n_points - n_bigbin; 167 | //r1 = src.entries[i].rmin + src.entries[i].w; 168 | r2 = src.entries[i].rmax - src.entries[i].w; 169 | if(j != src.entry_size - 2) 170 | sum_small_range += r2 - (src.entries[j].rmin + src.entries[j].w); 171 | CHECK(j < src.entry_reserve_size) << j; 172 | this->entries[0] = src.entries[0]; 173 | this->entry_size = 1; 174 | n_points -= n_bigbin; 175 | j = 0; 176 | int n_get_points = 1; 177 | //store maximum point 178 | big_points.push_back(src.entry_size - 1); 179 | for(int i = 0; i < big_points.size(); i++){ 180 | int id = big_points[i]; 181 | if(j != id -1){ 182 | CHECK(id < src.entry_reserve_size) << id; 183 | float_type r = src.entries[id].rmax - src.entries[id].w; 184 | int k = j; 185 | for(; n_get_points < n_points; n_get_points++){ 186 | float_type start = n_get_points * sum_small_range / n_points + begin; 187 | if(start >= r) 188 | break; 189 | for(; k < id; k++){ 190 | CHECK(k+1 < src.entry_reserve_size) << k+1; 191 | if(2 * start < (src.entries[k + 1].rmax + src.entries[k + 1].rmin)) 192 | break; 193 | } 194 | if(k == id) break; 195 | CHECK(k < src.entry_reserve_size) << k; 196 | if(2 * start >= src.entries[k].rmin + src.entries[k].w + src.entries[k+1].rmax - src.entries[k+1].w){ 197 | if(k != j - 1){ 198 | j = k + 1; 199 | CHECK(k < src.entry_reserve_size - 1) << k; 200 | this->entries[this->entry_size] = src.entries[k + 1]; 201 | this->entry_size++; 202 | 203 | } 204 | } 205 | else{ 206 | if(k != j){ 207 | j = k; 208 | CHECK(k < src.entry_reserve_size) << k; 209 | this->entries[this->entry_size] = src.entries[k]; 210 | this->entry_size++; 211 | } 212 | } 213 | } 214 | } 215 | //store big bin 216 | if(j != id){ 217 | CHECK(id < src.entry_reserve_size) << id; 218 | this->entries[this->entry_size] = src.entries[id]; 219 | this->entry_size++; 220 | j = id; 221 | } 222 | CHECK(j < src.entry_reserve_size) << j; 223 | begin += src.entries[j].rmin + 2 * src.entries[j].w - src.entries[j].rmax; 224 | } 225 | 226 | if(this->entry_size == 64) LOG(INFO)<<"entry 64"; 227 | // for(int i = 1; i < src.entry_size; i++){ 228 | // if 229 | // } 230 | 231 | 232 | }; 233 | 234 | void Qitem::GetSummary(summary& ret){ 235 | //remove it if data is sorted 236 | //sort(data.begin(), data.begin() + tail); 237 | ret.entry_size = 0; 238 | float_type waccum = 0; 239 | for(int i = 0; i < tail;){ 240 | int j = i + 1; 241 | CHECK(i < data.size()) << i; 242 | float_type wt = data[i].second; 243 | for(; j < tail; j++){ 244 | CHECK(j < data.size()) << j; 245 | if(data[j].first == data[i].first) 246 | wt += data[j].second; 247 | else 248 | break; 249 | } 250 | CHECK(ret.entry_size < ret.entry_reserve_size) << ret.entry_size; 251 | ret.entries[ret.entry_size] = entry(data[i].first, waccum, waccum + wt, wt); 252 | ret.entry_size++; 253 | waccum += wt; 254 | i = j; 255 | } 256 | } 257 | 258 | void quanSketch::Init(int maxn, float_type eps){ 259 | numOfLevel = 1; 260 | while (1) { 261 | summarySize = ceil(numOfLevel / eps) + 1; 262 | int n = (1ULL << numOfLevel); 263 | if (n * summarySize >= maxn) break; 264 | ++numOfLevel; 265 | } 266 | // std::cout<<"summarySize:"<= maxn) << "invalid init parameter"; 269 | CHECK(numOfLevel <= summarySize * eps) << "invalid init parameter"; 270 | Qentry.data.clear(); 271 | Qentry.data.resize(summarySize * 2); 272 | Qentry.tail = 0; 273 | //summaries.clear(); 274 | } 275 | 276 | 277 | void quanSketch::Add(float_type value, float_type weight){ 278 | if(weight == 0.0f) return; 279 | if(Qentry.data.size() == Qentry.tail){ 280 | t_summary.Reserve(2*summarySize); 281 | Qentry.GetSummary(t_summary); 282 | Qentry.tail = 0; 283 | for(int i = 1;; i++){ 284 | if(summaries.size() < i + 1){ 285 | //Qentry.data.resize((i+1)*summarySize); 286 | summaries.resize(i + 1, summary(0, (i+1) * summarySize)); 287 | } 288 | CHECK(i < summaries.size()) << i; 289 | if(summaries[i].entry_size == 0){ 290 | summaries[i].Prune(t_summary, summarySize); 291 | break; 292 | } 293 | else{ 294 | summaries[0].Prune(t_summary, summarySize); 295 | CHECK(i < summaries.size()) << i; 296 | t_summary.Merge(summaries[0], summaries[i]); 297 | if(t_summary.entry_size > summarySize) 298 | summaries[i].entry_size = 0; 299 | else{ 300 | summaries[i].Copy(t_summary); 301 | break; 302 | } 303 | } 304 | } 305 | // this->AddT(); 306 | } 307 | CHECK(Qentry.tail < Qentry.data.size()) << Qentry.tail; 308 | if(Qentry.tail == 0 || value != Qentry.data[Qentry.tail-1].first){ 309 | CHECK(Qentry.tail < Qentry.data.size()) << Qentry.tail; 310 | Qentry.data[Qentry.tail] = std::make_pair(value, weight); 311 | Qentry.tail++; 312 | } 313 | else{ 314 | CHECK(Qentry.tail <= Qentry.data.size()) << Qentry.tail; 315 | Qentry.data[Qentry.tail-1].second += weight; 316 | } 317 | 318 | //Qentry.data.push_back(std::make_pair(value, weight)); 319 | } 320 | 321 | void quanSketch::GetSummary(summary& dest){ 322 | dest.entry_size = 0; 323 | dest.entries.clear(); 324 | if(summaries.size() == 0){ 325 | // std::cout<<"0 size"< summarySize){ 329 | t_summary.Reserve(summarySize); 330 | t_summary.Prune(dest, summarySize); 331 | dest.Copy(t_summary); 332 | } 333 | } 334 | else { 335 | // std::cout<<"not 0 size"< 5 | #include "thundergbm/sparse_columns.h" 6 | #include "thundergbm/util/device_lambda.cuh" 7 | 8 | void SparseColumns::from_dataset(const DataSet &dataset) { 9 | LOG(TRACE) << "constructing sparse columns"; 10 | n_column = dataset.n_features(); 11 | vector csc_val_vec; 12 | vector csc_row_ind_vec; 13 | vector csc_col_ptr_vec; 14 | csc_col_ptr_vec.push_back(0); 15 | for (int i = 0; i < n_column; i++) { 16 | csc_val_vec.insert(csc_val_vec.end(), dataset.features[i].begin(), dataset.features[i].end()); 17 | csc_row_ind_vec.insert(csc_row_ind_vec.end(), dataset.line_num[i].begin(), dataset.line_num[i].end()); 18 | csc_col_ptr_vec.push_back(csc_col_ptr_vec.back() + dataset.features[i].size()); 19 | } 20 | nnz = csc_val_vec.size(); 21 | csc_val.resize(csc_val_vec.size()); 22 | memcpy(csc_val.host_data(), csc_val_vec.data(), sizeof(float_type) * csc_val_vec.size()); 23 | csc_row_ind.resize(csc_row_ind_vec.size()); 24 | memcpy(csc_row_ind.host_data(), csc_row_ind_vec.data(), sizeof(int) * csc_row_ind_vec.size()); 25 | csc_col_ptr.resize(csc_col_ptr_vec.size()); 26 | memcpy(csc_col_ptr.host_data(), csc_col_ptr_vec.data(), sizeof(int) * csc_col_ptr_vec.size()); 27 | cudaDeviceSynchronize();// ? 28 | } 29 | 30 | void SparseColumns::from_dataset_csr(const DataSet &dataset) { 31 | LOG(INFO) << "constructing sparse columns"; 32 | n_column = dataset.n_features(); 33 | size_t n_instances = dataset.n_instances(); 34 | const DataSet::node2d &instances = dataset.instances(); 35 | 36 | /** 37 | * construct csr matrix, then convert to csc matrix and sort columns by feature values 38 | */ 39 | vector csr_val; 40 | vector csr_col_ind;//index of each value of all the instances 41 | vector csr_row_ptr(1, 0);//the start positions of the instances 42 | 43 | LOG(INFO) << "converting libsvm sparse rows to csr matrix"; 44 | for (const auto &ins : instances) {//convert libsvm format to csr format 45 | for (const auto &j : ins) { 46 | csr_val.push_back(j.value); 47 | csr_col_ind.push_back(j.index - 1);//libSVM data format is one-based, convert to zero-based 48 | } 49 | CHECK_LE(csr_row_ptr.back() + ins.size(), INT_MAX); 50 | csr_row_ptr.push_back(csr_row_ptr.back() + ins.size()); 51 | } 52 | 53 | nnz = csr_val.size();//number of nonzer 54 | LOG(INFO) 55 | << string_format("dataset density = %.2f%% (%d feature values, ave=%d/instance, %d/feature)", 56 | (float) nnz / n_instances / n_column * 100, 57 | nnz, nnz / n_instances, nnz / n_column); 58 | 59 | LOG(INFO) << "copy csr matrix to GPU"; 60 | //three arrays (on GPU/CPU) for csr representation 61 | SyncArray val; 62 | SyncArray col_ind; 63 | SyncArray row_ptr; 64 | val.resize(csr_val.size()); 65 | col_ind.resize(csr_col_ind.size()); 66 | row_ptr.resize(csr_row_ptr.size()); 67 | 68 | //copy data to the three arrays 69 | val.copy_from(csr_val.data(), val.size()); 70 | col_ind.copy_from(csr_col_ind.data(), col_ind.size()); 71 | row_ptr.copy_from(csr_row_ptr.data(), row_ptr.size()); 72 | 73 | LOG(INFO) << "converting csr matrix to csc matrix"; 74 | cusparseHandle_t handle; 75 | cusparseMatDescr_t descr; 76 | cusparseCreate(&handle); 77 | cusparseCreateMatDescr(&descr); 78 | cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); 79 | cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); 80 | 81 | csc_val.resize(nnz); 82 | csc_row_ind.resize(nnz); 83 | csc_col_ptr.resize(n_column + 1); 84 | 85 | cusparseScsr2csc(handle, n_instances, n_column, nnz, val.device_data(), row_ptr.device_data(), 86 | col_ind.device_data(), csc_val.device_data(), csc_row_ind.device_data(), csc_col_ptr.device_data(), 87 | CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO); 88 | cudaDeviceSynchronize(); 89 | cusparseDestroy(handle); 90 | cusparseDestroyMatDescr(descr); 91 | } 92 | 93 | 94 | void SparseColumns::to_multi_devices(vector> &v_columns) const{ 95 | //devide data into multiple devices 96 | int cur_device_id; 97 | cudaGetDevice(&cur_device_id); 98 | int n_device = v_columns.size(); 99 | int ave_n_columns = n_column / n_device; 100 | DO_ON_MULTI_DEVICES(n_device, [&](int device_id) { 101 | SparseColumns &columns = *v_columns[device_id]; 102 | const int *csc_col_ptr_data = csc_col_ptr.host_data(); 103 | int first_col_id = device_id * ave_n_columns; 104 | int n_column_sub = (device_id < n_device - 1) ? ave_n_columns : n_column - first_col_id; 105 | n_column_sub = (n_device == 1) ? ave_n_columns : n_column_sub; 106 | int first_col_start = csc_col_ptr_data[first_col_id]; 107 | int nnz_sub = (device_id < n_device - 1) ? 108 | (csc_col_ptr_data[(device_id + 1) * ave_n_columns] - first_col_start) : (nnz - 109 | first_col_start); 110 | nnz_sub = (n_device == 1) ? nnz : nnz_sub; 111 | 112 | columns.column_offset = first_col_id; 113 | columns.nnz = nnz_sub; 114 | columns.n_column = n_column_sub; 115 | columns.csc_val.resize(nnz_sub); 116 | columns.csc_row_ind.resize(nnz_sub); 117 | columns.csc_col_ptr.resize(n_column_sub + 1); 118 | 119 | columns.csc_val.copy_from(csc_val.host_data() + first_col_start, nnz_sub); 120 | columns.csc_row_ind.copy_from(csc_row_ind.host_data() + first_col_start, nnz_sub); 121 | columns.csc_col_ptr.copy_from(csc_col_ptr.host_data() + first_col_id, n_column_sub + 1); 122 | 123 | int *csc_col_ptr_2d_data = columns.csc_col_ptr.device_data(); 124 | 125 | 126 | //correct segment start positions 127 | device_loop(n_column_sub + 1, [=] __device__(int col_id) { 128 | csc_col_ptr_2d_data[col_id] = csc_col_ptr_2d_data[col_id] - first_col_start; 129 | }); 130 | LOG(TRACE) << "sorting feature values (multi-device)"; 131 | cub_seg_sort_by_key(columns.csc_val, columns.csc_row_ind, columns.csc_col_ptr, false); 132 | 133 | }); 134 | LOG(TRACE) << "sorting finished"; 135 | } 136 | 137 | //void SparseColumns::get_cut_points_evenly(int nBin, int max_dimension, vector& bin_id, 138 | // const vector& min_fea, const vector& max_fea) { 139 | // float* csc_val_host = csc_val.host_data(); 140 | // int* csc_row_host = csc_row_ind.host_data(); 141 | // int* csc_col_host = csc_col_ptr.host_data(); 142 | // for(int cid = 0; cid < csc_col_ptr.size() - 1; cid ++){ 143 | // cstart = csc_col_host[cid]; 144 | // cend = csc_col_host[cid + 1]; 145 | // for(int off = cstart; off < cend; off++){ 146 | // float val = csc_val_host[off]; 147 | // bin_id[off] = (int) ((val - min_fea[cid]) / (max_fea[cid] - min_fea[cid]) * nBin); 148 | // } 149 | // } 150 | //} 151 | -------------------------------------------------------------------------------- /src/thundergbm/syncmem.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 17-9-16. 3 | // 4 | 5 | #include 6 | 7 | namespace thunder { 8 | SyncMem::SyncMem() : SyncMem(0) {} 9 | 10 | SyncMem::SyncMem(size_t size) : device_ptr(nullptr), host_ptr(nullptr), size_(size), head_(UNINITIALIZED), 11 | own_device_data(false), own_host_data(false) { 12 | #ifdef USE_CUDA 13 | CUDA_CHECK(cudaGetDevice(&device_id)); 14 | #endif 15 | } 16 | 17 | SyncMem::~SyncMem() { 18 | this->head_ = UNINITIALIZED; 19 | if (host_ptr && own_host_data) { 20 | free_host(host_ptr); 21 | host_ptr = nullptr; 22 | } 23 | #ifdef USE_CUDA 24 | DO_ON_DEVICE(device_id, { 25 | if (device_ptr && own_device_data) { 26 | CUDA_CHECK(cudaFree(device_ptr)); 27 | device_ptr = nullptr; 28 | } 29 | }); 30 | #endif 31 | } 32 | 33 | void *SyncMem::host_data() { 34 | to_host(); 35 | return host_ptr; 36 | } 37 | 38 | void *SyncMem::device_data() { 39 | #ifdef USE_CUDA 40 | to_device(); 41 | #else 42 | NO_GPU; 43 | #endif 44 | return device_ptr; 45 | } 46 | 47 | size_t SyncMem::size() const { 48 | return size_; 49 | } 50 | 51 | SyncMem::HEAD SyncMem::head() const { 52 | return head_; 53 | } 54 | 55 | void SyncMem::to_host() { 56 | switch (head_) { 57 | case UNINITIALIZED: 58 | malloc_host(&host_ptr, size_); 59 | memset(host_ptr, 0, size_); 60 | head_ = HOST; 61 | own_host_data = true; 62 | break; 63 | case DEVICE: 64 | #ifdef USE_CUDA 65 | DO_ON_DEVICE(device_id, { 66 | if (nullptr == host_ptr) { 67 | CUDA_CHECK(cudaHostAlloc(&host_ptr, size_, cudaHostAllocPortable)); 68 | CUDA_CHECK(cudaMemset(host_ptr, 0, size_)); 69 | own_host_data = true; 70 | } 71 | CUDA_CHECK(cudaMemcpy(host_ptr, device_ptr, size_, cudaMemcpyDeviceToHost)); 72 | head_ = HOST; 73 | }); 74 | #else 75 | NO_GPU; 76 | #endif 77 | break; 78 | case HOST:; 79 | } 80 | } 81 | 82 | void SyncMem::to_device() { 83 | #ifdef USE_CUDA 84 | DO_ON_DEVICE(device_id, { 85 | switch (head_) { 86 | case UNINITIALIZED: 87 | CUDA_CHECK(cudaMalloc(&device_ptr, size_)); 88 | CUDA_CHECK(cudaMemset(device_ptr, 0, size_)); 89 | head_ = DEVICE; 90 | own_device_data = true; 91 | break; 92 | case HOST: 93 | if (nullptr == device_ptr) { 94 | CUDA_CHECK(cudaMalloc(&device_ptr, size_)); 95 | CUDA_CHECK(cudaMemset(device_ptr, 0, size_)); 96 | own_device_data = true; 97 | } 98 | CUDA_CHECK(cudaMemcpy(device_ptr, host_ptr, size_, cudaMemcpyHostToDevice)); 99 | head_ = DEVICE; 100 | break; 101 | case DEVICE:; 102 | } 103 | }); 104 | #else 105 | NO_GPU; 106 | #endif 107 | } 108 | 109 | void SyncMem::set_host_data(void *data) { 110 | CHECK_NOTNULL(data); 111 | if (own_host_data) { 112 | free_host(host_ptr); 113 | } 114 | host_ptr = data; 115 | own_host_data = false; 116 | head_ = HEAD::HOST; 117 | } 118 | 119 | void SyncMem::set_device_data(void *data) { 120 | #ifdef USE_CUDA 121 | DO_ON_DEVICE(device_id, { 122 | CHECK_NOTNULL(data); 123 | if (own_device_data) { 124 | CUDA_CHECK(cudaFree(device_data())); 125 | } 126 | device_ptr = data; 127 | own_device_data = false; 128 | head_ = HEAD::DEVICE; 129 | }); 130 | #else 131 | NO_GPU; 132 | #endif 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/thundergbm/tree.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-18. 3 | // 4 | #include "thundergbm/tree.h" 5 | 6 | Tree::Tree(int depth) { 7 | init(depth); 8 | } 9 | 10 | void Tree::init(int depth) { 11 | int n_max_nodes = static_cast(pow(2, depth + 1) - 1); 12 | nodes.resize(n_max_nodes); 13 | TreeNode *node_data = nodes.host_data(); 14 | for (int i = 0; i < n_max_nodes; ++i) { 15 | node_data[i].final_id = i; 16 | node_data[i].split_feature_id = -1; 17 | node_data[i].is_valid = false; 18 | node_data[i].parent_index = (i - 1) / 2; 19 | if (i < n_max_nodes / 2) { 20 | node_data[i].is_leaf = false; 21 | node_data[i].lch_index = i * 2 + 1; 22 | node_data[i].rch_index = i * 2 + 2; 23 | } else { 24 | //leaf nodes 25 | node_data[i].is_leaf = true; 26 | node_data[i].lch_index = -1; 27 | node_data[i].rch_index = -1; 28 | } 29 | } 30 | node_data[0].parent_index = -1;//root node has no parent node 31 | } 32 | 33 | string Tree::dump(int depth) const { 34 | string s("\n"); 35 | preorder_traversal(0, depth, 0, s); 36 | return s; 37 | } 38 | 39 | void Tree::preorder_traversal(int nid, int max_depth, int depth, string &s) const { 40 | const TreeNode &node = nodes.host_data()[nid]; 41 | if (node.is_valid && !node.is_pruned) 42 | s = s + string(static_cast(depth), '\t') + 43 | (node.is_leaf ? 44 | string_format("%d:leaf=%.6g\n", node.final_id, node.base_weight) : 45 | string_format("%d:[f%d<%.6g], weight=%f, gain=%f, dr=%d\n", node.final_id, node.split_feature_id + 1, 46 | node.split_value, 47 | node.base_weight, node.gain, node.default_right)); 48 | if (depth < max_depth) { 49 | preorder_traversal(node.lch_index, max_depth, depth + 1, s); 50 | preorder_traversal(node.rch_index, max_depth, depth + 1, s); 51 | } 52 | } 53 | 54 | std::ostream &operator<<(std::ostream &os, const Tree::TreeNode &node) { 55 | os << string_format("\nnid:%d,l:%d,split_feature_id:%d,f:%f,gain:%f,r:%d,w:%f,", node.final_id, node.is_leaf, 56 | node.split_feature_id, node.split_value, node.gain, node.default_right, node.base_weight); 57 | os << "g/h:" << node.sum_gh_pair; 58 | return os; 59 | } 60 | 61 | void Tree::reorder_nid() { 62 | int nid = 0; 63 | Tree::TreeNode *nodes_data = nodes.host_data(); 64 | for (int i = 0; i < nodes.size(); ++i) { 65 | if (nodes_data[i].is_valid && !nodes_data[i].is_pruned) { 66 | nodes_data[i].final_id = nid; 67 | nid++; 68 | } 69 | } 70 | } 71 | 72 | int Tree::try_prune_leaf(int nid, int np, float_type gamma, vector &leaf_child_count) { 73 | Tree::TreeNode *nodes_data = nodes.host_data(); 74 | int p_nid = nodes_data[nid].parent_index; 75 | if (p_nid == -1) return np;// is root 76 | Tree::TreeNode &p_node = nodes_data[p_nid]; 77 | Tree::TreeNode &lch = nodes_data[p_node.lch_index]; 78 | Tree::TreeNode &rch = nodes_data[p_node.rch_index]; 79 | leaf_child_count[p_nid]++; 80 | if (leaf_child_count[p_nid] >= 2 && p_node.gain < gamma) { 81 | //do pruning 82 | //delete two children 83 | CHECK(lch.is_leaf); 84 | CHECK(rch.is_leaf); 85 | lch.is_pruned = true; 86 | rch.is_pruned = true; 87 | //make parent to leaf 88 | p_node.is_leaf = true; 89 | return try_prune_leaf(p_nid, np + 2, gamma, leaf_child_count); 90 | } else return np; 91 | } 92 | 93 | void Tree::prune_self(float_type gamma) { 94 | vector leaf_child_count(nodes.size(), 0); 95 | Tree::TreeNode *nodes_data = nodes.host_data(); 96 | int n_pruned = 0; 97 | for (int i = 0; i < nodes.size(); ++i) { 98 | if (nodes_data[i].is_leaf && nodes_data[i].is_valid) { 99 | n_pruned = try_prune_leaf(i, n_pruned, gamma, leaf_child_count); 100 | } 101 | } 102 | LOG(DEBUG) << string_format("%d nodes are pruned", n_pruned); 103 | reorder_nid(); 104 | } 105 | 106 | void Tree::shrink(float_type learning_rate){ 107 | Tree::TreeNode *nodes_data = nodes.host_data(); 108 | for (int i = 0; i < nodes.size(); ++i) { 109 | if (nodes_data[i].is_leaf && nodes_data[i].is_valid) { 110 | nodes_data[i].base_weight *= learning_rate; 111 | } 112 | } 113 | } -------------------------------------------------------------------------------- /src/thundergbm/updater/exact_updater.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Created by shijiashuai on 5/7/18. 3 | // 4 | #include "thundergbm/updater/exact_updater.h" 5 | 6 | void ExactUpdater::lsh_hash_init(unsigned n_bucket, unsigned num_table, unsigned num_dimension, unsigned p_norm, float r, unsigned numP, int seed){ 7 | lsh_table.param.n_bucket = n_bucket; 8 | lsh_table.param.n_table = num_table; 9 | lsh_table.param.n_dimension = num_dimension; 10 | lsh_table.param.p_norm = p_norm; 11 | lsh_table.param.r = r; 12 | lsh_table.param.n_comp = numP; 13 | lsh_table.param.seed = seed; 14 | lsh_table.init(); 15 | } 16 | 17 | void ExactUpdater::grow(Tree &tree, const vector> &v_columns, InsStat &stats) { 18 | TIMED_SCOPE(timerObj, "grow tree"); 19 | 20 | int n_instances = stats.n_instances; 21 | int cur_device = 0; 22 | // int cur_device = param.use_gpu_id; 23 | 24 | LOG(TRACE) << "broadcast tree and stats"; 25 | v_stats.resize(n_devices); 26 | v_trees_gpu.resize(n_devices); 27 | init_tree(tree, stats); 28 | DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) { 29 | //stats 30 | int n_instances = stats.n_instances; 31 | v_stats[device_id].reset(new InsStat()); 32 | InsStat &gpu_stats = *v_stats[device_id]; 33 | gpu_stats.resize(n_instances); 34 | gpu_stats.gh_pair.copy_from(stats.gh_pair.host_data(), n_instances); 35 | gpu_stats.nid.copy_from(stats.nid.host_data(), n_instances); 36 | gpu_stats.y.copy_from(stats.y.host_data(), n_instances); 37 | gpu_stats.y_predict.copy_from(stats.y_predict.host_data(), n_instances); 38 | 39 | //tree 40 | v_trees_gpu[device_id].reset(new Tree()); 41 | Tree &gpu_tree = *v_trees_gpu[device_id]; 42 | gpu_tree.nodes.resize(tree.nodes.size()); 43 | gpu_tree.nodes.copy_from(tree.nodes.host_data(), tree.nodes.size()); 44 | }); 45 | 46 | for (int i = 0; i < depth; ++i) { 47 | LOG(TRACE) << "growing tree at depth " << i; 48 | vector> local_sp(n_devices); 49 | { 50 | TIMED_SCOPE(timerObj, "find split"); 51 | DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) { 52 | LOG(TRACE) << string_format("finding split on device %d", device_id); 53 | find_split(i, *v_columns[device_id], *v_trees_gpu[device_id], *v_stats[device_id], local_sp[device_id]); 54 | }); 55 | } 56 | 57 | int n_max_nodes_in_level = 1 << i;//2^i 58 | int nid_offset = (1 << i) - 1;//2^i - 1 59 | SyncArray global_sp(n_max_nodes_in_level); 60 | { 61 | TIMED_SCOPE(timerObj, "split point all reduce"); 62 | split_point_all_reduce(local_sp, global_sp, i); 63 | } 64 | 65 | //do split 66 | { 67 | TIMED_SCOPE(timerObj, "update tree"); 68 | update_tree(tree, global_sp); 69 | } 70 | //broadcast tree 71 | LOG(TRACE) << "broadcasting updated tree"; 72 | DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) { 73 | v_trees_gpu[device_id]->nodes.copy_from(tree.nodes.host_data(), tree.nodes.size()); 74 | }); 75 | 76 | { 77 | vector v_has_split(n_devices); 78 | TIMED_SCOPE(timerObj, "reset ins2node id"); 79 | LOG(TRACE) << "reset ins2node id"; 80 | DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) { 81 | v_has_split[device_id] = reset_ins2node_id(*v_stats[device_id], *v_trees_gpu[device_id], 82 | *v_columns[device_id]); 83 | }); 84 | 85 | LOG(TRACE) << "gathering ins2node id"; 86 | //get final result of the reset instance id to node id 87 | bool has_split = false; 88 | for (int d = 0; d < n_devices; d++) { 89 | has_split |= v_has_split[d]; 90 | } 91 | if (!has_split) { 92 | LOG(INFO) << "no splittable nodes, stop"; 93 | break; 94 | } 95 | } 96 | 97 | //get global ins2node id 98 | { 99 | TIMED_SCOPE(timerObj, "global ins2node id"); 100 | SyncArray local_ins2node_id(n_instances); 101 | auto local_ins2node_id_data = local_ins2node_id.device_data(); 102 | auto global_ins2node_id_data = stats.nid.device_data(); 103 | for (int d = 0; d < n_devices; d++) { 104 | CUDA_CHECK(cudaMemcpyPeerAsync(local_ins2node_id_data, cur_device, 105 | v_stats[d]->nid.device_data(), d, 106 | sizeof(int) * n_instances)); 107 | cudaDeviceSynchronize(); 108 | device_loop(n_instances, [=]__device__(int i) { 109 | global_ins2node_id_data[i] = (global_ins2node_id_data[i] > local_ins2node_id_data[i]) ? 110 | global_ins2node_id_data[i] : local_ins2node_id_data[i]; 111 | }); 112 | } 113 | } 114 | 115 | //processing missing value 116 | { 117 | TIMED_SCOPE(timerObj, "process missing value"); 118 | LOG(TRACE) << "update ins2node id for each missing fval"; 119 | auto global_ins2node_id_data = stats.nid.device_data();//essential 120 | auto nodes_data = v_trees_gpu[0]->nodes.device_data();//already broadcast above 121 | device_loop(n_instances, [=]__device__(int iid) { 122 | int nid = global_ins2node_id_data[iid]; 123 | //if the instance is not on leaf node and not goes down 124 | if (nodes_data[nid].splittable() && nid < nid_offset + n_max_nodes_in_level) { 125 | //let the instance goes down 126 | const Tree::TreeNode &node = nodes_data[nid]; 127 | if (node.default_right) 128 | global_ins2node_id_data[iid] = node.rch_index; 129 | else 130 | global_ins2node_id_data[iid] = node.lch_index; 131 | } 132 | }); 133 | LOG(DEBUG) << "new nid = " << stats.nid; 134 | //broadcast ins2node id 135 | DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) { 136 | v_stats[device_id]->nid.copy_from(stats.nid.host_data(), stats.nid.size()); 137 | }); 138 | } 139 | } 140 | tree.nodes.copy_from(v_trees_gpu[0]->nodes); 141 | } 142 | 143 | void ExactUpdater::split_point_all_reduce(const vector> &local_sp, 144 | SyncArray &global_sp, int depth) { 145 | //get global best split of each node 146 | int n_max_nodes_in_level = 1 << depth;//2^i 147 | int nid_offset = (1 << depth) - 1;//2^i - 1 148 | auto global_sp_data = global_sp.host_data(); 149 | vector active_sp(n_max_nodes_in_level); 150 | for (int n = 0; n < n_max_nodes_in_level; n++) { 151 | global_sp_data[n].nid = n + nid_offset; 152 | global_sp_data[n].gain = -1.0f; 153 | active_sp[n] = false; 154 | } 155 | 156 | for (int device_id = 0; device_id < n_devices; device_id++) { 157 | auto local_sp_data = local_sp[device_id].host_data(); 158 | for (int j = 0; j < local_sp[device_id].size(); j++) { 159 | int sp_nid = local_sp_data[j].nid; 160 | if (sp_nid == -1) continue; 161 | int global_pos = sp_nid - nid_offset; 162 | if (!active_sp[global_pos]) 163 | global_sp_data[global_pos] = local_sp_data[j]; 164 | else 165 | global_sp_data[global_pos] = (global_sp_data[global_pos].gain >= local_sp_data[j].gain) 166 | ? 167 | global_sp_data[global_pos] : local_sp_data[j]; 168 | active_sp[global_pos] = true; 169 | } 170 | } 171 | //set inactive sp 172 | for (int n = 0; n < n_max_nodes_in_level; n++) { 173 | if (!active_sp[n]) 174 | global_sp_data[n].nid = -1; 175 | } 176 | LOG(DEBUG) << "global best split point = " << global_sp; 177 | } 178 | 179 | void ExactUpdater::init_tree(Tree &tree, const InsStat &stats) { 180 | tree.init(depth); 181 | //init root node 182 | Tree::TreeNode &root_node = tree.nodes.host_data()[0]; 183 | root_node.sum_gh_pair = stats.sum_gh; 184 | root_node.is_valid = true; 185 | root_node.calc_weight(lambda); 186 | LOG(DEBUG) << "root sum gh " << root_node.sum_gh_pair; 187 | } 188 | 189 | void ExactUpdater::find_split(int level, const SparseColumns &columns, const Tree &tree, const InsStat &stats, 190 | SyncArray &sp) { 191 | int n_max_nodes_in_level = static_cast(pow(2, level)); 192 | int nid_offset = static_cast(pow(2, level) - 1); 193 | int n_column = columns.n_column; 194 | int n_partition = n_column * n_max_nodes_in_level; 195 | int nnz = columns.nnz; 196 | int n_block = std::min((nnz / n_column - 1) / 256 + 1, 32 * 56); 197 | 198 | LOG(TRACE) << "start finding split"; 199 | 200 | //find the best split locally 201 | { 202 | using namespace thrust; 203 | SyncArray fvid2pid(nnz); 204 | 205 | { 206 | TIMED_SCOPE(timerObj, "fvid2pid"); 207 | //input 208 | const int *nid_data = stats.nid.device_data(); 209 | const int *iid_data = columns.csc_row_ind.device_data(); 210 | 211 | LOG(TRACE) << "after using v_stats and columns"; 212 | //output 213 | int *fvid2pid_data = fvid2pid.device_data(); 214 | device_loop_2d( 215 | n_column, columns.csc_col_ptr.device_data(), 216 | [=]__device__(int col_id, int fvid) { 217 | //feature value id -> instance id -> node id 218 | int nid = nid_data[iid_data[fvid]]; 219 | int pid; 220 | //if this node is leaf node, move it to the end 221 | if (nid < nid_offset) pid = INT_MAX;//todo negative 222 | else pid = (nid - nid_offset) * n_column + col_id; 223 | fvid2pid_data[fvid] = pid; 224 | }, 225 | n_block); 226 | LOG(DEBUG) << "fvid2pid " << fvid2pid; 227 | } 228 | 229 | 230 | //gather g/h pairs and do prefix sum 231 | int n_split; 232 | SyncArray gh_prefix_sum; 233 | SyncArray missing_gh(n_partition); 234 | SyncArray rle_pid; 235 | SyncArray rle_fval; 236 | { 237 | //get feature value id mapping for partition, new -> old 238 | SyncArray fvid_new2old(nnz); 239 | { 240 | TIMED_SCOPE(timerObj, "fvid_new2old"); 241 | sequence(cuda::par, fvid_new2old.device_data(), fvid_new2old.device_end(), 0); 242 | stable_sort_by_key( 243 | cuda::par, fvid2pid.device_data(), fvid2pid.device_end(), 244 | fvid_new2old.device_data(), 245 | thrust::less()); 246 | LOG(DEBUG) << "sorted fvid2pid " << fvid2pid; 247 | LOG(DEBUG) << "fvid_new2old " << fvid_new2old; 248 | } 249 | 250 | //do prefix sum 251 | { 252 | TIMED_SCOPE(timerObj, "do prefix sum"); 253 | SyncArray rle_gh(nnz); 254 | SyncArray rle_key(nnz); 255 | //same feature value in the same part has the same key. 256 | auto key_iter = make_zip_iterator( 257 | make_tuple( 258 | fvid2pid.device_data(), 259 | make_permutation_iterator( 260 | columns.csc_val.device_data(), 261 | fvid_new2old.device_data())));//use fvid_new2old to access csc_val 262 | //apply RLE compression 263 | n_split = reduce_by_key( 264 | cuda::par, 265 | key_iter, key_iter + nnz, 266 | make_permutation_iterator( //ins id -> gh pair 267 | stats.gh_pair.device_data(), 268 | make_permutation_iterator( //old fvid -> ins id 269 | columns.csc_row_ind.device_data(), 270 | fvid_new2old.device_data())), //new fvid -> old fvid 271 | rle_key.device_data(), 272 | rle_gh.device_data() 273 | ).first - rle_key.device_data(); 274 | gh_prefix_sum.resize(n_split); 275 | rle_pid.resize(n_split); 276 | rle_fval.resize(n_split); 277 | const auto rle_gh_data = rle_gh.device_data(); 278 | const auto rle_key_data = rle_key.device_data(); 279 | auto gh_prefix_sum_data = gh_prefix_sum.device_data(); 280 | auto rle_pid_data = rle_pid.device_data(); 281 | auto rle_fval_data = rle_fval.device_data(); 282 | device_loop(n_split, [=]__device__(int i) { 283 | gh_prefix_sum_data[i] = rle_gh_data[i]; 284 | rle_pid_data[i] = get<0>(rle_key_data[i]); 285 | rle_fval_data[i] = get<1>(rle_key_data[i]); 286 | }); 287 | 288 | inclusive_scan_by_key( 289 | cuda::par, 290 | rle_pid.device_data(), rle_pid.device_end(), 291 | gh_prefix_sum.device_data(), 292 | gh_prefix_sum.device_data()); 293 | // LOG(DEBUG) << "gh prefix sum = " << gh_prefix_sum; 294 | LOG(DEBUG) << "reduced pid = " << rle_pid; 295 | LOG(DEBUG) << "reduced fval = " << rle_fval; 296 | } 297 | 298 | //calculate missing value for each partition 299 | { 300 | TIMED_SCOPE(timerObj, "calculate missing value"); 301 | SyncArray pid_ptr(n_partition + 1); 302 | counting_iterator search_begin(0); 303 | upper_bound(cuda::par, rle_pid.device_data(), rle_pid.device_end(), search_begin, 304 | search_begin + n_partition, pid_ptr.device_data() + 1); 305 | LOG(DEBUG) << "pid_ptr = " << pid_ptr; 306 | 307 | auto pid_ptr_data = pid_ptr.device_data(); 308 | auto rle_pid_data = rle_pid.device_data(); 309 | auto rle_fval_data = rle_fval.device_data(); 310 | float_type rt_eps = this->rt_eps; 311 | device_loop(n_split, [=]__device__(int i) { 312 | int pid = rle_pid_data[i]; 313 | if (pid == INT_MAX) return; 314 | float_type f = rle_fval_data[i]; 315 | if ((pid_ptr_data[pid + 1] - 1) == i)//the last RLE 316 | rle_fval_data[i] = (f - fabsf(rle_fval_data[pid_ptr_data[pid]]) - rt_eps); 317 | else 318 | //FIXME read/write collision 319 | rle_fval_data[i] = (f + rle_fval_data[i + 1]) * 0.5f; 320 | }); 321 | 322 | const auto gh_prefix_sum_data = gh_prefix_sum.device_data(); 323 | const auto node_data = tree.nodes.device_data(); 324 | auto missing_gh_data = missing_gh.device_data(); 325 | device_loop(n_partition, [=]__device__(int pid) { 326 | int nid = pid / n_column + nid_offset; 327 | if (pid_ptr_data[pid + 1] != pid_ptr_data[pid]) 328 | missing_gh_data[pid] = 329 | node_data[nid].sum_gh_pair - gh_prefix_sum_data[pid_ptr_data[pid + 1] - 1]; 330 | }); 331 | // LOG(DEBUG) << "missing gh = " << missing_gh; 332 | } 333 | } 334 | 335 | //calculate gain of each split 336 | SyncArray gain(n_split); 337 | SyncArray default_right(n_split); 338 | { 339 | TIMED_SCOPE(timerObj, "calculate gain"); 340 | auto compute_gain = []__device__(GHPair father, GHPair lch, GHPair rch, float_type min_child_weight, 341 | float_type lambda) -> float_type { 342 | if (lch.h >= min_child_weight && rch.h >= min_child_weight) 343 | return (lch.g * lch.g) / (lch.h + lambda) + (rch.g * rch.g) / (rch.h + lambda) - 344 | (father.g * father.g) / (father.h + lambda); 345 | else 346 | return 0; 347 | }; 348 | 349 | int *fvid2pid_data = fvid2pid.device_data(); 350 | const Tree::TreeNode *nodes_data = tree.nodes.device_data(); 351 | GHPair *gh_prefix_sum_data = gh_prefix_sum.device_data(); 352 | float_type *gain_data = gain.device_data(); 353 | bool *default_right_data = default_right.device_data(); 354 | const auto rle_pid_data = rle_pid.device_data(); 355 | const auto missing_gh_data = missing_gh.device_data(); 356 | auto rle_fval_data = rle_fval.device_data(); 357 | //for lambda expression 358 | float_type mcw = min_child_weight; 359 | float_type l = lambda; 360 | device_loop(n_split, [=]__device__(int i) { 361 | int pid = rle_pid_data[i]; 362 | int nid0 = pid / n_column; 363 | int nid = nid0 + nid_offset; 364 | if (pid == INT_MAX) return; 365 | GHPair father_gh = nodes_data[nid].sum_gh_pair; 366 | GHPair p_missing_gh = missing_gh_data[pid]; 367 | GHPair rch_gh = gh_prefix_sum_data[i]; 368 | float_type max_gain = compute_gain(father_gh, father_gh - rch_gh, rch_gh, mcw, l); 369 | if (p_missing_gh.h > 1) { 370 | rch_gh = rch_gh + p_missing_gh; 371 | float_type temp_gain = compute_gain(father_gh, father_gh - rch_gh, rch_gh, mcw, l); 372 | if (temp_gain > 0 && temp_gain - max_gain > 0.1) { 373 | max_gain = temp_gain; 374 | default_right_data[i] = true; 375 | } 376 | } 377 | gain_data[i] = max_gain; 378 | }); 379 | LOG(DEBUG) << "gain = " << gain; 380 | } 381 | 382 | //get best gain and the index of best gain for each feature and each node 383 | SyncArray best_idx_gain(n_max_nodes_in_level); 384 | int n_nodes_in_level; 385 | { 386 | TIMED_SCOPE(timerObj, "get best gain"); 387 | auto arg_max = []__device__(const int_float &a, const int_float &b) { 388 | if (get<1>(a) == get<1>(b)) 389 | return get<0>(a) < get<0>(b) ? a : b; 390 | else 391 | return get<1>(a) > get<1>(b) ? a : b; 392 | }; 393 | auto in_same_node = [=]__device__(const int a, const int b) { 394 | return (a / n_column) == (b / n_column); 395 | }; 396 | 397 | //reduce to get best split of each node for this feature 398 | SyncArray key_test(n_max_nodes_in_level); 399 | n_nodes_in_level = reduce_by_key( 400 | cuda::par, 401 | rle_pid.device_data(), rle_pid.device_end(), 402 | make_zip_iterator(make_tuple(counting_iterator(0), gain.device_data())), 403 | key_test.device_data(),//make_discard_iterator(), 404 | best_idx_gain.device_data(), 405 | in_same_node, 406 | arg_max).second - best_idx_gain.device_data(); 407 | 408 | LOG(DEBUG) << "#nodes in level = " << n_nodes_in_level; 409 | LOG(DEBUG) << "best pid = " << key_test; 410 | LOG(DEBUG) << "best idx & gain = " << best_idx_gain; 411 | } 412 | 413 | //get split points 414 | const int_float *best_idx_gain_data = best_idx_gain.device_data(); 415 | const auto rle_pid_data = rle_pid.device_data(); 416 | GHPair *gh_prefix_sum_data = gh_prefix_sum.device_data(); 417 | const auto rle_fval_data = rle_fval.device_data(); 418 | const auto missing_gh_data = missing_gh.device_data(); 419 | bool *default_right_data = default_right.device_data(); 420 | 421 | sp.resize(n_nodes_in_level); 422 | auto sp_data = sp.device_data(); 423 | 424 | int column_offset = columns.column_offset; 425 | device_loop(n_nodes_in_level, [=]__device__(int i) { 426 | int_float bst = best_idx_gain_data[i]; 427 | float_type best_split_gain = get<1>(bst); 428 | int split_index = get<0>(bst); 429 | int pid = rle_pid_data[split_index]; 430 | sp_data[i].split_fea_id = (pid == INT_MAX) ? -1 : (pid % n_column) + column_offset; 431 | sp_data[i].nid = (pid == INT_MAX) ? -1 : (pid / n_column + nid_offset); 432 | sp_data[i].gain = best_split_gain; 433 | if (pid != INT_MAX) {//avoid split_index out of bound 434 | sp_data[i].fval = rle_fval_data[split_index]; 435 | sp_data[i].fea_missing_gh = missing_gh_data[pid]; 436 | sp_data[i].default_right = default_right_data[split_index]; 437 | sp_data[i].rch_sum_gh = gh_prefix_sum_data[split_index]; 438 | } 439 | }); 440 | } 441 | 442 | LOG(DEBUG) << "split points (gain/fea_id/nid): " << sp; 443 | } 444 | 445 | void ExactUpdater::update_tree(Tree &tree, const SyncArray &sp) { 446 | auto sp_data = sp.device_data(); 447 | int n_nodes_in_level = sp.size(); 448 | 449 | Tree::TreeNode *nodes_data = tree.nodes.device_data(); 450 | float_type rt_eps = this->rt_eps; 451 | float_type lambda = this->lambda; 452 | 453 | device_loop(n_nodes_in_level, [=]__device__(int i) { 454 | float_type best_split_gain = sp_data[i].gain; 455 | if (best_split_gain > rt_eps) { 456 | //do split 457 | if (sp_data[i].nid == -1) return; 458 | int nid = sp_data[i].nid; 459 | Tree::TreeNode &node = nodes_data[nid]; 460 | node.gain = best_split_gain; 461 | 462 | Tree::TreeNode &lch = nodes_data[node.lch_index];//left child 463 | Tree::TreeNode &rch = nodes_data[node.rch_index];//right child 464 | lch.is_valid = true; 465 | rch.is_valid = true; 466 | node.split_feature_id = sp_data[i].split_fea_id; 467 | GHPair p_missing_gh = sp_data[i].fea_missing_gh; 468 | //todo process begin 469 | node.split_value = sp_data[i].fval; 470 | rch.sum_gh_pair = sp_data[i].rch_sum_gh; 471 | if (sp_data[i].default_right) { 472 | rch.sum_gh_pair = rch.sum_gh_pair + p_missing_gh; 473 | node.default_right = true; 474 | } 475 | lch.sum_gh_pair = node.sum_gh_pair - rch.sum_gh_pair; 476 | lch.calc_weight(lambda); 477 | rch.calc_weight(lambda); 478 | } else { 479 | //set leaf 480 | if (sp_data[i].nid == -1) return; 481 | int nid = sp_data[i].nid; 482 | Tree::TreeNode &node = nodes_data[nid]; 483 | node.is_leaf = true; 484 | nodes_data[node.lch_index].is_valid = false; 485 | nodes_data[node.rch_index].is_valid = false; 486 | } 487 | // } 488 | }); 489 | } 490 | 491 | bool ExactUpdater::reset_ins2node_id(InsStat &stats, const Tree &tree, const SparseColumns &columns) { 492 | SyncArray has_splittable(1); 493 | //set new node id for each instance 494 | { 495 | TIMED_SCOPE(timerObj, "get new node id"); 496 | int *nid_data = stats.nid.device_data(); 497 | const int *iid_data = columns.csc_row_ind.device_data(); 498 | const Tree::TreeNode *nodes_data = tree.nodes.device_data(); 499 | const int *col_ptr_data = columns.csc_col_ptr.device_data(); 500 | const float_type *f_val_data = columns.csc_val.device_data(); 501 | has_splittable.host_data()[0] = false; 502 | bool *h_s_data = has_splittable.device_data(); 503 | int column_offset = columns.column_offset; 504 | 505 | int n_column = columns.n_column; 506 | int nnz = columns.nnz; 507 | int n_block = std::min((nnz / n_column - 1) / 256 + 1, 32 * 56); 508 | 509 | LOG(TRACE) << "update ins2node id for each fval"; 510 | device_loop_2d(n_column, col_ptr_data, 511 | [=]__device__(int col_id, int fvid) { 512 | //feature value id -> instance id 513 | int iid = iid_data[fvid]; 514 | //instance id -> node id 515 | int nid = nid_data[iid]; 516 | //node id -> node 517 | const Tree::TreeNode &node = nodes_data[nid]; 518 | //if the node splits on this feature 519 | if (node.splittable() && node.split_feature_id == col_id + column_offset) { 520 | h_s_data[0] = true; 521 | if (f_val_data[fvid] < node.split_value) 522 | //goes to left child 523 | nid_data[iid] = node.lch_index; 524 | else 525 | //right child 526 | nid_data[iid] = node.rch_index; 527 | } 528 | }, n_block); 529 | 530 | } 531 | LOG(DEBUG) << "new tree_id = " << stats.nid; 532 | // LOG(DEBUG) << v_trees_gpu[cur_device_id].nodes; 533 | return has_splittable.host_data()[0]; 534 | } 535 | 536 | std::ostream &operator<<(std::ostream &os, const int_float &rhs) { 537 | os << string_format("%d/%f", thrust::get<0>(rhs), thrust::get<1>(rhs)); 538 | return os; 539 | } 540 | -------------------------------------------------------------------------------- /src/thundergbm/util/common.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by jiashuai on 18-1-16. 3 | // 4 | #include "thundergbm/thundergbm.h" 5 | INITIALIZE_EASYLOGGINGPP 6 | --------------------------------------------------------------------------------