├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── README.md
├── dataset
    ├── a9a_test
    ├── a9a_train0
    └── a9a_train1
├── include
    └── thundergbm
    │   ├── clion_cuda.h
    │   ├── config.h.in
    │   ├── csc2r_transform.h
    │   ├── dataset.h
    │   ├── gpu_lsh.h
    │   ├── hist_cut.h
    │   ├── ins_stat.h
    │   ├── param.h
    │   ├── quantile_sketch.h
    │   ├── sparse_columns.h
    │   ├── syncarray.h
    │   ├── syncmem.h
    │   ├── thundergbm.h
    │   ├── tree.h
    │   ├── updater
    │       ├── exact_updater.h
    │       └── hist_updater.h
    │   └── util
    │       ├── common.h
    │       ├── cub_wrapper.h
    │       ├── device_lambda.cuh
    │       ├── log.h
    │       └── multi_device.h
└── src
    ├── test
        ├── CMakeLists.txt
        ├── test_exact_updater.cu
        ├── test_main.cpp
        └── test_unifiedmem.cu
    └── thundergbm
        ├── CMakeLists.txt
        ├── csc2r_transform.cu
        ├── dataset.cu
        ├── gpu_lsh.cu
        ├── hist_cut.cpp
        ├── ins_stat.cu
        ├── quantile_sketch.cpp
        ├── sparse_columns.cu
        ├── syncmem.cpp
        ├── tree.cpp
        ├── updater
            ├── exact_updater.cu
            └── hist_updater.cu
        └── util
            ├── common.cpp
            └── log.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | !.gitignore
 3 | !.gitmodules
 4 | !.travis.yml
 5 | *build*
 6 | !dataset/*.sh
 7 | !dataset/test_dataset.txt
 8 | html
 9 | latex
10 | logs
11 | *.pyc
12 | 
13 | nccl*
14 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/test/googletest"]
2 | 	path = src/test/googletest
3 | 	url = https://github.com/google/googletest.git
4 | [submodule "cub"]
5 | 	path = cub
6 | 	url = git@github.com:NVlabs/cub.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | project(thundergbm)
 3 | 
 4 | find_package(CUDA REQUIRED QUIET)
 5 | find_package(OpenMP REQUIRED QUIET)
 6 | 
 7 | if (NOT CMAKE_BUILD_TYPE)
 8 |     set(CMAKE_BUILD_TYPE Release)
 9 | endif ()
10 | 
11 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -gencode arch=compute_60,code=sm_60 --expt-extended-lambda --default-stream per-thread")
12 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" --source-in-ptx -lineinfo)
13 | 
14 | if (CMAKE_VERSION VERSION_LESS "3.1")
15 |     add_compile_options("-std=c++11")
16 | else ()
17 |     set(CMAKE_CXX_STANDARD 11)
18 | endif ()
19 | 
20 | if (OPENMP_FOUND)
21 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
22 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
23 | endif ()
24 | 
25 | add_definitions("-DELPP_FEATURE_PERFORMANCE_TRACKING")
26 | add_definitions("-DELPP_THREAD_SAFE")
27 | set(COMMON_INCLUDES ${PROJECT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR} ${PROJECT_SOURCE_DIR}/cub)
28 | include_directories(${COMMON_INCLUDES})
29 | 
30 | add_subdirectory(src/thundergbm)
31 | add_subdirectory(src/test)
32 | 
33 | # configuration file
34 | set(DATASET_DIR ${PROJECT_SOURCE_DIR}/dataset/)
35 | configure_file(include/thundergbm/config.h.in config.h)
36 | 
37 | add_custom_target(runtest ${PROJECT_NAME}-test --gtest_filter='UpdaterTest*')
38 | add_custom_target(runtest-performance ${PROJECT_NAME}-test --gtest_filter='PerformanceTest*')
39 | 
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is the code of paper [Practical Federated Gradient Boosting Decision Trees](https://arxiv.org/pdf/1911.04206.pdf). The implementation is based on a previous version of [ThunderGBM](https://github.com/Xtra-Computing/thundergbm.git). Only Linux-based operating systems are supported.
 2 | 
 3 | 
 4 | # Installation
 5 | 
 6 | ## Prerequisites
 7 | * CMake
 8 | * CUDA
 9 | 
10 | ## Build
11 | ```
12 | git submodule init
13 | git submodule update
14 | mkdir build
15 | cd build
16 | cmake ..
17 | make -j
18 | ```
19 | 
20 | 
21 | # Usage
22 | 
23 | SimFL currently only works for binary classification tasks with labels 0 and 1 and requires GPUs.
24 | 
25 | ## Parameteres
26 | 
27 | ```
28 | * -p: int, number of parties (default:2)
29 | * -t: int, number of lsh tables (default:40)
30 | * -b: int, number of buckets (default:500)
31 | * -r: float, r value of LSH function (default:4.0)
32 | * -s: int, init seed for LSH
33 | * -f: string, path to the dataset file
34 | * -d: int, the maximum dimension of the datasets
35 | ```
36 | 
37 | ## Datasets
38 | 
39 | Please rename all the local datasets in such format: name+'_train'+party_id, e.g., `a9a_train0`, `a9a_train1`. For the test dataset, please rename it in such format: name+'_test', e.g., `a9a_test`.
40 | 
41 | ## Sample command:
42 | Under `build` directory
43 | 
44 | ```
45 | ./src/test/thundergbm-test -p 2 -t 30 -b 500 -r 4 -s -1 -f ../dataset/a9a/a9a -d 123 -n 50 -e 8
46 | ```
47 | 
48 | 


--------------------------------------------------------------------------------
/include/thundergbm/clion_cuda.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 17-9-14.
  3 | //
  4 | 
  5 | #ifndef THUNDERSVM_CLION_CUDA_H
  6 | #define THUNDERSVM_CLION_CUDA_H
  7 | 
  8 | #ifdef __JETBRAINS_IDE__
  9 | 
 10 | #include "math.h"
 11 | 
 12 | #define __CUDACC__ 1
 13 | #define __host__
 14 | #define __device__
 15 | #define __global__
 16 | #define __noinline__
 17 | #define __forceinline__
 18 | #define __shared__
 19 | #define __constant__
 20 | #define __managed__
 21 | #define __restrict__
 22 | 
 23 | // CUDA Synchronization
 24 | inline void __syncthreads() {};
 25 | 
 26 | inline void __threadfence_block() {};
 27 | 
 28 | inline void __threadfence() {};
 29 | 
 30 | inline void __threadfence_system();
 31 | 
 32 | inline int __syncthreads_count(int predicate) { return predicate };
 33 | 
 34 | inline int __syncthreads_and(int predicate) { return predicate };
 35 | 
 36 | inline int __syncthreads_or(int predicate) { return predicate };
 37 | 
 38 | template<class T>
 39 | inline T __clz(const T val) { return val; }
 40 | 
 41 | template<class T>
 42 | inline T __ldg(const T *address) { return *address };
 43 | // CUDA TYPES
 44 | typedef unsigned short uchar;
 45 | typedef unsigned short ushort;
 46 | typedef unsigned int uint;
 47 | typedef unsigned long ulong;
 48 | typedef unsigned long long ulonglong;
 49 | typedef long long longlong;
 50 | 
 51 | typedef struct uchar1 {
 52 |     uchar x;
 53 | } uchar1;
 54 | 
 55 | typedef struct uchar2 {
 56 |     uchar x;
 57 |     uchar y;
 58 | } uchar2;
 59 | 
 60 | typedef struct uchar3 {
 61 |     uchar x;
 62 |     uchar y;
 63 |     uchar z;
 64 | } uchar3;
 65 | 
 66 | typedef struct uchar4 {
 67 |     uchar x;
 68 |     uchar y;
 69 |     uchar z;
 70 |     uchar w;
 71 | } uchar4;
 72 | 
 73 | typedef struct char1 {
 74 |     char x;
 75 | } char1;
 76 | 
 77 | typedef struct char2 {
 78 |     char x;
 79 |     char y;
 80 | } char2;
 81 | 
 82 | typedef struct char3 {
 83 |     char x;
 84 |     char y;
 85 |     char z;
 86 | } char3;
 87 | 
 88 | typedef struct char4 {
 89 |     char x;
 90 |     char y;
 91 |     char z;
 92 |     char w;
 93 | } char4;
 94 | 
 95 | typedef struct ushort1 {
 96 |     ushort x;
 97 | } ushort1;
 98 | 
 99 | typedef struct ushort2 {
100 |     ushort x;
101 |     ushort y;
102 | } ushort2;
103 | 
104 | typedef struct ushort3 {
105 |     ushort x;
106 |     ushort y;
107 |     ushort z;
108 | } ushort3;
109 | 
110 | typedef struct ushort4 {
111 |     ushort x;
112 |     ushort y;
113 |     ushort z;
114 |     ushort w;
115 | } ushort4;
116 | 
117 | typedef struct short1 {
118 |     short x;
119 | } short1;
120 | 
121 | typedef struct short2 {
122 |     short x;
123 |     short y;
124 | } short2;
125 | 
126 | typedef struct short3 {
127 |     short x;
128 |     short y;
129 |     short z;
130 | } short3;
131 | 
132 | typedef struct short4 {
133 |     short x;
134 |     short y;
135 |     short z;
136 |     short w;
137 | } short4;
138 | 
139 | typedef struct uint1 {
140 |     uint x;
141 | } uint1;
142 | 
143 | typedef struct uint2 {
144 |     uint x;
145 |     uint y;
146 | } uint2;
147 | 
148 | typedef struct uint3 {
149 |     uint x;
150 |     uint y;
151 |     uint z;
152 | } uint3;
153 | 
154 | typedef struct uint4 {
155 |     uint x;
156 |     uint y;
157 |     uint z;
158 |     uint w;
159 | } uint4;
160 | 
161 | typedef struct int1 {
162 |     int x;
163 | } int1;
164 | 
165 | typedef struct int2 {
166 |     int x;
167 |     int y;
168 | } int2;
169 | 
170 | typedef struct int3 {
171 |     int x;
172 |     int y;
173 |     int z;
174 | } int3;
175 | 
176 | typedef struct int4 {
177 |     int x;
178 |     int y;
179 |     int z;
180 |     int w;
181 | } int4;
182 | 
183 | typedef struct ulong1 {
184 |     ulong x;
185 | } ulong1;
186 | 
187 | typedef struct ulong2 {
188 |     ulong x;
189 |     ulong y;
190 | } ulong2;
191 | 
192 | typedef struct ulong3 {
193 |     ulong x;
194 |     ulong y;
195 |     ulong z;
196 | } ulong3;
197 | 
198 | typedef struct ulong4 {
199 |     ulong x;
200 |     ulong y;
201 |     ulong z;
202 |     ulong w;
203 | } ulong4;
204 | 
205 | typedef struct long1 {
206 |     long x;
207 | } long1;
208 | 
209 | typedef struct long2 {
210 |     long x;
211 |     long y;
212 | } long2;
213 | 
214 | typedef struct long3 {
215 |     long x;
216 |     long y;
217 |     long z;
218 | } long3;
219 | 
220 | typedef struct long4 {
221 |     long x;
222 |     long y;
223 |     long z;
224 |     long w;
225 | } long4;
226 | 
227 | typedef struct ulonglong1 {
228 |     ulonglong x;
229 | } ulonglong1;
230 | 
231 | typedef struct ulonglong2 {
232 |     ulonglong x;
233 |     ulonglong y;
234 | } ulonglong2;
235 | 
236 | typedef struct ulonglong3 {
237 |     ulonglong x;
238 |     ulonglong y;
239 |     ulonglong z;
240 | } ulonglong3;
241 | 
242 | typedef struct ulonglong4 {
243 |     ulonglong x;
244 |     ulonglong y;
245 |     ulonglong z;
246 |     ulonglong w;
247 | } ulonglong4;
248 | 
249 | typedef struct longlong1 {
250 |     longlong x;
251 | } longlong1;
252 | 
253 | typedef struct longlong2 {
254 |     longlong x;
255 |     longlong y;
256 | } longlong2;
257 | 
258 | typedef struct float1 {
259 |     float x;
260 | } float1;
261 | 
262 | typedef struct float2 {
263 |     float x;
264 |     float y;
265 | } float2;
266 | 
267 | typedef struct float3 {
268 |     float x;
269 |     float y;
270 |     float z;
271 | } float3;
272 | 
273 | typedef struct float4 {
274 |     float x;
275 |     float y;
276 |     float z;
277 |     float w;
278 | } float4;
279 | 
280 | typedef struct double1 {
281 |     double x;
282 | } double1;
283 | 
284 | typedef struct double2 {
285 |     double x;
286 |     double y;
287 | } double2;
288 | 
289 | typedef uint3 dim3;
290 | 
291 | extern dim3 gridDim;
292 | extern uint3 blockIdx;
293 | extern dim3 blockDim;
294 | extern uint3 threadIdx;
295 | extern int warpsize;
296 | #endif
297 | #endif //THUNDERSVM_CLION_CUDA_H
298 | 


--------------------------------------------------------------------------------
/include/thundergbm/config.h.in:
--------------------------------------------------------------------------------
1 | #cmakedefine DATASET_DIR "@DATASET_DIR@"
2 | 


--------------------------------------------------------------------------------
/include/thundergbm/csc2r_transform.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by qinbin on 21/8/18.
 3 | //
 4 | 
 5 | 
 6 | #ifndef THUNDERGBM_CSC2R_TRANSFORM_H
 7 | #define THUNDERGBM_CSC2R_TRANSFORM_H
 8 | 
 9 | #include "thundergbm.h"
10 | #include "syncarray.h"
11 | #include "cusparse.h"
12 | #include "sparse_columns.h"
13 | //change csc 2 csr or csr 2 csc
14 | 
15 | class Csc2r{
16 | public:
17 |     SyncArray<float_type> csc_val;
18 |     SyncArray<int> csc_row_ind;
19 |     SyncArray<int> csc_col_ptr;
20 |     int nnz;
21 |     //for gpu
22 |     void from_csr(float_type* val, int* csr_col_ind, int* csr_row_ptr, int n_instances, int n_column, int nnz);
23 |     void get_cut_points_evenly(int nBin, vector<float>& bin_id, const vector<float>& min_fea, const vector<float>& max_fea);
24 | //    void init_bin_id_csr(const vector<vector<std::shared_ptr<SparseColumns>>> &v_columns, int n_instances);
25 | };
26 | 
27 | #endif //THUNDERGBM_CSC2R_TRANSFORM_H
28 | 


--------------------------------------------------------------------------------
/include/thundergbm/dataset.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by jiashuai on 18-1-17.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_DATASET_H
 6 | #define THUNDERGBM_DATASET_H
 7 | 
 8 | #include "thundergbm.h"
 9 | #include "syncarray.h"
10 | 
11 | class DataSet {
12 | public:
13 |     ///one feature value and corresponding index
14 |     struct node {
15 |         node(int index, float_type value) : index(index), value(value) {}
16 | 
17 |         int index;
18 |         float_type value;
19 |     };
20 | 
21 |     ///two-dimension node vector
22 |     typedef vector<vector<DataSet::node>> node2d;
23 | 
24 |     ///load dataset from file
25 |     void load_from_file(string file_name);
26 | 
27 |     void load_from_file_csr(string file_name);
28 | 
29 |     void load_from_file_two_dimension(string file_name);
30 | 
31 |     const node2d &instances() const;
32 | 
33 |     size_t n_features() const;
34 | 
35 |     size_t n_instances() const;
36 | 
37 |     const vector<float_type> &y() const;
38 | 	vector<vector<float_type>> features;
39 |     vector<vector<int>> line_num;
40 | 
41 |     //to do
42 |     vector<float> min_fea;
43 |     vector<float> max_fea;
44 | 
45 |     //suppose know the max dimension of all datasets
46 |     int max_dimension = 123;
47 | 
48 |     void compression();
49 | private:
50 |     ///labels of instances
51 |     vector<float_type> y_;
52 |     node2d instances_;
53 |     size_t n_features_;
54 | };
55 | 
56 | 
57 | #endif //THUNDERGBM_DATASET_H
58 | 


--------------------------------------------------------------------------------
/include/thundergbm/gpu_lsh.h:
--------------------------------------------------------------------------------
  1 | #ifndef THUNDERGBM_GPU_LSH_H
  2 | #define THUNDERGBM_GPU_LSH_H
  3 | 
  4 | 
  5 | //////////////////////////////////////////////////////////////////////////////
  6 | /// Copyright (C) 2014 Gefu Tang <tanggefu@gmail.com>. All Rights Reserved.
  7 | ///
  8 | /// This file is part of LSHBOX.
  9 | ///
 10 | /// LSHBOX is free software: you can redistribute it and/or modify it under
 11 | /// the terms of the GNU General Public License as published by the Free
 12 | /// Software Foundation, either version 3 of the License, or(at your option)
 13 | /// any later version.
 14 | ///
 15 | /// LSHBOX is distributed in the hope that it will be useful, but WITHOUT
 16 | /// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 17 | /// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 18 | /// more details.
 19 | ///
 20 | /// You should have received a copy of the GNU General Public License along
 21 | /// with LSHBOX. If not, see <http://www.gnu.org/licenses/>.
 22 | ///
 23 | /// @version 0.1
 24 | /// @author Gefu Tang & Zhifeng Xiao
 25 | /// @date 2014.6.30
 26 | //////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | /**
 29 |  * @file psdlsh.h
 30 |  *
 31 |  * @brief Locality-Sensitive Hashing Scheme Based on p-Stable Distributions.
 32 |  */
 33 | //#pragma once
 34 | 
 35 | #include "thundergbm.h"
 36 | #include "cusparse.h"
 37 | #include <map>
 38 | #include <random>
 39 | #include <iostream>
 40 | #include <functional>
 41 | #include "syncarray.h"
 42 | /**
 43 |  * Locality-Sensitive Hashing Scheme Based on p-Stable Distributions.
 44 |  *
 45 |  *
 46 |  * For more information on p-stable distribution based LSH, see the following reference.
 47 |  *
 48 |  *     Mayur Datar , Nicole Immorlica , Piotr Indyk , Vahab S. Mirrokni,
 49 |  *     Locality-sensitive hashing scheme based on p-stable distributions,
 50 |  *     Proceedings of the twentieth annual symposium on Computational geometry, June
 51 |  *     08-11, 2004, Brooklyn, New York, USA.
 52 |  */
 53 | //template<typename DATATYPE = float>
 54 | class psdLsh
 55 | {
 56 | public:
 57 |     struct Parameter
 58 |     {
 59 |         //number of buckets in a hash table
 60 |         unsigned n_bucket;
 61 |         //number of hash tables
 62 |         unsigned n_table;
 63 |         //number of dimensions
 64 |         unsigned n_dimension;
 65 |         //p_norm = 1, cauchy distribution, = 2, gaussian distribution
 66 |         unsigned p_norm;
 67 |         //the range r
 68 |         float r;
 69 |         //the number of companies
 70 |         unsigned n_comp;
 71 |         int seed;
 72 |     };
 73 |     psdLsh() {}
 74 |     psdLsh(const Parameter &param_)
 75 |     {
 76 |         reset(param_);
 77 |     }
 78 |     ~psdLsh() {}
 79 | 
 80 | 
 81 |     void init();
 82 |     /**
 83 |      * Reset the parameter setting
 84 |      *
 85 |      * @param param_ A instance of psdLsh<DATATYPE>::Parametor, which contains
 86 |      * the necessary parameters
 87 |      */
 88 |     void reset(const Parameter &param_);
 89 |     /**
 90 |      * Hash the dataset.
 91 |      *
 92 |      * @param data A instance of Matrix<DATATYPE>, it is the search dataset.
 93 |      */
 94 |     void hash(int n_instances, int n_features, int nnz, int key_offset,
 95 |             SyncArray<float_type> &csr_val, SyncArray<int> &csr_row_ptr, SyncArray<int> &csr_col_ind,
 96 |             SyncArray<int> &hash_values, int cid);
 97 | 
 98 | //    void query(int n_instances, int n_features, int nnz,
 99 | //              SyncArray<float_type> &csr_val, SyncArray<int> &csr_row_ptr, SyncArray<int> &csr_col_ind,
100 | //              vector<vector<int>> &buckets);
101 | 
102 |     /**
103 |      * Insert a vector to the index.
104 |      *
105 |      * @param key   The sequence number of vector
106 |      * @param domin The pointer to the vector
107 |      */
108 | //        void insert(unsigned key, const DATATYPE *domin);
109 |     /**
110 |      * Query the approximate nearest neighborholds.
111 |      *
112 |      * @param domin   The pointer to the vector
113 |      * @param scanner Top-K scanner, use for scan the approximate nearest neighborholds
114 |      */
115 | //        template<typename SCANNER>
116 | //        void query(const DATATYPE *domin, SCANNER &scanner);
117 |     /**
118 |      * get the hash value of a vector.
119 |      *
120 |      * @param k     The idx of the table
121 |      * @param domin The pointer to the vector
122 |      * @return      The hash value
123 |      */
124 | //        unsigned getHashVal(unsigned k, const DATATYPE *domin);
125 |     /**
126 |      * Load the index from binary file.
127 |      *
128 |      * @param file The path of binary file.
129 |      */
130 | //        void load(const std::string &file);
131 |     /**
132 |      * Save the index as binary file.
133 |      *
134 |      * @param file The path of binary file.
135 |      */
136 | //        void save(const std::string &file);
137 | //private:
138 |     Parameter param;
139 |     SyncArray<float> b;
140 |     SyncArray<float> a;
141 | 
142 |     vector<float> v_a;
143 |     vector<float> v_b;
144 |     //random number vector before distribution transform
145 |     vector<float> random_vector;
146 | 
147 | 
148 | //        std::vector<float> v_a;
149 | //        std::vector<float> rndBs;
150 | //        std::vector<std::vector<float> > stableArray;
151 |     std::vector<std::map<unsigned, std::vector<std::vector<unsigned>> > > tables;
152 | //    std::vector<std::map<int, std::vector<std::vector<unsigned>> > > tables;
153 | };
154 | 
155 | // ------------------------- implementation -------------------------
156 | 
157 | //template<typename DATATYPE>
158 | //void lshbox::psdLsh<DATATYPE>::hash(Matrix<DATATYPE> &data)
159 | //{
160 | //    for (unsigned i = 0; i != data.getSize(); ++i)
161 | //    {
162 | //        insert(i, data[i]);
163 | //        ++pd;
164 | //    }
165 | //}
166 | //template<typename DATATYPE>
167 | //void lshbox::psdLsh<DATATYPE>::insert(unsigned key, const DATATYPE *domin)
168 | //{
169 | //    for(unsigned k = 0; k < param.n_table; k++){
170 | //        unsigned  hashVal = getHashVal(k, )
171 | //    }
172 | ////    for (unsigned k = 0; k != param.L; ++k)
173 | ////    {
174 | ////        unsigned hashVal = getHashVal(k, domin);
175 | ////        tables[k][hashVal].push_back(key);
176 | ////    }
177 | //}
178 | //template<typename DATATYPE>
179 | //template<typename SCANNER>
180 | //void lshbox::psdLsh<DATATYPE>::query(const DATATYPE *domin, SCANNER &scanner)
181 | //{
182 | //    scanner.reset(domin);
183 | //    for (unsigned k = 0; k != param.L; ++k)
184 | //    {
185 | //        unsigned hashVal = getHashVal(k, domin);
186 | //        if (tables[k].find(hashVal) != tables[k].end())
187 | //        {
188 | //            for (std::vector<unsigned>::iterator iter = tables[k][hashVal].begin(); iter != tables[k][hashVal].end(); ++iter)
189 | //            {
190 | //                scanner(*iter);
191 | //            }
192 | //        }
193 | //    }
194 | //    scanner.topk().genTopk();
195 | //}
196 | //template<typename DATATYPE>
197 | //unsigned lshbox::psdLsh<DATATYPE>::getHashVal(unsigned k, const DATATYPE *domin)
198 | //{
199 | //    float sum(0);
200 | //    for (unsigned i = 0; i != param.D; ++i)
201 | //    {
202 | //        sum += domin[i] * stableArray[k][i];
203 | //    }
204 | //    unsigned hashVal = unsigned(std::floor((sum + rndBs[k]) / param.W)) % param.M;
205 | //    return hashVal;
206 | //}
207 | //template<typename DATATYPE>
208 | //void lshbox::psdLsh<DATATYPE>::load(const std::string &file)
209 | //{
210 | //    std::ifstream in(file, std::ios::binary);
211 | //    in.read((char *)&param.M, sizeof(unsigned));
212 | //    in.read((char *)&param.L, sizeof(unsigned));
213 | //    in.read((char *)&param.D, sizeof(unsigned));
214 | //    in.read((char *)&param.W, sizeof(float));
215 | //    tables.resize(param.L);
216 | //    stableArray.resize(param.L);
217 | //    rndBs.resize(param.L);
218 | //    in.read((char *)&rndBs[0], sizeof(float) * param.L);
219 | //    for (unsigned i = 0; i != param.L; ++i)
220 | //    {
221 | //        stableArray[i].resize(param.D);
222 | //        in.read((char *)&stableArray[i][0], sizeof(float) * param.D);
223 | //        unsigned count;
224 | //        in.read((char *)&count, sizeof(unsigned));
225 | //        for (unsigned j = 0; j != count; ++j)
226 | //        {
227 | //            unsigned target;
228 | //            in.read((char *)&target, sizeof(unsigned));
229 | //            unsigned length;
230 | //            in.read((char *)&length, sizeof(unsigned));
231 | //            tables[i][target].resize(length);
232 | //            in.read((char *) & (tables[i][target][0]), sizeof(unsigned) * length);
233 | //        }
234 | //    }
235 | //    in.close();
236 | //}
237 | //template<typename DATATYPE>
238 | //void lshbox::psdLsh<DATATYPE>::save(const std::string &file)
239 | //{
240 | //    std::ofstream out(file, std::ios::binary);
241 | //    out.write((char *)&param.M, sizeof(unsigned));
242 | //    out.write((char *)&param.L, sizeof(unsigned));
243 | //    out.write((char *)&param.D, sizeof(unsigned));
244 | //    out.write((char *)&param.W, sizeof(float));
245 | //    out.write((char *)&rndBs[0], sizeof(float) * param.L);
246 | //    for (int i = 0; i != param.L; ++i)
247 | //    {
248 | //        out.write((char *)&stableArray[i][0], sizeof(float) * param.D);
249 | //        unsigned count = unsigned(tables[i].size());
250 | //        out.write((char *)&count, sizeof(unsigned));
251 | //        for (std::map<unsigned, std::vector<unsigned> >::iterator iter = tables[i].begin(); iter != tables[i].end(); ++iter)
252 | //        {
253 | //            unsigned target = iter->first;
254 | //            out.write((char *)&target, sizeof(unsigned));
255 | //            unsigned length = unsigned(iter->second.size());
256 | //            out.write((char *)&length, sizeof(unsigned));
257 | //            out.write((char *) & ((iter->second)[0]), sizeof(unsigned) * length);
258 | //        }
259 | //    }
260 | //    out.close();
261 | //}
262 | 
263 | 
264 | #endif //THUNDERGBM_GPU_LSH_H
265 | 


--------------------------------------------------------------------------------
/include/thundergbm/hist_cut.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by qinbin on 2018/5/9.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_HIST_CUT_H
 6 | #define THUNDERGBM_HIST_CUT_H
 7 | 
 8 | #include "thundergbm/thundergbm.h"
 9 | #include "thundergbm/dataset.h"
10 | #include "thundergbm/tree.h"
11 | #include "sparse_columns.h"
12 | 
13 | class HistCut {
14 | public:
15 | //split_point[i] stores the split points of feature i
16 |     //std::vector<std::vector<float_type>> split_points;
17 |     vector<float_type> cut_points;
18 |     vector<int> row_ptr;
19 |     //for gpu
20 |     SyncArray<float_type> cut_points_val;
21 |     SyncArray<int> cut_row_ptr;
22 | 
23 |     HistCut() = default;
24 | 
25 |     HistCut(const HistCut &cut) {
26 |         cut_points = cut.cut_points;
27 |         row_ptr = cut.row_ptr;
28 |         cut_points_val.copy_from(cut.cut_points_val);
29 |         cut_row_ptr.copy_from(cut.cut_row_ptr);
30 |     }
31 | 
32 |     void get_cut_points(SparseColumns &columns, InsStat &stats, int max_num_bins, int n_instances, int n_features);
33 | };
34 | 
35 | //store the g/h of the bins of one feature
36 | class BinStat {
37 | public:
38 |     SyncArray<GHPair> gh_pair;
39 |     //feature id
40 |     int fid;
41 |     //number of bins
42 |     int numBin;
43 | 
44 |     BinStat() = default;
45 | 
46 |     //feature: the pointer to features that need to build hist
47 |     //insId: the pointer to instance id of features
48 |     void Init(HistCut &cut, InsStat &stats, int pid, float_type *f_val, int n_f_val, int *iid);
49 |     //void Init(vector<float_type>& cut_points, InsStat& stats,SparseColumns& columns, int fid);
50 | };
51 | 
52 | #endif //THUNDERGBM_HIST_CUT_H
53 | 


--------------------------------------------------------------------------------
/include/thundergbm/ins_stat.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by shijiashuai on 5/7/18.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_INS_STAT_H
 6 | #define THUNDERGBM_INS_STAT_H
 7 | 
 8 | 
 9 | #include "syncarray.h"
10 | 
11 | struct GHPair {
12 |     float_type g;
13 |     float_type h;
14 | 
15 |     HOST_DEVICE GHPair operator+(const GHPair &rhs) const {
16 |         GHPair res;
17 |         res.g = this->g + rhs.g;
18 |         res.h = this->h + rhs.h;
19 |         return res;
20 |     }
21 | 
22 |     HOST_DEVICE const GHPair operator-(const GHPair &rhs) const {
23 |         GHPair res;
24 |         res.g = this->g - rhs.g;
25 |         res.h = this->h - rhs.h;
26 |         return res;
27 |     }
28 | 
29 |     HOST_DEVICE GHPair() : g(0), h(0) {};
30 | 
31 |     HOST_DEVICE GHPair(float_type v) : g(v), h(v) {};
32 | 
33 |     HOST_DEVICE GHPair(float_type g, float_type h) : g(g), h(h) {};
34 | 
35 |     friend std::ostream &operator<<(std::ostream &os,
36 |                                     const GHPair &p) {
37 |         os << string_format("%f/%f", p.g, p.h);
38 |         return os;
39 |     }
40 | };
41 | 
42 | class InsStat {
43 | public:
44 | 
45 |     ///gradient and hessian
46 |     SyncArray<GHPair> gh_pair;
47 |     ///node id
48 |     SyncArray<int> nid;
49 |     ///target value
50 |     SyncArray<float_type> y;
51 |     ///predict value
52 |     SyncArray<float_type> y_predict;
53 | 
54 |     int n_instances;
55 | 
56 |     GHPair sum_gh;
57 | 
58 |     InsStat() = default;
59 | 
60 |     explicit InsStat(size_t n_instances) {
61 |         resize(n_instances);
62 |     }
63 | 
64 |     void resize(size_t n_instances);
65 | 
66 |     void updateGH();
67 | 
68 |     void updateGH(SyncArray<bool>& is_multi);
69 | 
70 |     void updateGH(SyncArray<bool>& is_multi, int numP);
71 | };
72 | 
73 | #endif //THUNDERGBM_INS_STAT_H
74 | 


--------------------------------------------------------------------------------
/include/thundergbm/param.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by shijiashuai on 5/7/18.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_PARAM_H
 6 | #define THUNDERGBM_PARAM_H
 7 | 
 8 | #include "thundergbm.h"
 9 | 
10 | struct GBMParam {
11 |     int depth;
12 |     int n_trees;
13 |     float_type min_child_weight;
14 |     float_type lambda;
15 |     float_type gamma;
16 |     float_type rt_eps;
17 |     float_type learning_rate;
18 |     string path;
19 |     string test_path;
20 |     bool do_exact = true;
21 | 
22 |     //for histogram
23 |     int max_num_bin = 256;
24 | 
25 |     int n_device;
26 | };
27 | #endif //THUNDERGBM_PARAM_H
28 | 


--------------------------------------------------------------------------------
/include/thundergbm/quantile_sketch.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by qinbin on 2018/5/9.
 3 | //
 4 | #ifndef THUNDERGBM_QUANTILE_SKETCH_H
 5 | #define THUNDERGBM_QUANTILE_SKETCH_H
 6 | 
 7 | #include "thundergbm/thundergbm.h"
 8 | #include <utility>
 9 | #include <tuple>
10 | 
11 | using std::pair;
12 | using std::tuple;
13 | using std::vector;
14 | 
15 | 
16 | class entry{
17 | public:
18 |     float_type val;
19 |     float_type rmin;
20 |     float_type rmax;
21 |     float_type w;
22 |     entry() {};
23 |     entry(float_type val, float_type rmin, float_type rmax, float_type w) : val(val), rmin(rmin), rmax(rmax), w(w) {};
24 | };
25 | 
26 | class summary{
27 | public:
28 |     int entry_size;
29 |     int entry_reserve_size;
30 |     vector<entry> entries;
31 |     summary(): entry_size(0),entry_reserve_size(0) {
32 |         //entries.clear();
33 |     };
34 |     summary(int entry_size, int reserve_size): entry_size(entry_size), entry_reserve_size(reserve_size) {entries.resize(reserve_size);};
35 |     void Reserve(int size);
36 |     void Prune(summary& src,int size);
37 |     void Merge(summary& src1, summary& src2);
38 |     void Copy(summary& src);
39 | 
40 | };
41 | 
42 | class Qitem{
43 | public:
44 |     int tail;
45 |     vector<pair<float_type, float_type>> data;
46 |     Qitem(): tail(0) {
47 |         //data.clear();
48 |     };
49 |     void GetSummary(summary& ret);
50 | };
51 | 
52 | 
53 | class quanSketch{
54 | public:
55 |     int numOfLevel;
56 |     int summarySize;
57 |     Qitem Qentry;
58 |     vector<summary> summaries;
59 |     summary t_summary; //for temp
60 |     void Init(int maxn, float_type eps);
61 |     void Add(float_type, float_type);
62 |     void GetSummary(summary& dest);
63 |     quanSketch(): numOfLevel(0), summarySize(0) {
64 |         //summaries.clear();
65 |     };
66 | 
67 | };
68 | #endif //THUNDERGBM_QUANTILE_SKETCH_H
69 | 


--------------------------------------------------------------------------------
/include/thundergbm/sparse_columns.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by shijiashuai on 5/7/18.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_SPARSE_COLUMNS_H
 6 | #define THUNDERGBM_SPARSE_COLUMNS_H
 7 | 
 8 | #include "thundergbm.h"
 9 | #include "syncarray.h"
10 | #include "dataset.h"
11 | #include "cusparse.h"
12 | 
13 | class SparseColumns {//one feature corresponding to one column
14 | public:
15 |     SyncArray<float_type> csc_val;
16 |     SyncArray<int> csc_row_ind;
17 |     SyncArray<int> csc_col_ptr;
18 |     SyncArray<float_type> csc_bin_id;
19 |     int n_column;
20 |     int column_offset;
21 |     int nnz;
22 | 
23 |     void from_dataset(const DataSet &dataSet);
24 |     void from_dataset_csr(const DataSet &dataset);
25 | 
26 |     void to_multi_devices(vector<std::shared_ptr<SparseColumns>> &) const;
27 | //    void get_cut_points_evenly(int nBin, vector<int>& bin_id, const vector<float>& min_fea, const vector<float>& max_fea);
28 | };
29 | #endif //THUNDERGBM_SPARSE_COLUMNS_H
30 | 


--------------------------------------------------------------------------------
/include/thundergbm/syncarray.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 17-9-17.
  3 | //
  4 | 
  5 | #ifndef THUNDERGBM_SYNCDATA_H
  6 | #define THUNDERGBM_SYNCDATA_H
  7 | 
  8 | #include "thundergbm.h"
  9 | #include "syncmem.h"
 10 | 
 11 | /**
 12 |  * @brief Wrapper of SyncMem with a type
 13 |  * @tparam T type of element
 14 |  */
 15 | template<typename T>
 16 | class SyncArray : public el::Loggable {
 17 | public:
 18 |     /**
 19 |      * initialize class that can store given count of elements
 20 |      * @param count the given count
 21 |      */
 22 |     explicit SyncArray(size_t count) : mem(new SyncMem(sizeof(T) * count)), size_(count) {
 23 |     }
 24 | 
 25 |     SyncArray() : mem(nullptr), size_(0) {}
 26 | 
 27 |     ~SyncArray() { delete mem; };
 28 | 
 29 |     const T *host_data() const {
 30 |         to_host();
 31 |         return static_cast<T *>(mem->host_data());
 32 |     };
 33 | 
 34 |     const T *device_data() const {
 35 |         to_device();
 36 |         return static_cast<T *>(mem->device_data());
 37 |     };
 38 | 
 39 |     T *host_data() {
 40 |         to_host();
 41 |         return static_cast<T *>(mem->host_data());
 42 |     };
 43 | 
 44 |     T *device_data() {
 45 |         to_device();
 46 |         return static_cast<T *>(mem->device_data());
 47 |     };
 48 | 
 49 |     T *device_end() {
 50 |         return device_data() + size();
 51 |     };
 52 | 
 53 |     const T *device_end() const {
 54 |         return device_data() + size();
 55 |     };
 56 | 
 57 |     void set_host_data(T *host_ptr) {
 58 |         mem->set_host_data(host_ptr);
 59 |     }
 60 | 
 61 |     void set_device_data(T *device_ptr) {
 62 |         mem->set_device_data(device_ptr);
 63 |     }
 64 | 
 65 |     void to_host() const {
 66 |         mem->to_host();
 67 |     }
 68 | 
 69 |     void to_device() const {
 70 |         mem->to_device();
 71 |     }
 72 | 
 73 |     /**
 74 |      * copy device data. This will call to_device() implicitly.
 75 |      * @param source source data pointer (data can be on host or device)
 76 |      * @param count the count of elements
 77 |      */
 78 |     void copy_from(const T *source, size_t count) {
 79 | 
 80 | #ifdef USE_CUDA
 81 |         thunder::device_mem_copy(mem->device_data(), source, sizeof(T) * count);
 82 | #else
 83 |         memcpy(mem->host_data(), source, sizeof(T) * count);
 84 | #endif
 85 |     };
 86 | 
 87 |     void copy_from(const SyncArray<T> &source) {
 88 | 
 89 |         CHECK_EQ(size(), source.size()) << "destination and source count doesn't match";
 90 | #ifdef USE_CUDA
 91 |         copy_from(source.device_data(), source.size());
 92 | #else
 93 |         copy_from(source.host_data(), source.size());
 94 | #endif
 95 |     };
 96 | 
 97 |     /**
 98 |      * resize to a new size. This will also clear all data.
 99 |      * @param count
100 |      */
101 |     void resize(size_t count) {
102 |         delete mem;
103 |         mem = new SyncMem(sizeof(T) * count);
104 |         this->size_ = count;
105 |     };
106 | 
107 |     size_t mem_size() const {//number of bytes
108 |         return mem->size();
109 |     }
110 | 
111 |     size_t size() const {//number of values
112 |         return size_;
113 |     }
114 | 
115 |     SyncMem::HEAD head() const {
116 |         return mem->head();
117 |     }
118 | 
119 |     void log(el::base::type::ostream_t &ostream) const override {
120 |         int i;
121 |         ostream << "[";
122 |         for (i = 0; i < size() - 1 && i < el::base::consts::kMaxLogPerContainer - 1; ++i) {
123 | //    for (i = 0; i < size() - 1; ++i) {
124 |             ostream << host_data()[i] << ",";
125 |         }
126 |         ostream << host_data()[i];
127 |         if (size() < el::base::consts::kMaxLogPerContainer - 1) {
128 |             ostream << "]";
129 |         } else {
130 |             ostream << "...";
131 |         }
132 |     };
133 | 
134 |     int get_owner_id() {
135 |         return mem->get_owner_id();
136 |     }
137 | 
138 | private:
139 | 
140 |     SyncArray(const SyncArray<T> &);
141 | 
142 |     SyncArray &operator=(const SyncArray<T> &);
143 | 
144 |     SyncMem *mem;
145 |     size_t size_;
146 | };
147 | 
148 | #endif //THUNDERGBM_SYNCDATA_H
149 | 


--------------------------------------------------------------------------------
/include/thundergbm/syncmem.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 17-9-16.
  3 | //
  4 | 
  5 | #ifndef THUNDERGBM_SYNCMEM_H
  6 | #define THUNDERGBM_SYNCMEM_H
  7 | 
  8 | #include "thundergbm.h"
  9 | 
 10 | namespace thunder {
 11 |     inline void malloc_host(void **ptr, size_t size) {
 12 | #ifdef USE_CUDA
 13 |         CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocPortable));
 14 | #else
 15 |         *ptr = malloc(size);
 16 | #endif
 17 |     }
 18 | 
 19 |     inline void free_host(void *ptr) {
 20 | #ifdef USE_CUDA
 21 |         CUDA_CHECK(cudaFreeHost(ptr));
 22 | #else
 23 |         free(ptr);
 24 | #endif
 25 |     }
 26 | 
 27 |     inline void device_mem_copy(void *dst, const void *src, size_t size) {
 28 | #ifdef USE_CUDA
 29 |         CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 30 | #else
 31 |         NO_GPU;
 32 | #endif
 33 |     }
 34 | 
 35 |     /**
 36 |      * @brief Auto-synced memory for CPU and GPU
 37 |      */
 38 |     class SyncMem {
 39 |     public:
 40 |         SyncMem();
 41 | 
 42 |         /**
 43 |          * create a piece of synced memory with given size. The GPU/CPU memory will not be allocated immediately, but
 44 |          * allocated when it is used at first time.
 45 |          * @param size the size of memory (in Bytes)
 46 |          */
 47 |         explicit SyncMem(size_t size);
 48 | 
 49 |         ~SyncMem();
 50 | 
 51 |         ///return raw host pointer
 52 |         void *host_data();
 53 | 
 54 |         ///return raw device pointer
 55 |         void *device_data();
 56 | 
 57 |         /**
 58 |          * set host data pointer to another host pointer, and its memory will not be managed by this class
 59 |          * @param data another host pointer
 60 |          */
 61 |         void set_host_data(void *data);
 62 | 
 63 |         /**
 64 |          * set device data pointer to another device pointer, and its memory will not be managed by this class
 65 |          * @param data another device pointer
 66 |          */
 67 |         void set_device_data(void *data);
 68 | 
 69 |         ///transfer data to host
 70 |         void to_host();
 71 | 
 72 |         ///transfer data to device
 73 |         void to_device();
 74 | 
 75 |         ///return the size of memory
 76 |         size_t size() const;
 77 | 
 78 |         ///to determine the where the newest data locates in
 79 |         enum HEAD {
 80 |             HOST, DEVICE, UNINITIALIZED
 81 |         };
 82 | 
 83 |         HEAD head() const;
 84 | 
 85 |         int get_owner_id() {
 86 |             return device_id;
 87 |         }
 88 | 
 89 |     private:
 90 |         void *device_ptr;
 91 |         void *host_ptr;
 92 |         bool own_device_data;
 93 |         bool own_host_data;
 94 |         size_t size_;
 95 |         HEAD head_;
 96 |         int device_id;
 97 |     };
 98 | }
 99 | using thunder::SyncMem;
100 | #endif //THUNDERGBM_SYNCMEM_H
101 | 


--------------------------------------------------------------------------------
/include/thundergbm/thundergbm.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by jiashuai on 18-1-16.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_THUNDERGBM_H
 6 | #define THUNDERGBM_THUNDERGBM_H
 7 | 
 8 | #include <cstdlib>
 9 | #include <cmath>
10 | 
11 | #include "cuda_runtime_api.h"
12 | #include "thundergbm/util/log.h"
13 | #include "thundergbm/util/common.h"
14 | #include "thundergbm/util/multi_device.h"
15 | #include "config.h"
16 | 
17 | typedef float float_type;
18 | using std::vector;
19 | using std::string;
20 | #endif //THUNDERGBM_THUNDERGBM_H
21 | 


--------------------------------------------------------------------------------
/include/thundergbm/tree.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by jiashuai on 18-1-18.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_TREE_H
 6 | #define THUNDERGBM_TREE_H
 7 | 
 8 | #include "thundergbm/thundergbm.h"
 9 | #include "syncarray.h"
10 | #include "sstream"
11 | #include "ins_stat.h"
12 | 
13 | 
14 | class Tree {
15 | public:
16 |     struct TreeNode {
17 |         int final_id;// node id after pruning, may not equal to node index
18 |         int lch_index;// index of left child
19 |         int rch_index;// index of right child
20 |         int parent_index;// index of parent node
21 |         float_type gain;// gain of splitting this node
22 |         float_type base_weight;
23 |         int split_feature_id;
24 |         float_type split_value;
25 |         bool default_right;
26 |         bool is_leaf;
27 |         bool is_valid;// non-valid nodes are those that are "children" of leaf nodes
28 |         bool is_pruned;// pruned after pruning
29 | 
30 |         GHPair sum_gh_pair;
31 | 
32 |         friend std::ostream &operator<<(std::ostream &os,
33 |                                         const TreeNode &node);
34 | 
35 |         HOST_DEVICE void calc_weight(float_type lambda) {
36 |             this->base_weight = -sum_gh_pair.g / (sum_gh_pair.h + lambda);
37 |         }
38 | 
39 |         HOST_DEVICE bool splittable() const {
40 |             return !is_leaf && is_valid;
41 |         }
42 | 
43 |     };
44 | 
45 |     explicit Tree(int depth);
46 | 
47 |     Tree() = default;
48 | 
49 |     Tree(const Tree &tree) {
50 |         nodes.resize(tree.nodes.size());
51 |         nodes.copy_from(tree.nodes);
52 |     }
53 | 
54 |     Tree &operator=(const Tree &tree) {
55 |         nodes.resize(tree.nodes.size());
56 |         nodes.copy_from(tree.nodes);
57 |         return *this;
58 |     }
59 | 
60 |     void init(int depth);
61 | 
62 |     string dump(int depth) const;
63 | 
64 |     SyncArray<Tree::TreeNode> nodes;
65 | 
66 |     void prune_self(float_type gamma);
67 | 
68 |     void shrink(float_type learning_rate);
69 | 
70 | private:
71 |     void preorder_traversal(int nid, int max_depth, int depth, string &s) const;
72 | 
73 |     int try_prune_leaf(int nid, int np, float_type gamma, vector<int> &leaf_child_count);
74 | 
75 |     void reorder_nid();
76 | };
77 | 
78 | #endif //THUNDERGBM_TREE_H
79 | 


--------------------------------------------------------------------------------
/include/thundergbm/updater/exact_updater.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by shijiashuai on 5/7/18.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_EXACT_UPDATER_H
 6 | #define THUNDERGBM_EXACT_UPDATER_H
 7 | 
 8 | #include <thundergbm/tree.h>
 9 | #include <thundergbm/sparse_columns.h>
10 | #include "thrust/reduce.h"
11 | #include <thrust/adjacent_difference.h>
12 | #include <thrust/execution_policy.h>
13 | #include <thrust/sequence.h>
14 | #include <thrust/sort.h>
15 | #include <thrust/binary_search.h>
16 | #include <thrust/device_vector.h>
17 | #include <thundergbm/param.h>
18 | #include "thundergbm/util/device_lambda.cuh"
19 | #include "thundergbm/gpu_lsh.h"
20 | 
21 | 
22 | class SplitPoint {
23 | public:
24 |     float_type gain;
25 |     int split_fea_id;
26 |     float_type fval;
27 |     int bin_id;
28 |     GHPair fea_missing_gh;
29 |     GHPair rch_sum_gh;
30 |     bool default_right;
31 |     int nid;
32 | 
33 |     SplitPoint() {
34 |         nid = -1;
35 |         split_fea_id = -1;
36 |         gain = std::numeric_limits<float_type>::min();
37 |     }
38 | 
39 |     friend std::ostream &operator<<(std::ostream &output, const SplitPoint &sp) {
40 |         output << sp.gain << "/" << sp.split_fea_id << "/" << sp.nid;
41 |         return output;
42 |     }
43 | };
44 | 
45 | class ExactUpdater {
46 | public:
47 |     explicit ExactUpdater(GBMParam &param) {
48 |         depth = param.depth;
49 |         min_child_weight = param.min_child_weight;
50 |         lambda = param.lambda;
51 |         gamma = param.gamma;
52 |         rt_eps = param.rt_eps;
53 |         n_devices = param.n_device;
54 |     }
55 | 
56 | 
57 |     void grow(Tree &tree, const vector<std::shared_ptr<SparseColumns>> &v_columns, InsStat &stats);
58 | 
59 |     int depth;
60 |     float_type min_child_weight;
61 |     float_type lambda;
62 |     float_type gamma;
63 |     float_type rt_eps;
64 | 
65 |     psdLsh lsh_table;
66 | 
67 |     int n_devices;
68 |     vector<std::shared_ptr<InsStat>> v_stats;
69 |     vector<std::shared_ptr<Tree>> v_trees_gpu;
70 | 
71 |     void init_tree(Tree &tree, const InsStat &stats);
72 | 
73 |     virtual void find_split(int level, const SparseColumns &columns, const Tree &tree, const InsStat &stats,
74 |                             SyncArray<SplitPoint> &sp);
75 | 
76 | //update global best split for each node
77 |     void update_tree(Tree &tree, const SyncArray<SplitPoint> &sp);
78 | 
79 |     virtual bool reset_ins2node_id(InsStat &stats, const Tree &tree, const SparseColumns &columns);
80 | 
81 |     void split_point_all_reduce(const vector<SyncArray<SplitPoint>> &local_sp, SyncArray<SplitPoint> &global_sp,
82 |                                 int depth);
83 | 
84 |     void lsh_hash_init(unsigned table_size, unsigned num_table, unsigned num_dimension, unsigned p_norm, float r,
85 |                        unsigned numP, int seed);
86 | };
87 | 
88 | typedef thrust::tuple<int, float_type> int_float;
89 | 
90 | std::ostream &operator<<(std::ostream &os, const int_float &rhs);
91 | 
92 | #endif //THUNDERGBM_EXACT_UPDATER_H
93 | 


--------------------------------------------------------------------------------
/include/thundergbm/updater/hist_updater.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by qinbin on 2018/7/6.
 3 | //
 4 | 
 5 | #ifndef GBM_MIRROR2_HIST_UPDATER_H
 6 | #define GBM_MIRROR2_HIST_UPDATER_H
 7 | 
 8 | #include "thundergbm/updater/exact_updater.h"
 9 | #include "thundergbm/hist_cut.h"
10 | #include "thundergbm/csc2r_transform.h"
11 | 
12 | class HistUpdater : public ExactUpdater{
13 | public:
14 |     int max_num_bin = 64;
15 |     int do_cut = 0;
16 |     bool use_similar_bundle = 1;
17 |     vector<HistCut> v_cut;
18 |     vector<std::shared_ptr<SyncArray<int>>> bin_id;
19 | 
20 | 
21 |     void insBundle(const vector<std::shared_ptr<SparseColumns>> &v_columns, InsStat &stats);
22 |     void init_bin_id(const vector<std::shared_ptr<SparseColumns>> &v_columns);
23 |     void init_bin_id_outside(const vector<std::shared_ptr<SparseColumns>> &v_columns, SyncArray<int>& bin_id);
24 |     void init_bin_id_unsort(SparseColumns& unsort_columns, SyncArray<int>& bin_id);
25 |     void copy_bin_id(const vector<std::shared_ptr<SparseColumns>> &v_columns, SyncArray<int>& bin_id);
26 |     void init_cut(const vector<std::shared_ptr<SparseColumns>> &v_columns, InsStat &stats, int n_instance,
27 |             SparseColumns& unsort_columns);
28 | 
29 |     void init_bin_id_csr(const vector<vector<std::shared_ptr<SparseColumns>>> &v_columns, int n_instances);
30 | 
31 |     void similar_ins_bundle(const vector<std::shared_ptr<SparseColumns>> &v_columns, InsStat &stats,
32 |             int& n_instances, DataSet &dataSet, SparseColumns& unsort_columns, int* iidold2new, SyncArray<bool>& is_multi);
33 |     void similar_ins_bundle(const vector<std::shared_ptr<SparseColumns>> &v_columns,
34 |             const vector<std::shared_ptr<SparseColumns>> &v_columns2, InsStat &stats,
35 |                             int& n_instances, DataSet &dataSet, SparseColumns& unsort_columns, int* iidold2new, SyncArray<bool>& is_multi);
36 |     void similar_ins_bundle_multi(const vector<vector<std::shared_ptr<SparseColumns>>> &v_columns,
37 |                             int numP, InsStat &stats, int& n_instances, DataSet &dataSet,
38 |                             SparseColumns& unsort_columns, int* iidold2new, SyncArray<bool>& is_multi, bool is_random = 0);
39 |     void similar_ins_bundle_closest(const vector<vector<std::shared_ptr<SparseColumns>>> &v_columns, int numP,
40 |             InsStat &stats, int& n_instances, DataSet& dataSet, SparseColumns& unsort_columns,int* iidold2new, SyncArray<bool>& is_multi);
41 |     void similar_ins_bundle(const vector<vector<std::shared_ptr<SparseColumns>>> &v_columns, int numP,
42 |             vector<InsStat> &stats, int& n_instances, DataSet& dataSet, SparseColumns& unsort_columns,
43 |             int* iidold2new, SyncArray<bool>& is_multi);
44 | 
45 | 
46 |     void similar_ins_bundle_independent(const vector<vector<std::shared_ptr<SparseColumns>>> &v_columns, int numP,
47 |             vector<InsStat> &stats, int& n_instances, DataSet& dataSet, SparseColumns& unsort_columns,
48 |             int* iidold2new, SyncArray<bool>& is_multi, bool is_random = 0, bool weighted_gh = 1);
49 | 
50 |     void get_bin_ids(const SparseColumns &columns);
51 | 
52 |     void find_split(int level, const SparseColumns &columns, const Tree &tree, const InsStat &stats,
53 |                     SyncArray<SplitPoint> &sp) override;
54 | 
55 |     bool reset_ins2node_id(InsStat &stats, const Tree &tree, const SparseColumns &columns) override;
56 | 
57 |     explicit HistUpdater(GBMParam &param): ExactUpdater(param) {};
58 | 
59 | 
60 | 
61 |     Csc2r bin_id_csr;
62 | 
63 | };
64 | #endif //GBM_MIRROR2_HIST_UPDATER_H
65 | 


--------------------------------------------------------------------------------
/include/thundergbm/util/common.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by jiashuai on 18-1-16.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_COMMON_H
 6 | #define THUNDERGBM_COMMON_H
 7 | 
 8 | 
 9 | #include "thundergbm/thundergbm.h"
10 | 
11 | #define USE_CUDA
12 | #define CUDA_CHECK(condition) \
13 |   /* Code block avoids redefinition of cudaError_t error */ \
14 |   do { \
15 |     cudaError_t error = condition; \
16 |     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
17 |   } while (false)
18 | 
19 | #define NO_GPU \
20 | LOG(FATAL)<<"Cannot use GPU when compiling without GPU"
21 | 
22 | //https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf
23 | template<typename ... Args>
24 | std::string string_format(const std::string &format, Args ... args) {
25 |     size_t size = snprintf(nullptr, 0, format.c_str(), args ...) + 1; // Extra space for '\0'
26 |     std::unique_ptr<char[]> buf(new char[size]);
27 |     snprintf(buf.get(), size, format.c_str(), args ...);
28 |     return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside
29 | }
30 | 
31 | #define HOST_DEVICE __host__ __device__
32 | #endif //THUNDERGBM_COMMON_H
33 | 


--------------------------------------------------------------------------------
/include/thundergbm/util/cub_wrapper.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by ss on 18-5-13.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_CUB_UTIL_H
 6 | #define THUNDERGBM_CUB_UTIL_H
 7 | 
 8 | #include "thundergbm/thundergbm.h"
 9 | //#include <cub/util_allocator.cuh>
10 | //#include <cub/device/device_radix_sort.cuh>
11 | #include "cub/cub.cuh"
12 | #include <thundergbm/syncarray.h>
13 | 
14 | template<typename T1, typename T2>
15 | void cub_sort_by_key(SyncArray<T1> &keys, SyncArray<T2> &values, bool ascending = true) {
16 |     CHECK_EQ(values.size(), values.size()) << "keys and values must have equal size";
17 |     using namespace cub;
18 |     size_t num_items = keys.size();
19 |     SyncArray<T1> keys2(num_items);
20 |     SyncArray<T2> values2(num_items);
21 |     SyncArray<char> temp_storage;
22 | 
23 |     DoubleBuffer<T1> d_keys(keys.device_data(), keys2.device_data());
24 |     DoubleBuffer<T2> d_values(values.device_data(), values2.device_data());
25 | 
26 |     size_t temp_storage_bytes = 0;
27 | 
28 |     // Initialize device arrays
29 |     if (ascending)
30 |         DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, num_items);
31 |     else
32 |         DeviceRadixSort::SortPairsDescending(NULL, temp_storage_bytes, d_keys, d_values, num_items);
33 |     temp_storage.resize(temp_storage_bytes);
34 | 
35 |     // Run
36 |     if (ascending)
37 |         DeviceRadixSort::SortPairs(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values, num_items);
38 |     else
39 |         DeviceRadixSort::SortPairsDescending(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values,
40 |                                              num_items);
41 | 
42 |     CUDA_CHECK(
43 |             cudaMemcpy(keys.device_data(), reinterpret_cast<const void *>(d_keys.Current()), sizeof(float) * num_items,
44 |                        cudaMemcpyDeviceToDevice));
45 |     CUDA_CHECK(cudaMemcpy(values.device_data(), reinterpret_cast<const void *>(d_values.Current()),
46 |                           sizeof(int) * num_items,
47 |                           cudaMemcpyDeviceToDevice));
48 | }
49 | 
50 | template<typename T1, typename T2>
51 | void cub_seg_sort_by_key(SyncArray<T1> &keys, SyncArray<T2> &values, SyncArray<int> &ptr, bool ascending = true) {
52 |     CHECK_EQ(values.size(), values.size()) << "keys and values must have equal size";
53 |     using namespace cub;
54 |     size_t num_items = keys.size();
55 |     size_t num_segments = ptr.size() - 1;
56 |     SyncArray<T1> keys2(num_items);
57 |     SyncArray<T2> values2(num_items);
58 |     SyncArray<char> temp_storage;
59 | 
60 |     DoubleBuffer<T1> d_keys(keys.device_data(), keys2.device_data());
61 |     DoubleBuffer<T2> d_values(values.device_data(), values2.device_data());
62 | 
63 |     size_t temp_storage_bytes = 0;
64 | 
65 |     // Initialize device arrays
66 |     if (ascending)
67 |         DeviceSegmentedRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, num_items, num_segments,
68 |                                             ptr.device_data(), ptr.device_data() + 1);
69 |     else
70 |         DeviceSegmentedRadixSort::SortPairsDescending(NULL, temp_storage_bytes, d_keys, d_values, num_items,
71 |                                                       num_segments,
72 |                                                       ptr.device_data(), ptr.device_data() + 1);
73 |     temp_storage.resize(temp_storage_bytes);
74 | 
75 |     // Run
76 |     if (ascending)
77 |         DeviceSegmentedRadixSort::SortPairs(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values,
78 |                                             num_items, num_segments, ptr.device_data(),
79 |                                             ptr.device_data() + 1);
80 |     else
81 |         DeviceSegmentedRadixSort::SortPairsDescending(temp_storage.device_data(), temp_storage_bytes, d_keys, d_values,
82 |                                                       num_items, num_segments, ptr.device_data(),
83 |                                                       ptr.device_data() + 1);
84 | 
85 |     CUDA_CHECK(
86 |             cudaMemcpy(keys.device_data(), reinterpret_cast<const void *>(d_keys.Current()), sizeof(float) * num_items,
87 |                        cudaMemcpyDeviceToDevice));
88 |     CUDA_CHECK(cudaMemcpy(values.device_data(), reinterpret_cast<const void *>(d_values.Current()),
89 |                           sizeof(int) * num_items,
90 |                           cudaMemcpyDeviceToDevice));
91 | };
92 | 
93 | #endif //THUNDERGBM_CUB_UTIL_H
94 | 


--------------------------------------------------------------------------------
/include/thundergbm/util/device_lambda.cuh:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by jiashuai on 18-1-19.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_DEVICE_LAMBDA_H
 6 | #define THUNDERGBM_DEVICE_LAMBDA_H
 7 | 
 8 | #include "thundergbm/thundergbm.h"
 9 | #include "thundergbm/clion_cuda.h"
10 | 
11 | template<typename L>
12 | __global__ void lambda_kernel(size_t len, L lambda) {
13 |     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < len; i += blockDim.x * gridDim.x) {
14 |         lambda(i);
15 |     }
16 | }
17 | 
18 | template<typename L>
19 | __global__ void lambda_2d_sparse_kernel(const int *len2, L lambda) {
20 |     int i = blockIdx.x;
21 |     int begin = len2[i];
22 |     int end = len2[i + 1];
23 |     for (int j = begin + blockIdx.y * blockDim.x + threadIdx.x; j < end; j += blockDim.x * gridDim.y) {
24 |         lambda(i, j);
25 |     }
26 | }
27 | 
28 | ///p100 has 56 MPs, using 32*56 thread blocks
29 | template<typename L>
30 | void device_loop(int len, L lambda, unsigned int NUM_BLOCK = 32 *56, unsigned int BLOCK_SIZE=512) {
31 |     if (len > 0) {
32 |         lambda_kernel << < NUM_BLOCK, BLOCK_SIZE >> > (len, lambda);
33 |         CUDA_CHECK(cudaPeekAtLastError());
34 |     }
35 | }
36 | 
37 | 
38 | template<typename L>
39 | void device_loop_2d(int len1, const int *len2, L lambda, unsigned int NUM_BLOCK = 32 * 56,
40 |                     unsigned int BLOCK_SIZE = 256) {
41 |     if (len1 > 0) {
42 |         lambda_2d_sparse_kernel << < dim3(len1, NUM_BLOCK), BLOCK_SIZE >> > (len2, lambda);
43 |         CUDA_CHECK(cudaPeekAtLastError());
44 |     }
45 | }
46 | 
47 | template<typename L>
48 | __global__ void lambda_2d_sparse_kernel_mod(const int mod_val, const int *len2, L lambda) {
49 |     int i = blockIdx.x;
50 |     int begin = len2[i%mod_val];
51 |     int end = len2[i%mod_val + 1];
52 |     for (int j = begin + blockIdx.y * blockDim.x + threadIdx.x; j < end; j += blockDim.x * gridDim.y) {
53 |         lambda(i, j);
54 |     }
55 | }
56 | 
57 | template<typename L>
58 | void device_loop_2d_mod(int len1, int mod_val, const int *len2, L lambda, unsigned int NUM_BLOCK = 32 * 56,
59 |                     unsigned int BLOCK_SIZE = 256) {
60 |     if (len1 > 0) {
61 |         lambda_2d_sparse_kernel_mod << < dim3(len1, NUM_BLOCK), BLOCK_SIZE >> > (mod_val, len2, lambda);
62 |         CUDA_CHECK(cudaPeekAtLastError());
63 |     }
64 | }
65 | 
66 | //template<typename L>
67 | //__global__ void lambda_2d_sparse_kernel_zero(const int *len2, L lambda) {
68 | //    int i = blockIdx.x;
69 | //    int len = len2[i + 1] - len2[i];
70 | //    for (int j = blockIdx.y * blockDim.x + threadIdx.x; j < len; j += blockDim.x * gridDim.y) {
71 | //        lambda(i, j);
72 | //    }
73 | //}
74 | //
75 | //template<typename L>
76 | //void device_loop_2d_zero(int len1, const int *len2, L lambda, unsigned int NUM_BLOCK = 32 * 56,
77 | //                    unsigned int BLOCK_SIZE = 256) {
78 | //    if (len1 > 0) {
79 | //        lambda_2d_sparse_kernel_zero << < dim3(len1, NUM_BLOCK), BLOCK_SIZE >> > (len2, lambda);
80 | //        CUDA_CHECK(cudaPeekAtLastError());
81 | //    }
82 | //}
83 | #endif //THUNDERGBM_DEVICE_LAMBDA_H
84 | 


--------------------------------------------------------------------------------
/include/thundergbm/util/multi_device.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by ss on 18-6-18.
 3 | //
 4 | 
 5 | #ifndef THUNDERGBM_MULTI_DEVICE_H
 6 | #define THUNDERGBM_MULTI_DEVICE_H
 7 | //switch to specific device and do something, then switch back to the original device
 8 | //FIXME make this macro into a function?
 9 | #define DO_ON_DEVICE(device_id, something) \
10 |     do { \
11 |         int org_device_id = 0; \
12 |         CUDA_CHECK(cudaGetDevice(&org_device_id)); \
13 |         CUDA_CHECK(cudaSetDevice(device_id)); \
14 |         something; \
15 |         CUDA_CHECK(cudaSetDevice(org_device_id)); \
16 |     } while (false)
17 | 
18 | /**
19 |  * Do something on multiple devices, then switch back to the original device
20 |  *
21 |  *
22 |  * example:
23 |  *
24 |  * DO_ON_MULTI_DEVICES(n_devices, [&](int device_id){
25 |  *   //do_something_on_device(device_id);
26 |  * });
27 |  */
28 | 
29 | template<typename L>
30 | void DO_ON_MULTI_DEVICES(int n_devices, L do_something) {
31 |     int org_device_id = 0;
32 |     CUDA_CHECK(cudaGetDevice(&org_device_id));
33 | #pragma omp parallel for num_threads(n_devices)
34 |     for (int device_id = 0; device_id < n_devices; device_id++) {
35 |         CUDA_CHECK(cudaSetDevice(device_id));
36 |         do_something(device_id);
37 |     }
38 |     CUDA_CHECK(cudaSetDevice(org_device_id));
39 | 
40 | }
41 | 
42 | //void DO_ON_MULTI_DEVICES(int n_devices, L do_something) {
43 | //    int org_device_id = 1;
44 | //    CUDA_CHECK(cudaGetDevice(&org_device_id));
45 | //#pragma omp parallel for num_threads(n_devices)
46 | //    for (int device_id = 1; device_id <= n_devices; device_id++) {
47 | //        CUDA_CHECK(cudaSetDevice(device_id));
48 | //        do_something(device_id);
49 | //    }
50 | //    CUDA_CHECK(cudaSetDevice(org_device_id));
51 | //
52 | //}
53 | 
54 | #endif //THUNDERGBM_MULTI_DEVICE_H
55 | 


--------------------------------------------------------------------------------
/src/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(googletest)
 2 | 
 3 | include_directories(googletest/googletest/include)
 4 | include_directories(googletest/googlemock/include)
 5 | 
 6 | file(GLOB TEST_SRC *)
 7 | 
 8 | cuda_add_executable(${PROJECT_NAME}-test ${TEST_SRC} ${COMMON_INCLUDES})
 9 | target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME} gtest)
10 | 
11 | 


--------------------------------------------------------------------------------
/src/test/test_main.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 17-9-15.
  3 | //
  4 | #include "thundergbm/thundergbm.h"
  5 | #include "gtest/gtest.h"
  6 | 
  7 | //#include <thundergbm/tree.h>
  8 | //#include <thundergbm/dataset.h>
  9 | //#include <thundergbm/updater/exact_updater.h>
 10 | //#include <thundergbm/updater/hist_updater.h>
 11 | 
 12 | 
 13 | //float_type compute_rmse(const InsStat &stats) {
 14 | //    float_type sum_error = 0;
 15 | //    const float_type *y_data = stats.y.host_data();
 16 | //    const float_type *y_predict_data = stats.y_predict.host_data();
 17 | //    for (int i = 0; i < stats.n_instances; ++i) {
 18 | //        float_type e = y_predict_data[i] - y_data[i];
 19 | //        sum_error += e * e;
 20 | //    }
 21 | //    float_type rmse = sqrt(sum_error / stats.n_instances);
 22 | //    return rmse;
 23 | //}
 24 | 
 25 | 
 26 | 
 27 | int iargc; // Making arg and arv global to access within TESTs
 28 | char** iargv;
 29 | int main(int argc, char **argv) {
 30 |     ::testing::InitGoogleTest(&argc, argv);
 31 |     iargc = argc;
 32 |     iargv = argv;
 33 |     el::Loggers::reconfigureAllLoggers(el::ConfigurationType::Format, "%datetime %level %fbase:%line : %msg");
 34 |     el::Loggers::addFlag(el::LoggingFlag::ColoredTerminalOutput);
 35 |     el::Loggers::addFlag(el::LoggingFlag::FixedTimeFormat);
 36 |     el::Loggers::reconfigureAllLoggers(el::Level::Debug, el::ConfigurationType::Enabled, "false");
 37 |     el::Loggers::reconfigureAllLoggers(el::Level::Trace, el::ConfigurationType::Enabled, "false");
 38 |     el::Loggers::reconfigureAllLoggers(el::Level::Info, el::ConfigurationType::Enabled, "false");
 39 |     return RUN_ALL_TESTS();
 40 | 
 41 | //    GBMParam param;
 42 | //    bool verbose = false;
 43 | //    param.depth = 6;
 44 | //    param.n_trees = 40;
 45 | //    param.min_child_weight = 1;
 46 | //    param.lambda = 1;
 47 | //    param.gamma = 1;
 48 | //    param.rt_eps = 1e-6;
 49 | //    param.do_exact = true;
 50 | //    param.n_device = 1;
 51 | //    if (!verbose) {
 52 | //        el::Loggers::reconfigureAllLoggers(el::Level::Debug, el::ConfigurationType::Enabled, "false");
 53 | //        el::Loggers::reconfigureAllLoggers(el::Level::Trace, el::ConfigurationType::Enabled, "false");
 54 | //    }
 55 | //    el::Loggers::reconfigureAllLoggers(el::ConfigurationType::PerformanceTracking, "true");
 56 | //    DataSet dataSet;
 57 | //    dataSet.load_from_file(param.path);
 58 | //    int n_instances = dataSet.n_instances();
 59 | //    InsStat stats;
 60 | //    vector<Tree> trees;
 61 | //    SparseColumns columns;
 62 | //    columns.from_dataset(dataSet);
 63 | //    trees.resize(param.n_trees);
 64 | //    stats.resize(n_instances);
 65 | //    stats.y.copy_from(dataSet.y().data(), n_instances);
 66 | //
 67 | //    int n_devices = 1;
 68 | //    vector<std::shared_ptr<SparseColumns>> v_columns;
 69 | //    v_columns.resize(n_devices);
 70 | //    for (int i = 0; i < n_devices; i++)
 71 | //        v_columns[i].reset(new SparseColumns());
 72 | //    columns.to_multi_devices(v_columns);
 73 | //    HistUpdater updater(param);
 74 | //    int round = 0;
 75 | //    float_type rmse = 0;
 76 | //    {
 77 | //        bool init_bin = 0;
 78 | //        for (Tree &tree:trees) {
 79 | //            stats.updateGH();
 80 | //            //updater.insBundle(v_columns, stats);
 81 | //            TIMED_SCOPE(timerObj, "construct tree");
 82 | //            if(!init_bin) {
 83 | //                updater.use_similar_bundle = 0;
 84 | //                {
 85 | //                    TIMED_SCOPE(timerObj, "init cut");
 86 | //                    updater.init_cut(v_columns, stats, n_instances, columns);
 87 | //                }
 88 | //                if(updater.use_similar_bundle)
 89 | //                {
 90 | //                    TIMED_SCOPE(timerObj, "similar ins bundle");
 91 | //                    updater.similar_ins_bundle(v_columns, stats, n_instances, dataSet, columns);
 92 | //                }
 93 | //                init_bin = 1;
 94 | //            }
 95 | //
 96 | //
 97 | //            {
 98 | //                TIMED_SCOPE(timerObj, "grow");
 99 | //                updater.grow(tree, v_columns, stats);
100 | //            }
101 | //            {
102 | //                TIMED_SCOPE(timerObj, "prune");
103 | //                tree.prune_self(param.gamma);
104 | //            }
105 | //
106 | //            LOG(DEBUG) << string_format("\nbooster[%d]", round) << tree.dump(param.depth);
107 | //            predict_in_training(stats, tree);
108 | //            //next round
109 | //            round++;
110 | //
111 | //        }
112 | //    }
113 | //    rmse = compute_rmse(stats);
114 | //    LOG(INFO) << "rmse = " << rmse;
115 | //    return 1;
116 | }
117 | 


--------------------------------------------------------------------------------
/src/test/test_unifiedmem.cu:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by shijiashuai on 10/7/18.
 3 | //
 4 | 
 5 | #include <thundergbm/thundergbm.h>
 6 | #include "thundergbm/syncarray.h"
 7 | #include "gtest/gtest.h"
 8 | #include "thrust/reduce.h"
 9 | #include <thrust/adjacent_difference.h>
10 | #include <thrust/execution_policy.h>
11 | #include <thrust/sequence.h>
12 | #include <thrust/sort.h>
13 | #include <thrust/binary_search.h>
14 | #include <thrust/device_vector.h>
15 | #include "thundergbm/util/device_lambda.cuh"
16 | 
17 | 
18 | void kernel(int count, int *ptr) {
19 |     device_loop(count, [=] __device__(int i) { ptr[i] = 1; });
20 | }
21 | 
22 | //TEST(TestUnified, test) {
23 | //    int *ptr;
24 | //    size_t size = (1L << 30) * 4;
25 | //    size_t count = size / sizeof(int);
26 | //
27 | //    using namespace thrust;
28 | //    cudaMallocManaged((void **) &ptr, size);
29 | //
30 | //    memset(ptr, 0, size);
31 | //    {
32 | //        TIMED_SCOPE(timerObj, "prefetch kernel");
33 | //        cudaMemPrefetchAsync(ptr, size, 0);
34 | //
35 | //        sort(cuda::par, ptr, ptr + count);
36 | //        cudaDeviceSynchronize();
37 | //    }
38 | //
39 | //    cudaFree(ptr);
40 | //    ptr = nullptr;
41 | //
42 | //    SyncArray<int> arr(count);
43 | //    arr.to_host();
44 | //    {
45 | //        TIMED_SCOPE(timerObj, "copy kernel");
46 | //        sort(cuda::par, arr.device_data(), arr.device_end());
47 | //    }
48 | //}
49 | 


--------------------------------------------------------------------------------
/src/thundergbm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 2 | #set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 3 | #set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 4 | 
 5 | file(GLOB SRC util/*.c* updater/*.c* *.c*)
 6 | 
 7 | cuda_add_library(${PROJECT_NAME} SHARED ${SRC})
 8 | 
 9 | target_link_libraries(${PROJECT_NAME} ${CUDA_cusparse_LIBRARY})
10 | 


--------------------------------------------------------------------------------
/src/thundergbm/csc2r_transform.cu:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by qinbin on 21/8/18.
 3 | //
 4 | 
 5 | #include "thundergbm/csc2r_transform.h"
 6 | 
 7 | void Csc2r::from_csr(float_type* csr_val, int* csr_col_ind, int* csr_row_ptr, int n_instances, int n_column, int nnz){
 8 | 
 9 |     cusparseHandle_t handle;
10 |     cusparseMatDescr_t descr;
11 |     cusparseCreate(&handle);
12 |     cusparseCreateMatDescr(&descr);
13 |     cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
14 |     cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
15 | 
16 |     //std::cout<<"nnz:"<<nnz<<std::endl;
17 |     csc_val.resize(nnz);
18 |     csc_row_ind.resize(nnz);
19 |     csc_col_ptr.resize(n_column + 1);
20 |     this->nnz = nnz;
21 |     
22 |     cusparseScsr2csc(handle, n_instances, n_column, nnz, csr_val, csr_row_ptr,
23 |                      csr_col_ind, csc_val.device_data(), csc_row_ind.device_data(), csc_col_ptr.device_data(),
24 |                      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
25 |     cudaDeviceSynchronize();
26 | 
27 |     cusparseDestroy(handle);
28 |     cusparseDestroyMatDescr(descr);
29 | 
30 | 
31 | }
32 | 
33 | 
34 | void Csc2r::get_cut_points_evenly(int nBin, vector<float>& bin_id,
35 |         const vector<float>& min_fea, const vector<float>& max_fea) {
36 |     float* csc_val_host = csc_val.host_data();
37 |     int* csc_row_host = csc_row_ind.host_data();
38 |     int* csc_col_host = csc_col_ptr.host_data();
39 |     for(int cid = 0; cid < csc_col_ptr.size() - 1; cid ++){
40 |         int cstart = csc_col_host[cid];
41 |         int cend = csc_col_host[cid + 1];
42 |         for(int off = cstart; off < cend; off++){
43 | 
44 |             float val = csc_val_host[off];
45 |             int rid = csc_row_host[off];
46 | //            std::cout<<"rid:"<<rid<<" ";
47 | //            std::cout<<"diff fea:"<<(max_fea[rid] - min_fea[rid])<<" ";
48 |             if((max_fea[rid] - min_fea[rid]) < 1e-5) {
49 | //                std::cout << "only one feature value" << std::endl;
50 |                 bin_id[off] = 2.0;
51 |             }
52 | //            if(min_fea[rid] == INFINITY || max_fea[rid] == -INFINITY){
53 | //                std::cout<<"impossible case"<<std::endl;
54 | //                bin_id[off]=0.0;
55 | //            }
56 |             else
57 |                 bin_id[off] = 1.0 * ((int) ((val - min_fea[rid]) / (max_fea[rid] - min_fea[rid]) * nBin) + 1);
58 |         }
59 |     }
60 | }
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/src/thundergbm/dataset.cu:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 18-1-17.
  3 | //
  4 | #include <omp.h>
  5 | #include <thundergbm/util/cub_wrapper.h>
  6 | #include "thundergbm/dataset.h"
  7 | #include "cusparse.h"
  8 | #include "thrust/sort.h"
  9 | #include "thrust/system/cuda/detail/par.h"
 10 | #include "thundergbm/util/device_lambda.cuh"
 11 | 
 12 | void DataSet::load_from_file(string file_name) {
 13 |     LOG(INFO) << "loading LIBSVM dataset from file \"" << file_name << "\"";
 14 |     y_.clear();
 15 |     features.clear();
 16 |     line_num.clear();
 17 |     //instances_.clear();
 18 |     n_features_ = 123;
 19 |     std::ifstream ifs(file_name, std::ifstream::binary);
 20 |     CHECK(ifs.is_open()) << "file " << file_name << " not found";
 21 | 
 22 |     std::array<char, 2 << 20> buffer{}; //16M
 23 |     const int nthread = omp_get_max_threads();
 24 | 
 25 |     auto find_last_line = [](char *ptr, const char *begin) {
 26 |         while (ptr != begin && *ptr != '\n' && *ptr != '\r') --ptr;
 27 |         return ptr;
 28 |     };
 29 | 
 30 |     string first_line;
 31 |     getline(ifs, first_line);
 32 |     std::stringstream first_ss(first_line);
 33 |     int n_f_first = 0;
 34 |     string tuple;
 35 |     while(first_ss >> tuple)
 36 |         n_f_first++;
 37 |     ifs.clear();
 38 |     ifs.seekg (0, std::ios::beg);
 39 | 
 40 | 	int n_sum_line = 0;
 41 |     while (ifs) {
 42 |         ifs.read(buffer.data(), buffer.size());
 43 |         char *head = buffer.data();
 44 |         size_t size = ifs.gcount();
 45 |         vector<vector<float_type>> y_thread(nthread);
 46 |         //vector<node2d> instances_thread(nthread);
 47 | 
 48 |         vector<size_t> local_feature(nthread, 0);
 49 | 
 50 | 		//vector<vector<vector<int>>> index_thread(nthread);
 51 |         vector<vector<vector<float_type>>> feature_thread(nthread);
 52 |         vector<vector<vector<int>>> line_thread(nthread);
 53 |         for(int i = 0; i < nthread; i++){
 54 |             feature_thread[i].resize(n_f_first * 2);
 55 |             line_thread[i].resize(n_f_first * 2);
 56 |         }
 57 |         vector<int> n_line(nthread);
 58 | #pragma omp parallel num_threads(nthread)
 59 |         {
 60 |             //get working area of this thread
 61 |             int tid = omp_get_thread_num();
 62 |             size_t nstep = (size + nthread - 1) / nthread;
 63 |             size_t sbegin = std::min(tid * nstep, size - 1);
 64 |             size_t send = std::min((tid + 1) * nstep, size - 1);
 65 |             char *pbegin = find_last_line(head + sbegin, head);
 66 |             char *pend = find_last_line(head + send, head);
 67 | 
 68 |             //move stream start position to the end of last line
 69 |             if (tid == nthread - 1) ifs.seekg(pend - head - send, std::ios_base::cur);
 70 | 
 71 |             //read instances line by line
 72 |             char *lbegin = pbegin;
 73 |             char *lend = lbegin;
 74 | 			int lid = 0;
 75 |             while (lend != pend) {
 76 |                 //get one line
 77 |                 lend = lbegin + 1;
 78 |                 while (lend != pend && *lend != '\n' && *lend != '\r') {
 79 |                     ++lend;
 80 |                 }
 81 |                 string line(lbegin, lend);
 82 |                 std::stringstream ss(line);
 83 | 
 84 |                 //read label of an instance
 85 |                 y_thread[tid].emplace_back();
 86 |                 ss >> y_thread[tid].back();
 87 | 
 88 | 				string tuple;
 89 |                 //int fid = 0;
 90 |                 while(ss >> tuple){
 91 |                     int i;
 92 |                     float v;
 93 |                     CHECK_EQ(sscanf(tuple.c_str(), "%d:%f", &i, &v), 2) << "read error, using [index]:[value] format";
 94 |                     //index_thread[tid].back().emplace_back(i);
 95 |                     if(i > local_feature[tid]){
 96 |                         local_feature[tid] = i;
 97 |                     }
 98 |                     if(i > feature_thread[tid].size()){
 99 |                         feature_thread[tid].resize(i);
100 |                         line_thread[tid].resize(i);
101 | //                        min_fea.resize(i);
102 | //                        max_fea.resize(i);
103 | //                        min_fea[i - 1] = INFINITY;
104 | //                        max_fea[i - 1] = -INFINITY;
105 |                     }
106 | 
107 |                     feature_thread[tid][i-1].emplace_back(v);
108 |                     line_thread[tid][i-1].emplace_back(lid);
109 |                     //fid++;
110 | 
111 | 
112 |                 }
113 |                 lid++;
114 |                 //read next instance
115 |                 lbegin = lend;
116 | 
117 |             }
118 |             n_line[tid] = lid;
119 |         }
120 |         for (int i = 0; i < nthread; i++) {
121 |             if (local_feature[i] > n_features_)
122 |                 n_features_ = local_feature[i];
123 |         }
124 | //        this->features.resize(n_features_);
125 | //        this->line_num.resize(n_features_);
126 |         this->features.resize(max_dimension);
127 |         this->line_num.resize(max_dimension);
128 |         for(int i = 0; i < nthread; i++) {
129 |             for(int j = 0; j < local_feature[i]; j++) {
130 |                 this->features[j].insert(this->features[j].end(),
131 |                                          feature_thread[i][j].begin(),
132 |                                          feature_thread[i][j].end());
133 |                 for (int k = 0; k < line_thread[i][j].size(); k++) {
134 |                     line_thread[i][j][k] += n_sum_line;
135 |                 }
136 |                 this->line_num[j].insert(this->line_num[j].end(),
137 |                                          line_thread[i][j].begin(), line_thread[i][j].end());
138 |             }
139 |             n_sum_line += n_line[i];
140 |         }
141 | 		for (int i = 0; i < nthread; i++) {
142 |             this->y_.insert(y_.end(), y_thread[i].begin(), y_thread[i].end());
143 |         }
144 |     }
145 |     n_features_ = max_dimension;
146 |     min_fea.resize(max_dimension);
147 |     max_fea.resize(max_dimension);
148 |     for(int i = 0; i < max_dimension; i++){
149 |         if(features[i].size() == 0){
150 |             min_fea[i] = INFINITY;
151 |             max_fea[i] = -INFINITY;
152 |         }
153 |         else{
154 |             min_fea[i] = *std::min_element(features[i].begin(), features[i].end());
155 |             max_fea[i] = *std::max_element(features[i].begin(), features[i].end());
156 |         }
157 |     }
158 |     LOG(INFO) << "#instances = " << this->n_instances() << ", #features = " << this->n_features();
159 | }
160 | 
161 | void DataSet::load_from_file_csr(string file_name) {
162 |     LOG(INFO) << "loading LIBSVM dataset from file \"" << file_name << "\"";
163 |     y_.clear();
164 |     instances_.clear();
165 |     n_features_ = 0;
166 |     std::ifstream ifs(file_name, std::ifstream::binary);
167 |     CHECK(ifs.is_open()) << "file " << file_name << " not found";
168 | 
169 |     std::array<char, 2 << 20> buffer{}; //16M
170 |     const int nthread = omp_get_max_threads();
171 | 
172 |     auto find_last_line = [](char *ptr, const char *begin) {
173 |         while (ptr != begin && *ptr != '\n' && *ptr != '\r') --ptr;
174 |         return ptr;
175 |     };
176 | 
177 |     while (ifs) {
178 |         ifs.read(buffer.data(), buffer.size());
179 |         char *head = buffer.data();
180 |         size_t size = ifs.gcount();
181 |         vector<vector<float_type>> y_thread(nthread);
182 |         vector<node2d> instances_thread(nthread);
183 | 
184 |         vector<size_t> local_feature(nthread, 0);
185 | #pragma omp parallel num_threads(nthread)
186 |         {
187 |             //get working area of this thread
188 |             int tid = omp_get_thread_num();
189 |             size_t nstep = (size + nthread - 1) / nthread;
190 |             size_t sbegin = std::min(tid * nstep, size - 1);
191 |             size_t send = std::min((tid + 1) * nstep, size - 1);
192 |             char *pbegin = find_last_line(head + sbegin, head);
193 |             char *pend = find_last_line(head + send, head);
194 | 
195 |             //move stream start position to the end of last line
196 |             if (tid == nthread - 1) ifs.seekg(pend - head - send, std::ios_base::cur);
197 | 
198 |             //read instances line by line
199 |             char *lbegin = pbegin;
200 |             char *lend = lbegin;
201 |             while (lend != pend) {
202 |                 //get one line
203 |                 lend = lbegin + 1;
204 |                 while (lend != pend && *lend != '\n' && *lend != '\r') {
205 |                     ++lend;
206 |                 }
207 |                 string line(lbegin, lend);
208 |                 std::stringstream ss(line);
209 | 
210 |                 //read label of an instance
211 |                 y_thread[tid].emplace_back();
212 |                 ss >> y_thread[tid].back();
213 | 
214 |                 //read features of an instance
215 |                 instances_thread[tid].emplace_back();
216 |                 string tuple;
217 |                 while (ss >> tuple) {
218 |                     int i;
219 |                     float v;
220 |                     CHECK_EQ(sscanf(tuple.c_str(), "%d:%f", &i, &v), 2) << "read error, using [index]:[value] format";
221 |                     instances_thread[tid].back().emplace_back(i, v);
222 |                     if (i > local_feature[tid]) local_feature[tid] = i;
223 |                 };
224 | 
225 |                 //read next instance
226 |                 lbegin = lend;
227 |             }
228 |         }
229 |         for (int i = 0; i < nthread; i++) {
230 |             if (local_feature[i] > n_features_)
231 |                 n_features_ = local_feature[i];
232 |         }
233 |         for (int i = 0; i < nthread; i++) {
234 |             this->y_.insert(y_.end(), y_thread[i].begin(), y_thread[i].end());
235 |             this->instances_.insert(instances_.end(), instances_thread[i].begin(), instances_thread[i].end());
236 |         }
237 |     }
238 |     LOG(INFO) << "#instances = " << this->n_instances() << ", #features = " << this->n_features();
239 | }
240 | 
241 | void DataSet::load_from_file_two_dimension(string file_name){
242 |     LOG(INFO) << "loading LIBSVM dataset from file \"" << file_name << "\"";
243 |     y_.clear();
244 |     features.clear();
245 |     line_num.clear();
246 |     //instances_.clear();
247 |     n_features_ = 0;
248 |     std::ifstream ifs(file_name, std::ifstream::binary);
249 |     CHECK(ifs.is_open()) << "file " << file_name << " not found";
250 | 
251 |     std::array<char, 2 << 20> buffer{}; //16M
252 |     const int nthread = omp_get_max_threads();
253 | 
254 |     auto find_last_line = [](char *ptr, const char *begin) {
255 |         while (ptr != begin && *ptr != '\n' && *ptr != '\r') --ptr;
256 |         return ptr;
257 |     };
258 | 
259 |     string first_line;
260 |     getline(ifs, first_line);
261 |     std::stringstream first_ss(first_line);
262 |     int n_f_first = 0;
263 |     string tuple;
264 |     while(first_ss >> tuple)
265 |         n_f_first++;
266 |     ifs.clear();
267 |     ifs.seekg (0, std::ios::beg);
268 | 
269 |     int n_sum_line = 0;
270 |     while (ifs) {
271 |         ifs.read(buffer.data(), buffer.size());
272 |         char *head = buffer.data();
273 |         size_t size = ifs.gcount();
274 |         vector<vector<float_type>> y_thread(nthread);
275 |         vector<node2d> instances_thread(nthread);
276 | 
277 |         vector<size_t> local_feature(nthread, 0);
278 | 
279 |         //vector<vector<vector<int>>> index_thread(nthread);
280 |         vector<vector<vector<float_type>>> feature_thread(nthread);
281 |         vector<vector<vector<int>>> line_thread(nthread);
282 |         for(int i = 0; i < nthread; i++){
283 |             feature_thread[i].resize(n_f_first * 2);
284 |             line_thread[i].resize(n_f_first * 2);
285 |         }
286 |         vector<int> n_line(nthread);
287 | #pragma omp parallel num_threads(nthread)
288 |         {
289 |             //get working area of this thread
290 |             int tid = omp_get_thread_num();
291 |             size_t nstep = (size + nthread - 1) / nthread;
292 |             size_t sbegin = std::min(tid * nstep, size - 1);
293 |             size_t send = std::min((tid + 1) * nstep, size - 1);
294 |             char *pbegin = find_last_line(head + sbegin, head);
295 |             char *pend = find_last_line(head + send, head);
296 | 
297 |             //move stream start position to the end of last line
298 |             if (tid == nthread - 1) ifs.seekg(pend - head - send, std::ios_base::cur);
299 | 
300 |             //read instances line by line
301 |             char *lbegin = pbegin;
302 |             char *lend = lbegin;
303 |             int lid = 0;
304 |             while (lend != pend) {
305 |                 //get one line
306 |                 lend = lbegin + 1;
307 |                 while (lend != pend && *lend != '\n' && *lend != '\r') {
308 |                     ++lend;
309 |                 }
310 |                 string line(lbegin, lend);
311 |                 std::stringstream ss(line);
312 | 
313 |                 //read label of an instance
314 |                 y_thread[tid].emplace_back();
315 |                 ss >> y_thread[tid].back();
316 | 
317 |                 //read features of an instance
318 |                 instances_thread[tid].emplace_back();
319 | 
320 |                 string tuple;
321 |                 //int fid = 0;
322 |                 while(ss >> tuple){
323 |                     int i;
324 |                     float v;
325 |                     CHECK_EQ(sscanf(tuple.c_str(), "%d:%f", &i, &v), 2) << "read error, using [index]:[value] format";
326 |                     instances_thread[tid].back().emplace_back(i, v);
327 |                     if(i > local_feature[tid]){
328 |                         local_feature[tid] = i;
329 |                     }
330 |                     if(i > feature_thread[tid].size()){
331 |                         feature_thread[tid].resize(i);
332 |                         line_thread[tid].resize(i);
333 |                     }
334 | 
335 |                     feature_thread[tid][i-1].emplace_back(v);
336 |                     line_thread[tid][i-1].emplace_back(lid);
337 |                     //fid++;
338 | 
339 | 
340 |                 }
341 |                 lid++;
342 |                 //read next instance
343 |                 lbegin = lend;
344 | 
345 |             }
346 |             n_line[tid] = lid;
347 |         }
348 |         for (int i = 0; i < nthread; i++) {
349 |             if (local_feature[i] > n_features_)
350 |                 n_features_ = local_feature[i];
351 |         }
352 |         this->features.resize(n_features_);
353 |         this->line_num.resize(n_features_);
354 |         for(int i = 0; i < nthread; i++) {
355 |             for(int j = 0; j < local_feature[i]; j++) {
356 |                 this->features[j].insert(this->features[j].end(),
357 |                                          feature_thread[i][j].begin(),
358 |                                          feature_thread[i][j].end());
359 |                 for (int k = 0; k < line_thread[i][j].size(); k++) {
360 |                     line_thread[i][j][k] += n_sum_line;
361 |                 }
362 |                 this->line_num[j].insert(this->line_num[j].end(),
363 |                                          line_thread[i][j].begin(), line_thread[i][j].end());
364 |             }
365 |             n_sum_line += n_line[i];
366 |         }
367 |         for (int i = 0; i < nthread; i++) {
368 |             this->y_.insert(y_.end(), y_thread[i].begin(), y_thread[i].end());
369 |             this->instances_.insert(instances_.end(), instances_thread[i].begin(), instances_thread[i].end());
370 |         }
371 |     }
372 |     LOG(INFO) << "#instances = " << this->n_instances() << ", #features = " << this->n_features();
373 | }
374 | 
375 | const DataSet::node2d &DataSet::instances() const {
376 |     return this->instances_;
377 | }
378 | 
379 | size_t DataSet::n_features() const {
380 |     return n_features_;
381 | }
382 | 
383 | size_t DataSet::n_instances() const {
384 |     //return this->instances_.size();
385 | 	return this->y_.size();	
386 | }
387 | 
388 | const vector<float_type> &DataSet::y() const {
389 |     return this->y_;
390 | }
391 | 
392 | void DataSet::compression() {
393 |     
394 | }
395 | 


--------------------------------------------------------------------------------
/src/thundergbm/gpu_lsh.cu:
--------------------------------------------------------------------------------
  1 | #include <thundergbm/util/cub_wrapper.h>
  2 | #include "thundergbm/gpu_lsh.h"
  3 | #include "thundergbm/util/device_lambda.cuh"
  4 | 
  5 | 
  6 | 
  7 | void psdLsh::init(){
  8 |     tables.resize(param.n_table);
  9 |     //std::cout<<"init n_table:"<<param.n_table<<std::endl;
 10 |     //std::cout<<"init n_dimension:"<<param.n_dimension<<std::endl;
 11 | //    v_a.resize(param.n_table * param.n_dimension);
 12 |     int seed = param.seed;
 13 |     std::mt19937 rng(seed==-1?unsigned(std::time(0)):seed);
 14 |     //if(seed!=-1)
 15 |     //    std::mt19937 rng(seed);    
 16 |     //else
 17 |     //    std::mt19937 rng(unsigned(std::time(0)));
 18 |     //std::mt19937 rng(unsigned(std::time(0))); //random number generator
 19 | //    std::mt19937 rng(42);
 20 |     std::ofstream outfile;
 21 |     outfile.open("unbalance_label.txt", std::ios::out | std::ios::app);
 22 |     outfile<<"lsh seed:"<<unsigned(std::time(0))<<std::endl;
 23 |     outfile.close();
 24 | //    std::cout<<"time 0:"<<std::time(0);
 25 | //    std::mt19937 rng(34); // with fixed seed
 26 |     std::uniform_real_distribution<float> ur(0, param.r);
 27 | 
 28 |     switch (param.p_norm)
 29 |     {
 30 |         case 1: //CAUCHY
 31 |         {
 32 | 
 33 |             for(unsigned i = 0; i < param.n_table; i++){
 34 |                 std::cauchy_distribution<float> cd;
 35 |                 for(unsigned j = 0; j < param.n_dimension; j++)
 36 |                     v_a.push_back(cd(rng));
 37 | //                    random_vector.push_back(rng());
 38 |                 v_b.push_back(ur(rng));
 39 |             }
 40 |             break;
 41 | //            for (std::vector<std::vector<float> >::iterator iter = stableArray.begin(); iter != stableArray.end(); ++iter)
 42 | //            {
 43 | //                for (unsigned i = 0; i != param.D; ++i)
 44 | //                {
 45 | //                    iter->push_back(cd(rng));
 46 | //                }
 47 | //                rndBs.push_back(ur(rng));
 48 | //            }
 49 | //            return;
 50 |         }
 51 |         case 2: //GAUSSIAN
 52 |         {
 53 |             for(unsigned i = 0; i < param.n_table; i++){
 54 |                 std::normal_distribution<float> nd;
 55 |                 for(unsigned j = 0; j < param.n_dimension; j++)
 56 |                     v_a.push_back(nd(rng));
 57 | //                    random_vector.push_back(rng());
 58 |                 v_b.push_back(ur(rng));
 59 |             }
 60 |             break;
 61 | 
 62 | //            for (std::vector<std::vector<float> >::iterator iter = stableArray.begin(); iter != stableArray.end(); ++iter)
 63 | //            {
 64 | //                for (unsigned i = 0; i != param.D; ++i)
 65 | //                {
 66 | //                    iter->push_back(nd(rng));
 67 | //                }
 68 | //                rndBs.push_back(ur(rng));
 69 | //            }
 70 | //            return;
 71 |         }
 72 |         default:
 73 |         {
 74 |             break;
 75 |         }
 76 |     }
 77 | 
 78 |     a.resize(v_a.size());
 79 |     a.copy_from(v_a.data(), v_a.size());
 80 |     b.resize(v_b.size());
 81 |     b.copy_from(v_b.data(), v_b.size());
 82 | 
 83 |     for(int i = 0; i < param.n_table; i++){
 84 | //        tables[i].resize(param.n_bucket);
 85 |         for(int j = 0; j < param.n_bucket; j++)
 86 |             tables[i][j].resize(param.n_comp);
 87 |     }
 88 | }
 89 | 
 90 | void psdLsh::reset(const Parameter &param_)
 91 | {
 92 |     param = param_;
 93 |     init();
 94 | }
 95 | 
 96 | 
 97 | void psdLsh::hash(int n_instances, int n_features, int nnz, int key_offset,
 98 |         SyncArray<float_type> &csr_val, SyncArray<int> &csr_row_ptr, SyncArray<int> &csr_col_ind, SyncArray<int> &hash_values, int cid) {
 99 | //    cudaSetDevice(0);
100 | 
101 |     CHECK(n_features == param.n_dimension);
102 |     cusparseHandle_t handle;
103 |     cusparseMatDescr_t descr;
104 |     cusparseCreate(&handle);
105 |     cusparseCreateMatDescr(&descr);
106 |     cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
107 |     cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
108 |     float one(1);
109 |     float zero(0);
110 |     SyncArray<float_type> result(n_instances * param.n_table);
111 |     float *result_device = result.device_data();
112 |     float *b_device = b.device_data();
113 |     cudaDeviceSynchronize();
114 |     cusparseScsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_instances, param.n_table, n_features, nnz, &one, descr,
115 |                    csr_val.device_data(), csr_row_ptr.device_data(), csr_col_ind.device_data(), a.device_data(),
116 |                    n_features,
117 |                    &zero, result_device, n_instances);
118 | 
119 |     cudaDeviceSynchronize();
120 | 
121 |     int *hash_values_device = hash_values.device_data();
122 | 
123 |     SyncArray<float> r_gpu(1);
124 |     r_gpu.host_data()[0] = param.r;
125 |     SyncArray<int> n_table_gpu(1);
126 |     n_table_gpu.host_data()[0] = param.n_table;
127 |     cudaDeviceSynchronize();
128 |     float* r_device = r_gpu.device_data();
129 |     int* n_table_device = n_table_gpu.device_data();
130 |     device_loop(result.size(), [=]__device__(int vid){
131 |         result_device[vid] = result_device[vid] + b_device[vid % n_table_device[0]];
132 |         result_device[vid] = result_device[vid] / r_device[0];
133 |         hash_values_device[vid] = __float2int_rd(result_device[vid]);
134 | //        hash_values_device[vid] = (int) (floorf(result_device[vid]));
135 |     });
136 |     cudaDeviceSynchronize();
137 |     int *hash_values_host = hash_values.host_data();
138 |     for (unsigned nid = 0; nid < n_instances; nid++) {
139 |         for (unsigned tid = 0; tid < param.n_table; tid++) {
140 |             hash_values_host[nid * param.n_table + tid] %= param.n_bucket;
141 | //            std::cout<<"hash value"<<hash_values_host[nid * param.n_table + tid]<<std::endl;
142 |             tables[tid][hash_values_host[nid * param.n_table + tid]][cid].push_back(key_offset);
143 | //            if(tables[tid][hash_values_host[nid * param.n_table + tid]].size() == 0)
144 | //                tables[tid][hash_values_host[nid * param.n_table + tid]].resize(param.n_comp);
145 | //            tables[tid][hash_values_host[nid * param.n_table + tid]][cid].push_back(key_offset);
146 |         }
147 |         key_offset ++;
148 |     }
149 |     cudaDeviceSynchronize();
150 |     cusparseDestroy(handle);
151 |     cusparseDestroyMatDescr(descr);
152 | }
153 | 
154 | //void psdLsh::query(int n_instances, int n_features, int nnz,
155 | //                   SyncArray<float_type> &csr_val, SyncArray<int> &csr_row_ptr, SyncArray<int> &csr_col_ind,
156 | //                   vector<vector<int>>& buckets){
157 | //    CHECK(n_features == param.n_dimension);
158 | //    cusparseHandle_t handle;
159 | //    cusparseMatDescr_t descr;
160 | //    cusparseCreate(&handle);
161 | //    cusparseCreateMatDescr(&descr);
162 | //    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
163 | //    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
164 | //    float one(1);
165 | //    float zero(0);
166 | //    SyncArray <float_type> result(n_instances * param.n_table);
167 | //    float *result_device = result.device_data();
168 | //    float *b_device = b.device_data();
169 | //    cusparseScsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_instances, param.n_table, n_features, nnz, &one, descr,
170 | //                   csr_val.device_data(), csr_row_ptr.device_data(), csr_col_ind.device_data(), a.device_data(),
171 | //                   n_features,
172 | //                   &zero, result_device, n_instances);
173 | //
174 | //    SyncArray<int> hash_values(n_instances * param.n_table);
175 | //    int *hash_values_device = hash_values.device_data();
176 | //    int *hash_values_host = hash_values.host_data();
177 | //    device_loop(n_instances * param.n_table, [=]
178 | //    __device__(int vid){
179 | //        result_device[vid] += b_device[vid % param.n_table];
180 | //        result_device[vid] /= param.r;
181 | //        hash_values_device[vid] = (int) (floorf(result_device[vid]));
182 | //    });
183 | //    for (unsigned nid = 0; nid < n_instances; nid++) {
184 | //        for (unsigned tid = 0; tid < param.n_table; tid++) {
185 | //            int bid = hash_values_host[nid * n_instances + tid] % param.n_bucket;
186 | //            buckets[nid].insert(buckets[nid].end(), tables[tid][bid].begin(), tables[tid][bid].end());
187 | //        }
188 | //    }
189 | //}
190 | 


--------------------------------------------------------------------------------
/src/thundergbm/hist_cut.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by qinbin on 2018/5/9.
  3 | //
  4 | 
  5 | #include "thundergbm/hist_cut.h"
  6 | #include "thundergbm/quantile_sketch.h"
  7 | #include "thundergbm/syncarray.h"
  8 | #include <sstream>
  9 | #include <omp.h>
 10 | void HistCut::get_cut_points(SparseColumns& columns, InsStat& stats, int max_num_bins, int n_instances, int device_id){
 11 |     LOG(TRACE)<<"get cut points";
 12 |     int n_features = columns.n_column;
 13 |     std::cout<<"n_featrues:"<<n_features<<std::endl;
 14 |     vector<quanSketch> sketchs(n_features);
 15 |     const int kFactor = 8;
 16 |     for(int i = 0; i < n_features; i++){
 17 |         sketchs[i].Init(n_instances, 1.0 / (max_num_bins * kFactor));
 18 |     }
 19 |     float_type* val_ptr = columns.csc_val.host_data();
 20 |     int* row_ptr = columns.csc_row_ind.host_data();
 21 |     int* col_ptr = columns.csc_col_ptr.host_data();
 22 | 	auto stat_gh_ptr = stats.gh_pair.host_data();
 23 | //	std::cout<<"before add"<<std::endl;
 24 | #pragma omp parallel for
 25 | 	for(int i = 0; i < columns.csc_col_ptr.size() - 1; i++){
 26 |         for(int j = col_ptr[i + 1] - 1; j >= col_ptr[i]; j--){
 27 |             float_type val = val_ptr[j];
 28 |             float_type weight = stat_gh_ptr[row_ptr[j]].h;
 29 |             sketchs[i].Add(val, weight);
 30 |         }
 31 |     }
 32 | //    std::cout<<"after add"<<std::endl;
 33 | 	vector<summary> n_summary(n_features);
 34 | //    summary n_summary[n_features];
 35 | //	std::cout<<"before prune"<<std::endl;
 36 | #pragma omp parallel for
 37 |     for(int i = 0; i < n_features; i++){
 38 |         summary ts;
 39 |         sketchs[i].GetSummary(ts);
 40 | 		n_summary[i].Reserve(max_num_bins * kFactor);
 41 |         n_summary[i].Prune(ts, max_num_bins * kFactor);
 42 |     }
 43 |     int nthread = omp_get_max_threads();
 44 |     vector<vector<float_type>> cut_points_local;
 45 | 	cut_points_local.resize(n_features);
 46 |     vector<int> cut_points_size(n_features);
 47 |     for(int i = 0; i < n_features; i++)
 48 |         cut_points_local[i].resize(max_num_bins);
 49 | #pragma omp parallel num_threads(nthread)
 50 |     {
 51 |         int tid = omp_get_thread_num();
 52 |         int nstep = (n_features + nthread - 1) / nthread;
 53 |         int sbegin = std::min(tid * nstep, n_features);
 54 |         int send = std::min((tid + 1) * nstep, n_features);
 55 |         for(int i = sbegin; i < send; i++){
 56 |             int k = 0;
 57 |             summary ts;
 58 | 			ts.Reserve(max_num_bins);
 59 | 			ts.Prune(n_summary[i], max_num_bins);
 60 |             if(ts.entry_size == 0) { cut_points_size[i] = 0;continue;}
 61 |             float_type min_val = ts.entries[0].val;
 62 |             //push a value that is smaller than min val
 63 |             cut_points_local[i][k++] = min_val - (fabsf(min_val) + 1e-5);
 64 | 
 65 |             if(ts.entry_size > 1 && ts.entry_size <= 16){
 66 |                 cut_points_local[i][k++] = (ts.entries[0].val + ts.entries[1].val) / 2;
 67 |                 for(int j = 2; j < ts.entry_size; j++){
 68 |                     float_type mid = (ts.entries[j - 1].val + ts.entries[j].val) / 2;
 69 |                     if(mid > cut_points_local[i][k-1]){
 70 |                         cut_points_local[i][k++] = mid;
 71 |                     }
 72 |                 }
 73 |             }
 74 |             else{
 75 |                 if(ts.entry_size == max_num_bins)
 76 |                     LOG(INFO)<<"max bin";
 77 |                 if(ts.entry_size > 1)
 78 |                     cut_points_local[i][k++] = ts.entries[1].val;
 79 |                 for(int j = 2; j < ts.entry_size; j++){
 80 |                     float_type val = ts.entries[j].val;
 81 |                     if(val > cut_points_local[i][k-1]){
 82 |                         cut_points_local[i][k++] = val;
 83 |                     }
 84 |                 }
 85 |             }
 86 | 
 87 |             /*
 88 |             float_type max_val = ts.entries[ts.entry_size - 1].val;
 89 |             if(max_val > 0){
 90 |                 cut_points_local[i][k++] = max_val*2 + 1e-5;
 91 |             }
 92 |             else{
 93 |                 cut_points_local[i][k++] = 1e-5;
 94 |             }
 95 |             */
 96 |             cut_points_size[i]=k;
 97 |         }
 98 |     }
 99 |     for (int i = 0; i < n_features; i++) {
100 | 		if(cut_points_size[i] != 0)
101 |         	this->cut_points.insert(cut_points.end(), cut_points_local[i].begin(), cut_points_local[i].begin() + cut_points_size[i]);
102 |     }
103 |     this->row_ptr.push_back(0);
104 |     for(int i = 0; i < n_features; i++){
105 |         this->row_ptr.push_back(cut_points_size[i]+this->row_ptr.back());
106 |     }
107 |     CUDA_CHECK(cudaSetDevice(device_id));
108 |     cut_row_ptr.resize(this->row_ptr.size());
109 |     cut_row_ptr.copy_from(this->row_ptr.data(), this->row_ptr.size());
110 |     cut_points_val.resize(this->cut_points.size());
111 |     auto cut_points_val_ptr = cut_points_val.host_data();
112 |     auto cut_row_ptr_data = cut_row_ptr.host_data();
113 |     //descend order
114 |     for(int i = 0; i < cut_row_ptr.size(); i++){
115 |         int sum = cut_row_ptr_data[i] + cut_row_ptr_data[i+1] - 1;
116 |         for(int j = cut_row_ptr_data[i+1] - 1; j >= cut_row_ptr_data[i]; j--)
117 |             cut_points_val_ptr[j] = this->cut_points[sum - j];
118 |     }
119 |     CUDA_CHECK(cudaSetDevice(device_id));
120 | 
121 | }
122 | 
123 | void BinStat::Init(HistCut& cut, InsStat& stats, int pid, float_type* f_val, int n_f_val, int* iid){
124 |     this->numBin = cut.row_ptr[pid+1] - cut.row_ptr[pid];
125 |     this->gh_pair.resize(cut.row_ptr[pid+1] - cut.row_ptr[pid]);
126 |     auto cbegin = cut.cut_points.begin()+ cut.row_ptr[pid];
127 |     auto cend = cut.cut_points.begin()+ cut.row_ptr[pid + 1];
128 |     for(int i = 0; i < n_f_val; i++){
129 |         float_type val = f_val[i];
130 |         float_type g = stats.gh_pair.host_data()[iid[i]].g;
131 |         float_type h = stats.gh_pair.host_data()[iid[i]].h;
132 |         auto off = std::upper_bound(cbegin, cend, val);
133 |         if(off == cend) off = cend - 1;
134 |         int bid = off - cbegin;
135 |         this->gh_pair.host_data()[bid].g += g;
136 |         this->gh_pair.host_data()[bid].h += h;
137 |     }
138 | }
139 | 
140 | 
141 | //void BinStat::Init(vector<float_type>& cut_points, InsStat& stats, SparseColumns& columns, int fid){
142 | //    this->fid = fid;
143 | //    this->gh_pair.resize(cut_points.size());
144 | //    float_type* val_ptr = columns.csc_val.host_data();
145 | //    int* row_ptr = columns.csc_row_ind.host_data();
146 | //    int* col_ptr = columns.csc_col_ptr.host_data();
147 | //    for(int i = col_ptr[fid + 1] - 1; i >= col_ptr[fid]; i--){
148 | //        float_type val = val_ptr[i];
149 | //        float_type g = stats.gh_pair.host_data()[row_ptr[i]].g;
150 | //        float_type h = stats.gh_pair.host_data()[row_ptr[i]].h;
151 | //        auto cbegin = cut_points.begin();
152 | //        auto cend = cut_points.end();
153 | //        auto off = std::upper_bound(cbegin, cend, val);
154 | //        if(off == cend) off = cend - 1;
155 | //        this->bid = off - cbegin;
156 | //        this->gh_pair.host_data()[bid].g += g;
157 | //        this->gh_pair.host_data()[bid].h += h;
158 | //    }
159 | //}
160 | 


--------------------------------------------------------------------------------
/src/thundergbm/ins_stat.cu:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by shijiashuai on 5/7/18.
 3 | //
 4 | #include "thundergbm/ins_stat.h"
 5 | 
 6 | void InsStat::resize(size_t n_instances) {
 7 |     this->n_instances = n_instances;
 8 |     gh_pair.resize(n_instances);
 9 |     nid.resize(n_instances);
10 |     y.resize(n_instances);
11 |     y_predict.resize(n_instances);
12 | }
13 | 
14 | void InsStat::updateGH() {
15 |     sum_gh = GHPair(0, 0);
16 |     GHPair *gh_pair_data = gh_pair.host_data();
17 |     int *nid_data = nid.host_data();
18 |     float_type *stats_y_data = y.host_data();
19 |     float_type *stats_yp_data = y_predict.host_data();
20 |     LOG(DEBUG) << y_predict;
21 |     LOG(TRACE) << "initializing instance statistics";
22 |     //TODO parallel?
23 |     for (int i = 0; i < n_instances; ++i) {
24 |         nid_data[i] = 0;
25 |         //TODO support other objective function
26 |         gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i];
27 |         gh_pair_data[i].h = 1;
28 |         sum_gh = sum_gh + gh_pair_data[i];
29 |     }
30 | }
31 | 
32 | void InsStat::updateGH(SyncArray<bool>& is_multi) {
33 |     sum_gh = GHPair(0, 0);
34 |     GHPair *gh_pair_data = gh_pair.host_data();
35 |     int *nid_data = nid.host_data();
36 |     float_type *stats_y_data = y.host_data();
37 |     float_type *stats_yp_data = y_predict.host_data();
38 |     bool* is_multi_data = is_multi.host_data();
39 |     LOG(DEBUG) << y_predict;
40 |     LOG(TRACE) << "initializing instance statistics";
41 |     //TODO parallel?
42 |     for (int i = 0; i < n_instances; ++i) {
43 |         nid_data[i] = 0;
44 |         //TODO support other objective function
45 |         if(is_multi_data[i]) {
46 |             gh_pair_data[i].g = 2 * stats_yp_data[i] - stats_y_data[i];
47 |             gh_pair_data[i].h = 2;
48 |         }
49 |         else {
50 |             gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i];
51 |             gh_pair_data[i].h = 1;
52 |         }
53 |         sum_gh = sum_gh + gh_pair_data[i];
54 |     }
55 | }
56 | 
57 | void InsStat::updateGH(SyncArray<bool>& is_multi, int numP) {
58 |     sum_gh = GHPair(0, 0);
59 |     GHPair *gh_pair_data = gh_pair.host_data();
60 |     int *nid_data = nid.host_data();
61 |     float_type *stats_y_data = y.host_data();
62 |     float_type *stats_yp_data = y_predict.host_data();
63 |     bool* is_multi_data = is_multi.host_data();
64 |     LOG(DEBUG) << y_predict;
65 |     LOG(TRACE) << "initializing instance statistics";
66 |     //TODO parallel?
67 |     for (int i = 0; i < n_instances; ++i) {
68 |         nid_data[i] = 0;
69 |         //TODO support other objective function
70 |         if(is_multi_data[i]) {
71 |             gh_pair_data[i].g = numP * stats_yp_data[i] - stats_y_data[i]; // y is already multipled in similar_ins_bundle
72 |             gh_pair_data[i].h = numP;
73 | 
74 | //            gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i];
75 | //            gh_pair_data[i].h = 1;
76 |         }
77 |         else {
78 |             gh_pair_data[i].g = stats_yp_data[i] - stats_y_data[i];
79 |             gh_pair_data[i].h = 1;
80 |         }
81 |         sum_gh = sum_gh + gh_pair_data[i];
82 |     }
83 | }


--------------------------------------------------------------------------------
/src/thundergbm/quantile_sketch.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by qinbin on 2018/5/9.
  3 | //
  4 | 
  5 | #include "thundergbm/quantile_sketch.h"
  6 | #include <math.h>
  7 | 
  8 | void summary::Reserve(int size){
  9 |     if(size > entries.size()){
 10 |         entry_reserve_size = size;
 11 |         entries.resize(size);
 12 |         //data...
 13 |     }
 14 | }
 15 | 
 16 | void summary::Copy(summary& src){
 17 |     entry_size = src.entry_size;
 18 |     entry_reserve_size = src.entry_reserve_size;
 19 |     entries = src.entries;
 20 | }
 21 | 
 22 | void summary::Merge(summary& src1, summary& src2){
 23 |     if(src1.entry_size == 0 && src2.entry_size == 0){
 24 |         this->entry_size = 0;
 25 |         this->entry_reserve_size = 0;
 26 |         this->entries.clear();
 27 |         return;
 28 |     }
 29 |     else if(src1.entry_size == 0){
 30 |         this->Copy(src2);
 31 |         return;
 32 |     }
 33 |     else if(src2.entry_size == 0){
 34 |         this->Copy(src1);
 35 |         return;
 36 |     }
 37 |     float_type r1 = 0;
 38 |     float_type r2 = 0;
 39 |     int i = 0, j = 0;
 40 |     this->Reserve(src1.entry_size + src2.entry_size);
 41 |     this->entry_size = 0;
 42 |     for(; i < src1.entry_size && j < src2.entry_size;){
 43 |         int val1 = src1.entries[i].val;
 44 |         int val2 = src2.entries[j].val;
 45 |         if(val1 == val2){
 46 |             CHECK(this->entry_size < entry_reserve_size) << this->entry_size;
 47 |             this->entries[this->entry_size++] = entry(val1,
 48 |                                      src1.entries[i].rmin + src2.entries[j].rmin,
 49 |                                      src1.entries[i].rmax + src2.entries[j].rmax,
 50 |                                      src1.entries[i].w + src2.entries[j].w);
 51 |             r1 = src1.entries[i].rmin + src1.entries[i].w;
 52 |             r2 = src2.entries[j].rmin + src2.entries[j].w;
 53 |             i++;
 54 |             j++;
 55 | 			//this->entry_size++;
 56 |         }
 57 |         else if(val1 < val2){
 58 |             CHECK(this->entry_size < entry_reserve_size) << this->entry_size;
 59 |             this->entries[this->entry_size++]=entry(val1,
 60 |                                      src1.entries[i].rmin + r2,
 61 |                                      src1.entries[i].rmax + src2.entries[j].rmax - src2.entries[j].w,
 62 |                                           src1.entries[i].w);
 63 |             r1 = src1.entries[i].rmin + src1.entries[i].w;
 64 |             i++;
 65 | 			//this->entry_size++;
 66 |         }
 67 |         else{
 68 |             CHECK(this->entry_size < entry_reserve_size) << this->entry_size;
 69 |             this->entries[this->entry_size++] = entry(val2,
 70 |                                      src2.entries[j].rmin + r1,
 71 |                                      src2.entries[j].rmax + src1.entries[i].rmax - src1.entries[i].w,
 72 |                                      src2.entries[j].w);
 73 |             r2 = src2.entries[j].rmin + src2.entries[j].w;
 74 |             j++;
 75 | 			//this->entry_size++;
 76 |         }
 77 |     }
 78 |     for(; i < src1.entry_size; i++){
 79 |         CHECK(this->entry_size < entry_reserve_size) << this->entry_size;
 80 |         this->entries[this->entry_size++] = entry(src1.entries[i].val,
 81 |                                  src1.entries[i].rmin + r2,
 82 |                                  src1.entries[i].rmax + src2.entries[src2.entry_size - 1].rmax,
 83 |                                  src1.entries[i].w);
 84 |     	//this->entry_size++;
 85 | 	}
 86 |     for(; j < src2.entry_size; j++){
 87 |         CHECK(this->entry_size < entry_reserve_size) << this->entry_size;
 88 |         this->entries[this->entry_size++] = entry(src2.entries[j].val,
 89 |                                  src2.entries[j].rmin + r1,
 90 |                                  src2.entries[j].rmax +  src1.entries[src1.entry_size - 1].rmax,
 91 |                                  src2.entries[j].w);
 92 |     	//this->entry_size++;
 93 | 	}
 94 |     //this->entry_size = this->entries.size();
 95 |     r1 = 0;
 96 |     r2 = 0;
 97 | //    float_type rmin_diff = 0;
 98 | //    float_type rmax_diff = 0;
 99 | //    float_type w_diff = 0;
100 |     for(int i = 0; i < this->entry_size; i++){
101 |         if(this->entries[i].rmin < r1){
102 |             this->entries[i].rmin = r1;
103 | //            if(r1 - this->entries[i].rmin > rmin_diff)
104 | //                rmin_diff = r1 - this->entries[i].rmin;
105 |         }
106 | 		else
107 |         	r1 = this->entries[i].rmin;
108 |         if(this->entries[i].rmax < r2){
109 |             this->entries[i].rmax = r2;
110 | //            if(r2 - this->entries[i].rmax > rmax_diff)
111 | //                rmax_diff = r2 - this->entries[i].rmax;
112 |         }
113 |         if(this->entries[i].rmax < this->entries[i].rmin + this->entries[i].w){
114 |             this->entries[i].rmax = this->entries[i].rmin + this->entries[i].w;
115 | //            if(this->entries[i].rmax - this->entries[i].rmin - this->entries[i].w > w_diff)
116 | //                    w_diff = this->entries[i].rmax - this->entries[i].rmin - this->entries[i].w;
117 |         }
118 | 		r2 = this->entries[i].rmax;
119 |     }
120 | }
121 | 
122 | 
123 | 
124 | void summary::Prune(summary& src, int size){
125 |     if(src.entry_size <= size){
126 |         this->Copy(src);
127 |         return;
128 |     }
129 |     float_type begin = src.entries[0].rmax;
130 |     float_type End = src.entries[src.entry_size - 1].rmin;
131 |     float_type range = End - begin;
132 |     if(size <= 2 || range == 0.0f){
133 |         this->entry_size = 2;
134 |         CHECK(1 < entry_reserve_size) << entry_reserve_size;
135 |         this->entries[0] = src.entries[0];
136 |         this->entries[1] = src.entries[src.entry_size - 1];
137 |         return;
138 |     }
139 |     range = std::max(range, 1e-3f);
140 |     int n_points = size - 2;
141 |     int n_bigbin = 0;
142 |     int safe_factor = 2;
143 |     float_type chunk_size = safe_factor * range / n_points;
144 |     float_type sum_small_range = 0;
145 |     int j = 0;
146 |     int i = 1;
147 |     float_type r1;
148 |     float_type r2;
149 |     vector<int> big_points;
150 |     big_points.reserve(n_points + 1);
151 |     //int last_big_point = 0;
152 |     for(; i < src.entry_size - 1; i++){
153 |         CHECK(i < src.entry_reserve_size) << i;
154 |         r1 = src.entries[i].rmin + src.entries[i].w;
155 |         r2 = src.entries[i].rmax - src.entries[i].w;
156 |         if(r1 > r2 + chunk_size){
157 |             n_bigbin++;
158 |             big_points.push_back(i);
159 |             //
160 |             if(j != i -1)
161 |                 sum_small_range += r2 - (src.entries[j].rmin + src.entries[j].w);
162 |             j = i;
163 |         }
164 |     }
165 |     CHECK(n_bigbin < n_points) << "too many big bin";
166 |     int n_smallbin = n_points - n_bigbin;
167 |     //r1 = src.entries[i].rmin + src.entries[i].w;
168 |     r2 = src.entries[i].rmax - src.entries[i].w;
169 |     if(j != src.entry_size - 2)
170 |         sum_small_range += r2 - (src.entries[j].rmin + src.entries[j].w);
171 |     CHECK(j < src.entry_reserve_size) << j;
172 |     this->entries[0] = src.entries[0];
173 |     this->entry_size = 1;
174 | 	n_points -= n_bigbin;
175 |     j = 0;
176 |     int n_get_points = 1;
177 |     //store maximum point
178 |     big_points.push_back(src.entry_size - 1);
179 |     for(int i = 0; i < big_points.size(); i++){
180 |         int id = big_points[i];
181 |         if(j != id -1){
182 |             CHECK(id < src.entry_reserve_size) << id;
183 |             float_type r = src.entries[id].rmax - src.entries[id].w;
184 |             int k = j;
185 |             for(; n_get_points < n_points; n_get_points++){
186 |                 float_type start = n_get_points * sum_small_range / n_points + begin;
187 |                 if(start >= r)
188 |                     break;
189 |                 for(; k < id; k++){
190 |                     CHECK(k+1 < src.entry_reserve_size) << k+1;
191 |                     if(2 * start < (src.entries[k + 1].rmax + src.entries[k + 1].rmin))
192 |                         break;
193 |                 }
194 |                 if(k == id) break;
195 |                 CHECK(k < src.entry_reserve_size) << k;
196 |                 if(2 * start >= src.entries[k].rmin + src.entries[k].w + src.entries[k+1].rmax - src.entries[k+1].w){
197 |                     if(k != j - 1){
198 |                         j = k + 1;
199 |                         CHECK(k < src.entry_reserve_size - 1) << k;
200 |                         this->entries[this->entry_size] = src.entries[k + 1];
201 |                         this->entry_size++;
202 | 						
203 |                     }
204 |                 }
205 |                 else{
206 |                     if(k != j){
207 |                         j = k;
208 |                         CHECK(k < src.entry_reserve_size) << k;
209 |                         this->entries[this->entry_size] = src.entries[k];
210 |                         this->entry_size++;
211 |                     }
212 |                 }
213 |             }
214 |         }
215 |         //store big bin
216 |         if(j != id){
217 |             CHECK(id < src.entry_reserve_size) << id;
218 |             this->entries[this->entry_size] = src.entries[id];
219 |             this->entry_size++;
220 |             j = id;
221 |         }
222 |         CHECK(j < src.entry_reserve_size) << j;
223 |         begin += src.entries[j].rmin + 2 * src.entries[j].w - src.entries[j].rmax;
224 |     }
225 | 
226 |     if(this->entry_size == 64) LOG(INFO)<<"entry 64";
227 | //    for(int i = 1; i < src.entry_size; i++){
228 | //        if
229 | //    }
230 | 
231 | 
232 | };
233 | 
234 | void Qitem::GetSummary(summary& ret){
235 |     //remove it if data is sorted
236 |     //sort(data.begin(), data.begin() + tail);
237 |     ret.entry_size = 0;
238 |     float_type waccum = 0;
239 |     for(int i = 0; i < tail;){
240 |         int j = i + 1;
241 |         CHECK(i < data.size()) << i;
242 |         float_type wt = data[i].second;
243 |         for(; j < tail; j++){
244 |             CHECK(j < data.size()) << j;
245 |             if(data[j].first == data[i].first)
246 |                 wt += data[j].second;
247 |             else
248 |                 break;
249 |         }
250 |         CHECK(ret.entry_size < ret.entry_reserve_size) << ret.entry_size;
251 |         ret.entries[ret.entry_size] = entry(data[i].first, waccum, waccum + wt, wt);
252 |         ret.entry_size++;
253 |         waccum += wt;
254 |         i = j;
255 |     }
256 | }
257 | 
258 | void quanSketch::Init(int maxn, float_type eps){
259 |     numOfLevel = 1;
260 |     while (1) {
261 |         summarySize = ceil(numOfLevel / eps) + 1;
262 |         int n = (1ULL << numOfLevel);
263 |         if (n * summarySize >= maxn) break;
264 |         ++numOfLevel;
265 |     }
266 | //    std::cout<<"summarySize:"<<summarySize<<std::endl;
267 | 	int n = (1ULL << numOfLevel);
268 |     CHECK(n * summarySize >= maxn) << "invalid init parameter";
269 |     CHECK(numOfLevel <= summarySize * eps) << "invalid init parameter";
270 |     Qentry.data.clear();
271 |     Qentry.data.resize(summarySize * 2);
272 |     Qentry.tail = 0;
273 |     //summaries.clear();
274 | }
275 | 
276 | 
277 | void quanSketch::Add(float_type value, float_type weight){
278 |     if(weight == 0.0f) return;
279 |     if(Qentry.data.size() == Qentry.tail){
280 |         t_summary.Reserve(2*summarySize);
281 |         Qentry.GetSummary(t_summary);
282 |         Qentry.tail = 0;
283 |         for(int i = 1;; i++){
284 |             if(summaries.size() < i + 1){
285 | 				//Qentry.data.resize((i+1)*summarySize);
286 |                 summaries.resize(i + 1, summary(0, (i+1) * summarySize));
287 |             }
288 |             CHECK(i < summaries.size()) << i;
289 |             if(summaries[i].entry_size == 0){
290 |                 summaries[i].Prune(t_summary, summarySize);
291 |                 break;
292 |             }
293 |             else{
294 |                 summaries[0].Prune(t_summary, summarySize);
295 |                 CHECK(i < summaries.size()) << i;
296 |                 t_summary.Merge(summaries[0], summaries[i]);
297 |                 if(t_summary.entry_size > summarySize)
298 |                     summaries[i].entry_size = 0;
299 |                 else{
300 |                     summaries[i].Copy(t_summary);
301 |                     break;
302 |                 }
303 |             }
304 |         }
305 | //        this->AddT();
306 |     }
307 |     CHECK(Qentry.tail < Qentry.data.size()) << Qentry.tail;
308 |     if(Qentry.tail == 0 || value != Qentry.data[Qentry.tail-1].first){
309 |         CHECK(Qentry.tail < Qentry.data.size()) << Qentry.tail;
310 | 		Qentry.data[Qentry.tail] = std::make_pair(value, weight);
311 | 		Qentry.tail++;
312 | 	}
313 | 	else{
314 |         CHECK(Qentry.tail <= Qentry.data.size()) << Qentry.tail;
315 |         Qentry.data[Qentry.tail-1].second += weight;
316 |     }
317 | 
318 | 	//Qentry.data.push_back(std::make_pair(value, weight));
319 | }
320 | 
321 | void quanSketch::GetSummary(summary& dest){
322 |     dest.entry_size = 0;
323 |     dest.entries.clear();
324 |     if(summaries.size() == 0){
325 | //		std::cout<<"0 size"<<std::endl;
326 |         dest.Reserve(Qentry.data.size());
327 |         Qentry.GetSummary(dest);
328 |         if(dest.entry_size > summarySize){
329 |             t_summary.Reserve(summarySize);
330 |             t_summary.Prune(dest, summarySize);
331 |             dest.Copy(t_summary);
332 |         }
333 |     }
334 |     else {
335 | //		std::cout<<"not 0 size"<<std::endl;
336 |         dest.Reserve(2 * summarySize);
337 |         Qentry.GetSummary(dest);
338 |         summaries[0].Prune(dest, summarySize);
339 |         for(int i = 1; i < summaries.size(); i++){
340 |             if(summaries[i].entry_size == 0)
341 |                 continue;
342 | 			if(summaries[0].entry_size == 0)
343 | 				summaries[0].Copy(summaries[i]);
344 | 			else{
345 |             	dest.Merge(summaries[0], summaries[i]);
346 |             	summaries[0].Prune(dest, summarySize);
347 | 			}
348 |         }
349 |         dest.Copy(summaries[0]);
350 |     }
351 | 
352 | }
353 | 
354 | //void quanSketch::AddT(){
355 | //    t_summary
356 | //}
357 | 


--------------------------------------------------------------------------------
/src/thundergbm/sparse_columns.cu:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by shijiashuai on 5/7/18.
  3 | //
  4 | #include <thundergbm/util/cub_wrapper.h>
  5 | #include "thundergbm/sparse_columns.h"
  6 | #include "thundergbm/util/device_lambda.cuh"
  7 | 
  8 | void SparseColumns::from_dataset(const DataSet &dataset) {
  9 |     LOG(TRACE) << "constructing sparse columns";
 10 |     n_column = dataset.n_features();
 11 |     vector<float_type> csc_val_vec;
 12 |     vector<int> csc_row_ind_vec;
 13 |     vector<int> csc_col_ptr_vec;
 14 |     csc_col_ptr_vec.push_back(0);
 15 |     for (int i = 0; i < n_column; i++) {
 16 |         csc_val_vec.insert(csc_val_vec.end(), dataset.features[i].begin(), dataset.features[i].end());
 17 |         csc_row_ind_vec.insert(csc_row_ind_vec.end(), dataset.line_num[i].begin(), dataset.line_num[i].end());
 18 |         csc_col_ptr_vec.push_back(csc_col_ptr_vec.back() + dataset.features[i].size());
 19 |     }
 20 |     nnz = csc_val_vec.size();
 21 |     csc_val.resize(csc_val_vec.size());
 22 |     memcpy(csc_val.host_data(), csc_val_vec.data(), sizeof(float_type) * csc_val_vec.size());
 23 |     csc_row_ind.resize(csc_row_ind_vec.size());
 24 |     memcpy(csc_row_ind.host_data(), csc_row_ind_vec.data(), sizeof(int) * csc_row_ind_vec.size());
 25 |     csc_col_ptr.resize(csc_col_ptr_vec.size());
 26 |     memcpy(csc_col_ptr.host_data(), csc_col_ptr_vec.data(), sizeof(int) * csc_col_ptr_vec.size());
 27 |     cudaDeviceSynchronize();// ?
 28 | }
 29 | 
 30 | void SparseColumns::from_dataset_csr(const DataSet &dataset) {
 31 |     LOG(INFO) << "constructing sparse columns";
 32 |     n_column = dataset.n_features();
 33 |     size_t n_instances = dataset.n_instances();
 34 |     const DataSet::node2d &instances = dataset.instances();
 35 | 
 36 |     /**
 37 |      * construct csr matrix, then convert to csc matrix and sort columns by feature values
 38 |      */
 39 |     vector<float_type> csr_val;
 40 |     vector<int> csr_col_ind;//index of each value of all the instances
 41 |     vector<int> csr_row_ptr(1, 0);//the start positions of the instances
 42 | 
 43 |     LOG(INFO) << "converting libsvm sparse rows to csr matrix";
 44 |     for (const auto &ins : instances) {//convert libsvm format to csr format
 45 |         for (const auto &j : ins) {
 46 |             csr_val.push_back(j.value);
 47 |             csr_col_ind.push_back(j.index - 1);//libSVM data format is one-based, convert to zero-based
 48 |         }
 49 |         CHECK_LE(csr_row_ptr.back() + ins.size(), INT_MAX);
 50 |         csr_row_ptr.push_back(csr_row_ptr.back() + ins.size());
 51 |     }
 52 | 
 53 |     nnz = csr_val.size();//number of nonzer
 54 |     LOG(INFO)
 55 |             << string_format("dataset density = %.2f%% (%d feature values, ave=%d/instance, %d/feature)",
 56 |                              (float) nnz / n_instances / n_column * 100,
 57 |                              nnz, nnz / n_instances, nnz / n_column);
 58 | 
 59 |     LOG(INFO) << "copy csr matrix to GPU";
 60 |     //three arrays (on GPU/CPU) for csr representation
 61 |     SyncArray<float_type> val;
 62 |     SyncArray<int> col_ind;
 63 |     SyncArray<int> row_ptr;
 64 |     val.resize(csr_val.size());
 65 |     col_ind.resize(csr_col_ind.size());
 66 |     row_ptr.resize(csr_row_ptr.size());
 67 | 
 68 |     //copy data to the three arrays
 69 |     val.copy_from(csr_val.data(), val.size());
 70 |     col_ind.copy_from(csr_col_ind.data(), col_ind.size());
 71 |     row_ptr.copy_from(csr_row_ptr.data(), row_ptr.size());
 72 | 
 73 |     LOG(INFO) << "converting csr matrix to csc matrix";
 74 |     cusparseHandle_t handle;
 75 |     cusparseMatDescr_t descr;
 76 |     cusparseCreate(&handle);
 77 |     cusparseCreateMatDescr(&descr);
 78 |     cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 79 |     cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
 80 | 
 81 |     csc_val.resize(nnz);
 82 |     csc_row_ind.resize(nnz);
 83 |     csc_col_ptr.resize(n_column + 1);
 84 | 
 85 |     cusparseScsr2csc(handle, n_instances, n_column, nnz, val.device_data(), row_ptr.device_data(),
 86 |                      col_ind.device_data(), csc_val.device_data(), csc_row_ind.device_data(), csc_col_ptr.device_data(),
 87 |                      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
 88 |     cudaDeviceSynchronize();
 89 |     cusparseDestroy(handle);
 90 |     cusparseDestroyMatDescr(descr);
 91 | }
 92 | 
 93 | 
 94 | void SparseColumns::to_multi_devices(vector<std::shared_ptr<SparseColumns>> &v_columns) const{
 95 |     //devide data into multiple devices
 96 |     int cur_device_id;
 97 |     cudaGetDevice(&cur_device_id);
 98 |     int n_device = v_columns.size();
 99 |     int ave_n_columns = n_column / n_device;
100 |     DO_ON_MULTI_DEVICES(n_device, [&](int device_id) {
101 |         SparseColumns &columns = *v_columns[device_id];
102 |         const int *csc_col_ptr_data = csc_col_ptr.host_data();
103 |         int first_col_id = device_id * ave_n_columns;
104 |         int n_column_sub = (device_id < n_device - 1) ? ave_n_columns : n_column - first_col_id;
105 |         n_column_sub = (n_device == 1) ? ave_n_columns : n_column_sub;
106 |         int first_col_start = csc_col_ptr_data[first_col_id];
107 |         int nnz_sub = (device_id < n_device - 1) ?
108 |                       (csc_col_ptr_data[(device_id + 1) * ave_n_columns] - first_col_start) : (nnz -
109 |                                                                                                first_col_start);
110 |         nnz_sub = (n_device == 1) ? nnz : nnz_sub;
111 | 
112 |         columns.column_offset = first_col_id;
113 |         columns.nnz = nnz_sub;
114 |         columns.n_column = n_column_sub;
115 |         columns.csc_val.resize(nnz_sub);
116 |         columns.csc_row_ind.resize(nnz_sub);
117 |         columns.csc_col_ptr.resize(n_column_sub + 1);
118 | 
119 |         columns.csc_val.copy_from(csc_val.host_data() + first_col_start, nnz_sub);
120 |         columns.csc_row_ind.copy_from(csc_row_ind.host_data() + first_col_start, nnz_sub);
121 |         columns.csc_col_ptr.copy_from(csc_col_ptr.host_data() + first_col_id, n_column_sub + 1);
122 | 
123 |         int *csc_col_ptr_2d_data = columns.csc_col_ptr.device_data();
124 | 
125 | 
126 |         //correct segment start positions
127 |         device_loop(n_column_sub + 1, [=] __device__(int col_id) {
128 |             csc_col_ptr_2d_data[col_id] = csc_col_ptr_2d_data[col_id] - first_col_start;
129 |         });
130 |         LOG(TRACE) << "sorting feature values (multi-device)";
131 |         cub_seg_sort_by_key(columns.csc_val, columns.csc_row_ind, columns.csc_col_ptr, false);
132 | 
133 |     });
134 |     LOG(TRACE) << "sorting finished";
135 | }
136 | 
137 | //void SparseColumns::get_cut_points_evenly(int nBin, int max_dimension, vector<int>& bin_id,
138 | //        const vector<float>& min_fea, const vector<float>& max_fea) {
139 | //    float* csc_val_host = csc_val.host_data();
140 | //    int* csc_row_host = csc_row_ind.host_data();
141 | //    int* csc_col_host = csc_col_ptr.host_data();
142 | //    for(int cid = 0; cid < csc_col_ptr.size() - 1; cid ++){
143 | //        cstart = csc_col_host[cid];
144 | //        cend = csc_col_host[cid + 1];
145 | //        for(int off = cstart; off < cend; off++){
146 | //            float val = csc_val_host[off];
147 | //            bin_id[off] = (int) ((val - min_fea[cid]) / (max_fea[cid] - min_fea[cid]) * nBin);
148 | //        }
149 | //    }
150 | //}
151 | 


--------------------------------------------------------------------------------
/src/thundergbm/syncmem.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 17-9-16.
  3 | //
  4 | 
  5 | #include <thundergbm/syncmem.h>
  6 | 
  7 | namespace thunder {
  8 |     SyncMem::SyncMem() : SyncMem(0) {}
  9 | 
 10 |     SyncMem::SyncMem(size_t size) : device_ptr(nullptr), host_ptr(nullptr), size_(size), head_(UNINITIALIZED),
 11 |                                     own_device_data(false), own_host_data(false) {
 12 | #ifdef USE_CUDA
 13 |         CUDA_CHECK(cudaGetDevice(&device_id));
 14 | #endif
 15 |     }
 16 | 
 17 |     SyncMem::~SyncMem() {
 18 |         this->head_ = UNINITIALIZED;
 19 |         if (host_ptr && own_host_data) {
 20 |             free_host(host_ptr);
 21 |             host_ptr = nullptr;
 22 |         }
 23 | #ifdef USE_CUDA
 24 |         DO_ON_DEVICE(device_id, {
 25 |             if (device_ptr && own_device_data) {
 26 |                 CUDA_CHECK(cudaFree(device_ptr));
 27 |                 device_ptr = nullptr;
 28 |             }
 29 |         });
 30 | #endif
 31 |     }
 32 | 
 33 |     void *SyncMem::host_data() {
 34 |         to_host();
 35 |         return host_ptr;
 36 |     }
 37 | 
 38 |     void *SyncMem::device_data() {
 39 | #ifdef USE_CUDA
 40 |         to_device();
 41 | #else
 42 |         NO_GPU;
 43 | #endif
 44 |         return device_ptr;
 45 |     }
 46 | 
 47 |     size_t SyncMem::size() const {
 48 |         return size_;
 49 |     }
 50 | 
 51 |     SyncMem::HEAD SyncMem::head() const {
 52 |         return head_;
 53 |     }
 54 | 
 55 |     void SyncMem::to_host() {
 56 |         switch (head_) {
 57 |             case UNINITIALIZED:
 58 |                 malloc_host(&host_ptr, size_);
 59 |                 memset(host_ptr, 0, size_);
 60 |                 head_ = HOST;
 61 |                 own_host_data = true;
 62 |                 break;
 63 |             case DEVICE:
 64 | #ifdef USE_CUDA
 65 |                 DO_ON_DEVICE(device_id, {
 66 |                     if (nullptr == host_ptr) {
 67 |                         CUDA_CHECK(cudaHostAlloc(&host_ptr, size_, cudaHostAllocPortable));
 68 |                         CUDA_CHECK(cudaMemset(host_ptr, 0, size_));
 69 |                         own_host_data = true;
 70 |                     }
 71 |                     CUDA_CHECK(cudaMemcpy(host_ptr, device_ptr, size_, cudaMemcpyDeviceToHost));
 72 |                     head_ = HOST;
 73 |                 });
 74 | #else
 75 |                 NO_GPU;
 76 | #endif
 77 |                 break;
 78 |             case HOST:;
 79 |         }
 80 |     }
 81 | 
 82 |     void SyncMem::to_device() {
 83 | #ifdef USE_CUDA
 84 |         DO_ON_DEVICE(device_id, {
 85 |             switch (head_) {
 86 |                 case UNINITIALIZED:
 87 |                     CUDA_CHECK(cudaMalloc(&device_ptr, size_));
 88 |                     CUDA_CHECK(cudaMemset(device_ptr, 0, size_));
 89 |                     head_ = DEVICE;
 90 |                     own_device_data = true;
 91 |                     break;
 92 |                 case HOST:
 93 |                     if (nullptr == device_ptr) {
 94 |                         CUDA_CHECK(cudaMalloc(&device_ptr, size_));
 95 |                         CUDA_CHECK(cudaMemset(device_ptr, 0, size_));
 96 |                         own_device_data = true;
 97 |                     }
 98 |                     CUDA_CHECK(cudaMemcpy(device_ptr, host_ptr, size_, cudaMemcpyHostToDevice));
 99 |                     head_ = DEVICE;
100 |                     break;
101 |                 case DEVICE:;
102 |             }
103 |         });
104 | #else
105 |         NO_GPU;
106 | #endif
107 |     }
108 | 
109 |     void SyncMem::set_host_data(void *data) {
110 |         CHECK_NOTNULL(data);
111 |         if (own_host_data) {
112 |             free_host(host_ptr);
113 |         }
114 |         host_ptr = data;
115 |         own_host_data = false;
116 |         head_ = HEAD::HOST;
117 |     }
118 | 
119 |     void SyncMem::set_device_data(void *data) {
120 | #ifdef USE_CUDA
121 |         DO_ON_DEVICE(device_id, {
122 |             CHECK_NOTNULL(data);
123 |             if (own_device_data) {
124 |                 CUDA_CHECK(cudaFree(device_data()));
125 |             }
126 |             device_ptr = data;
127 |             own_device_data = false;
128 |             head_ = HEAD::DEVICE;
129 |         });
130 | #else
131 |         NO_GPU;
132 | #endif
133 |     }
134 | }
135 | 


--------------------------------------------------------------------------------
/src/thundergbm/tree.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by jiashuai on 18-1-18.
  3 | //
  4 | #include "thundergbm/tree.h"
  5 | 
  6 | Tree::Tree(int depth) {
  7 |     init(depth);
  8 | }
  9 | 
 10 | void Tree::init(int depth) {
 11 |     int n_max_nodes = static_cast<int>(pow(2, depth + 1) - 1);
 12 |     nodes.resize(n_max_nodes);
 13 |     TreeNode *node_data = nodes.host_data();
 14 |     for (int i = 0; i < n_max_nodes; ++i) {
 15 |         node_data[i].final_id = i;
 16 |         node_data[i].split_feature_id = -1;
 17 |         node_data[i].is_valid = false;
 18 |         node_data[i].parent_index = (i - 1) / 2;
 19 |         if (i < n_max_nodes / 2) {
 20 |             node_data[i].is_leaf = false;
 21 |             node_data[i].lch_index = i * 2 + 1;
 22 |             node_data[i].rch_index = i * 2 + 2;
 23 |         } else {
 24 |             //leaf nodes
 25 |             node_data[i].is_leaf = true;
 26 |             node_data[i].lch_index = -1;
 27 |             node_data[i].rch_index = -1;
 28 |         }
 29 |     }
 30 |     node_data[0].parent_index = -1;//root node has no parent node
 31 | }
 32 | 
 33 | string Tree::dump(int depth) const {
 34 |     string s("\n");
 35 |     preorder_traversal(0, depth, 0, s);
 36 |     return s;
 37 | }
 38 | 
 39 | void Tree::preorder_traversal(int nid, int max_depth, int depth, string &s) const {
 40 |     const TreeNode &node = nodes.host_data()[nid];
 41 |     if (node.is_valid && !node.is_pruned)
 42 |         s = s + string(static_cast<unsigned long>(depth), '\t') +
 43 |             (node.is_leaf ?
 44 |              string_format("%d:leaf=%.6g\n", node.final_id, node.base_weight) :
 45 |              string_format("%d:[f%d<%.6g], weight=%f, gain=%f, dr=%d\n", node.final_id, node.split_feature_id + 1,
 46 |                            node.split_value,
 47 |                            node.base_weight, node.gain, node.default_right));
 48 |     if (depth < max_depth) {
 49 |         preorder_traversal(node.lch_index, max_depth, depth + 1, s);
 50 |         preorder_traversal(node.rch_index, max_depth, depth + 1, s);
 51 |     }
 52 | }
 53 | 
 54 | std::ostream &operator<<(std::ostream &os, const Tree::TreeNode &node) {
 55 |     os << string_format("\nnid:%d,l:%d,split_feature_id:%d,f:%f,gain:%f,r:%d,w:%f,", node.final_id, node.is_leaf,
 56 |                         node.split_feature_id, node.split_value, node.gain, node.default_right, node.base_weight);
 57 |     os << "g/h:" << node.sum_gh_pair;
 58 |     return os;
 59 | }
 60 | 
 61 | void Tree::reorder_nid() {
 62 |     int nid = 0;
 63 |     Tree::TreeNode *nodes_data = nodes.host_data();
 64 |     for (int i = 0; i < nodes.size(); ++i) {
 65 |         if (nodes_data[i].is_valid && !nodes_data[i].is_pruned) {
 66 |             nodes_data[i].final_id = nid;
 67 |             nid++;
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | int Tree::try_prune_leaf(int nid, int np, float_type gamma, vector<int> &leaf_child_count) {
 73 |     Tree::TreeNode *nodes_data = nodes.host_data();
 74 |     int p_nid = nodes_data[nid].parent_index;
 75 |     if (p_nid == -1) return np;// is root
 76 |     Tree::TreeNode &p_node = nodes_data[p_nid];
 77 |     Tree::TreeNode &lch = nodes_data[p_node.lch_index];
 78 |     Tree::TreeNode &rch = nodes_data[p_node.rch_index];
 79 |     leaf_child_count[p_nid]++;
 80 |     if (leaf_child_count[p_nid] >= 2 && p_node.gain < gamma) {
 81 |         //do pruning
 82 |         //delete two children
 83 |         CHECK(lch.is_leaf);
 84 |         CHECK(rch.is_leaf);
 85 |         lch.is_pruned = true;
 86 |         rch.is_pruned = true;
 87 |         //make parent to leaf
 88 |         p_node.is_leaf = true;
 89 |         return try_prune_leaf(p_nid, np + 2, gamma, leaf_child_count);
 90 |     } else return np;
 91 | }
 92 | 
 93 | void Tree::prune_self(float_type gamma) {
 94 |     vector<int> leaf_child_count(nodes.size(), 0);
 95 |     Tree::TreeNode *nodes_data = nodes.host_data();
 96 |     int n_pruned = 0;
 97 |     for (int i = 0; i < nodes.size(); ++i) {
 98 |         if (nodes_data[i].is_leaf && nodes_data[i].is_valid) {
 99 |             n_pruned = try_prune_leaf(i, n_pruned, gamma, leaf_child_count);
100 |         }
101 |     }
102 |     LOG(DEBUG) << string_format("%d nodes are pruned", n_pruned);
103 |     reorder_nid();
104 | }
105 | 
106 | void Tree::shrink(float_type learning_rate){
107 |     Tree::TreeNode *nodes_data = nodes.host_data();
108 |     for (int i = 0; i < nodes.size(); ++i) {
109 |         if (nodes_data[i].is_leaf && nodes_data[i].is_valid) {
110 |             nodes_data[i].base_weight *= learning_rate;
111 |         }
112 |     }
113 | }


--------------------------------------------------------------------------------
/src/thundergbm/updater/exact_updater.cu:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by shijiashuai on 5/7/18.
  3 | //
  4 | #include "thundergbm/updater/exact_updater.h"
  5 | 
  6 | void ExactUpdater::lsh_hash_init(unsigned n_bucket, unsigned num_table, unsigned num_dimension, unsigned p_norm, float r, unsigned numP, int seed){
  7 |     lsh_table.param.n_bucket = n_bucket;
  8 |     lsh_table.param.n_table = num_table;
  9 |     lsh_table.param.n_dimension = num_dimension;
 10 |     lsh_table.param.p_norm = p_norm;
 11 |     lsh_table.param.r = r;
 12 |     lsh_table.param.n_comp = numP;
 13 |     lsh_table.param.seed = seed;
 14 |     lsh_table.init();
 15 | }
 16 | 
 17 | void ExactUpdater::grow(Tree &tree, const vector<std::shared_ptr<SparseColumns>> &v_columns, InsStat &stats) {
 18 |     TIMED_SCOPE(timerObj, "grow tree");
 19 | 
 20 |     int n_instances = stats.n_instances;
 21 |     int cur_device = 0;
 22 | //    int cur_device = param.use_gpu_id;
 23 | 
 24 |     LOG(TRACE) << "broadcast tree and stats";
 25 |     v_stats.resize(n_devices);
 26 |     v_trees_gpu.resize(n_devices);
 27 |     init_tree(tree, stats);
 28 |     DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) {
 29 |         //stats
 30 |         int n_instances = stats.n_instances;
 31 |         v_stats[device_id].reset(new InsStat());
 32 |         InsStat &gpu_stats = *v_stats[device_id];
 33 |         gpu_stats.resize(n_instances);
 34 |         gpu_stats.gh_pair.copy_from(stats.gh_pair.host_data(), n_instances);
 35 |         gpu_stats.nid.copy_from(stats.nid.host_data(), n_instances);
 36 |         gpu_stats.y.copy_from(stats.y.host_data(), n_instances);
 37 |         gpu_stats.y_predict.copy_from(stats.y_predict.host_data(), n_instances);
 38 | 
 39 |         //tree
 40 |         v_trees_gpu[device_id].reset(new Tree());
 41 |         Tree &gpu_tree = *v_trees_gpu[device_id];
 42 |         gpu_tree.nodes.resize(tree.nodes.size());
 43 |         gpu_tree.nodes.copy_from(tree.nodes.host_data(), tree.nodes.size());
 44 |     });
 45 | 
 46 |     for (int i = 0; i < depth; ++i) {
 47 |         LOG(TRACE) << "growing tree at depth " << i;
 48 |         vector<SyncArray<SplitPoint>> local_sp(n_devices);
 49 |         {
 50 |             TIMED_SCOPE(timerObj, "find split");
 51 |             DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) {
 52 |                 LOG(TRACE) << string_format("finding split on device %d", device_id);
 53 |                 find_split(i, *v_columns[device_id], *v_trees_gpu[device_id], *v_stats[device_id], local_sp[device_id]);
 54 |             });
 55 |         }
 56 | 
 57 |         int n_max_nodes_in_level = 1 << i;//2^i
 58 |         int nid_offset = (1 << i) - 1;//2^i - 1
 59 |         SyncArray<SplitPoint> global_sp(n_max_nodes_in_level);
 60 |         {
 61 |             TIMED_SCOPE(timerObj, "split point all reduce");
 62 |             split_point_all_reduce(local_sp, global_sp, i);
 63 |         }
 64 | 
 65 |         //do split
 66 |         {
 67 |             TIMED_SCOPE(timerObj, "update tree");
 68 |             update_tree(tree, global_sp);
 69 |         }
 70 |         //broadcast tree
 71 |         LOG(TRACE) << "broadcasting updated tree";
 72 |         DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) {
 73 |             v_trees_gpu[device_id]->nodes.copy_from(tree.nodes.host_data(), tree.nodes.size());
 74 |         });
 75 | 
 76 |         {
 77 |             vector<bool> v_has_split(n_devices);
 78 |             TIMED_SCOPE(timerObj, "reset ins2node id");
 79 |             LOG(TRACE) << "reset ins2node id";
 80 |             DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) {
 81 |                 v_has_split[device_id] = reset_ins2node_id(*v_stats[device_id], *v_trees_gpu[device_id],
 82 |                                                            *v_columns[device_id]);
 83 |             });
 84 | 
 85 |             LOG(TRACE) << "gathering ins2node id";
 86 |             //get final result of the reset instance id to node id
 87 |             bool has_split = false;
 88 |             for (int d = 0; d < n_devices; d++) {
 89 |                 has_split |= v_has_split[d];
 90 |             }
 91 |             if (!has_split) {
 92 |                 LOG(INFO) << "no splittable nodes, stop";
 93 |                 break;
 94 |             }
 95 |         }
 96 | 
 97 |         //get global ins2node id
 98 |         {
 99 |             TIMED_SCOPE(timerObj, "global ins2node id");
100 |             SyncArray<int> local_ins2node_id(n_instances);
101 |             auto local_ins2node_id_data = local_ins2node_id.device_data();
102 |             auto global_ins2node_id_data = stats.nid.device_data();
103 |             for (int d = 0; d < n_devices; d++) {
104 |                 CUDA_CHECK(cudaMemcpyPeerAsync(local_ins2node_id_data, cur_device,
105 |                                                v_stats[d]->nid.device_data(), d,
106 |                                                sizeof(int) * n_instances));
107 |                 cudaDeviceSynchronize();
108 |                 device_loop(n_instances, [=]__device__(int i) {
109 |                     global_ins2node_id_data[i] = (global_ins2node_id_data[i] > local_ins2node_id_data[i]) ?
110 |                                                  global_ins2node_id_data[i] : local_ins2node_id_data[i];
111 |                 });
112 |             }
113 |         }
114 | 
115 |         //processing missing value
116 |         {
117 |             TIMED_SCOPE(timerObj, "process missing value");
118 |             LOG(TRACE) << "update ins2node id for each missing fval";
119 |             auto global_ins2node_id_data = stats.nid.device_data();//essential
120 |             auto nodes_data = v_trees_gpu[0]->nodes.device_data();//already broadcast above
121 |             device_loop(n_instances, [=]__device__(int iid) {
122 |                 int nid = global_ins2node_id_data[iid];
123 |                 //if the instance is not on leaf node and not goes down
124 |                 if (nodes_data[nid].splittable() && nid < nid_offset + n_max_nodes_in_level) {
125 |                     //let the instance goes down
126 |                     const Tree::TreeNode &node = nodes_data[nid];
127 |                     if (node.default_right)
128 |                         global_ins2node_id_data[iid] = node.rch_index;
129 |                     else
130 |                         global_ins2node_id_data[iid] = node.lch_index;
131 |                 }
132 |             });
133 |             LOG(DEBUG) << "new nid = " << stats.nid;
134 |             //broadcast ins2node id
135 |             DO_ON_MULTI_DEVICES(n_devices, [&](int device_id) {
136 |                 v_stats[device_id]->nid.copy_from(stats.nid.host_data(), stats.nid.size());
137 |             });
138 |         }
139 |     }
140 |     tree.nodes.copy_from(v_trees_gpu[0]->nodes);
141 | }
142 | 
143 | void ExactUpdater::split_point_all_reduce(const vector<SyncArray<SplitPoint>> &local_sp,
144 |                                           SyncArray<SplitPoint> &global_sp, int depth) {
145 |     //get global best split of each node
146 |     int n_max_nodes_in_level = 1 << depth;//2^i
147 |     int nid_offset = (1 << depth) - 1;//2^i - 1
148 |     auto global_sp_data = global_sp.host_data();
149 |     vector<bool> active_sp(n_max_nodes_in_level);
150 |     for (int n = 0; n < n_max_nodes_in_level; n++) {
151 |         global_sp_data[n].nid = n + nid_offset;
152 |         global_sp_data[n].gain = -1.0f;
153 |         active_sp[n] = false;
154 |     }
155 | 
156 |     for (int device_id = 0; device_id < n_devices; device_id++) {
157 |         auto local_sp_data = local_sp[device_id].host_data();
158 |         for (int j = 0; j < local_sp[device_id].size(); j++) {
159 |             int sp_nid = local_sp_data[j].nid;
160 |             if (sp_nid == -1) continue;
161 |             int global_pos = sp_nid - nid_offset;
162 |             if (!active_sp[global_pos])
163 |                 global_sp_data[global_pos] = local_sp_data[j];
164 |             else
165 |                 global_sp_data[global_pos] = (global_sp_data[global_pos].gain >= local_sp_data[j].gain)
166 |                                              ?
167 |                                              global_sp_data[global_pos] : local_sp_data[j];
168 |             active_sp[global_pos] = true;
169 |         }
170 |     }
171 |     //set inactive sp
172 |     for (int n = 0; n < n_max_nodes_in_level; n++) {
173 |         if (!active_sp[n])
174 |             global_sp_data[n].nid = -1;
175 |     }
176 |     LOG(DEBUG) << "global best split point = " << global_sp;
177 | }
178 | 
179 | void ExactUpdater::init_tree(Tree &tree, const InsStat &stats) {
180 |     tree.init(depth);
181 |     //init root node
182 |     Tree::TreeNode &root_node = tree.nodes.host_data()[0];
183 |     root_node.sum_gh_pair = stats.sum_gh;
184 |     root_node.is_valid = true;
185 |     root_node.calc_weight(lambda);
186 |     LOG(DEBUG) << "root sum gh " << root_node.sum_gh_pair;
187 | }
188 | 
189 | void ExactUpdater::find_split(int level, const SparseColumns &columns, const Tree &tree, const InsStat &stats,
190 |                               SyncArray<SplitPoint> &sp) {
191 |     int n_max_nodes_in_level = static_cast<int>(pow(2, level));
192 |     int nid_offset = static_cast<int>(pow(2, level) - 1);
193 |     int n_column = columns.n_column;
194 |     int n_partition = n_column * n_max_nodes_in_level;
195 |     int nnz = columns.nnz;
196 |     int n_block = std::min((nnz / n_column - 1) / 256 + 1, 32 * 56);
197 | 
198 |     LOG(TRACE) << "start finding split";
199 | 
200 |     //find the best split locally
201 |     {
202 |         using namespace thrust;
203 |         SyncArray<int> fvid2pid(nnz);
204 | 
205 |         {
206 |             TIMED_SCOPE(timerObj, "fvid2pid");
207 |             //input
208 |             const int *nid_data = stats.nid.device_data();
209 |             const int *iid_data = columns.csc_row_ind.device_data();
210 | 
211 |             LOG(TRACE) << "after using v_stats and columns";
212 |             //output
213 |             int *fvid2pid_data = fvid2pid.device_data();
214 |             device_loop_2d(
215 |                     n_column, columns.csc_col_ptr.device_data(),
216 |                     [=]__device__(int col_id, int fvid) {
217 |                         //feature value id -> instance id -> node id
218 |                         int nid = nid_data[iid_data[fvid]];
219 |                         int pid;
220 |                         //if this node is leaf node, move it to the end
221 |                         if (nid < nid_offset) pid = INT_MAX;//todo negative
222 |                         else pid = (nid - nid_offset) * n_column + col_id;
223 |                         fvid2pid_data[fvid] = pid;
224 |                     },
225 |                     n_block);
226 |             LOG(DEBUG) << "fvid2pid " << fvid2pid;
227 |         }
228 | 
229 | 
230 |         //gather g/h pairs and do prefix sum
231 |         int n_split;
232 |         SyncArray<GHPair> gh_prefix_sum;
233 |         SyncArray<GHPair> missing_gh(n_partition);
234 |         SyncArray<int> rle_pid;
235 |         SyncArray<float_type> rle_fval;
236 |         {
237 |             //get feature value id mapping for partition, new -> old
238 |             SyncArray<int> fvid_new2old(nnz);
239 |             {
240 |                 TIMED_SCOPE(timerObj, "fvid_new2old");
241 |                 sequence(cuda::par, fvid_new2old.device_data(), fvid_new2old.device_end(), 0);
242 |                 stable_sort_by_key(
243 |                         cuda::par, fvid2pid.device_data(), fvid2pid.device_end(),
244 |                         fvid_new2old.device_data(),
245 |                         thrust::less<int>());
246 |                 LOG(DEBUG) << "sorted fvid2pid " << fvid2pid;
247 |                 LOG(DEBUG) << "fvid_new2old " << fvid_new2old;
248 |             }
249 | 
250 |             //do prefix sum
251 |             {
252 |                 TIMED_SCOPE(timerObj, "do prefix sum");
253 |                 SyncArray<GHPair> rle_gh(nnz);
254 |                 SyncArray<int_float> rle_key(nnz);
255 |                 //same feature value in the same part has the same key.
256 |                 auto key_iter = make_zip_iterator(
257 |                         make_tuple(
258 |                                 fvid2pid.device_data(),
259 |                                 make_permutation_iterator(
260 |                                         columns.csc_val.device_data(),
261 |                                         fvid_new2old.device_data())));//use fvid_new2old to access csc_val
262 |                 //apply RLE compression
263 |                 n_split = reduce_by_key(
264 |                         cuda::par,
265 |                         key_iter, key_iter + nnz,
266 |                         make_permutation_iterator(                   //ins id -> gh pair
267 |                                 stats.gh_pair.device_data(),
268 |                                 make_permutation_iterator(                 //old fvid -> ins id
269 |                                         columns.csc_row_ind.device_data(),
270 |                                         fvid_new2old.device_data())),             //new fvid -> old fvid
271 |                         rle_key.device_data(),
272 |                         rle_gh.device_data()
273 |                 ).first - rle_key.device_data();
274 |                 gh_prefix_sum.resize(n_split);
275 |                 rle_pid.resize(n_split);
276 |                 rle_fval.resize(n_split);
277 |                 const auto rle_gh_data = rle_gh.device_data();
278 |                 const auto rle_key_data = rle_key.device_data();
279 |                 auto gh_prefix_sum_data = gh_prefix_sum.device_data();
280 |                 auto rle_pid_data = rle_pid.device_data();
281 |                 auto rle_fval_data = rle_fval.device_data();
282 |                 device_loop(n_split, [=]__device__(int i) {
283 |                     gh_prefix_sum_data[i] = rle_gh_data[i];
284 |                     rle_pid_data[i] = get<0>(rle_key_data[i]);
285 |                     rle_fval_data[i] = get<1>(rle_key_data[i]);
286 |                 });
287 | 
288 |                 inclusive_scan_by_key(
289 |                         cuda::par,
290 |                         rle_pid.device_data(), rle_pid.device_end(),
291 |                         gh_prefix_sum.device_data(),
292 |                         gh_prefix_sum.device_data());
293 | //                        LOG(DEBUG) << "gh prefix sum = " << gh_prefix_sum;
294 |                 LOG(DEBUG) << "reduced pid = " << rle_pid;
295 |                 LOG(DEBUG) << "reduced fval = " << rle_fval;
296 |             }
297 | 
298 |             //calculate missing value for each partition
299 |             {
300 |                 TIMED_SCOPE(timerObj, "calculate missing value");
301 |                 SyncArray<int> pid_ptr(n_partition + 1);
302 |                 counting_iterator<int> search_begin(0);
303 |                 upper_bound(cuda::par, rle_pid.device_data(), rle_pid.device_end(), search_begin,
304 |                             search_begin + n_partition, pid_ptr.device_data() + 1);
305 |                 LOG(DEBUG) << "pid_ptr = " << pid_ptr;
306 | 
307 |                 auto pid_ptr_data = pid_ptr.device_data();
308 |                 auto rle_pid_data = rle_pid.device_data();
309 |                 auto rle_fval_data = rle_fval.device_data();
310 |                 float_type rt_eps = this->rt_eps;
311 |                 device_loop(n_split, [=]__device__(int i) {
312 |                     int pid = rle_pid_data[i];
313 |                     if (pid == INT_MAX) return;
314 |                     float_type f = rle_fval_data[i];
315 |                     if ((pid_ptr_data[pid + 1] - 1) == i)//the last RLE
316 |                         rle_fval_data[i] = (f - fabsf(rle_fval_data[pid_ptr_data[pid]]) - rt_eps);
317 |                     else
318 |                         //FIXME read/write collision
319 |                         rle_fval_data[i] = (f + rle_fval_data[i + 1]) * 0.5f;
320 |                 });
321 | 
322 |                 const auto gh_prefix_sum_data = gh_prefix_sum.device_data();
323 |                 const auto node_data = tree.nodes.device_data();
324 |                 auto missing_gh_data = missing_gh.device_data();
325 |                 device_loop(n_partition, [=]__device__(int pid) {
326 |                     int nid = pid / n_column + nid_offset;
327 |                     if (pid_ptr_data[pid + 1] != pid_ptr_data[pid])
328 |                         missing_gh_data[pid] =
329 |                                 node_data[nid].sum_gh_pair - gh_prefix_sum_data[pid_ptr_data[pid + 1] - 1];
330 |                 });
331 | //                        LOG(DEBUG) << "missing gh = " << missing_gh;
332 |             }
333 |         }
334 | 
335 |         //calculate gain of each split
336 |         SyncArray<float_type> gain(n_split);
337 |         SyncArray<bool> default_right(n_split);
338 |         {
339 |             TIMED_SCOPE(timerObj, "calculate gain");
340 |             auto compute_gain = []__device__(GHPair father, GHPair lch, GHPair rch, float_type min_child_weight,
341 |                                              float_type lambda) -> float_type {
342 |                 if (lch.h >= min_child_weight && rch.h >= min_child_weight)
343 |                     return (lch.g * lch.g) / (lch.h + lambda) + (rch.g * rch.g) / (rch.h + lambda) -
344 |                            (father.g * father.g) / (father.h + lambda);
345 |                 else
346 |                     return 0;
347 |             };
348 | 
349 |             int *fvid2pid_data = fvid2pid.device_data();
350 |             const Tree::TreeNode *nodes_data = tree.nodes.device_data();
351 |             GHPair *gh_prefix_sum_data = gh_prefix_sum.device_data();
352 |             float_type *gain_data = gain.device_data();
353 |             bool *default_right_data = default_right.device_data();
354 |             const auto rle_pid_data = rle_pid.device_data();
355 |             const auto missing_gh_data = missing_gh.device_data();
356 |             auto rle_fval_data = rle_fval.device_data();
357 |             //for lambda expression
358 |             float_type mcw = min_child_weight;
359 |             float_type l = lambda;
360 |             device_loop(n_split, [=]__device__(int i) {
361 |                 int pid = rle_pid_data[i];
362 |                 int nid0 = pid / n_column;
363 |                 int nid = nid0 + nid_offset;
364 |                 if (pid == INT_MAX) return;
365 |                 GHPair father_gh = nodes_data[nid].sum_gh_pair;
366 |                 GHPair p_missing_gh = missing_gh_data[pid];
367 |                 GHPair rch_gh = gh_prefix_sum_data[i];
368 |                 float_type max_gain = compute_gain(father_gh, father_gh - rch_gh, rch_gh, mcw, l);
369 |                 if (p_missing_gh.h > 1) {
370 |                     rch_gh = rch_gh + p_missing_gh;
371 |                     float_type temp_gain = compute_gain(father_gh, father_gh - rch_gh, rch_gh, mcw, l);
372 |                     if (temp_gain > 0 && temp_gain - max_gain > 0.1) {
373 |                         max_gain = temp_gain;
374 |                         default_right_data[i] = true;
375 |                     }
376 |                 }
377 |                 gain_data[i] = max_gain;
378 |             });
379 |             LOG(DEBUG) << "gain = " << gain;
380 |         }
381 | 
382 |         //get best gain and the index of best gain for each feature and each node
383 |         SyncArray<int_float> best_idx_gain(n_max_nodes_in_level);
384 |         int n_nodes_in_level;
385 |         {
386 |             TIMED_SCOPE(timerObj, "get best gain");
387 |             auto arg_max = []__device__(const int_float &a, const int_float &b) {
388 |                 if (get<1>(a) == get<1>(b))
389 |                     return get<0>(a) < get<0>(b) ? a : b;
390 |                 else
391 |                     return get<1>(a) > get<1>(b) ? a : b;
392 |             };
393 |             auto in_same_node = [=]__device__(const int a, const int b) {
394 |                 return (a / n_column) == (b / n_column);
395 |             };
396 | 
397 |             //reduce to get best split of each node for this feature
398 |             SyncArray<int> key_test(n_max_nodes_in_level);
399 |             n_nodes_in_level = reduce_by_key(
400 |                     cuda::par,
401 |                     rle_pid.device_data(), rle_pid.device_end(),
402 |                     make_zip_iterator(make_tuple(counting_iterator<int>(0), gain.device_data())),
403 |                     key_test.device_data(),//make_discard_iterator(),
404 |                     best_idx_gain.device_data(),
405 |                     in_same_node,
406 |                     arg_max).second - best_idx_gain.device_data();
407 | 
408 |             LOG(DEBUG) << "#nodes in level = " << n_nodes_in_level;
409 |             LOG(DEBUG) << "best pid = " << key_test;
410 |             LOG(DEBUG) << "best idx & gain = " << best_idx_gain;
411 |         }
412 | 
413 |         //get split points
414 |         const int_float *best_idx_gain_data = best_idx_gain.device_data();
415 |         const auto rle_pid_data = rle_pid.device_data();
416 |         GHPair *gh_prefix_sum_data = gh_prefix_sum.device_data();
417 |         const auto rle_fval_data = rle_fval.device_data();
418 |         const auto missing_gh_data = missing_gh.device_data();
419 |         bool *default_right_data = default_right.device_data();
420 | 
421 |         sp.resize(n_nodes_in_level);
422 |         auto sp_data = sp.device_data();
423 | 
424 |         int column_offset = columns.column_offset;
425 |         device_loop(n_nodes_in_level, [=]__device__(int i) {
426 |             int_float bst = best_idx_gain_data[i];
427 |             float_type best_split_gain = get<1>(bst);
428 |             int split_index = get<0>(bst);
429 |             int pid = rle_pid_data[split_index];
430 |             sp_data[i].split_fea_id = (pid == INT_MAX) ? -1 : (pid % n_column) + column_offset;
431 |             sp_data[i].nid = (pid == INT_MAX) ? -1 : (pid / n_column + nid_offset);
432 |             sp_data[i].gain = best_split_gain;
433 |             if (pid != INT_MAX) {//avoid split_index out of bound
434 |                 sp_data[i].fval = rle_fval_data[split_index];
435 |                 sp_data[i].fea_missing_gh = missing_gh_data[pid];
436 |                 sp_data[i].default_right = default_right_data[split_index];
437 |                 sp_data[i].rch_sum_gh = gh_prefix_sum_data[split_index];
438 |             }
439 |         });
440 |     }
441 | 
442 |     LOG(DEBUG) << "split points (gain/fea_id/nid): " << sp;
443 | }
444 | 
445 | void ExactUpdater::update_tree(Tree &tree, const SyncArray<SplitPoint> &sp) {
446 |     auto sp_data = sp.device_data();
447 |     int n_nodes_in_level = sp.size();
448 | 
449 |     Tree::TreeNode *nodes_data = tree.nodes.device_data();
450 |     float_type rt_eps = this->rt_eps;
451 |     float_type lambda = this->lambda;
452 | 
453 |     device_loop(n_nodes_in_level, [=]__device__(int i) {
454 |         float_type best_split_gain = sp_data[i].gain;
455 |         if (best_split_gain > rt_eps) {
456 |             //do split
457 |             if (sp_data[i].nid == -1) return;
458 |             int nid = sp_data[i].nid;
459 |             Tree::TreeNode &node = nodes_data[nid];
460 |             node.gain = best_split_gain;
461 | 
462 |             Tree::TreeNode &lch = nodes_data[node.lch_index];//left child
463 |             Tree::TreeNode &rch = nodes_data[node.rch_index];//right child
464 |             lch.is_valid = true;
465 |             rch.is_valid = true;
466 |             node.split_feature_id = sp_data[i].split_fea_id;
467 |             GHPair p_missing_gh = sp_data[i].fea_missing_gh;
468 |             //todo process begin
469 |             node.split_value = sp_data[i].fval;
470 |             rch.sum_gh_pair = sp_data[i].rch_sum_gh;
471 |             if (sp_data[i].default_right) {
472 |                 rch.sum_gh_pair = rch.sum_gh_pair + p_missing_gh;
473 |                 node.default_right = true;
474 |             }
475 |             lch.sum_gh_pair = node.sum_gh_pair - rch.sum_gh_pair;
476 |             lch.calc_weight(lambda);
477 |             rch.calc_weight(lambda);
478 |         } else {
479 |             //set leaf
480 |             if (sp_data[i].nid == -1) return;
481 |             int nid = sp_data[i].nid;
482 |             Tree::TreeNode &node = nodes_data[nid];
483 |             node.is_leaf = true;
484 |             nodes_data[node.lch_index].is_valid = false;
485 |             nodes_data[node.rch_index].is_valid = false;
486 |         }
487 | //    }
488 |     });
489 | }
490 | 
491 | bool ExactUpdater::reset_ins2node_id(InsStat &stats, const Tree &tree, const SparseColumns &columns) {
492 |     SyncArray<bool> has_splittable(1);
493 |     //set new node id for each instance
494 |     {
495 |         TIMED_SCOPE(timerObj, "get new node id");
496 |         int *nid_data = stats.nid.device_data();
497 |         const int *iid_data = columns.csc_row_ind.device_data();
498 |         const Tree::TreeNode *nodes_data = tree.nodes.device_data();
499 |         const int *col_ptr_data = columns.csc_col_ptr.device_data();
500 |         const float_type *f_val_data = columns.csc_val.device_data();
501 |         has_splittable.host_data()[0] = false;
502 |         bool *h_s_data = has_splittable.device_data();
503 |         int column_offset = columns.column_offset;
504 | 
505 |         int n_column = columns.n_column;
506 |         int nnz = columns.nnz;
507 |         int n_block = std::min((nnz / n_column - 1) / 256 + 1, 32 * 56);
508 | 
509 |         LOG(TRACE) << "update ins2node id for each fval";
510 |         device_loop_2d(n_column, col_ptr_data,
511 |                        [=]__device__(int col_id, int fvid) {
512 |                            //feature value id -> instance id
513 |                            int iid = iid_data[fvid];
514 |                            //instance id -> node id
515 |                            int nid = nid_data[iid];
516 |                            //node id -> node
517 |                            const Tree::TreeNode &node = nodes_data[nid];
518 |                            //if the node splits on this feature
519 |                            if (node.splittable() && node.split_feature_id == col_id + column_offset) {
520 |                                h_s_data[0] = true;
521 |                                if (f_val_data[fvid] < node.split_value)
522 |                                    //goes to left child
523 |                                    nid_data[iid] = node.lch_index;
524 |                                else
525 |                                    //right child
526 |                                    nid_data[iid] = node.rch_index;
527 |                            }
528 |                        }, n_block);
529 | 
530 |     }
531 |     LOG(DEBUG) << "new tree_id = " << stats.nid;
532 | //        LOG(DEBUG) << v_trees_gpu[cur_device_id].nodes;
533 |     return has_splittable.host_data()[0];
534 | }
535 | 
536 | std::ostream &operator<<(std::ostream &os, const int_float &rhs) {
537 |     os << string_format("%d/%f", thrust::get<0>(rhs), thrust::get<1>(rhs));
538 |     return os;
539 | }
540 | 


--------------------------------------------------------------------------------
/src/thundergbm/util/common.cpp:
--------------------------------------------------------------------------------
1 | //
2 | // Created by jiashuai on 18-1-16.
3 | //
4 | #include "thundergbm/thundergbm.h"
5 | INITIALIZE_EASYLOGGINGPP
6 | 


--------------------------------------------------------------------------------