├── .gitignore ├── Makefile ├── README.md ├── build └── .gitignore ├── include ├── core │ ├── common.hpp │ ├── decision_tree.hpp │ └── traits.hpp ├── impl │ ├── binary_classification.hpp │ ├── classification.hpp │ ├── day_sharpe.hpp │ └── regression.hpp └── utility │ ├── CLI11.hpp │ └── timer.h ├── test ├── eval_day_sharpe.cpp ├── test_day_sharpe.cpp └── test_dt.cpp └── tools └── sharpe_finder.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.csv 3 | *.dot 4 | *.png 5 | *.bin 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CFLAGS= 3 | LIB= 4 | UNAME_S := $(shell uname -s) 5 | ifeq ($(UNAME_S),Linux) 6 | CFLAGS += -fopenmp 7 | LIB += -lrt 8 | endif 9 | 10 | all: test/*.cpp 11 | $(CC) $(CFLAGS) -std=c++11 -O3 -msse2 -funroll-loops -I include test/test_day_sharpe.cpp -o build/test_day_sharpe -D _D2_SINGLE -D N=1000000 -D D=28 -D MD=8 -D MW=100 -D M=50000 $(LIB) 12 | $(CC) $(CFLAGS) -std=c++11 -O3 -msse2 -funroll-loops -I include test/eval_day_sharpe.cpp -o build/eval_day_sharpe -D _D2_SINGLE -D N=1000000 -D D=28 -D MD=8 -D MW=100 -D M=50000 $(LIB) 13 | $(CC) $(CFLAGS) -std=c++11 -O3 -msse2 -funroll-loops -I include test/test_dt.cpp -o build/test_dt -D _D2_SINGLE -D N=1000000 -D D=28 -D MD=24 -D MW=1000 -D M=50000 -D USE_D2_CLTYPE $(LIB) 14 | $(CC) $(CFLAGS) -std=c++11 -O3 -msse2 -funroll-loops -I include tools/sharpe_finder.cpp -o build/sharpe_finder -D _D2_SINGLE -D DIMENSION=28 -D DAYS=100 -D DAYS_TEST=100 -D MD=8 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Leaf-wise Induction of Decision Tree with Presorted Deque 2 | 3 | 4 | 5 | This is the proof-of-concept demo code for reproducing experiments in the arXiv note "A Faster Drop-in Implementation for Leaf-wise Exact Greedy Induction of Decision Tree Using Pre-sorted Deque" (https://arxiv.org/abs/1712.06989). 6 | 7 | 8 | ### Prepare sample data 9 | 10 | - download data from [HIGGS](https://archive.ics.uci.edu/ml/datasets/HIGGS) and uncompress gz file. 11 | - create training data `head -1000000 HIGGS.csv > higgs-train-1m.csv` 12 | - create testing data `tail -50000 HIGGS.csv > higgs-test.csv` 13 | 14 | 15 | ### Compile and test 16 | 17 | ``` 18 | $ make 19 | $ OMP_NUM_THREADS=28 ./build/test_dt higgs-train-1m.csv higgs-test.csv 20 | tree induction time: 1.475672 seconds 21 | training time: 2.048821 seconds 22 | nleafs: 1845 23 | test metric: FP 0.276, FN 0.317, Sensitivity 0.720, Specificity 0.687, Accuracy 0.705 24 | ``` 25 | 26 | ### Other tests on synthetic data 27 | ``` 28 | $ OMP_NUM_THREADS=12 ./build/test_dt 29 | tree induction time: 1.425927 seconds 30 | training time: 2.048105 seconds 31 | nleafs: 24 32 | test metric: FP 0.801, FN 0.000, Sensitivity 1.000, Specificity 0.985, Accuracy 0.985 33 | ``` 34 | ---- 35 | All rights reserved (2017-2023). Jianbo Ye 36 | -------------------------------------------------------------------------------- /build/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /include/core/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _COMMON_H_ 2 | #define _COMMON_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace d2 { 8 | 9 | #define __IN__ 10 | #define __OUT__ 11 | #define __IN_OUT__ 12 | 13 | 14 | /*! \brief the float point that will be used to 15 | * store real numbers. */ 16 | #ifdef _D2_DOUBLE 17 | typedef double real_t; 18 | #elif defined _D2_SINGLE 19 | typedef float real_t; 20 | #endif 21 | 22 | 23 | #ifdef _D2_DOUBLE 24 | #define _D2_SCALAR double 25 | #define _D2_FUNC(x) _d ## x 26 | #define _D2_CBLAS_FUNC(x) cblas_d ## x 27 | //#define _D2_LAPACKE_FUNC(x) d ## x 28 | #elif defined _D2_SINGLE 29 | #define _D2_SCALAR float 30 | #define _D2_FUNC(x) _s ## x 31 | #define _D2_CBLAS_FUNC(x) cblas_s ## x 32 | //#define _D2_LAPACKE_FUNC(x) s ## x 33 | #endif 34 | 35 | /*! \brief the unsigned integer type that will 36 | * be used to store index 37 | */ 38 | typedef unsigned index_t; 39 | 40 | /*! \brief the header string of log printing subject to each processer 41 | */ 42 | #ifdef RABIT_RABIT_H_ 43 | inline const std::string getLogHeader() 44 | {return std::string("@d2suite(") + std::to_string(rabit::GetRank()) + ")";} 45 | #else 46 | 47 | inline const std::string getLogHeader() { return std::string("@d2suite"); } 48 | 49 | #endif 50 | 51 | 52 | } 53 | 54 | /*! \brief the ad-hoc solution to barrier() api */ 55 | #ifdef RABIT_RABIT_H_ 56 | namespace rabit { 57 | inline void Barrier(){ static float a = 1; rabit::Broadcast(&a, sizeof(float), 0); } 58 | } 59 | #endif 60 | 61 | 62 | #define _D2_CLTYPE unsigned short int /* label type for classification */ 63 | #define _D2_RGTYPE real_t /* label type for regression */ 64 | 65 | #endif /* _COMMON_H_ */ 66 | -------------------------------------------------------------------------------- /include/core/decision_tree.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _D2_DECISION_TREE_H_ 2 | #define _D2_DECISION_TREE_H_ 3 | 4 | #include "common.hpp" 5 | #include "utility/timer.h" 6 | #include "traits.hpp" 7 | 8 | // stl headers 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef RABIT_RABIT_H 24 | #include 25 | #endif 26 | 27 | namespace d2 { 28 | namespace internal { 29 | 30 | struct DT { 31 | constexpr static real_t prior_weight = 0.00; 32 | }; 33 | 34 | /*! \brief base class for decision tree nodes 35 | * which includes shared functions and data members of both leaf and branch 36 | */ 37 | template 38 | class DTNode { 39 | public: 40 | YStats y_stats; 41 | 42 | typedef DTLeaf Leaf; 43 | typedef DTBranch Branch; 44 | 45 | DTNode() = default; 46 | explicit DTNode(const YStats &ys) : y_stats(ys) {} 47 | 48 | /*! \brief get pointer to the leaf node by a given sample */ 49 | virtual Leaf *getLeafNode(const real_t *X) = 0; 50 | virtual Leaf *getLeafNodeDebug(const real_t *X) = 0; 51 | 52 | virtual size_t getLeafCount() = 0; 53 | 54 | /*! \brief write data into a stream buffer */ 55 | virtual void write(std::ostream *fo) const = 0; 56 | 57 | /*! \brief read data from a stream buffer */ 58 | virtual void read(std::istream *fi) = 0; 59 | 60 | size_t hashCode() const { 61 | std::stringstream ss; 62 | ss << (void const *) this; 63 | return std::hash()(ss.str()); 64 | } 65 | 66 | virtual void dotgraph(std::ostream &f) const = 0; 67 | 68 | virtual void dotgraph(std::ostream &f, 69 | std::unordered_map &node_mapper) const = 0; 70 | 71 | int parent{}; 72 | }; 73 | 74 | /*! \brief lead node in decision tree 75 | */ 76 | template 77 | class DTLeaf : public DTNode { 78 | public: 79 | using typename DTNode::Leaf; 80 | using typename DTNode::Branch; 81 | using DTNode::hashCode; 82 | 83 | DTLeaf() = default; 84 | explicit DTLeaf(const YStats &ys) : DTNode(ys), label(ys.getLabel()) {} 85 | 86 | /*! \brief construct a new leaf node from a branch node */ 87 | explicit DTLeaf(const Branch &that) { 88 | this->y_stats = that.y_stats; 89 | //this->score = that.score; 90 | //this->weight = that.weight; 91 | //this->r = that.r; 92 | this->parent = that.parent; 93 | this->label = that.y_stats.getLabel(); 94 | } 95 | 96 | Leaf *getLeafNode(const real_t *X) { 97 | return this; 98 | } 99 | 100 | Leaf *getLeafNodeDebug(const real_t *X) { 101 | return this; 102 | } 103 | 104 | size_t getLeafCount() { return 1.; } 105 | 106 | void dotgraph(std::ostream &f) const { 107 | f << "node" << std::hex << hashCode() 108 | << std::dec << " [label=\"" << label << "\", shape=box, style=filled ]\n"; 109 | } 110 | 111 | void dotgraph(std::ostream &f, 112 | std::unordered_map &node_mapper) const { 113 | const size_t ¬e = node_mapper[hashCode()]; 114 | f << "node" << std::hex << hashCode() 115 | << std::dec << " [label=\"" << label << "(" << note << ")\", shape=box, style=filled ]\n"; 116 | } 117 | 118 | void write(std::ostream *fo) const { 119 | 120 | fo->write((const char *) &this->label, sizeof(typename YStats::LabelType)); 121 | fo->write((const char *) &this->parent, sizeof(int)); 122 | } 123 | 124 | void read(std::istream *fi) { 125 | fi->read((char *) &this->label, sizeof(typename YStats::LabelType)); 126 | fi->read((char *) &this->parent, sizeof(int)); 127 | } 128 | 129 | typename YStats::LabelType label; 130 | }; 131 | 132 | /*! \brief branch node in decision tree 133 | */ 134 | template 135 | class DTBranch : public DTNode { 136 | public: 137 | using typename DTNode::Leaf; 138 | using typename DTNode::Branch; 139 | using DTNode::hashCode; 140 | 141 | 142 | DTBranch() = default; 143 | DTBranch(size_t i, real_t cto) : index(i), cutoff(cto) {} 144 | 145 | Leaf *getLeafNode(const real_t *X) { 146 | assert(left && right); 147 | if (X[index] < cutoff) { 148 | return left->getLeafNode(X); 149 | } else { 150 | return right->getLeafNode(X); 151 | } 152 | } 153 | 154 | Leaf *getLeafNodeDebug(const real_t *X) { 155 | assert(left && right); 156 | if (X[index] < cutoff) { 157 | std::cout << "X[" << index << "](=" << X[index] << ") < " << cutoff << std::endl; 158 | return left->getLeafNodeDebug(X); 159 | } else { 160 | std::cout << "X[" << index << "](=" << X[index] << ") >= " << cutoff << std::endl; 161 | return right->getLeafNodeDebug(X); 162 | } 163 | } 164 | 165 | size_t getLeafCount() { 166 | n_leafs = left->getLeafCount() + right->getLeafCount(); 167 | return n_leafs; 168 | } 169 | 170 | void dotgraph(std::ostream &f) const { 171 | assert(left && right); 172 | left->dotgraph(f); 173 | right->dotgraph(f); 174 | f << std::hex; 175 | f << "node" << hashCode() << std::dec << " [label=\"x" << index << " < " << cutoff 176 | << "?\", style=filled]\n"; 177 | f << std::hex; 178 | f << "node" << hashCode() << " -> node" << left->hashCode() << " [label=\"yes\"]\n"; 179 | f << "node" << hashCode() << " -> node" << right->hashCode() << "[label=\"no\"]\n"; 180 | f << std::dec; 181 | } 182 | 183 | void dotgraph(std::ostream &f, std::unordered_map &node_mapper) const { 184 | assert(left && right); 185 | left->dotgraph(f, node_mapper); 186 | right->dotgraph(f, node_mapper); 187 | f << std::hex; 188 | f << "node" << hashCode() << std::dec << " [label=\"x" << index << " < " << cutoff 189 | << "?\", style=filled]\n"; 190 | f << std::hex; 191 | f << "node" << hashCode() << " -> node" << left->hashCode() << " [label=\"yes\"]\n"; 192 | f << "node" << hashCode() << " -> node" << right->hashCode() << "[label=\"no\"]\n"; 193 | f << std::dec; 194 | } 195 | 196 | 197 | void write(std::ostream *fo) const { 198 | fo->write((const char *) &this->nleft, sizeof(int)); 199 | fo->write((const char *) &this->nright, sizeof(int)); 200 | fo->write((const char *) &this->index, sizeof(size_t)); 201 | fo->write((const char *) &this->cutoff, sizeof(real_t)); 202 | fo->write((const char *) &this->parent, sizeof(int)); 203 | fo->write((const char *) &this->n_leafs, sizeof(size_t)); 204 | } 205 | 206 | void read(std::istream *fi) { 207 | fi->read((char *) &this->nleft, sizeof(int)); 208 | fi->read((char *) &this->nright, sizeof(int)); 209 | fi->read((char *) &this->index, sizeof(size_t)); 210 | fi->read((char *) &this->cutoff, sizeof(real_t)); 211 | fi->read((char *) &this->parent, sizeof(int)); 212 | fi->read((char *) &this->n_leafs, sizeof(size_t)); 213 | } 214 | 215 | DTNode *left = nullptr, *right = nullptr; 216 | int nleft = -1, nright = -1; 217 | size_t index{}; 218 | real_t cutoff{}; 219 | size_t n_leafs{}; 220 | }; 221 | 222 | 223 | /*! \brief node assignment data structure stores 224 | * the indexes of sample data 225 | */ 226 | template 227 | struct SortedSampleDeque : public std::deque > { 228 | SortedSampleDeque() : std::deque >() {} 229 | explicit SortedSampleDeque(const size_t n) : std::deque >(n) {} 230 | }; 231 | 232 | template 233 | struct NodeAssignment { 234 | size_t *ptr; ///< index array 235 | std::vector *> sorted_samples; 236 | size_t size; ///< size of index array 237 | size_t cache_offset; ///< offset to the cache array head, aka (ptr - cache_offset) should be constant 238 | int idx_cache_index; 239 | int depth; 240 | YStats y_stats; 241 | void Initialize(size_t dim, size_t *ptr_, size_t size_, size_t cache_offset_, const YStats& y_stats_) { 242 | sorted_samples.resize(dim); 243 | ptr = ptr_; 244 | size = size_; 245 | cache_offset = cache_offset_; 246 | y_stats = y_stats_; 247 | } 248 | }; 249 | 250 | struct IndexCache { 251 | size_t index; 252 | int nleft; 253 | int nright; 254 | }; 255 | 256 | template 257 | struct Goodness { 258 | real_t score; 259 | YStats left, right; 260 | Goodness() {} 261 | Goodness(real_t score, const YStats& left, const YStats& right): 262 | score(score), left(left), right(right) {} 263 | bool operator>(const Goodness& that) { 264 | return score > that.score; 265 | } 266 | bool operator<(const Goodness& that) { 267 | return score < that.score; 268 | } 269 | }; 270 | 271 | /*! \brief the whole data structure used in building the decision trees 272 | */ 273 | 274 | template 275 | struct TreeAssignmentNode { 276 | NodeAssignment data; 277 | int parent_id_hash; // parent_index * 2 + {0: left, 1: right} 278 | int getParentId() { return (parent_id_hash < 0)? parent_id_hash : parent_id_hash / 2; } 279 | bool isRightNode() { return parent_id_hash % 2; } 280 | static int hash(int parent_id, int is_left) { 281 | return parent_id * 2 + is_left; 282 | } 283 | }; 284 | 285 | template 286 | struct NodeCmp { 287 | bool operator()(const TreeAssignmentNode& left, const TreeAssignmentNode& right) { 288 | return criterion::unnormalized_op(left.data.y_stats) < criterion::unnormalized_op(right.data.y_stats); 289 | } 290 | }; 291 | 292 | template 293 | struct BufferForTreeConstructor { 294 | std::vector y; 295 | std::vector sample_weight; 296 | size_t max_depth{}; 297 | real_t min_leaf_weight{}; 298 | size_t max_nleafs{}; 299 | bool warm_start = false; 300 | std::priority_queue< TreeAssignmentNode, 301 | std::vector>, 302 | NodeCmp > tree_assignment_queue; 303 | 304 | // std::stack > tree_stack; 305 | // decision tree with presort 306 | std::vector sample_mask_cache; 307 | }; 308 | 309 | 310 | template 311 | Goodness best_split_ptr(SortedSampleDeque &sample_deque, 312 | size_t n, 313 | real_t &cutoff, 314 | size_t &left_count, 315 | const bool presort, 316 | const YStats &y_stats) { 317 | assert(presort); 318 | 319 | YStats y_stats_left = def::prepare::left_op(y_stats); 320 | YStats y_stats_right = def::prepare::right_op(y_stats); 321 | 322 | const real_t no_split_score = criterion::op(y_stats); 323 | 324 | Goodness best_goodness {no_split_score, y_stats_left, y_stats_right}; 325 | 326 | size_t i = 0; 327 | typename YStats::LabelType label; 328 | for (auto sample = sample_deque.begin(); sample != sample_deque.end();) { 329 | const real_t current_x = sample->x; 330 | typename YStats::LabelType yy = label = sample->y; 331 | while (i < n && (sample->x == current_x || yy == label)) { 332 | y_stats_left.updateLeft(yy); 333 | y_stats_right.updateRight(yy); 334 | i++; 335 | sample++; 336 | if (sample != sample_deque.end()) { 337 | yy = sample->y; 338 | } 339 | }; 340 | if (i < n) { 341 | const real_t score = YStats::template goodness_score(y_stats_left, y_stats_right); 342 | if (score < best_goodness.score) { 343 | best_goodness = Goodness(score, y_stats_left, y_stats_right); 344 | cutoff = sample->x; 345 | left_count = i; 346 | } 347 | } 348 | } 349 | 350 | return best_goodness; 351 | } 352 | 353 | 354 | template 355 | void inplace_split_ptr(const SortedSampleDeque &sample_deque, 356 | NodeAssignment &assignment) { 357 | #pragma omp parallel for default(none) shared(assignment, sample_deque) 358 | for (size_t i = 0; i < assignment.size; ++i) { 359 | assignment.ptr[i] = sample_deque[i].index; 360 | } 361 | } 362 | 363 | template 364 | DTNode *build_dtnode(NodeAssignment &assignment, 365 | NodeAssignment &aleft, 366 | NodeAssignment &aright, 367 | BufferForTreeConstructor &buf, 368 | const bool presort, 369 | const int dim_index = -1) { 370 | // default: return leaf node 371 | aleft.ptr = NULL; 372 | aright.ptr = NULL; 373 | 374 | // make sure there is at least one sample 375 | assert(assignment.size > 0); 376 | 377 | // make a copy of Y stats on the sample 378 | YStats y_stats = assignment.y_stats; 379 | real_t score = criterion::op(y_stats); 380 | 381 | // build node 382 | if (assignment.size == 1 || 383 | assignment.size < buf.min_leaf_weight || 384 | assignment.depth == buf.max_depth || 385 | y_stats.stop()) { 386 | // if the condtion to create a leaf node is satisfied 387 | auto *leaf = new DTLeaf(y_stats); 388 | return leaf; 389 | } else { 390 | // if it is possible to create a branch node 391 | std::array, dim> goodness = {}; 392 | std::array cutoff = {}; 393 | std::array left_count = {}; 394 | 395 | // compute goodness split score across different dimensions 396 | // if (dim_index >= 0) printf("cached index: %d\n", dim_index); 397 | #pragma omp parallel for default(none) shared(assignment, goodness, cutoff, left_count, y_stats) 398 | for (size_t ii = 0; ii < dim; ++ii) { 399 | if (dim_index < 0 || ii == dim_index) { 400 | auto &sorted_samples = assignment.sorted_samples[ii]; 401 | goodness[ii] = best_split_ptr( 402 | *sorted_samples, assignment.size, cutoff[ii], left_count[ii], presort, y_stats); 403 | } 404 | } 405 | // pick the best goodness 406 | auto *best_goodness = std::min_element(goodness.begin(), goodness.end()); 407 | size_t ii = best_goodness - goodness.begin(); 408 | 409 | if (dim_index >= 0) assert(best_goodness - goodness.begin() == dim_index || best_goodness->score == score); 410 | 411 | if (best_goodness->score == score || 412 | (left_count[ii] <= buf.min_leaf_weight && 413 | left_count[ii] > assignment.size - buf.min_leaf_weight)) { 414 | // if the best goodness is not good enough, a leaf node is still created 415 | auto *leaf = new DTLeaf(y_stats); 416 | 417 | return leaf; 418 | } else { 419 | // otherwise, create a branch node subject to the picked dimension/goodness 420 | auto *branch = new DTBranch(ii, cutoff[ii]); 421 | 422 | inplace_split_ptr(*assignment.sorted_samples[ii], assignment); 423 | def::finalize::op(best_goodness->left); 424 | def::finalize::op(best_goodness->right); 425 | 426 | // create branched assignment 427 | aleft.Initialize(dim, 428 | assignment.ptr, 429 | left_count[ii], 430 | assignment.cache_offset, 431 | best_goodness->left); 432 | aright.Initialize(dim, 433 | assignment.ptr + left_count[ii], 434 | assignment.size - left_count[ii], 435 | assignment.cache_offset + left_count[ii], 436 | best_goodness->right); 437 | 438 | if (presort) { 439 | #pragma omp parallel for 440 | for (size_t i = 0; i < aleft.size; ++i) { 441 | buf.sample_mask_cache[aleft.ptr[i]] = 'l'; 442 | } 443 | #pragma omp parallel for 444 | for (size_t i = 0; i < aright.size; ++i) { 445 | buf.sample_mask_cache[aright.ptr[i]] = 'r'; 446 | } 447 | 448 | #pragma omp parallel for 449 | for (size_t d = 0; d < dim; ++d) { 450 | auto &ass = assignment.sorted_samples[d]; 451 | auto &left = aleft.sorted_samples[d]; 452 | auto &right = aright.sorted_samples[d]; 453 | left = new SortedSampleDeque(); 454 | right = new SortedSampleDeque(); 455 | for (size_t i = 0; i < assignment.size; ++i) { 456 | const auto &sorted_sample = ass->front(); 457 | const char mask = buf.sample_mask_cache[sorted_sample.index]; 458 | if (mask == 'l') { 459 | left->push_back(sorted_sample); 460 | } else if (mask == 'r') { 461 | right->push_back(sorted_sample); 462 | } 463 | ass->pop_front(); 464 | } 465 | delete ass; 466 | } 467 | } 468 | return branch; 469 | } 470 | } 471 | } 472 | 473 | 474 | #define BIT_HIGH_POS 30 475 | 476 | template 477 | DTNode * 478 | post_process_node_arr(std::vector > &leaf_arr, 479 | std::vector > &branch_arr) { 480 | for (auto iter = branch_arr.begin(); iter < branch_arr.end(); ++iter) { 481 | if (iter->nleft & 1 << BIT_HIGH_POS) { 482 | iter->left = &branch_arr[iter->nleft & ~(1 << BIT_HIGH_POS)]; 483 | } else { 484 | iter->left = &leaf_arr[iter->nleft]; 485 | } 486 | 487 | 488 | if (iter->nright & 1 << BIT_HIGH_POS) { 489 | iter->right = &branch_arr[iter->nright & ~(1 << BIT_HIGH_POS)]; 490 | } else { 491 | iter->right = &leaf_arr[iter->nright]; 492 | } 493 | } 494 | DTNode *r; 495 | if (!branch_arr.empty()) { 496 | r = &branch_arr[0]; 497 | // printf("%zd\n", static_cast *>(r)->nleft); 498 | } else { 499 | r = &leaf_arr[0]; 500 | } 501 | return r; 502 | } 503 | 504 | template 505 | DTNode *build_tree(size_t sample_size, 506 | BufferForTreeConstructor &buffer, 507 | NodeAssignment &assign, 508 | std::vector > &leaf_arr, 509 | std::vector > &branch_arr, 510 | const bool presort) { 511 | std::vector index_arr; 512 | if (buffer.warm_start && branch_arr.size() > 0) { 513 | for (size_t ii = 0; ii < branch_arr.size(); ++ii) { 514 | size_t index = branch_arr[ii].index; 515 | int nleft, nright; 516 | if (branch_arr[ii].nleft & (1 << BIT_HIGH_POS)) 517 | nleft = branch_arr[ii].nleft & ~(1 << BIT_HIGH_POS); 518 | else 519 | nleft = -1; 520 | 521 | if (branch_arr[ii].nright & (1 << BIT_HIGH_POS)) 522 | nright = branch_arr[ii].nright & ~(1 << BIT_HIGH_POS); 523 | else 524 | nright = -1; 525 | 526 | IndexCache idc = {index, nleft, nright}; 527 | index_arr.push_back(idc); 528 | } 529 | } else { 530 | buffer.warm_start = false; 531 | } 532 | leaf_arr.clear(); 533 | branch_arr.clear(); 534 | 535 | auto &tree_assignment_queue = buffer.tree_assignment_queue; 536 | 537 | // create index array at root node 538 | std::vector root_index(sample_size); 539 | for (size_t i = 0; i < sample_size; ++i) root_index[i] = i; 540 | // create the NodeAssignment at root node and push into stack 541 | NodeAssignment &root_assignment = assign; 542 | root_assignment.ptr = &root_index[0]; 543 | root_assignment.size = sample_size; 544 | root_assignment.cache_offset = 0; 545 | root_assignment.idx_cache_index = 0; 546 | root_assignment.depth = 1; 547 | root_assignment.y_stats = def::prepare::left_op(root_assignment.y_stats); 548 | { 549 | #pragma omp for 550 | for (size_t ii = 0; ii < root_assignment.size; ++ii) { 551 | root_assignment.y_stats.updateLeft(buffer.y[root_assignment.ptr[ii]]); 552 | } 553 | def::finalize::op(root_assignment.y_stats); 554 | } 555 | 556 | tree_assignment_queue.push({root_assignment, -1}); 557 | 558 | 559 | // start to travel a tree construction using a stack 560 | size_t nleafs = 1; 561 | auto current_sample_size_not_in_leaf = sample_size; 562 | 563 | while (!tree_assignment_queue.empty()) { 564 | // std::cout << "fetch data from the top node of stack ... " << std::flush; 565 | auto cur_tree = tree_assignment_queue.top(); 566 | auto cur_assignment = cur_tree.data; 567 | int cur_parent = cur_tree.getParentId(); 568 | bool cur_is_right_node = cur_tree.isRightNode(); 569 | 570 | NodeAssignment assignment_left, assignment_right; 571 | DTNode *node; 572 | if (buffer.warm_start && cur_assignment.idx_cache_index >= 0) 573 | node = build_dtnode(cur_assignment, 574 | assignment_left, 575 | assignment_right, 576 | buffer, 577 | presort, 578 | index_arr[cur_assignment.idx_cache_index].index); 579 | else 580 | node = build_dtnode(cur_assignment, 581 | assignment_left, 582 | assignment_right, 583 | buffer, 584 | presort); 585 | node->parent = cur_parent; // set parent index 586 | bool is_branch = assignment_left.ptr != nullptr && assignment_right.ptr != nullptr; 587 | if (buffer.max_nleafs > 0) { 588 | // check if the maximum number of leafs allowed is reached 589 | is_branch = is_branch && tree_assignment_queue.size() + leaf_arr.size() < buffer.max_nleafs; 590 | } 591 | tree_assignment_queue.pop(); 592 | if (is_branch) {// spanning the tree 593 | // std::cout << "branching" << std::endl; 594 | assignment_left.depth = cur_assignment.depth + 1; 595 | assignment_right.depth = cur_assignment.depth + 1; 596 | if (buffer.warm_start && cur_assignment.idx_cache_index >= 0) { 597 | assignment_left.idx_cache_index = index_arr[cur_assignment.idx_cache_index].nleft; 598 | assignment_right.idx_cache_index = index_arr[cur_assignment.idx_cache_index].nright; 599 | } else { 600 | assignment_left.idx_cache_index = -1; 601 | assignment_right.idx_cache_index = -1; 602 | } 603 | 604 | int parent_id = branch_arr.size(); 605 | int left_hash = TreeAssignmentNode::hash(parent_id, 0); 606 | int right_hash = TreeAssignmentNode::hash(parent_id, 1); 607 | tree_assignment_queue.push({assignment_left, left_hash}); 608 | tree_assignment_queue.push({assignment_right, right_hash}); 609 | branch_arr.push_back(std::move(*static_cast * > (node))); 610 | } else { 611 | current_sample_size_not_in_leaf -= cur_assignment.size; 612 | // std::cout << "reaching a leaf (" << current_sample_size_not_in_leaf << ")" << std::endl; 613 | leaf_arr.push_back(std::move(*static_cast * > (node))); 614 | } 615 | 616 | if (cur_parent >= 0) { 617 | // set child node index 618 | auto &parent = branch_arr[cur_parent]; 619 | size_t ind = (is_branch) ? ((branch_arr.size() - 1) | 1 << BIT_HIGH_POS) : (leaf_arr.size() - 1); 620 | if (cur_is_right_node) { 621 | assert (parent.nright < 0); 622 | parent.nright = ind; 623 | } else { 624 | assert (parent.nleft < 0); 625 | parent.nleft = ind; 626 | } 627 | } 628 | } 629 | 630 | #ifdef COMPILE_PRUNING 631 | // start to pruning the constructed tree 632 | bool pruning = true; 633 | if (false) { 634 | root = post_process_node_arr(leaf_arr, branch_arr); 635 | real_t error_before_pruning = root->get_R(); 636 | real_t weight = root->weight; 637 | size_t n_leafs = root->getLeafCount(); 638 | real_t min_alpha = 0; 639 | std::cerr << getLogHeader() << "initial terminal nodes: "<< n_leafs << std::endl; 640 | while (n_leafs > 512) { 641 | // find the min(r-R) 642 | std::stack branch_ind_stack; 643 | branch_ind_stack.push(0); 644 | min_alpha = branch_arr[0].weight; 645 | int min_ind; 646 | while (!branch_ind_stack.empty()) { 647 | int ind = branch_ind_stack.top(); 648 | real_t alpha = (branch_arr[ind].r - branch_arr[ind].R) / (branch_arr[ind].n_leafs - 1); 649 | if (alpha < min_alpha) { 650 | min_alpha = alpha; 651 | min_ind = ind; 652 | } 653 | branch_ind_stack.pop(); 654 | if (branch_arr[ind].nleft & (1<* leaf = new DTLeaf(branch_arr[min_ind]); 665 | DTBranch &parent = branch_arr[branch_arr[min_ind].parent]; 666 | if (parent.nleft == (min_ind | (1<= 0) { 679 | ind = branch_arr[ind].parent; 680 | branch_arr[ind].R += (branch_arr[min_ind].r - branch_arr[min_ind].R); 681 | branch_arr[ind].n_leafs -= (branch_arr[min_ind].n_leafs - 1); 682 | } 683 | } 684 | std::cerr << getLogHeader() << "remaining terminal nodes: "<< n_leafs << std::endl; 685 | } 686 | #endif 687 | return post_process_node_arr(leaf_arr, branch_arr); 688 | } 689 | } 690 | 691 | /*! \brief the decision tree class that is currently used in marriage learning framework 692 | */ 693 | template 694 | class Decision_Tree { 695 | public: 696 | typedef typename YStats::LabelType label_t; 697 | 698 | void init() { 699 | leaf_arr.clear(); 700 | branch_arr.clear(); 701 | } 702 | 703 | void predict(const real_t *X, const size_t n, label_t *y, 704 | std::unordered_map *node_mapper = nullptr) const { 705 | const real_t *x = X; 706 | assert(root); 707 | for (size_t i = 0; i < n; ++i, x += dim) { 708 | auto leaf = root->getLeafNode(x); 709 | if (node_mapper) { 710 | (*node_mapper)[leaf->hashCode()]++; 711 | } 712 | y[i] = leaf->label; 713 | } 714 | }; 715 | 716 | label_t predict_debug(const real_t *x) { 717 | assert(root); 718 | return root->getLeafNodeDebug(x)->label; 719 | } 720 | 721 | void dotgraph(std::ostream &f) { 722 | assert(root); 723 | f << "digraph G {\n"; 724 | root->dotgraph(f); 725 | f << "}\n"; 726 | } 727 | 728 | void dotgraph(std::ostream &f, std::unordered_map &node_mapper) const { 729 | assert(root); 730 | f << "digraph G {\n"; 731 | root->dotgraph(f, node_mapper); 732 | f << "}\n"; 733 | } 734 | 735 | int fit(const real_t *X, const label_t *y, 736 | const real_t *sample_weight, const size_t n, 737 | bool sparse = false) { 738 | assert(X && y && !(sparse && !sample_weight)); 739 | using namespace internal; 740 | // convert sparse data to dense 741 | const real_t *XX, *ss; 742 | const label_t *yy; 743 | 744 | size_t sample_size; 745 | if (sparse) { 746 | size_t nz = 0; 747 | for (size_t i = 0; i < n; ++i) nz += sample_weight[i] > 0; 748 | auto *XX_ = new real_t[nz * dim]; 749 | auto *yy_ = new label_t[nz]; 750 | auto *ss_ = new real_t[nz]; 751 | size_t count = 0; 752 | for (size_t i = 0; i < n; ++i) 753 | if (sample_weight[i] > 0) { 754 | for (size_t j = 0; j < dim; ++j) XX_[count * dim + j] = X[i * dim + j]; 755 | yy_[count] = y[i]; 756 | ss_[count] = sample_weight[i]; 757 | count++; 758 | } 759 | XX = XX_; 760 | yy = yy_; 761 | ss = ss_; 762 | sample_size = nz; 763 | } else { 764 | XX = X; 765 | yy = y; 766 | ss = sample_weight; 767 | sample_size = n; 768 | } 769 | 770 | buf.max_depth = max_depth; 771 | buf.min_leaf_weight = min_leaf_weight; 772 | buf.max_nleafs = max_nleafs; 773 | buf.warm_start = true; 774 | 775 | if (!presorted) { 776 | prepare_presort(XX, yy, ss, sample_size, buf, assign); 777 | presorted = true; 778 | } else { 779 | } 780 | //printf("finish presorting!\n"); 781 | 782 | double start = getRealTime(); 783 | root = build_tree(sample_size, buf, assign, leaf_arr, branch_arr, true); 784 | printf("tree induction time: %lf seconds\n", getRealTime() - start); 785 | 786 | if (sparse) { 787 | delete[] XX; 788 | delete[] yy; 789 | delete[] ss; 790 | } 791 | return 0; 792 | } 793 | 794 | inline void set_communicate(bool bval) { communicate = bval; } 795 | 796 | inline void set_max_depth(size_t depth) { max_depth = depth; } 797 | 798 | inline void set_min_leaf_weight(real_t weight) { min_leaf_weight = weight; } 799 | 800 | inline void set_max_nleafs(size_t nleafs) { max_nleafs = nleafs; } 801 | 802 | typedef internal::DTNode Node; 803 | typedef internal::DTLeaf LeafNode; 804 | typedef internal::DTBranch BranchNode; 805 | 806 | #ifdef RABIT_RABIT_H_ 807 | typedef rabit::utils::MemoryBufferStream MemoryBufferStream; 808 | /*! \brief synchronize between multiple processors */ 809 | void sync(size_t rank) { 810 | bool no_model = false; 811 | if (rabit::GetRank() == rank) { // check if model exists 812 | if (leaf_arr.empty()) no_model = true; 813 | } 814 | rabit::Broadcast(&no_model, sizeof(bool), rank); 815 | if (no_model) return; 816 | 817 | std::string s_model; 818 | MemoryBufferStream fs(&s_model); 819 | size_t n_leaf = leaf_arr.size(); 820 | size_t n_branch = branch_arr.size(); 821 | 822 | rabit::Broadcast(&n_leaf, sizeof(size_t), rank); 823 | rabit::Broadcast(&n_branch, sizeof(size_t), rank); 824 | if (rabit::GetRank() != rank) { 825 | leaf_arr.resize(n_leaf); 826 | branch_arr.resize(n_branch); 827 | } else if (rabit::GetRank() == rank) { 828 | save(&fs); 829 | } 830 | fs.Seek(0); 831 | rabit::Broadcast(&s_model, rank); 832 | // if (rabit::GetRank() == rank) printf("%zd: %zd\t %zd\n", rank, n_leaf, n_branch); 833 | if (rabit::GetRank() != rank) { 834 | load(&fs); 835 | // printf("%zd: load data from %zd\n", rabit::GetRank(), rank); 836 | this->root = internal::post_process_node_arr(leaf_arr, branch_arr); 837 | assert(root && !leaf_arr.empty()); 838 | } 839 | rabit::Barrier(); 840 | } 841 | #endif 842 | 843 | /*! \brief helper function that caches data to stream */ 844 | inline void save(std::ostream *fo) { 845 | size_t n_leaf = leaf_arr.size(); 846 | size_t n_branch = branch_arr.size(); 847 | fo->write((const char *) &n_leaf, sizeof(size_t)); 848 | fo->write((const char *) &n_branch, sizeof(size_t)); 849 | 850 | for (const LeafNode &leaf : leaf_arr) { 851 | leaf.write(fo); 852 | } 853 | for (const BranchNode &branch : branch_arr) { 854 | branch.write(fo); 855 | } 856 | } 857 | 858 | /*! \brief helper function that restores data from stream */ 859 | inline void load(std::istream *fi) { 860 | size_t n_leaf; 861 | size_t n_branch; 862 | fi->read((char *) &n_leaf, sizeof(size_t)); 863 | fi->read((char *) &n_branch, sizeof(size_t)); 864 | 865 | leaf_arr.resize(n_leaf); 866 | branch_arr.resize(n_branch); 867 | 868 | for (LeafNode &leaf : leaf_arr) { 869 | leaf.read(fi); 870 | } 871 | for (BranchNode &branch : branch_arr) { 872 | branch.read(fi); 873 | } 874 | this->root = internal::post_process_node_arr(leaf_arr, branch_arr); 875 | assert(root && !leaf_arr.empty()); 876 | } 877 | 878 | Node *root = nullptr; 879 | private: 880 | internal::BufferForTreeConstructor buf; 881 | internal::NodeAssignment assign; 882 | std::vector leaf_arr; 883 | std::vector branch_arr; 884 | size_t max_depth = 10; 885 | real_t min_leaf_weight = .0; 886 | size_t max_nleafs = 0; 887 | bool presorted = false; 888 | bool communicate = true; 889 | 890 | void prepare_presort(const real_t *XX, const label_t *yy, const real_t *ss, 891 | const size_t sample_size, 892 | internal::BufferForTreeConstructor &buffer, 893 | internal::NodeAssignment &assignment) { 894 | buffer.y.resize(sample_size); 895 | buffer.sample_weight.resize(sample_size, 1.); 896 | for (size_t i = 0; i < sample_size; ++i) { 897 | buffer.y[i] = yy[i]; 898 | } 899 | if (ss) 900 | for (size_t i = 0; i < sample_size; ++i) buffer.sample_weight[i] = ss[i]; 901 | 902 | assignment.sorted_samples.resize(dim); 903 | buffer.sample_mask_cache.resize(sample_size); 904 | #pragma omp parallel for 905 | for (size_t k = 0; k < dim; ++k) { 906 | auto &sorted_samples = assignment.sorted_samples[k]; 907 | sorted_samples = new internal::SortedSampleDeque(sample_size); 908 | const real_t *XXX = XX + k; 909 | for (size_t i = 0; i < sample_size; ++i, XXX += dim) { 910 | auto &sample = (*sorted_samples)[i]; 911 | sample = {*XXX, yy[i], i}; 912 | /* 913 | if (ss) 914 | sample.weight = ss[i]; 915 | else 916 | sample.weight = 1.; 917 | */ 918 | } 919 | // presort 920 | std::sort(sorted_samples->begin(), sorted_samples->end(), internal::SortedSample::cmp); 921 | 922 | } 923 | } 924 | 925 | }; 926 | } 927 | 928 | #include "impl/classification.hpp" 929 | #include "impl/regression.hpp" 930 | #include "impl/day_sharpe.hpp" 931 | #include "impl/binary_classification.hpp" 932 | 933 | #endif /* _D2_DECISION_TREE_H_ */ 934 | -------------------------------------------------------------------------------- /include/core/traits.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _TRAITS_H_ 2 | #define _TRAITS_H_ 3 | 4 | namespace d2 { 5 | 6 | namespace def { 7 | 8 | //! \brief the generic trait class for node statistics 9 | //! a Stats class should contain: 10 | // (1) the sufficient statistics for supporting the calculation of the split criteria at each node 11 | // (2) the update rules of statistics, O(1) time, when a cutoff threshold at each node splitting is moving upwards 12 | // (3) the stop criterion of node splitting specific to the sufficient statistics 13 | template 14 | struct Stats { 15 | typedef LT LabelType; 16 | 17 | virtual inline void updateLeft(LabelType y) = 0; 18 | 19 | virtual inline void updateRight(LabelType y) = 0; 20 | 21 | virtual inline LabelType getLabel() const = 0; 22 | 23 | virtual inline bool stop() const = 0; 24 | }; 25 | 26 | template 27 | struct finalize { 28 | static void op(YStats &y_stats) {} 29 | }; 30 | 31 | // inherit statistics from parent 32 | template 33 | struct prepare { 34 | static YStats left_op(const YStats &y_stats) { return YStats(); } 35 | 36 | static YStats right_op(const YStats &y_stats) { return y_stats; } 37 | }; 38 | } 39 | 40 | namespace internal { 41 | /*! \brief data structure for additional linked list on presort samples 42 | */ 43 | template 44 | struct SortedSample; 45 | 46 | template 47 | class DTLeaf; 48 | 49 | template 50 | class DTBranch; 51 | 52 | } 53 | } 54 | #endif /* _TRAITS_H_ */ 55 | -------------------------------------------------------------------------------- /include/impl/binary_classification.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _BINARY_CLASSIFICATION_H_ 2 | #define _BINARY_CLASSIFICATION_H_ 3 | 4 | #include "core/common.hpp" 5 | #include "core/traits.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace d2 { 12 | namespace def { 13 | //! \brief the Stats class for binary classification problem 14 | struct BinaryClassificationStats : public Stats<_D2_CLTYPE> { 15 | // member variables 16 | std::array histogram; 17 | real_t alpha; 18 | 19 | BinaryClassificationStats() : histogram({}), alpha(0.5) { 20 | } 21 | 22 | BinaryClassificationStats(real_t alpha) : histogram({}), alpha(alpha) { 23 | } 24 | 25 | BinaryClassificationStats(const BinaryClassificationStats &that) : histogram(that.histogram), 26 | alpha(that.alpha) { 27 | } 28 | 29 | using LabelType = Stats<_D2_CLTYPE>::LabelType; 30 | 31 | inline LabelType getLabel() const override { 32 | if (alpha * histogram[1] - (1 - alpha) * histogram[0] <= 0) { 33 | return 0; 34 | } else { 35 | return 1; 36 | } 37 | } 38 | 39 | inline void updateLeft(LabelType y) override { 40 | histogram[y]++; 41 | } 42 | 43 | inline void updateRight(LabelType y) override { 44 | histogram[y]--; 45 | } 46 | 47 | inline bool stop() const override { 48 | // todo 49 | return false; 50 | } 51 | 52 | template 53 | inline static real_t 54 | goodness_score(const BinaryClassificationStats left, const BinaryClassificationStats right) { 55 | return std::min(criterion::op(left), criterion::op(right)); 56 | } 57 | 58 | }; 59 | 60 | /*! \brief FN - TN */ 61 | struct fntn { 62 | static inline real_t op(const BinaryClassificationStats &y_stats) { 63 | const auto &alpha = y_stats.alpha; 64 | return std::min(alpha * y_stats.histogram[1] - (1 - alpha) * y_stats.histogram[0], (real_t) 0.); 65 | } 66 | 67 | static inline real_t unnormalized_op(const BinaryClassificationStats &y_stats) { 68 | return op(y_stats); 69 | } 70 | 71 | }; 72 | } 73 | 74 | namespace internal { 75 | 76 | template<> 77 | struct SortedSample { 78 | real_t x; 79 | _D2_CLTYPE y; 80 | size_t index; 81 | 82 | inline static bool cmp(const SortedSample &a, 83 | const SortedSample &b) { 84 | return a.x < b.x; 85 | } 86 | }; 87 | } 88 | 89 | } 90 | 91 | #endif /* _BINARY_CLASSIFICATION_H_ */ 92 | -------------------------------------------------------------------------------- /include/impl/classification.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _CLASSIFICATION_H_ 2 | #define _CLASSIFICATION_H_ 3 | 4 | #include "core/common.hpp" 5 | #include "core/traits.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace d2 { 12 | namespace def { 13 | //! \brief the Stats class for classification problems 14 | template 15 | struct ClassificationStats : public Stats<_D2_CLTYPE> { 16 | // member variables 17 | std::array histogram; 18 | 19 | ClassificationStats() : histogram({}) { 20 | } 21 | 22 | ClassificationStats(const ClassificationStats &that) : histogram(that.histogram) { 23 | } 24 | 25 | using LabelType = Stats<_D2_CLTYPE>::LabelType; 26 | 27 | inline LabelType getLabel() const override { 28 | return std::max_element(histogram.begin(), histogram.end() - 1) - histogram.begin(); 29 | } 30 | 31 | inline void updateLeft(LabelType y) override { 32 | histogram[y]++; 33 | histogram.back()++; 34 | } 35 | 36 | inline void updateRight(LabelType y) override { 37 | histogram[y]--; 38 | histogram.back()--; 39 | } 40 | 41 | inline bool stop() const override { 42 | // todo 43 | return false; 44 | } 45 | 46 | template 47 | inline static real_t 48 | goodness_score(const ClassificationStats left, const ClassificationStats right) { 49 | return 50 | (criterion::op(left) * left.histogram.back() + criterion::op(right) * right.histogram.back()) 51 | / (left.histogram.back() + right.histogram.back()); 52 | } 53 | 54 | }; 55 | 56 | /*! \brief gini function used in make splits 57 | */ 58 | struct gini { 59 | template 60 | static inline real_t op(const ClassificationStats &y_stats) { 61 | auto &proportion = y_stats.histogram; 62 | real_t total_weight_sq; 63 | total_weight_sq = proportion.back() * proportion.back(); 64 | //if (total_weight_sq <= 0) return 1.; 65 | 66 | real_t gini = total_weight_sq; 67 | for (size_t i = 0; i < n_class; ++i) 68 | gini -= proportion[i] * proportion[i]; 69 | gini /= total_weight_sq; 70 | return gini; 71 | } 72 | 73 | template 74 | static inline real_t unnormalized_op(const ClassificationStats &y_stats) { 75 | return op(y_stats) * y_stats.histogram.back(); 76 | } 77 | 78 | static inline real_t loss(const real_t &x) { return 1 - x; } 79 | }; 80 | 81 | /*! \brief entropy function used in make splits 82 | */ 83 | struct entropy { 84 | template 85 | static inline real_t op(const ClassificationStats &y_stats) { 86 | auto &proportion = y_stats.histogram; 87 | real_t total_weight; 88 | total_weight = proportion.back(); 89 | assert(total_weight > 0); 90 | 91 | real_t entropy = 0.; 92 | for (size_t i = 0; i < n_class; ++i) { 93 | if (proportion[i] > 0) { 94 | real_t p = proportion[i] / total_weight; 95 | entropy -= log(p) * p; 96 | } 97 | } 98 | 99 | return entropy; 100 | } 101 | 102 | template 103 | static inline real_t unnormalized_op(const ClassificationStats &y_stats) { 104 | return op(y_stats) * y_stats.histogram.back(); 105 | } 106 | 107 | static inline real_t loss(const real_t &x) { return -log(x); } 108 | }; 109 | 110 | 111 | } 112 | 113 | namespace internal { 114 | 115 | template 116 | struct SortedSample > { 117 | real_t x; 118 | _D2_CLTYPE y; 119 | // real_t weight; 120 | size_t index; 121 | 122 | //SortedSample *next; 123 | inline static bool cmp(const SortedSample > &a, 124 | const SortedSample > &b) { 125 | return a.x < b.x; 126 | } 127 | }; 128 | } 129 | } 130 | 131 | #endif /* _CLASSIFICATION_H_ */ 132 | -------------------------------------------------------------------------------- /include/impl/day_sharpe.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _DAY_SHARPE_H_ 2 | #define _DAY_SHARPE_H_ 3 | 4 | 5 | #include "core/traits.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace d2 { 13 | namespace def { 14 | 15 | struct sharpe_stats { 16 | real_t mean; 17 | real_t std; 18 | real_t sharpe; //< negative sharpe 19 | }; 20 | 21 | template 22 | sharpe_stats _sharpe_helper(const std::array &fr) { 23 | real_t m1 = 0, m2 = 0; 24 | for (size_t i = 0; i < days; ++i) { 25 | m1 += fr[i]; 26 | m2 += fr[i] * fr[i]; 27 | } 28 | 29 | m1 = m1 / days; 30 | m2 = m2 / days; 31 | 32 | return {m1, 33 | static_cast(sqrt(m2 - m1 * m1)), 34 | static_cast(-m1 / (sqrt(m2 - m1 * m1) + 1E-10))}; 35 | } 36 | 37 | struct reward_date_pair { 38 | real_t reward; 39 | size_t date; 40 | 41 | bool operator==(const reward_date_pair &that) { 42 | return reward == that.reward; 43 | } 44 | }; 45 | 46 | std::ostream &operator<<(std::ostream &out, const reward_date_pair &p) { 47 | out << p.reward; 48 | return out; 49 | } 50 | 51 | template 52 | struct DaySharpeStats : Stats { 53 | size_t count; 54 | std::array forward_return; 55 | real_t best; 56 | 57 | DaySharpeStats() : count(0), forward_return({}), best(std::numeric_limits::max()) {} 58 | 59 | DaySharpeStats(const DaySharpeStats &that) : count(that.count), forward_return(that.forward_return), 60 | best(that.best) {} 61 | 62 | using Stats::LabelType; 63 | 64 | inline LabelType getLabel() const override { 65 | return {std::min(_sharpe_helper(this->forward_return).sharpe, (real_t) 0.), 66 | std::numeric_limits::max()}; 67 | } 68 | 69 | inline void updateLeft(LabelType y) override { 70 | forward_return[y.date] += y.reward; 71 | count++; 72 | } 73 | 74 | inline void updateRight(LabelType y) override { 75 | forward_return[y.date] -= y.reward; 76 | count--; 77 | } 78 | 79 | inline bool stop() const override { 80 | // todo 81 | return false; 82 | } 83 | 84 | template 85 | inline static real_t goodness_score(const DaySharpeStats left, const DaySharpeStats right) { 86 | return std::min(criterion::op(left), criterion::op(right)); 87 | } 88 | }; 89 | 90 | template 91 | struct finalize, criterion> { 92 | static void op(DaySharpeStats &y_stats) { 93 | y_stats.best = std::min(y_stats.best, criterion::op(y_stats)); 94 | } 95 | }; 96 | 97 | template 98 | struct prepare, criterion> { 99 | static DaySharpeStats left_op(const DaySharpeStats &y_stats) { 100 | DaySharpeStats left_stats; 101 | left_stats.best = y_stats.best; 102 | return left_stats; 103 | } 104 | 105 | static DaySharpeStats right_op(const DaySharpeStats &y_stats) { 106 | return y_stats; 107 | } 108 | }; 109 | 110 | struct sharpe { 111 | template 112 | static inline real_t op(const DaySharpeStats &y_stats) { 113 | return _sharpe_helper(y_stats.forward_return).sharpe; 114 | //return std::min(_sharpe_helper(y_stats.forward_return).sharpe, y_stats.best); 115 | } 116 | 117 | template 118 | static inline real_t unnormalized_op(const DaySharpeStats &y_stats) { 119 | return _sharpe_helper(y_stats.forward_return).mean; 120 | } 121 | 122 | }; 123 | } 124 | 125 | namespace internal { 126 | template 127 | struct SortedSample; 128 | 129 | template 130 | struct SortedSample > { 131 | real_t x; 132 | def::reward_date_pair y; 133 | // real_t weight; 134 | size_t index; 135 | 136 | //SortedSample *next; 137 | inline static bool cmp(const SortedSample > &a, 138 | const SortedSample > &b) { 139 | return a.x < b.x; 140 | } 141 | }; 142 | } 143 | } 144 | 145 | 146 | #endif /* _DAY_SHARPE_H_ */ 147 | -------------------------------------------------------------------------------- /include/impl/regression.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _REGRESSION_H_ 2 | #define _REGRESSION_H_ 3 | 4 | #include "core/common.hpp" 5 | #include "core/traits.hpp" 6 | 7 | namespace d2 { 8 | //! \brief customizable template classes that extend the scope of decision tree implementations 9 | namespace def { 10 | struct RegressionStats : Stats<_D2_RGTYPE> { 11 | using LabelType = Stats<_D2_RGTYPE>::LabelType; 12 | size_t count; 13 | double sum; 14 | double sum_sq; 15 | 16 | 17 | RegressionStats() : count(0), sum(0), sum_sq(0) {} 18 | 19 | RegressionStats(const RegressionStats &that) : 20 | count(that.count), sum(that.sum), sum_sq(that.sum_sq) { 21 | } 22 | 23 | inline LabelType getLabel() const override { 24 | return (LabelType) sum / (LabelType) count; 25 | } 26 | 27 | inline void updateLeft(LabelType y) override { 28 | count++; 29 | sum += y; 30 | sum_sq += y * y; 31 | } 32 | 33 | inline void updateRight(LabelType y) override { 34 | count--; 35 | sum -= y; 36 | sum_sq -= y * y; 37 | } 38 | 39 | inline bool stop() const override { 40 | // todo 41 | return false; 42 | } 43 | 44 | template 45 | inline static real_t goodness_score(const RegressionStats left, const RegressionStats right) { 46 | return 47 | (criterion::op(left) * left.count + criterion::op(right) * right.count) 48 | / (left.count + right.count); 49 | } 50 | }; 51 | 52 | /*! \brief mean square error function 53 | */ 54 | struct mse { 55 | static inline real_t op(const RegressionStats &y_stats) { 56 | real_t mean = (real_t) y_stats.sum / (real_t) y_stats.count; 57 | return (real_t) y_stats.sum_sq / (real_t) y_stats.count - mean * mean; 58 | } 59 | 60 | static inline real_t unnormalized_op(const RegressionStats &y_stats) { 61 | return op(y_stats) * y_stats.count; 62 | } 63 | }; 64 | } 65 | 66 | namespace internal { 67 | template<> 68 | struct SortedSample { 69 | real_t x; 70 | real_t y; 71 | // real_t weight; 72 | size_t index; 73 | 74 | //SortedSample *next; 75 | inline static bool cmp(const SortedSample &a, 76 | const SortedSample &b) { 77 | return a.x < b.x; 78 | } 79 | }; 80 | } 81 | } 82 | 83 | #endif /* _REGRESSION_H_ */ 84 | -------------------------------------------------------------------------------- /include/utility/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef _TIMER_H_ 2 | #define _TIMER_H_ 3 | 4 | #define BILLION 1000000000L 5 | 6 | // Timing, count in nano seconds. 7 | #if defined (_WIN32) 8 | #include 9 | 10 | inline double getRealTime() { 11 | FILETIME tm; 12 | ULONGLONG t; 13 | #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8 14 | /* Windows 8, Windows Server 2012 and later. ---------------- */ 15 | GetSystemTimePreciseAsFileTime( &tm ); 16 | #else 17 | /* Windows 2000 and later. ---------------------------------- */ 18 | GetSystemTimeAsFileTime( &tm ); 19 | #endif 20 | t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime; 21 | return (double) t / (double) BILLION; 22 | } 23 | 24 | 25 | #else 26 | 27 | #include 28 | 29 | #ifdef __MACH__ 30 | #include 31 | //clock_gettime is not implemented on OSX 32 | #include 33 | #include 34 | //#define CLOCK_REALTIME 0 35 | //#define CLOCK_MONOTONIC 0 36 | inline int clock_gettime(int clk_id, struct timespec* ts) { 37 | clock_serv_t cclock; 38 | mach_timespec_t mts; 39 | clk_id = 0; // something stupid to get ride of warnings 40 | host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); 41 | clock_get_time(cclock, &mts); 42 | mach_port_deallocate(mach_task_self(), cclock); 43 | ts->tv_sec = mts.tv_sec; 44 | ts->tv_nsec = mts.tv_nsec; 45 | return 0; 46 | } 47 | #endif 48 | 49 | 50 | inline double getRealTime() { 51 | struct timespec ts; 52 | clock_gettime(CLOCK_MONOTONIC, &ts); 53 | return (double) (ts.tv_sec) + (double) (ts.tv_nsec) / (double) BILLION; 54 | } 55 | 56 | #endif 57 | 58 | 59 | #endif /* _TIMER_H_ */ 60 | -------------------------------------------------------------------------------- /test/eval_day_sharpe.cpp: -------------------------------------------------------------------------------- 1 | #include "core/decision_tree.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace d2; 9 | using namespace std; 10 | 11 | 12 | /* training sample size */ 13 | #ifndef N 14 | #define N 1000000 15 | #endif 16 | 17 | /* testing sample size */ 18 | #ifndef M 19 | #define M 0 20 | #endif 21 | 22 | /* dimension of features */ 23 | #ifndef D 24 | #define D 28 25 | #endif 26 | 27 | /* number of classes */ 28 | #ifndef NC 29 | #define NC 2 30 | #endif 31 | 32 | /* maximal depth */ 33 | #ifndef MD 34 | #define MD 8 35 | #endif 36 | 37 | /* minimal sample weight (size) */ 38 | #ifndef MW 39 | #define MW .0 40 | #endif 41 | 42 | #ifndef DAYS 43 | #define DAYS 100 44 | #endif 45 | 46 | 47 | using namespace d2::def; 48 | 49 | typedef reward_date_pair d2_label_t; 50 | 51 | template 52 | void sample_naive_data(real_t *X, LabelType *y, real_t *w, size_t n); 53 | 54 | template 55 | real_t metric(LabelType *y_pred, LabelType *y_true, size_t n); 56 | 57 | 58 | template<> 59 | void sample_naive_data(real_t *X, reward_date_pair *y, real_t *w, size_t n) { 60 | for (size_t i = 0; i < n; ++i) { 61 | y[i].reward = 2 * (rand() % 2) - 1; 62 | y[i].date = rand() % DAYS; 63 | if (((int) y[i].reward + 1) / 2) { 64 | for (size_t j = 0; j < D; ++j) 65 | X[i * D + j] = (real_t) rand() / (real_t) RAND_MAX; 66 | } else { 67 | for (size_t j = 0; j < D; ++j) 68 | X[i * D + j] = (real_t) rand() / (real_t) RAND_MAX - .1; 69 | } 70 | if (w) w[i] = 1.; // (real_t) rand() / (real_t) RAND_MAX; 71 | } 72 | } 73 | 74 | template<> 75 | real_t metric(reward_date_pair *y_pred, reward_date_pair *y_true, size_t n) { 76 | std::array k = {}; 77 | for (size_t i = 0; i < n; ++i) 78 | if (y_pred[i].reward != 0) k[y_true[i].date] += y_true[i].reward; 79 | 80 | return _sharpe_helper(k).sharpe; 81 | } 82 | 83 | void metric_time(reward_date_pair *y_pred, reward_date_pair *y_true, size_t n, unsigned long long int *orderid) { 84 | std::array k = {}; 85 | std::set orderid_set; 86 | int current_day = -1; 87 | for (size_t i = 0; i < n; ++i) { 88 | if (y_true[i].date != current_day) { 89 | current_day = y_true[i].date; 90 | orderid_set.clear(); 91 | } 92 | if (orderid_set.find(orderid[i]) != orderid_set.end()) continue; 93 | 94 | if (y_pred[i].reward != 0) { 95 | k[y_true[i].date] += y_true[i].reward; 96 | orderid_set.insert(orderid[i]); 97 | } 98 | } 99 | 100 | auto stats = _sharpe_helper(k); 101 | std::cout << "mean: " << stats.mean << std::endl; 102 | std::cout << "std: " << stats.std << std::endl; 103 | std::cout << "sharpe: " << stats.sharpe << std::endl; 104 | } 105 | 106 | 107 | int main(int argc, char *argv[]) { 108 | assert(N >= M); 109 | real_t *X, *w = NULL; 110 | d2_label_t *y, *y_pred; 111 | unsigned long long int *orderid; 112 | 113 | // prepare naive training data 114 | X = new real_t[D * N]; 115 | y = new d2_label_t[N]; 116 | //w = new real_t[N]; 117 | y_pred = new d2_label_t[N]; 118 | orderid = new unsigned long long int[N]; 119 | 120 | if (argc == 1) { 121 | sample_naive_data(X, y, w, N); 122 | } else { 123 | ifstream train_fs; 124 | train_fs.open(argv[1]); 125 | for (auto i = 0; i < N; ++i) { 126 | string line; 127 | getline(train_fs, line); 128 | istringstream ss(line); 129 | string number; 130 | 131 | getline(ss, number, ','); 132 | orderid[i] = stoll(number); 133 | 134 | getline(ss, number, ','); 135 | y[i].reward = (real_t) stof(number); 136 | 137 | getline(ss, number, ','); 138 | y[i].date = (size_t) stoi(number); 139 | 140 | for (auto j = 0; j < D; ++j) { 141 | getline(ss, number, ','); 142 | X[i * D + j] = stof(number); 143 | } 144 | } 145 | train_fs.close(); 146 | std::cout << "finished data load!" << std::endl; 147 | } 148 | 149 | auto classifier = new Decision_Tree, sharpe>(); 150 | 151 | 152 | classifier->init(); 153 | classifier->set_max_depth(MD); 154 | classifier->set_min_leaf_weight(MW); 155 | 156 | std::fstream f; 157 | f.open("tree.bin", std::fstream::in); 158 | classifier->load(&f); 159 | f.close(); 160 | 161 | std::ostringstream oss; 162 | classifier->dotgraph(oss); 163 | std::cout << oss.str() << std::endl; 164 | 165 | 166 | if (argc == 1) { 167 | sample_naive_data(X, y, w, M); 168 | classifier->predict(X, M, y_pred); 169 | // output result 170 | printf("test sharpe: %.3f\n", -metric(y_pred, y, M)); 171 | } else { 172 | classifier->predict(X, N, y_pred); 173 | metric_time(y_pred, y, N, orderid); 174 | } 175 | 176 | 177 | delete[] X; 178 | delete[] y; 179 | delete[] y_pred; 180 | delete[] orderid; 181 | 182 | return 0; 183 | } 184 | -------------------------------------------------------------------------------- /test/test_day_sharpe.cpp: -------------------------------------------------------------------------------- 1 | #include "core/decision_tree.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace d2; 8 | using namespace std; 9 | 10 | 11 | /* training sample size */ 12 | #ifndef N 13 | #define N 1000000 14 | #endif 15 | 16 | /* testing sample size */ 17 | #ifndef M 18 | #define M 0 19 | #endif 20 | 21 | /* dimension of features */ 22 | #ifndef D 23 | #define D 28 24 | #endif 25 | 26 | /* number of classes */ 27 | #ifndef NC 28 | #define NC 2 29 | #endif 30 | 31 | /* maximal depth */ 32 | #ifndef MD 33 | #define MD 8 34 | #endif 35 | 36 | /* minimal sample weight (size) */ 37 | #ifndef MW 38 | #define MW .0 39 | #endif 40 | 41 | #ifndef DAYS 42 | #define DAYS 100 43 | #endif 44 | 45 | 46 | using namespace d2::def; 47 | 48 | typedef reward_date_pair d2_label_t; 49 | 50 | template 51 | void sample_naive_data(real_t *X, LabelType *y, real_t *w, size_t n); 52 | 53 | template 54 | real_t metric(LabelType *y_pred, LabelType *y_true, size_t n); 55 | 56 | 57 | template<> 58 | void sample_naive_data(real_t *X, reward_date_pair *y, real_t *w, size_t n) { 59 | for (size_t i = 0; i < n; ++i) { 60 | y[i].reward = 2 * (rand() % 2) - 1; 61 | y[i].date = rand() % DAYS; 62 | if (((int) y[i].reward + 1) / 2) { 63 | for (size_t j = 0; j < D; ++j) 64 | X[i * D + j] = (real_t) rand() / (real_t) RAND_MAX; 65 | } else { 66 | for (size_t j = 0; j < D; ++j) 67 | X[i * D + j] = (real_t) rand() / (real_t) RAND_MAX - .1; 68 | } 69 | if (w) w[i] = 1.; // (real_t) rand() / (real_t) RAND_MAX; 70 | } 71 | } 72 | 73 | template<> 74 | real_t metric(reward_date_pair *y_pred, reward_date_pair *y_true, size_t n) { 75 | std::array k = {}; 76 | for (size_t i = 0; i < n; ++i) 77 | if (y_pred[i].reward != 0) k[y_true[i].date] += y_true[i].reward; 78 | 79 | return _sharpe_helper(k).sharpe; 80 | } 81 | 82 | 83 | int main(int argc, char *argv[]) { 84 | assert(N >= M); 85 | real_t *X, *w = NULL; 86 | d2_label_t *y, *y_pred; 87 | 88 | // prepare naive training data 89 | X = new real_t[D * N]; 90 | y = new d2_label_t[N]; 91 | //w = new real_t[N]; 92 | y_pred = new d2_label_t[N]; 93 | 94 | if (argc == 1) { 95 | sample_naive_data(X, y, w, N); 96 | ofstream train_fs; 97 | train_fs.open("data.csv"); 98 | for (auto i = 0; i < N; ++i) { 99 | train_fs << y[i].reward; 100 | train_fs << "," << y[i].date; 101 | for (auto j = 0; j < D; ++j) 102 | train_fs << "," << X[i * D + j]; 103 | train_fs << endl; 104 | } 105 | train_fs.close(); 106 | } else { 107 | ifstream train_fs; 108 | train_fs.open(argv[1]); 109 | for (auto i = 0; i < N; ++i) { 110 | string line; 111 | getline(train_fs, line); 112 | istringstream ss(line); 113 | string number; 114 | getline(ss, number, ','); 115 | y[i].reward = (real_t) stof(number); 116 | getline(ss, number, ','); 117 | y[i].date = (size_t) stoi(number); 118 | for (auto j = 0; j < D; ++j) { 119 | getline(ss, number, ','); 120 | X[i * D + j] = stof(number); 121 | } 122 | } 123 | train_fs.close(); 124 | std::cout << "finished data load!" << std::endl; 125 | } 126 | 127 | auto classifier = new Decision_Tree, sharpe>(); 128 | 129 | 130 | classifier->init(); 131 | classifier->set_max_depth(MD); 132 | classifier->set_min_leaf_weight(MW); 133 | // training 134 | double start = getRealTime(); 135 | classifier->fit(X, y, w, N); 136 | printf("training time: %lf seconds\n", getRealTime() - start); 137 | printf("nleafs: %zu \n", classifier->root->getLeafCount()); 138 | 139 | std::ostringstream oss; 140 | classifier->dotgraph(oss); 141 | std::cout << oss.str() << std::endl; 142 | 143 | if (argc == 1) { 144 | sample_naive_data(X, y, w, M); 145 | classifier->predict(X, M, y_pred); 146 | // output result 147 | printf("test sharpe: %.3f\n", -metric(y_pred, y, M)); 148 | } else { 149 | } 150 | 151 | std::fstream f; 152 | f.open("tree.bin", std::fstream::out); 153 | classifier->save(&f); 154 | f.close(); 155 | 156 | delete[] X; 157 | delete[] y; 158 | delete[] y_pred; 159 | 160 | return 0; 161 | } 162 | -------------------------------------------------------------------------------- /test/test_dt.cpp: -------------------------------------------------------------------------------- 1 | #include "core/decision_tree.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace d2; 8 | using namespace std; 9 | 10 | 11 | /* training sample size */ 12 | #ifndef N 13 | #define N 1000000 14 | #endif 15 | 16 | /* testing sample size */ 17 | #ifndef M 18 | #define M 0 19 | #endif 20 | 21 | /* dimension of features */ 22 | #ifndef D 23 | #define D 28 24 | #endif 25 | 26 | /* number of classes */ 27 | #ifndef NC 28 | #define NC 2 29 | #endif 30 | 31 | /* maximal depth */ 32 | #ifndef MD 33 | #define MD 8 34 | #endif 35 | 36 | /* maximal number of leafs */ 37 | #ifndef ML 38 | #define ML 2000 39 | #endif 40 | 41 | /* minimal sample weight (size) */ 42 | #ifndef MW 43 | #define MW 1.0 44 | #endif 45 | 46 | #ifdef USE_D2_CLTYPE 47 | typedef _D2_CLTYPE d2_label_t; 48 | #elif defined USE_D2_RGTYPE 49 | typedef _D2_RGTYPE d2_label_t; 50 | #endif 51 | 52 | template 53 | void sample_naive_data(real_t *X, LabelType *y, real_t *w, size_t n) { 54 | for (size_t i = 0; i < n; ++i) { 55 | y[i] = ((real_t) rand() / (real_t) RAND_MAX) < 0.02; 56 | if (y[i]) { 57 | for (size_t j = 0; j < D; ++j) 58 | X[i * D + j] = (real_t) rand() / (real_t) RAND_MAX; 59 | } else { 60 | for (size_t j = 0; j < D; ++j) 61 | X[i * D + j] = (real_t) rand() / (real_t) RAND_MAX - 0.01; 62 | } 63 | if (w) w[i] = 1.; // (real_t) rand() / (real_t) RAND_MAX; 64 | } 65 | } 66 | 67 | 68 | struct Metric { 69 | real_t FP, FN, Sensitivity, Specificity, Accuracy; 70 | }; 71 | 72 | template 73 | Metric metric(LabelType *y_pred, LabelType *y_true, size_t n) { 74 | size_t fn = 0, fp = 0, tn = 0, tp = 0; 75 | for (size_t i = 0; i < n; ++i) { 76 | if (y_true[i] == 0) { 77 | if (y_pred[i] < 0.5) tn++; 78 | else fn++; 79 | } else { 80 | if (y_pred[i] > 0.5) tp++; 81 | else fp++; 82 | } 83 | } 84 | return {(real_t) fp / (real_t) (fp + tp), 85 | (real_t) fn / (real_t) (fn + tn), 86 | (real_t) tp / (real_t) (tp + fn), 87 | (real_t) tn / (real_t) (tn + fp), 88 | (real_t) (tp + tn) / (real_t) (tp + tn + fp + fn)}; 89 | } 90 | 91 | int main(int argc, char *argv[]) { 92 | 93 | real_t *X, *w = NULL; 94 | d2_label_t *y, *y_pred; 95 | 96 | // prepare naive training data 97 | X = new real_t[D * N]; 98 | y = new d2_label_t[N]; 99 | //w = new real_t[N]; 100 | y_pred = new d2_label_t[M]; 101 | 102 | if (argc == 1) { 103 | sample_naive_data(X, y, w, N); 104 | } else { 105 | ifstream train_fs; 106 | train_fs.open(argv[1]); 107 | for (auto i = 0; i < N; ++i) { 108 | string line; 109 | getline(train_fs, line); 110 | istringstream ss(line); 111 | string number; 112 | getline(ss, number, ','); 113 | y[i] = (d2_label_t) stof(number); 114 | for (auto j = 0; j < D; ++j) { 115 | getline(ss, number, ','); 116 | X[i * D + j] = stof(number); 117 | } 118 | } 119 | train_fs.close(); 120 | } 121 | 122 | 123 | // create classifier 124 | #if USE_D2_CLTYPE 125 | auto classifier = new Decision_Tree, def::gini>(); 126 | //auto classifier = new Decision_Tree(); 127 | #elif USE_D2_RGTYPE 128 | auto classifier = new Decision_Tree(); 129 | #endif 130 | 131 | classifier->init(); 132 | classifier->set_max_depth(MD); 133 | classifier->set_min_leaf_weight(MW); 134 | classifier->set_max_nleafs(ML); 135 | // training 136 | double start = getRealTime(); 137 | classifier->fit(X, y, w, N); 138 | printf("training time: %lf seconds\n", getRealTime() - start); 139 | printf("nleafs: %zu \n", classifier->root->getLeafCount()); 140 | 141 | if (argc == 1) { 142 | // prepare naive testing data 143 | sample_naive_data(X, y, w, M); 144 | classifier->predict(X, M, y_pred); 145 | 146 | } else if (argc == 3) { 147 | assert(M < N); 148 | ifstream test_fs; 149 | test_fs.open(argv[2]); 150 | for (auto i = 0; i < M; ++i) { 151 | string line; 152 | getline(test_fs, line); 153 | istringstream ss(line); 154 | string number; 155 | getline(ss, number, ','); 156 | y[i] = stof(number); 157 | for (auto j = 0; j < D; ++j) { 158 | getline(ss, number, ','); 159 | X[i * D + j] = stof(number); 160 | } 161 | } 162 | test_fs.close(); 163 | // classifier->predict_debug(X); 164 | classifier->predict(X, M, y_pred); 165 | } 166 | 167 | auto result = metric(y_pred, y, M); 168 | printf("test metric: FP %.3f, FN %.3f, Sensitivity %.3f, Specificity %.3f, Accuracy %.3f\n", 169 | result.FP, result.FN, result.Sensitivity, result.Specificity, result.Accuracy); 170 | 171 | delete[] X; 172 | delete[] y; 173 | delete[] y_pred; 174 | 175 | return 0; 176 | } 177 | -------------------------------------------------------------------------------- /tools/sharpe_finder.cpp: -------------------------------------------------------------------------------- 1 | #include "core/decision_tree.hpp" 2 | #include "utility/CLI11.hpp" 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace d2; 8 | using namespace d2::def; 9 | using namespace std; 10 | 11 | typedef reward_date_pair d2_label_t; 12 | 13 | void metric_time(reward_date_pair *y_pred, reward_date_pair *y_true, size_t n, unsigned long long int *orderid) { 14 | std::array k = {}; 15 | std::set orderid_set; 16 | int current_day = -1; 17 | for (size_t i=0; i &X, vector &y, 37 | vector &order_id, bool has_order_id = false) { 38 | ifstream train_fs; 39 | if (filename.empty()) { 40 | return -1; 41 | } 42 | 43 | train_fs.open(filename); 44 | if (!train_fs.good()) { 45 | cerr <<"Error: cannot open " << filename << endl; 46 | return -1; 47 | } 48 | string line; 49 | 50 | while (getline(train_fs, line)) { 51 | istringstream ss(line); 52 | string number; 53 | if (has_order_id) { 54 | getline(ss, number, ','); 55 | order_id.push_back(stoll(number)); 56 | } 57 | y.push_back(d2_label_t()); 58 | getline(ss, number, ','); 59 | y.back().reward = (real_t) stof(number); 60 | getline(ss, number, ','); 61 | y.back().date = (size_t) stoi(number); 62 | for (auto j=0; j &out) { 86 | // construct a stream from the string 87 | std::stringstream ss(str); 88 | 89 | std::string s; 90 | while (std::getline(ss, s, delim)) { 91 | out.push_back(s); 92 | } 93 | }; 94 | 95 | std::vector filenames; 96 | tokenize(filenames_buffer, ',', filenames); 97 | 98 | vector< Decision_Tree, sharpe> > classifiers (filenames.size()); 99 | 100 | 101 | // read test data 102 | vector X_test; 103 | vector y_test; 104 | vector orders_test; 105 | bool has_test = read_data(test_filename, X_test, y_test, orders_test, true) == 0; 106 | std::unordered_map node_mapper; 107 | 108 | 109 | for (auto i=0; i < filenames.size(); ++i) { 110 | string filename = filenames[i]; 111 | vector X; 112 | vector y; 113 | vector orders; 114 | read_data(filename, X, y, orders); 115 | 116 | vector X_reduced; 117 | vector y_reduced; 118 | 119 | size_t n = y.size(); 120 | for (size_t nn = 0; nn < n; ++ nn) { 121 | bool skip = false; 122 | for (int j=0; jgetLeafCount()); 152 | X_reduced.clear(); 153 | y_reduced.clear(); 154 | 155 | std::fstream f; 156 | string tree_file = "tree.bin." + to_string(i); 157 | f.open(tree_file.c_str(), std::fstream::out); 158 | classifier.save(&f); 159 | f.close(); 160 | 161 | if (has_test) { 162 | vector y_pred(y_test.size()); 163 | for (size_t nn = 0; nn < y_test.size(); ++nn) { 164 | auto &yy = y_pred[nn]; 165 | for (int j=0; j