├── .gitignore ├── CMakeLists.txt ├── Gbm.h ├── Concurrency.h ├── Concurrency.cpp ├── README.md ├── LogisticFun.h ├── Config.h ├── Config.cpp ├── GbmFun.h ├── DataSet.h ├── Tree.h ├── TreeRegressor.h ├── Gbm.cpp ├── DataSet.cpp ├── TreeRegressor.cpp ├── Train.cpp └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | CMakeCache.txt 2 | CMakeFiles/ 3 | CMakeLists.txt~ 4 | Makefile 5 | cmake_install.cmake 6 | train 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.0) 2 | project("Gbm Training") 3 | 4 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++11 -O2") 5 | 6 | include_directories("/usr/local/include") 7 | 8 | ADD_LIBRARY(folly STATIC IMPORTED) 9 | set_property(TARGET folly PROPERTY IMPORTED_LOCATION /usr/local/lib/libfolly.a) 10 | 11 | ADD_LIBRARY(thrift STATIC IMPORTED) 12 | set_property(TARGET thrift PROPERTY IMPORTED_LOCATION /usr/local/lib/libthrift.a) 13 | 14 | set(gflags_SHARED FALSE) 15 | set(gflags_NOTHREADS FALSE) 16 | find_package(gflags REQUIRED) 17 | 18 | ADD_LIBRARY(glog STATIC IMPORTED) 19 | set_property(TARGET glog PROPERTY IMPORTED_LOCATION /usr/local/lib/libglog.a) 20 | 21 | ADD_LIBRARY(double-conversion STATIC IMPORTED) 22 | set_property(TARGET double-conversion PROPERTY IMPORTED_LOCATION /usr/local/lib/libdouble-conversion.a) 23 | 24 | add_executable(train 25 | Concurrency.cpp 26 | Config.cpp 27 | DataSet.cpp 28 | Gbm.cpp 29 | Train.cpp 30 | TreeRegressor.cpp) 31 | 32 | target_link_libraries(train 33 | pthread 34 | double-conversion 35 | folly 36 | thrift 37 | gflags 38 | glog) 39 | -------------------------------------------------------------------------------- /Gbm.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | 25 | namespace boosting { 26 | 27 | class Config; 28 | class DataSet; 29 | class GbmFun; 30 | 31 | template class TreeNode; 32 | 33 | class Gbm { 34 | public: 35 | Gbm(const GbmFun& fun, 36 | const DataSet& ds, 37 | const Config& cfg); 38 | 39 | void getModel(std::vector*>* model, 40 | double fimps[]); 41 | 42 | private: 43 | 44 | TreeNode* mapTree(const TreeNode* rt); 45 | 46 | const GbmFun& fun_; 47 | const DataSet& ds_; 48 | const Config& cfg_; 49 | }; 50 | 51 | } 52 | -------------------------------------------------------------------------------- /Concurrency.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include "thrift/concurrency/Monitor.h" 24 | #include "thrift/concurrency/PosixThreadFactory.h" 25 | #include "thrift/concurrency/ThreadManager.h" 26 | #include "gflags/gflags.h" 27 | 28 | DECLARE_int32(num_threads); 29 | 30 | namespace boosting { 31 | 32 | class Concurrency { 33 | 34 | public: 35 | 36 | static boost::shared_ptr 37 | threadManager; 38 | 39 | static void initThreadManager(); 40 | 41 | }; 42 | 43 | class CounterMonitor { 44 | 45 | public: 46 | 47 | explicit CounterMonitor(int n) : counter_(n) {} 48 | 49 | void init(int size); 50 | 51 | void decrement(); 52 | 53 | int wait(); 54 | 55 | private: 56 | 57 | apache::thrift::concurrency::Monitor monitor_; 58 | 59 | std::atomic counter_; 60 | 61 | }; 62 | 63 | } 64 | -------------------------------------------------------------------------------- /Concurrency.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include "Concurrency.h" 21 | 22 | DEFINE_int32(num_threads, 0, 23 | "number of threads to use in loading & evaluation"); 24 | 25 | using namespace apache::thrift::concurrency; 26 | 27 | namespace boosting { 28 | 29 | boost::shared_ptr 30 | Concurrency::threadManager = 31 | boost::shared_ptr(NULL); 32 | 33 | void Concurrency::initThreadManager() { 34 | if (FLAGS_num_threads > 0) { 35 | threadManager = ThreadManager::newSimpleThreadManager(FLAGS_num_threads); 36 | threadManager->threadFactory( 37 | boost::shared_ptr( 38 | new PosixThreadFactory)); 39 | threadManager->start(); 40 | } 41 | } 42 | 43 | void CounterMonitor::init(int size) { 44 | counter_ = size; 45 | } 46 | 47 | void CounterMonitor::decrement() { 48 | if (atomic_fetch_sub(&counter_, 1) == 1) { 49 | monitor_.notifyAll(); 50 | } 51 | } 52 | 53 | int CounterMonitor::wait() { 54 | return monitor_.waitForever(); 55 | } 56 | 57 | }; 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Fast & Simple implementation of GBM 2 | 3 | GBM is the generally regarded as best perform supervised learning algorithms before recent DL revolution. It is robust but not scalable. 4 | 5 | Goal: 6 | 1) Fast (Handle 40M rows * 500 features within 10 hours) 7 | 2) Simple (The less lines of code, the better) <= 3000 8 | 3) Mudular/Extensible for further improvements 9 | 10 | Algorithms: 11 | 1) pre-bucketing (data compression) 12 | 2) bucket sort to build histogram, then linear scan to find best split 13 | 3) hints and intelligent of using #buckets 14 | 4) stochastic gradient boosting machine 15 | 16 | features: 17 | 1) correctness (model + fimps) 18 | 2) deterministic randomness 19 | 3) easily extensible for wide varieties of similar algorithms: random forest, bagging, gbm, for both classification and regression methods, regression takes priority 20 | 21 | new features: 22 | 1) byte/short: two layer of storage. (save both memory and cpu) 23 | 2) taking hints based on previous fimps (top 1/3 using short, rest using byte) 24 | 25 | Prameters: 26 | 27 | m: number trees 28 | n: number of leaves per tree 29 | r: example sampling rate 30 | s: feature sampling rate 31 | 32 | d: number of data points 33 | f: number of features 34 | 35 | k: number of buckets 36 | ml: minimum number of datapoints per leave 37 | 38 | Complexity: 39 | Memory: max(f * d1 * 8, [f * d, f * d * 2)) 40 | 41 | Algorithmic: 42 | 1. Bucketization: O(f * d1 * log(d1)) 43 | 2. Continue reading: O(f * d2 * log(k)) 44 | 45 | 3: Single Best Split: O(f' * d' + f' * k) 46 | 4a: depth-k balanced tree: k * S 47 | 4b: single n-leaves tree: #splits: (2n - 3), O(S * n * log(n)) (roughly) 48 | 49 | D: 20M, exampling sampling: 4M 50 | feature sampling rate: 51 | 52 | Components: 53 | 54 | Config: (specify data format and training parameters) 55 | DataSet: (column-wise storage, with Self Compression) 56 | Tree: (works both in compressed/raw) 57 | TreeRegressor: (k-leaf regression tree) 58 | GbmFun: (function to extend to different types of loss) 59 | Gbm: (gradient boosting machine) 60 | 61 | -------------------------------------------------------------------------------- /LogisticFun.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include "GbmFun.h" 23 | 24 | namespace boosting { 25 | 26 | class LogisticFun : public GbmFun { 27 | public: 28 | double getLeafVal(const std::vector& subset, 29 | const boost::scoped_array& y) const { 30 | double wx = 0.0, wy = 0.0; 31 | for (const auto& id : subset) { 32 | double yi = y[id]; 33 | wy += yi; 34 | wx += fabs(yi) * (2.0 - fabs(yi)); 35 | } 36 | return wy / wx; 37 | } 38 | 39 | double getF0(const std::vector& y) const { 40 | double sumy = 0.0; 41 | for (const auto yi : y) { 42 | sumy += yi; 43 | } 44 | double ybar = sumy/y.size(); 45 | return 0.5 * log((1.0 + ybar)/(1.0 - ybar)); 46 | } 47 | 48 | void getGradient(const std::vector& y, 49 | const boost::scoped_array& F, 50 | boost::scoped_array& grad) const { 51 | int size = y.size(); 52 | for (int i = 0; i < size; i++) { 53 | grad[i] = 2.0 * y[i]/(1.0 + exp(2.0 * y[i] * F[i])); 54 | } 55 | } 56 | 57 | double getInitLoss(const std::vector& y) const { 58 | int posCount = 0; 59 | for (const auto yi : y) { 60 | if (yi > 0) { 61 | posCount += 1; 62 | } 63 | } 64 | return getEntropy(posCount, y.size()) * y.size(); 65 | } 66 | 67 | double getExampleLoss(const double y, const double f) const { 68 | return log(1.0 + exp(-2.0 * y * f)); 69 | } 70 | 71 | void accumulateExampleLoss(const double y, const double f) { 72 | numExamples_ += 1; 73 | if (y > 0) { 74 | posCount_ += 1; 75 | } 76 | logloss_ += getExampleLoss(y, f); 77 | } 78 | 79 | double getReduction() const { 80 | double entropy = getEntropy(posCount_, numExamples_); 81 | return 1.0 - logloss_/(entropy * numExamples_); 82 | } 83 | 84 | int getNumExamples() const { 85 | return numExamples_; 86 | } 87 | 88 | double getLoss() const { 89 | return logloss_; 90 | } 91 | 92 | private: 93 | static double getEntropy(int posCount, int numExamples) { 94 | double posProb = double(posCount)/numExamples; 95 | return -(posProb * log(posProb) + (1 - posProb) * log(1.0 - posProb)); 96 | } 97 | 98 | int numExamples_; 99 | int posCount_; 100 | double logloss_; 101 | }; 102 | } 103 | -------------------------------------------------------------------------------- /Config.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | namespace boosting { 28 | 29 | enum LossFunction { 30 | L2Regression = 0, 31 | L2Logistic = 1 32 | }; 33 | 34 | // Specifying the training parameters and data format 35 | struct Config { 36 | 37 | // reads configuration file generated by 38 | // a.feed.scripts.boosting.gen_json.gen_json_file 39 | bool readConfig(const std::string& fileName); 40 | 41 | int getNumFeatures() const { 42 | return trainIdx_.size(); 43 | } 44 | 45 | int getNumTrees() const { 46 | return numTrees_; 47 | } 48 | 49 | int getNumLeaves() const { 50 | return numLeaves_; 51 | } 52 | 53 | double getLearningRate() const { 54 | return learningRate_; 55 | } 56 | 57 | double getExampleSamplingRate() const { 58 | return exampleSamplingRate_; 59 | } 60 | 61 | double getFeatureSamplingRate() const { 62 | return featureSamplingRate_; 63 | } 64 | 65 | int getTargetIdx() const { 66 | return targetIdx_; 67 | } 68 | 69 | int getCompareIdx() const { 70 | return cmpIdx_; 71 | } 72 | 73 | const std::vector& getTrainIdx() const { 74 | return trainIdx_; 75 | } 76 | 77 | bool isWeakFeature(const int fidx) const { 78 | return (std::find(weakIdx_.begin(), weakIdx_.end(), trainIdx_[fidx]) 79 | != weakIdx_.end()); 80 | } 81 | 82 | const std::string& getFeatureName(const int fidx) const { 83 | return allColumns_[trainIdx_[fidx]]; 84 | } 85 | 86 | // Returns -1 if feature is not found. 87 | int getFeatureIndex(const std::string& f) const { 88 | auto it = featureToIndexMap_.find(f); 89 | return it != featureToIndexMap_.end() ? it->second : -1; 90 | } 91 | 92 | const std::vector& getWeakIdx() const { 93 | return weakIdx_; 94 | } 95 | 96 | const std::vector& getEvalIdx() const { 97 | return evalIdx_; 98 | } 99 | 100 | const std::vector& getColumnNames() const { 101 | return allColumns_; 102 | } 103 | 104 | char getDelimiter() const { 105 | return delimiter_; 106 | } 107 | 108 | LossFunction getLossFunction() const { 109 | return lossFunction_; 110 | } 111 | 112 | private: 113 | 114 | int numTrees_; 115 | int numLeaves_; 116 | double exampleSamplingRate_; 117 | double featureSamplingRate_; 118 | double learningRate_; 119 | 120 | int targetIdx_; 121 | int cmpIdx_; 122 | LossFunction lossFunction_; 123 | 124 | std::vector trainIdx_; 125 | std::vector weakIdx_; 126 | std::vector evalIdx_; 127 | 128 | std::vector allColumns_; 129 | std::unordered_map featureToIndexMap_; 130 | char delimiter_; 131 | }; 132 | 133 | } 134 | -------------------------------------------------------------------------------- /Config.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include "Config.h" 21 | 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | namespace boosting { 30 | 31 | using namespace folly; 32 | using namespace std; 33 | 34 | bool Config::readConfig(const std::string& fileName) { 35 | ifstream fs(fileName); 36 | stringstream buffer; 37 | 38 | buffer << fs.rdbuf(); 39 | 40 | try { 41 | const dynamic cfg = parseJson(buffer.str()); 42 | numTrees_ = cfg["num_trees"].asInt(); 43 | numLeaves_ = cfg["num_leaves"].asInt(); 44 | exampleSamplingRate_ = cfg["example_sampling_rate"].asDouble(); 45 | featureSamplingRate_ = cfg["feature_sampling_rate"].asDouble(); 46 | learningRate_ = cfg["learning_rate"].asDouble(); 47 | 48 | // load dictionary: indices <--> column names 49 | const dynamic& columnNames = cfg["all_columns"]; 50 | unordered_map columnIdx; 51 | int cidx = 0; 52 | for (auto it = columnNames.begin(); it != columnNames.end(); ++it) { 53 | auto columnName = it->asString(); 54 | allColumns_.emplace_back(columnName.toStdString()); 55 | CHECK(columnIdx.find(columnName) == columnIdx.end()); 56 | columnIdx[columnName] = cidx; 57 | cidx++; 58 | } 59 | 60 | targetIdx_ = columnIdx[cfg["target_column"].asString()]; 61 | 62 | auto it = cfg.find("compare_column"); 63 | cmpIdx_ = (it != cfg.items().end()) 64 | ? columnIdx[it->second.asString()] : -1; 65 | 66 | it = cfg.find("loss_function"); 67 | if (it != cfg.items().end() && it->second.asString() == "logistic") { 68 | lossFunction_ = L2Logistic; 69 | } else { 70 | lossFunction_ = L2Regression; 71 | } 72 | 73 | const dynamic& trainColumns = cfg["train_columns"]; 74 | for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) { 75 | featureToIndexMap_[it->asString().toStdString()] = trainIdx_.size(); 76 | trainIdx_.push_back(columnIdx.at(it->asString())); 77 | } 78 | 79 | const dynamic& weakColumns = cfg["weak_columns"]; 80 | for (auto it = weakColumns.begin(); it != weakColumns.end(); ++it) { 81 | weakIdx_.push_back(columnIdx.at(it->asString())); 82 | } 83 | 84 | const dynamic& evalColumns = cfg["eval_output_columns"]; 85 | for (auto it = evalColumns.begin(); it != evalColumns.end(); ++it) { 86 | evalIdx_.push_back(columnIdx.at(it->asString())); 87 | } 88 | 89 | const dynamic& targetColumn = cfg["target_column"]; 90 | targetIdx_ = columnIdx.at(targetColumn.asString()); 91 | 92 | const string& delimiter = cfg["delimiter"].asString().toStdString(); 93 | 94 | if (delimiter == "TAB") { 95 | delimiter_ = '\t'; 96 | } else if (delimiter == "COMMA") { 97 | delimiter_ = ','; 98 | } else if (delimiter == "CTRL-A") { 99 | delimiter_ = '\001'; 100 | } else { 101 | LOG(FATAL) << "invalid delimiter " << delimiter; 102 | return false; 103 | } 104 | } catch (const exception& ex) { 105 | LOG(FATAL) << "parse config failed: " << ex.what(); 106 | return false; 107 | } 108 | return true; 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /GbmFun.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | 25 | namespace boosting { 26 | 27 | // Implementing a few simple function could extend Gbm to different 28 | // loss functions, like l2 loss (least square), logloss (logistic 29 | // regression), huber loss (robust regression), lambdaloss (lambda 30 | // rank), etc. 31 | class GbmFun { 32 | public: 33 | virtual double getLeafVal(const std::vector& subset, 34 | const boost::scoped_array& y) const = 0; 35 | 36 | virtual double getF0(const std::vector& y) const = 0; 37 | 38 | virtual void getGradient(const std::vector& y, 39 | const boost::scoped_array& F, 40 | boost::scoped_array& grad) const = 0; 41 | 42 | virtual double getInitLoss(const std::vector& y) const = 0; 43 | 44 | virtual double getExampleLoss(const double y, const double f) const = 0; 45 | 46 | virtual void accumulateExampleLoss(const double y, const double f) = 0; 47 | 48 | virtual double getReduction() const = 0; 49 | 50 | virtual int getNumExamples() const = 0; 51 | 52 | virtual double getLoss() const = 0; 53 | }; 54 | 55 | 56 | class LeastSquareFun : public GbmFun { 57 | public: 58 | LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) { 59 | } 60 | 61 | double getLeafVal(const std::vector& subset, 62 | const boost::scoped_array& y) const { 63 | 64 | double sum = 0; 65 | for (const auto& id : subset) { 66 | sum += y[id]; 67 | } 68 | return sum/subset.size(); 69 | } 70 | 71 | double getF0(const std::vector& yvec) const { 72 | double sum = 0.0; 73 | for (const auto& y : yvec) { 74 | sum += y; 75 | } 76 | return sum/yvec.size(); 77 | } 78 | 79 | void getGradient(const std::vector& y, 80 | const boost::scoped_array& F, 81 | boost::scoped_array& grad) const { 82 | 83 | int size = y.size(); 84 | 85 | for (int i = 0; i < size; i++) { 86 | grad[i] = y[i] - F[i]; 87 | } 88 | } 89 | 90 | double getInitLoss(const std::vector& yvec) const { 91 | double sumy = 0.0; 92 | double sumy2 = 0.0; 93 | 94 | for (const auto& y : yvec) { 95 | sumy += y; 96 | sumy2 += y*y; 97 | } 98 | 99 | return sumy2 - sumy * sumy/yvec.size(); 100 | } 101 | 102 | double getExampleLoss(const double y, const double f) const { 103 | return (y - f) * (y - f); 104 | } 105 | 106 | void accumulateExampleLoss(const double y, const double f) { 107 | sumy_ += y; 108 | numExamples_ += 1; 109 | sumy2_ += y * y; 110 | l2_ += getExampleLoss(y, f); 111 | } 112 | 113 | double getReduction() const { 114 | return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_); 115 | } 116 | 117 | int getNumExamples() const { 118 | return numExamples_; 119 | } 120 | 121 | double getLoss() const { 122 | return l2_; 123 | } 124 | 125 | private: 126 | int numExamples_; 127 | double sumy_; 128 | double sumy2_; 129 | double l2_; 130 | }; 131 | 132 | } 133 | -------------------------------------------------------------------------------- /DataSet.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "glog/logging.h" 28 | 29 | namespace boosting { 30 | 31 | class Config; 32 | 33 | enum FeatureEncoding { 34 | EMPTY = 0, 35 | BYTE = 1, 36 | SHORT = 2, 37 | DOUBLE = 3 38 | }; 39 | 40 | // different representation of a single feature vec 41 | // compressed to byte/short for significant memory saving 42 | // and much faster splits 43 | struct FeatureData { 44 | std::vector transitions; 45 | FeatureEncoding encoding; 46 | std::unique_ptr> bvec; 47 | std::unique_ptr> svec; 48 | std::unique_ptr> fvec; 49 | 50 | void shrink_to_fit() { 51 | if (encoding == BYTE) { 52 | bvec->shrink_to_fit(); 53 | } else if (encoding == SHORT) { 54 | svec->shrink_to_fit(); 55 | } else if (encoding == DOUBLE) { 56 | fvec->shrink_to_fit(); 57 | } 58 | return; 59 | } 60 | }; 61 | 62 | template class TreeNode; 63 | 64 | // in memory representation of raw data read from a list of data 65 | // files, then intelligently compress the data into the format 66 | // suitable for the boosting training process 67 | class DataSet { 68 | public: 69 | DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1); 70 | 71 | bool addVector(const boost::scoped_array& fvec, double target); 72 | 73 | bool getRow(const std::string& line, 74 | double* target, 75 | boost::scoped_array& fvec, 76 | double* cmpValue = NULL) const; 77 | 78 | bool getEvalColumns(const std::string& line, 79 | boost::scoped_array& feval) const; 80 | 81 | int getNumExamples() const { 82 | return numExamples_; 83 | } 84 | 85 | void getFeatureVec(const int eid, boost::scoped_array& fvec) const { 86 | for (int i = 0; i < numFeatures_; i++) { 87 | if (features_[i].encoding == EMPTY) { 88 | fvec[i] = 0; 89 | } else if (features_[i].encoding == BYTE) { 90 | fvec[i] = (*features_[i].bvec)[eid]; 91 | } else if (features_[i].encoding == SHORT) { 92 | fvec[i] = (*features_[i].svec)[eid]; 93 | } else { 94 | CHECK(false) << "invalid types"; 95 | } 96 | } 97 | } 98 | 99 | double getPrediction(TreeNode* tree, int eid) const; 100 | 101 | void close() { 102 | bucketize(); 103 | for (int i = 0; i < numFeatures_; i++) { 104 | auto &f = features_[i]; 105 | f.shrink_to_fit(); 106 | } 107 | 108 | targets_.shrink_to_fit(); 109 | } 110 | 111 | private: 112 | void bucketize(); 113 | 114 | const Config& cfg_; 115 | const int bucketingThresh_; 116 | const int examplesThresh_; 117 | 118 | //state of data loading process 119 | bool preBucketing_; 120 | int numExamples_; 121 | int numFeatures_; 122 | 123 | boost::scoped_array features_; 124 | std::vector targets_; 125 | 126 | friend class TreeRegressor; 127 | friend class Gbm; 128 | }; 129 | 130 | // partition subset into left and right, depending 131 | // on how the values of fvec compare to fv 132 | template void split(const std::vector& subset, 133 | std::vector* left, 134 | std::vector* right, 135 | const std::vector& fvec, 136 | uint16_t fv) { 137 | 138 | for (auto id : subset) { 139 | if (fvec[id] <= fv) { 140 | left->push_back(id); 141 | } else { 142 | right->push_back(id); 143 | } 144 | } 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /Tree.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | 24 | #include "folly/json.h" 25 | #include "folly/Conv.h" 26 | #include "Config.h" 27 | 28 | namespace boosting { 29 | 30 | template 31 | class TreeNode { 32 | public: 33 | virtual double eval(const boost::scoped_array& fvec) const = 0; 34 | virtual void scale(double w) = 0; 35 | virtual folly::dynamic toJson(const Config& cfg) const = 0; 36 | virtual ~TreeNode() {} 37 | }; 38 | 39 | template 40 | class PartitionNode : public TreeNode { 41 | public: 42 | PartitionNode(int fid, T fv) 43 | : fid_(fid), fv_(fv), fvote_(0.0), 44 | left_(NULL), right_(NULL) { 45 | } 46 | 47 | TreeNode* getLeft() const { 48 | return left_; 49 | } 50 | 51 | TreeNode* getRight() const { 52 | return right_; 53 | } 54 | 55 | int getFid() const { 56 | return fid_; 57 | } 58 | 59 | T getFv() const { 60 | return fv_; 61 | } 62 | 63 | double getVote() const { 64 | return fvote_; 65 | } 66 | 67 | void setLeft(TreeNode* left) { 68 | left_ = left; 69 | } 70 | 71 | void setRight(TreeNode* right) { 72 | right_ = right; 73 | } 74 | 75 | void setVote(double fvote) { 76 | fvote_ = fvote; 77 | } 78 | 79 | double eval(const boost::scoped_array& fvec) const { 80 | if (fvec[fid_] <= fv_) { 81 | return left_->eval(fvec); 82 | } else { 83 | return right_->eval(fvec); 84 | } 85 | } 86 | 87 | void scale(double w) { 88 | fvote_ *= w; 89 | left_->scale(w); 90 | right_->scale(w); 91 | } 92 | 93 | folly::dynamic toJson(const Config& cfg) const { 94 | folly::dynamic m = folly::dynamic::object; 95 | 96 | m.insert("index", fid_); 97 | m.insert("value", fv_); 98 | m.insert("left", left_->toJson(cfg)); 99 | m.insert("right", right_->toJson(cfg)); 100 | m.insert("vote", fvote_); 101 | m.insert("feature", cfg.getFeatureName(fid_)); 102 | return m; 103 | } 104 | 105 | ~PartitionNode() { 106 | delete left_; 107 | delete right_; 108 | } 109 | 110 | private: 111 | int fid_; 112 | T fv_; 113 | double fvote_; 114 | 115 | TreeNode* left_; 116 | TreeNode* right_; 117 | }; 118 | 119 | template 120 | class LeafNode : public TreeNode { 121 | public: 122 | explicit LeafNode(double fvote) : fvote_(fvote) { 123 | } 124 | 125 | double eval(const boost::scoped_array& fvec) const { 126 | return fvote_; 127 | } 128 | 129 | double getVote() const { 130 | return fvote_; 131 | } 132 | 133 | void scale(double w) { 134 | fvote_ *= w; 135 | } 136 | 137 | folly::dynamic toJson(const Config& cfg) const { 138 | folly::dynamic m = folly::dynamic::object; 139 | 140 | m.insert("index", -1); 141 | m.insert("vote", fvote_); 142 | return m; 143 | } 144 | 145 | ~LeafNode() { 146 | } 147 | 148 | private: 149 | double fvote_; 150 | }; 151 | 152 | // load a regression tree from Json 153 | template 154 | TreeNode* fromJson(const folly::dynamic& obj, const Config& cfg) { 155 | const folly::dynamic* feature = nullptr; 156 | try { 157 | feature = &obj["feature"]; 158 | } catch(...) { 159 | } 160 | 161 | double vote = static_cast(obj["vote"].asDouble()); 162 | 163 | if (!feature) { 164 | return new LeafNode(vote); 165 | } else { 166 | std::string featureName = feature->asString().toStdString(); 167 | int index = cfg.getFeatureIndex(featureName); 168 | CHECK_GE(index, 0) << "Failed to find " << featureName << " in config."; 169 | T value; 170 | if (obj["value"].isInt()) { 171 | value = static_cast(obj["value"].asInt()); 172 | } else { 173 | value = static_cast(obj["value"].asDouble()); 174 | } 175 | PartitionNode* rt = new PartitionNode(index, value); 176 | rt->setLeft(fromJson(obj["left"], cfg)); 177 | rt->setRight(fromJson(obj["right"], cfg)); 178 | rt->setVote(vote); 179 | return rt; 180 | } 181 | } 182 | 183 | template 184 | double predict(const std::vector*>& models, 185 | const boost::scoped_array& fvec) { 186 | 187 | double f = 0.0; 188 | for (const auto& m : models) { 189 | f += m->eval(fvec); 190 | } 191 | return f; 192 | } 193 | 194 | template 195 | double predict_vec(const std::vector*>& models, 196 | const boost::scoped_array& fvec, 197 | std::vector* score) { 198 | 199 | double f = 0.0; 200 | for (const auto& m : models) { 201 | f += m->eval(fvec); 202 | score->push_back(f); 203 | } 204 | return f; 205 | } 206 | 207 | 208 | } 209 | -------------------------------------------------------------------------------- /TreeRegressor.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | namespace boosting { 27 | 28 | class DataSet; 29 | template class TreeNode; 30 | class GbmFun; 31 | 32 | // Build regression trees from DataSet 33 | class TreeRegressor { 34 | public: 35 | TreeRegressor(const DataSet& ds, 36 | const boost::scoped_array& y, 37 | const GbmFun& fun); 38 | 39 | // Return the root of a regression tree with desired specifications, based on 40 | // a random sampling of the data in ds_ and a random sampling of the features. 41 | // Also set the feature importance vector (fimps) = total gains from 42 | // splitting along each feature (most entries will be 0). 43 | TreeNode* getTree( 44 | const int numLeaves, 45 | const double exampleSamplingRate, 46 | const double featureSamplingRate, 47 | double fimps[]); 48 | 49 | ~TreeRegressor(); 50 | 51 | private: 52 | 53 | // Node in a binary regression tree, computed based on a sampling of the data 54 | // (given by subset); responsible for cleaning up subset upon destruction 55 | struct SplitNode { 56 | 57 | explicit SplitNode(const std::vector* subset); 58 | 59 | const std::vector* subset; // which subset of the data we're using 60 | int fid; // which feature to split along 61 | uint16_t fv; // value of said feature, at which to split 62 | double gain; // gain in prediction accuracy from this split 63 | bool selected; // internal node of regression tree, as opposed to leaf 64 | 65 | SplitNode* left; // left child in a regression tree 66 | SplitNode* right; // right child in a regression tree 67 | 68 | ~SplitNode() { 69 | delete subset; 70 | } 71 | }; 72 | 73 | // More than a histogram in the basic sense of the word, because our 74 | // data has two dimensions. Make buckets based on the x-dimension, 75 | // and within each bucket keep track of not only the number of 76 | // observations (as in a basic histogram), but also the sum of y-values 77 | // of those observations. 78 | struct Histogram { 79 | const int num; // number of buckets 80 | std::vector cnt; // number of observations in each bucket 81 | std::vector sumy; // sum of y-values of those observations 82 | const int totalCnt; 83 | const double totalSum; 84 | 85 | Histogram(int n, int cnt, double sum) 86 | : num(n), 87 | cnt(num, 0), 88 | sumy(num, 0.0), 89 | totalCnt(cnt), 90 | totalSum(sum) { 91 | } 92 | }; 93 | 94 | template 95 | void buildHistogram(const std::vector& subset, 96 | const std::vector& fvec, 97 | Histogram& hist) const; 98 | 99 | // Choose the x-value such that, by splitting the data at that value, we 100 | // minimize the total sum-of-squares error 101 | static void getBestSplitFromHistogram( 102 | const TreeRegressor::Histogram& hist, 103 | int* idx, 104 | double* gain); 105 | 106 | // Based on a sampling of the data (given by *subset) and a random sampling 107 | // of features (given by featureSamplingRate), find a splitting that maximizes 108 | // prediction accuracy, unless terminal==true, in which case just return a 109 | // sentry. 110 | // Upon finish, also push to working queues (frontiers_ and allSplits_) 111 | SplitNode* getBestSplit(const std::vector* subset, 112 | double featureSamplingRate, 113 | bool terminal); 114 | 115 | // Partition split.subset into left and right according to the splitting 116 | // specified by split.fid and split.fv 117 | void splitExamples(const SplitNode& split, 118 | std::vector* left, 119 | std::vector* right); 120 | 121 | // Return root of a regression tree for data in subset with numSplits internal 122 | // nodes (i.e., numSplits+1 leaves) by greedily selecting the splits with the 123 | // biggest gain. 124 | SplitNode* getBestSplits(const std::vector* subset, 125 | const int numSplits, 126 | double featureSamplingRate); 127 | 128 | // Recursively construct a tree of ParitionNode's and LeafNode's from 129 | // a tree of SplitNode's. The point is that SplitNode's carry some working 130 | // data (e.g., about which data points belong to them) that we should 131 | // throw away when the computation is finished. 132 | TreeNode* getTreeHelper(SplitNode* root, double fimps[]); 133 | 134 | const DataSet& ds_; 135 | const boost::scoped_array& y_; 136 | const GbmFun& fun_; 137 | 138 | // working queue to select best numSplits splits 139 | // could replace with priority queue if necessary 140 | std::vector frontiers_; 141 | 142 | // memory management, to delete SplitNode's upon destruction 143 | std::vector allSplits_; 144 | 145 | }; 146 | 147 | template 148 | void TreeRegressor::buildHistogram(const std::vector& subset, 149 | const std::vector& fvec, 150 | Histogram& hist) const { 151 | 152 | for(auto id : subset) { 153 | const T& v = fvec[id]; 154 | 155 | hist.cnt[v] += 1; 156 | hist.sumy[v] += y_[id]; 157 | } 158 | } 159 | 160 | } 161 | -------------------------------------------------------------------------------- /Gbm.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include "Gbm.h" 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | #include "Concurrency.h" 27 | #include "Config.h" 28 | #include "DataSet.h" 29 | #include "GbmFun.h" 30 | #include "Tree.h" 31 | #include "TreeRegressor.h" 32 | #include 33 | 34 | namespace boosting { 35 | 36 | using namespace std; 37 | 38 | Gbm::Gbm(const GbmFun& fun, const DataSet& ds, const Config& cfg) 39 | : fun_(fun), ds_(ds), cfg_(cfg) { 40 | } 41 | 42 | class ParallelEval : public apache::thrift::concurrency::Runnable { 43 | public: 44 | ParallelEval( 45 | CounterMonitor& monitor, 46 | const int numExamples, 47 | const int numFeatures, 48 | const GbmFun& fun, 49 | const std::unique_ptr>& weakModel, 50 | const DataSet& ds, 51 | const vector& targets, 52 | boost::scoped_array& F, 53 | boost::scoped_array& subLoss, 54 | const int workIdx, 55 | const int totalWorkers) 56 | : monitor_(monitor), numExamples_(numExamples), 57 | numFeatures_(numFeatures), fun_(fun), weakModel_(weakModel), 58 | ds_(ds), targets_(targets), F_(F), 59 | subLoss_(subLoss), workIdx_(workIdx), 60 | totalWorkers_(totalWorkers) { 61 | } 62 | 63 | void run() { 64 | //boost::scoped_array fvec(new uint16_t[numFeatures_]); 65 | for (int i = 0; i < numExamples_; i++) { 66 | if (i % totalWorkers_ == workIdx_) { 67 | //ds_.getFeatureVec(i, fvec); 68 | //double score = weakModel_->eval(fvec); 69 | double score = ds_.getPrediction(weakModel_.get(), i); 70 | F_[i] += score; 71 | subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]); 72 | } 73 | } 74 | monitor_.decrement(); 75 | } 76 | 77 | private: 78 | CounterMonitor& monitor_; 79 | const int numExamples_; 80 | const int numFeatures_; 81 | const GbmFun& fun_; 82 | const std::unique_ptr>& weakModel_; 83 | const DataSet& ds_; 84 | const vector targets_; 85 | boost::scoped_array& F_; 86 | boost::scoped_array& subLoss_; 87 | const int workIdx_; 88 | const int totalWorkers_; 89 | }; 90 | 91 | void Gbm::getModel( 92 | vector*>* model, 93 | double fimps[]) { 94 | 95 | const int numExamples = ds_.getNumExamples(); 96 | 97 | boost::scoped_array F(new double[numExamples]); 98 | boost::scoped_array y(new double[numExamples]); 99 | 100 | double f0 = fun_.getF0(ds_.targets_); 101 | for (int i = 0; i < numExamples; i++) { 102 | F[i] = f0; 103 | } 104 | 105 | model->push_back(new LeafNode(f0)); 106 | 107 | double initLoss = fun_.getInitLoss(ds_.targets_); 108 | 109 | LOG(INFO) << "init avg loss " << initLoss / numExamples; 110 | 111 | for (int it = 0; it < cfg_.getNumTrees(); it++) { 112 | 113 | LOG(INFO) << "------- iteration " << it << " -------"; 114 | 115 | fun_.getGradient(ds_.targets_, F, y); 116 | TreeRegressor regressor(ds_, y, fun_); 117 | 118 | std::unique_ptr> weakModel( 119 | regressor.getTree(cfg_.getNumLeaves(), cfg_.getExampleSamplingRate(), 120 | cfg_.getFeatureSamplingRate(), fimps)); 121 | 122 | weakModel->scale(cfg_.getLearningRate()); 123 | 124 | model->push_back(mapTree(weakModel.get())); 125 | 126 | VLOG(1) << toPrettyJson(weakModel->toJson(cfg_)); 127 | double newLoss = 0.0; 128 | 129 | if (FLAGS_num_threads > 1) { 130 | CounterMonitor monitor(FLAGS_num_threads); 131 | boost::scoped_array subLoss(new double[FLAGS_num_threads]); 132 | for (int wid = 0; wid < FLAGS_num_threads; wid++) { 133 | subLoss[wid] = 0.0; 134 | Concurrency::threadManager->add( 135 | boost::shared_ptr( 136 | new ParallelEval(monitor, numExamples, ds_.numFeatures_, 137 | fun_, weakModel, 138 | ds_, ds_.targets_, F, subLoss, 139 | wid, FLAGS_num_threads))); 140 | } 141 | monitor.wait(); 142 | 143 | for (int wid = 0; wid < FLAGS_num_threads; wid++) { 144 | newLoss += subLoss[wid]; 145 | } 146 | } else { 147 | //boost::scoped_array fvec(new uint16_t[ds_.numFeatures_]); 148 | for (int i = 0; i < numExamples; i++) { 149 | // ds_.getFeatureVec(i, fvec); 150 | // double score = weakModel->eval(fvec); 151 | double score = ds_.getPrediction(weakModel.get(), i); 152 | F[i] += score; 153 | newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]); 154 | } 155 | } 156 | 157 | LOG(INFO) << "total avg loss " << newLoss/numExamples 158 | << " reduction: " << 1.0 - newLoss/initLoss; 159 | } 160 | } 161 | 162 | TreeNode* Gbm::mapTree(const TreeNode* rt) { 163 | const PartitionNode* pnode = 164 | dynamic_cast*>(rt); 165 | if (pnode != NULL) { 166 | int fid = pnode->getFid(); 167 | PartitionNode* newNode = new PartitionNode( 168 | fid, ds_.features_[fid].transitions[pnode->getFv()]); 169 | newNode->setVote(pnode->getVote()); 170 | newNode->setLeft(mapTree(pnode->getLeft())); 171 | newNode->setRight(mapTree(pnode->getRight())); 172 | return newNode; 173 | } else { 174 | const LeafNode* lfnode = 175 | dynamic_cast*>(rt); 176 | return new LeafNode(lfnode->getVote()); 177 | } 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /DataSet.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include "DataSet.h" 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | #include "Config.h" 27 | #include "Tree.h" 28 | #include 29 | #include 30 | #include 31 | 32 | namespace boosting { 33 | 34 | using namespace std; 35 | 36 | DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh) 37 | : cfg_(cfg), bucketingThresh_(bucketingThresh), 38 | examplesThresh_(examplesThresh), 39 | preBucketing_(true), numExamples_(0), 40 | numFeatures_(cfg.getNumFeatures()), 41 | features_(new FeatureData[numFeatures_]) { 42 | 43 | for (int i = 0; i < numFeatures_; i++) { 44 | features_[i].fvec.reset(new vector()); 45 | features_[i].encoding = DOUBLE; 46 | } 47 | } 48 | 49 | bool DataSet::getEvalColumns(const std::string& line, 50 | boost::scoped_array& feval) const { 51 | vector sv; 52 | folly::split(cfg_.getDelimiter(), line, sv); 53 | const auto& evalColumns = cfg_.getEvalIdx(); 54 | 55 | for (int fid = 0; fid < evalColumns.size(); fid++) { 56 | feval[fid] = sv[evalColumns[fid]].toString(); 57 | } 58 | return true; 59 | } 60 | 61 | bool DataSet::getRow(const string& line, double* target, 62 | boost::scoped_array& fvec, 63 | double* cmpValue) const { 64 | try { 65 | vector sv; 66 | folly::split(cfg_.getDelimiter(), line, sv); 67 | 68 | if (sv.size() != cfg_.getColumnNames().size()) { 69 | LOG(ERROR) << "invalid row: unexpected number of columns" << line 70 | << ", expected " << cfg_.getColumnNames().size() 71 | << ", got " << sv.size(); 72 | return false; 73 | } 74 | const auto& trainColumns = cfg_.getTrainIdx(); 75 | 76 | for (int fid = 0; fid < trainColumns.size(); fid++) { 77 | fvec[fid] = atof(sv[trainColumns[fid]].toString().c_str()); 78 | } 79 | *target = atof(sv[cfg_.getTargetIdx()].toString().c_str()); 80 | if (cfg_.getLossFunction() == L2Logistic) { 81 | *target = (*target) > 0.0 ? 1.0 : -1.0; 82 | } 83 | if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) { 84 | *cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str()); 85 | } 86 | 87 | } catch (...) { 88 | LOG(ERROR) << "fail to process line: " << line; 89 | return false; 90 | } 91 | return true; 92 | } 93 | 94 | //predict without explicitly creating feature vector, since it is 95 | //expensive to copy the long vector. used only in Gbm eval step. 96 | double DataSet::getPrediction(TreeNode* rt, int eid) const { 97 | const PartitionNode* pnode = 98 | dynamic_cast*>(rt); 99 | 100 | if (pnode != NULL) { 101 | const int fid = pnode->getFid(); 102 | uint16_t fv; 103 | if (features_[fid].encoding == BYTE) { 104 | fv = (*features_[fid].bvec)[eid]; 105 | } else { 106 | fv = (*features_[fid].svec)[eid]; 107 | } 108 | 109 | if (fv <= pnode->getFv()) { 110 | return getPrediction(pnode->getLeft(), eid); 111 | } else { 112 | return getPrediction(pnode->getRight(), eid); 113 | } 114 | } else { 115 | const LeafNode* lfnode = 116 | dynamic_cast*>(rt); 117 | return lfnode->getVote(); 118 | } 119 | } 120 | 121 | bool DataSet::addVector(const boost::scoped_array& fvec, 122 | double target) { 123 | if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) { 124 | return false; 125 | } 126 | 127 | for (int fid = 0; fid < numFeatures_; fid++) { 128 | double val = fvec[fid]; 129 | if (preBucketing_) { 130 | features_[fid].fvec->push_back(val); 131 | } else { 132 | const auto& transitions = features_[fid].transitions; 133 | const auto& it = lower_bound(transitions.begin(), 134 | transitions.end(), 135 | val); 136 | 137 | if (features_[fid].encoding == EMPTY) { 138 | continue; 139 | } else if (features_[fid].encoding == BYTE) { 140 | (features_[fid].bvec)->push_back( 141 | static_cast(it - transitions.begin())); 142 | } else if (features_[fid].encoding == SHORT) { 143 | (features_[fid].svec)->push_back( 144 | static_cast(it - transitions.begin())); 145 | } else { 146 | LOG(INFO) << "invalid encoding after bucketing"; 147 | } 148 | } 149 | } 150 | targets_.push_back(target); 151 | numExamples_++; 152 | 153 | if (bucketingThresh_ != -1 && numExamples_ > bucketingThresh_ 154 | && preBucketing_) { 155 | bucketize(); 156 | } 157 | return true; 158 | } 159 | 160 | struct IdVal { 161 | int id; 162 | double val; 163 | 164 | IdVal(int i, double v) : id(i), val(v) { 165 | } 166 | }; 167 | 168 | template 169 | void fillValues(const vector& idvals, 170 | const vector& transitions, 171 | vector& vec) { 172 | int idx = 0; 173 | for (int i = 0; i < transitions.size(); i++) { 174 | while(idx <= transitions[i]) { 175 | vec[idvals[idx].id] = static_cast(i); 176 | idx++; 177 | } 178 | } 179 | while (idx < idvals.size()) { 180 | vec[idvals[idx].id] = static_cast(transitions.size()); 181 | idx++; 182 | } 183 | } 184 | 185 | template 186 | void check(const vector& vec, 187 | const vector& fvec, 188 | const vector& transitions) { 189 | 190 | CHECK(vec.size() == fvec.size()); 191 | for (int idx = 0; idx < vec.size(); idx++) { 192 | if (vec[idx] < transitions.size()) { 193 | CHECK(fvec[idx] <= transitions[vec[idx]]) 194 | << "less or equal than transition! "; 195 | } 196 | 197 | if (vec[idx] > 0) { 198 | CHECK(fvec[idx] > transitions[vec[idx] - 1]) 199 | << " larger than previous transition"; 200 | } 201 | } 202 | } 203 | 204 | void check(const FeatureData& fd) { 205 | if (fd.encoding == BYTE) { 206 | check(*(fd.bvec), *(fd.fvec), fd.transitions); 207 | } else if (fd.encoding == SHORT) { 208 | check(*(fd.svec), *(fd.fvec), fd.transitions); 209 | } 210 | } 211 | 212 | void Bucketize(FeatureData& fd, bool useByteEncoding) { 213 | CHECK(fd.encoding == DOUBLE) << "invalid data to bucketing"; 214 | 215 | const auto& fv = *(fd.fvec); 216 | const int num = fv.size(); 217 | 218 | vector idvals; 219 | for (int i = 0; i < num; i++) { 220 | idvals.emplace_back(i, fv[i]); 221 | } 222 | 223 | sort(idvals.begin(), idvals.end(), 224 | [](const IdVal& x, const IdVal& y) { 225 | return x.val < y.val; 226 | }); 227 | 228 | uint16_t maxValue 229 | = useByteEncoding ? numeric_limits::max() : numeric_limits::max(); 230 | 231 | const int stepSize = ceil(fv.size()/(1.0 + maxValue)); 232 | 233 | vector transitions; 234 | int i = stepSize; 235 | while (i < num) { 236 | double t = idvals[i-1].val; 237 | while (i < num && idvals[i].val == t) { 238 | i++; 239 | } 240 | if (i < num) { 241 | transitions.push_back(i-1); 242 | } 243 | i += stepSize; 244 | } 245 | 246 | CHECK(transitions.size() < maxValue) 247 | << " invalid bucketing: too many buckets"; 248 | 249 | for(int i = 0; i < transitions.size(); i++) { 250 | fd.transitions.push_back(idvals[transitions[i]].val); 251 | } 252 | 253 | bool byteEncoding = (transitions.size() < numeric_limits::max()); 254 | if (transitions.size() == 0) { 255 | fd.encoding = EMPTY; 256 | } else if (byteEncoding) { 257 | fd.encoding = BYTE; 258 | fd.bvec.reset(new vector(num)); 259 | fillValues(idvals, transitions, *(fd.bvec)); 260 | } else { 261 | fd.encoding = SHORT; 262 | fd.svec.reset(new vector(num)); 263 | fillValues(idvals, transitions, *(fd.svec)); 264 | } 265 | 266 | check(fd); 267 | 268 | // free up the original vector 269 | fd.fvec.reset(); 270 | } 271 | 272 | void DataSet::bucketize() { 273 | if (!preBucketing_) { 274 | return; 275 | } 276 | 277 | LOG(INFO) << "start bucketization for data compression"; 278 | int hist[4]; 279 | memset(hist, 0, sizeof(hist)); 280 | 281 | for (int i = 0; i < numFeatures_; i++) { 282 | Bucketize(features_[i], cfg_.isWeakFeature(i)); 283 | hist[features_[i].encoding]++; 284 | 285 | LOG(INFO) << "feature: " << cfg_.getFeatureName(i) 286 | << " num transitions: " << features_[i].transitions.size() 287 | << ",encoding: " << features_[i].encoding; 288 | } 289 | preBucketing_ = false; 290 | CHECK(hist[3] == 0) << "no double features after bucketing"; 291 | LOG(INFO) << "total memory saving over double: " 292 | << 1 - (hist[1] * 0.5 + hist[2])/(4.0*numFeatures_); 293 | LOG(INFO) << "additional memory saving over short: " 294 | << 1 - (hist[1] * 0.5 + hist[2])/numFeatures_; 295 | } 296 | 297 | } 298 | -------------------------------------------------------------------------------- /TreeRegressor.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include "TreeRegressor.h" 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | #include "Tree.h" 27 | #include "GbmFun.h" 28 | #include "DataSet.h" 29 | #include "gflags/gflags.h" 30 | #include "glog/logging.h" 31 | 32 | DEFINE_int32(min_leaf_examples, 256, 33 | "minimum number of data points in the leaf"); 34 | 35 | namespace boosting { 36 | 37 | using namespace std; 38 | 39 | // Return true with approximately the desired probability; 40 | // actual probability may differ by ~ 1/RAND_MAX 41 | // Not thread safe. But: 42 | // (1) we're only using this to sample some examples and features, 43 | // so we don't care (?) 44 | // (2) currently TreeRegressor isn't multithreading anyway 45 | inline bool biasedCoinFlip(double probabilityOfTrue) { 46 | return (rand() < probabilityOfTrue * RAND_MAX); 47 | } 48 | 49 | TreeRegressor::SplitNode::SplitNode(const vector* st): 50 | subset(st), fid(-1), fv(0), gain(0), selected(false), 51 | left(NULL), right(NULL) { 52 | } 53 | 54 | TreeRegressor::TreeRegressor( 55 | const DataSet& ds, 56 | const boost::scoped_array& y, 57 | const GbmFun& fun) : ds_(ds), y_(y), fun_(fun) { 58 | } 59 | 60 | TreeRegressor::~TreeRegressor() { 61 | for (SplitNode* split : allSplits_) { 62 | delete split; 63 | } 64 | } 65 | 66 | void TreeRegressor::splitExamples( 67 | const SplitNode& split, 68 | vector* left, 69 | vector* right) { 70 | 71 | const int fid = split.fid; 72 | const uint16_t fv = split.fv; 73 | 74 | auto &f = ds_.features_[fid]; 75 | 76 | if (f.encoding == BYTE) { 77 | boosting::split(*(split.subset), left, right, *(f.bvec), fv); 78 | } else { 79 | CHECK(f.encoding == SHORT); 80 | boosting::split(*(split.subset), left, right, *(f.svec), fv); 81 | } 82 | } 83 | 84 | void TreeRegressor::getBestSplitFromHistogram( 85 | const TreeRegressor::Histogram& hist, 86 | int* idx, 87 | double* gain) { 88 | 89 | // The loss function should really be 90 | // (sum of (y - y_mean)^2 for observations left of idx) 91 | // + (sum of (y - y_mean)^2 for observations right of idx) 92 | // By math, this equals 93 | // (sum of squares of all y-values) 94 | // - (sum of y-values on left)^2 / (number of observations on left) 95 | // - (sum of y-values on right)^2 / (number of observations on right) 96 | // Since the first term (sum of squares of all y-values) is independent 97 | // of our choice of where to split, it makes no difference, so we ignore it 98 | // in calculating loss. 99 | 100 | // loss function if we don't split at all 101 | double lossBefore = -1.0 * hist.totalSum * hist.totalSum / hist.totalCnt; 102 | 103 | int cntLeft = 0; // number of observations on or to left of idx 104 | double sumLeft = 0.0; // number of observations strictly to right of idx 105 | 106 | double bestGain = 0.0; 107 | int bestIdx = -1; // everything strictly to right of idx 108 | 109 | CHECK(hist.num >= 1); 110 | 111 | for (int i = 0; i < hist.num - 1; i++) { 112 | 113 | cntLeft += hist.cnt[i]; 114 | sumLeft += hist.sumy[i]; 115 | 116 | double sumRight = hist.totalSum - sumLeft; 117 | int cntRight = hist.totalCnt - cntLeft; 118 | 119 | if (cntLeft < FLAGS_min_leaf_examples) { 120 | continue; 121 | } 122 | if (cntRight < FLAGS_min_leaf_examples) { 123 | break; 124 | } 125 | 126 | double lossAfter = 127 | -1.0 * sumLeft * sumLeft / cntLeft 128 | - 1.0 * sumRight * sumRight / cntRight; 129 | 130 | double gain = lossBefore - lossAfter; 131 | if (gain > bestGain) { 132 | bestGain = gain; 133 | bestIdx = i; 134 | } 135 | } 136 | 137 | *idx = bestIdx; 138 | *gain = bestGain; 139 | } 140 | 141 | TreeRegressor::SplitNode* 142 | TreeRegressor::getBestSplit(const vector* subset, 143 | double featureSamplingRate, 144 | bool terminal) { 145 | 146 | SplitNode* split = new SplitNode(subset); 147 | if (terminal) { 148 | allSplits_.push_back(split); 149 | return split; 150 | } 151 | 152 | int bestFid = -1; // which feature to split on, -1 is invalid 153 | int bestFv = 0; // critical value of that feature 154 | 155 | // gain in prediction accuracy from that split: 156 | // initialize to 0 instead of std::numeric_limits::lowest() because, 157 | // if no split results in a positive gain, we would rather report that, than 158 | // return a valid but degenerate split 159 | double bestGain = 0.0; 160 | 161 | double totalSum = 0.0; // sum of all target values 162 | 163 | for (auto& id : *subset) { 164 | totalSum += y_[id]; 165 | } 166 | 167 | // For each of a random sampling of features, see if splitting on that 168 | // feature results in the biggest improvement so far. 169 | // TODO(tiankai): The various fid's can be processed in parallel. 170 | for (int fid = 0; fid < ds_.numFeatures_; fid++) { 171 | const auto& f = ds_.features_[fid]; 172 | 173 | if (f.encoding == EMPTY || !biasedCoinFlip(featureSamplingRate)) { 174 | continue; 175 | } 176 | 177 | Histogram hist(f.transitions.size() + 1, subset->size(), totalSum); 178 | 179 | if (f.encoding == BYTE) { 180 | buildHistogram(*subset, *(f.bvec), hist); 181 | } else { 182 | CHECK(f.encoding == SHORT); 183 | buildHistogram(*subset, *(f.svec), hist); 184 | } 185 | 186 | int fv; 187 | double gain; 188 | getBestSplitFromHistogram(hist, &fv, &gain); 189 | 190 | if (gain > bestGain) { 191 | bestFid = fid; 192 | bestFv = fv; 193 | bestGain = gain; 194 | } 195 | } 196 | split->fid = bestFid; 197 | split->fv = bestFv; 198 | split->gain = bestGain; 199 | 200 | frontiers_.push_back(split); 201 | allSplits_.push_back(split); 202 | return split; 203 | } 204 | 205 | 206 | TreeNode* TreeRegressor::getTree( 207 | const int numLeaves, 208 | const double exampleSamplingRate, 209 | const double featureSamplingRate, 210 | double fimps[]) { 211 | 212 | // randomly sample data in ds_ 213 | vector* subset = new vector(); 214 | for (int i = 0; i < ds_.getNumExamples(); i++) { 215 | if (biasedCoinFlip(exampleSamplingRate)) { 216 | subset->push_back(i); 217 | } 218 | } 219 | CHECK(subset->size() >= FLAGS_min_leaf_examples * numLeaves); 220 | 221 | // compute the decision tree in SplitNode's 222 | SplitNode* root = getBestSplits(subset, numLeaves - 1, featureSamplingRate); 223 | 224 | // convert the decision tree to PartitionNode's and LeafNode's 225 | return getTreeHelper(root, fimps); 226 | } 227 | 228 | TreeNode* TreeRegressor::getTreeHelper( 229 | SplitNode* split, 230 | double fimps[]) { 231 | 232 | if (split == NULL) { 233 | return NULL; 234 | } else if (!split->selected) { 235 | // leaf of decision tree 236 | double fvote = fun_.getLeafVal(*(split->subset), y_); 237 | LOG(INFO) << "leaf: " << fvote << ", #examples:" 238 | << split->subset->size(); 239 | CHECK(split->subset->size() >= FLAGS_min_leaf_examples); 240 | 241 | return new LeafNode(fvote); 242 | } else { 243 | // internal node of decision tree 244 | LOG(INFO) << "select split: " << split->fid << ":" << split->fv 245 | << " gain: " << split->gain << ", #examples:" 246 | << split->subset->size() << ", min partition: " 247 | << std::min(split->left->subset->size(), split->right->subset->size()); 248 | 249 | fimps[split->fid] += split->gain; 250 | double fvote = fun_.getLeafVal(*(split->subset), y_); 251 | PartitionNode* node = new PartitionNode(split->fid, split->fv); 252 | node->setLeft(getTreeHelper(split->left, fimps)); 253 | node->setRight(getTreeHelper(split->right, fimps)); 254 | node->setVote(fvote); 255 | 256 | return node; 257 | } 258 | } 259 | 260 | TreeRegressor::SplitNode* TreeRegressor::getBestSplits( 261 | const vector* subset, const int numSplits, double featureSamplingRate) { 262 | 263 | CHECK(subset != NULL); 264 | 265 | // Compute the root of the decision tree. 266 | SplitNode* firstSplit = getBestSplit(subset, featureSamplingRate, false); 267 | 268 | int numSelected = 0; 269 | do { 270 | // frontiers_.size() = #leaves = #internal nodes + 1 = numSelected + 1 271 | CHECK(frontiers_.size() == numSelected+1); 272 | 273 | // Do a linear search over the leaves to find the next split with the most 274 | // gain. 275 | double bestGain = 0.0; 276 | vector::iterator best_it = frontiers_.end(); 277 | for (auto it = frontiers_.begin(); it != frontiers_.end(); it++) { 278 | if ((*it)->gain > bestGain) { 279 | bestGain = (*it)->gain; 280 | best_it = it; 281 | } 282 | } 283 | 284 | if (best_it == frontiers_.end()) { 285 | // no gain from any split 286 | break; 287 | } 288 | 289 | CHECK(bestGain > 0.0); 290 | 291 | (*best_it)->selected = true; 292 | numSelected++; 293 | SplitNode* bestSplit = *best_it; 294 | frontiers_.erase(best_it); 295 | 296 | // Now that we've selected bestSplit, expand its left and right children. 297 | vector* left = new vector(); 298 | vector* right = new vector(); 299 | 300 | splitExamples(*bestSplit, left, right); 301 | bool terminal = (numSelected == numSplits); 302 | 303 | bestSplit->left = getBestSplit(left, featureSamplingRate, terminal); 304 | bestSplit->right = getBestSplit(right, featureSamplingRate, terminal); 305 | } while (numSelected < numSplits); 306 | 307 | return firstSplit; 308 | } 309 | 310 | } 311 | -------------------------------------------------------------------------------- /Train.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015,2016 Tao Xu 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "boost/make_shared.hpp" 28 | #include "boost/shared_ptr.hpp" 29 | #include "boost/move/unique_ptr.hpp" 30 | #include "Concurrency.h" 31 | #include "Config.h" 32 | #include "GbmFun.h" 33 | #include "Gbm.h" 34 | #include "LogisticFun.h" 35 | #include "DataSet.h" 36 | #include "Tree.h" 37 | #include "gflags/gflags.h" 38 | #include "folly/String.h" 39 | #include "folly/json.h" 40 | #include "thrift/concurrency/PosixThreadFactory.h" 41 | #include "thrift/concurrency/Thread.h" 42 | #include "thrift/concurrency/ThreadManager.h" 43 | 44 | using namespace boosting; 45 | using namespace std; 46 | 47 | DEFINE_int32(num_examples_for_bucketing, 1024*1024*5, 48 | "number of data points used for data set compression"); 49 | 50 | DEFINE_int32(random_seed, 123456789, "The random seed."); 51 | 52 | DEFINE_string(config_file, "", 53 | "file contains the configurations"); 54 | 55 | DEFINE_string(training_files, "", 56 | "comma separated list of data files for training"); 57 | 58 | DEFINE_string(testing_files, "", 59 | "comma separated list of data files for training"); 60 | 61 | DEFINE_string(eval_output_file, "", 62 | "file contains eval output:could be stdout"); 63 | 64 | DEFINE_string(model_file, "", 65 | "file contains the whole model"); 66 | 67 | DEFINE_bool(eval_only, false, 68 | "eval only mode"); 69 | 70 | DEFINE_bool(find_optimal_num_trees, false, 71 | "using huge data to trim number of trees"); 72 | 73 | DEFINE_int32(num_examples_for_training, -1, 74 | "number of data points used for training, " 75 | " -1 will use all available"); 76 | 77 | const int CHUNK_SIZE = 2500; // # of lines each data loading chunk may parse 78 | 79 | 80 | /** 81 | * Utility class used to parallelize dataset loading. 82 | */ 83 | class DataChunk : public apache::thrift::concurrency::Runnable { 84 | 85 | public: 86 | 87 | DataChunk(const Config& cfg, const DataSet& dataSet, 88 | CounterMonitor* monitorPtr = NULL) : 89 | cfg_(cfg), dataSet_(dataSet), monitorPtr_(monitorPtr) {} 90 | 91 | bool addLine(const string& s) { 92 | if (s.empty()) { 93 | return false; 94 | } 95 | lines_.emplace_back(s); 96 | return true; 97 | } 98 | 99 | void parseLines() { 100 | featureVectors_.reserve(lines_.size()); 101 | targets_.reserve(lines_.size()); 102 | boost::scoped_array farr(new double[cfg_.getNumFeatures()]); 103 | double target; 104 | for (const string& line : lines_) { 105 | if (dataSet_.getRow(line, &target, farr)) { 106 | targets_.push_back(target); 107 | featureVectors_.emplace_back(farr.get(), 108 | farr.get() + cfg_.getNumFeatures()); 109 | } 110 | } 111 | } 112 | 113 | void run() { 114 | parseLines(); 115 | if (monitorPtr_ != NULL) { 116 | monitorPtr_->decrement(); 117 | } 118 | } 119 | 120 | const vector>& getFeatureVectors() const { 121 | return featureVectors_; 122 | } 123 | 124 | const vector& getTargets() const { 125 | return targets_; 126 | } 127 | 128 | size_t getLineBufferSize() const { 129 | return lines_.size(); 130 | } 131 | 132 | size_t getSize() const { 133 | return featureVectors_.size(); 134 | } 135 | 136 | // Does not use class member dataset, since we might want to load into 137 | // another dataset. 138 | size_t addToDataSet(DataSet* dataSet) const { 139 | CHECK(featureVectors_.size() == targets_.size()) 140 | << "featureVectors_ and targets_ vectors must be the same size"; 141 | boost::scoped_array farr(new double[cfg_.getNumFeatures()]); 142 | size_t size = featureVectors_.size(); 143 | for (size_t i = 0; i < size; ++i) { 144 | const auto fvec = featureVectors_[i]; 145 | copy(fvec.begin(), fvec.end(), farr.get()); 146 | if (!dataSet->addVector(farr, targets_[i])) { 147 | return i; 148 | } 149 | } 150 | return size; 151 | } 152 | 153 | private: 154 | 155 | const Config& cfg_; 156 | const DataSet& dataSet_; 157 | CounterMonitor* monitorPtr_; // for threading purposes 158 | vector lines_; 159 | vector> featureVectors_; 160 | vector targets_; 161 | 162 | }; 163 | 164 | // Divide training data file's lines into chunks, 165 | // and parse chunks concurrently if desired/possible 166 | void readIntoDataChunks(istream& in, 167 | vector>* chunks, 168 | size_t chunkSize, const Config& cfg, 169 | const DataSet& dataSet) { 170 | // Read lines, placing them into chunks 171 | CounterMonitor monitor(0); 172 | boost::shared_ptr curChunkPtr = 173 | boost::make_shared(cfg, dataSet, &monitor); 174 | string line; 175 | while (getline(in, line)) { 176 | curChunkPtr->addLine(line); 177 | if (curChunkPtr->getLineBufferSize() >= chunkSize) { 178 | // filled up current chunk, so start another one 179 | chunks->push_back(curChunkPtr); 180 | curChunkPtr = boost::make_shared(cfg, dataSet, &monitor); 181 | } 182 | } 183 | if (curChunkPtr->getLineBufferSize() > 0) { 184 | chunks->push_back(curChunkPtr); 185 | } 186 | 187 | // Parse all chunks 188 | if (FLAGS_num_threads > 0 && !chunks->empty()) { 189 | monitor.init(chunks->size()); 190 | for (auto chunkPtr : *chunks) { 191 | Concurrency::threadManager->add(chunkPtr); 192 | } 193 | monitor.wait(); 194 | } else { 195 | for (auto chunkPtr : *chunks) { 196 | chunkPtr->parseLines(); 197 | } 198 | } 199 | } 200 | 201 | // write feature importance vector 202 | void dumpFimps(const string& fileName, const Config& cfg, double fimps[]) { 203 | ofstream fs(fileName); 204 | for (int fid = 0; fid < cfg.getNumFeatures(); fid++) { 205 | fs << fid << '\t' << fimps[fid] << '\t' 206 | << cfg.getFeatureName(fid) << '\n'; 207 | } 208 | fs.close(); 209 | } 210 | 211 | // write Json dump of boosting model 212 | template 213 | void dumpModel(const string& fileName, 214 | const Config& cfg, 215 | const vector* >& model) { 216 | folly::dynamic m = folly::dynamic::object; 217 | folly::dynamic trees = {}; 218 | 219 | for (const auto& t : model) { 220 | trees.push_back(std::move(t->toJson(cfg))); 221 | } 222 | 223 | m.insert("trees", trees); 224 | 225 | ofstream fs(fileName); 226 | fs << toPrettyJson(m); 227 | fs.close(); 228 | } 229 | 230 | unique_ptr getGbmFun(LossFunction loss) { 231 | if (loss == L2Regression) { 232 | return unique_ptr(new LeastSquareFun()); 233 | } else { 234 | return unique_ptr(new LogisticFun()); 235 | } 236 | } 237 | 238 | int main(int argc, char **argv) { 239 | stringstream ss; 240 | for (int i = 0; i < argc; i++) { 241 | ss << argv[i] << " "; 242 | } 243 | 244 | google::SetUsageMessage("Gbm Training"); 245 | google::ParseCommandLineFlags(&argc, &argv, true); 246 | google::InitGoogleLogging(argv[0]); 247 | Concurrency::initThreadManager(); 248 | 249 | // Initialize random seed. 250 | srand(FLAGS_random_seed); 251 | 252 | LOG(INFO) << ss.str(); 253 | 254 | Config cfg; 255 | 256 | LOG(INFO) << "loading config"; 257 | 258 | CHECK(cfg.readConfig(FLAGS_config_file)); 259 | unique_ptr pfun = getGbmFun(cfg.getLossFunction()); 260 | GbmFun& fun = *pfun; 261 | 262 | unique_ptr pCmpFun = getGbmFun(cfg.getLossFunction()); 263 | GbmFun& cmpFun = *pCmpFun; 264 | 265 | vector*> model; 266 | DataSet ds(cfg, FLAGS_num_examples_for_bucketing, 267 | FLAGS_num_examples_for_training); 268 | 269 | if (!FLAGS_eval_only) { 270 | // Compute model from training files 271 | 272 | // First, load training files 273 | vector sv; 274 | folly::split(',', FLAGS_training_files, sv); 275 | 276 | time_t start, end; 277 | time(&start); 278 | 279 | for (const auto& s : sv) { 280 | LOG(INFO) << "loading data from:" << s; 281 | 282 | ifstream fs(s.str()); 283 | vector> dataChunks; 284 | readIntoDataChunks(fs, &dataChunks, CHUNK_SIZE, cfg, ds); 285 | for (const auto chunkPtr : dataChunks) { 286 | chunkPtr->addToDataSet(&ds); 287 | } 288 | 289 | time(&end); 290 | double timespent = difftime(end, start); 291 | LOG(INFO) << "read " << ds.getNumExamples() << " examples in " 292 | << timespent << " sec" << endl; 293 | } 294 | 295 | ds.close(); 296 | 297 | // Second, train the models 298 | Gbm engine(fun, ds, cfg); 299 | double* fimps = new double[cfg.getNumFeatures()]; 300 | for (int i = 0; i < cfg.getNumFeatures(); i++) { 301 | fimps[i] = 0.0; 302 | } 303 | engine.getModel(&model, fimps); 304 | 305 | // Third, write the model files 306 | dumpFimps(FLAGS_model_file + ".fimps", cfg, fimps); 307 | dumpModel(FLAGS_model_file, cfg, model); 308 | } else { 309 | // Skip training, load previously written model 310 | 311 | LOG(INFO) << "loading model from " << FLAGS_model_file; 312 | ifstream fs(FLAGS_model_file); 313 | stringstream buffer; 314 | buffer << fs.rdbuf(); 315 | 316 | const folly::dynamic obj = folly::parseJson(buffer.str()); 317 | const int numTrees = obj["trees"].size(); 318 | LOG(INFO) << "num trees: " << numTrees; 319 | model.reserve(numTrees); 320 | for (int i = 0; i < numTrees; i++) { 321 | model.push_back(fromJson(obj["trees"][i], cfg)); 322 | } 323 | } 324 | 325 | if (FLAGS_testing_files != "") { 326 | ostream *os = NULL; 327 | ofstream ofs; 328 | if (FLAGS_eval_output_file != "") { 329 | if (FLAGS_eval_output_file == "stdout") { 330 | os = &cout; 331 | } else { 332 | ofs.open(FLAGS_eval_output_file); 333 | os = &ofs; 334 | } 335 | } 336 | 337 | // See how well the model performs on testing data 338 | double target, score; 339 | boost::scoped_array fvec(new double[cfg.getNumFeatures()]); 340 | int numEvalColumns = cfg.getEvalIdx().size(); 341 | boost::scoped_array feval(new string[numEvalColumns]); 342 | 343 | vector> funs; 344 | for (int i = 0; i < model.size(); i++) { 345 | funs.push_back(getGbmFun(cfg.getLossFunction())); 346 | } 347 | 348 | vector tsv; 349 | folly::split(',', FLAGS_testing_files, tsv); 350 | for (const auto& s : tsv) { 351 | LOG(INFO) << "loading data from:" << s; 352 | istream *is; 353 | fstream fs; 354 | 355 | if (s.str() == "stdin") { 356 | is = &cin; 357 | } else { 358 | fs.open(s.str()); 359 | is = &fs; 360 | } 361 | string line; 362 | vector scores; 363 | while(getline(*is, line)) { 364 | ds.getRow(line, &target, fvec, &score); 365 | double f; 366 | if (FLAGS_find_optimal_num_trees) { 367 | f = predict_vec(model, fvec, &scores); 368 | for (int i = 0; i < model.size(); i++) { 369 | funs[i]->accumulateExampleLoss(target, scores[i]); 370 | } 371 | scores.clear(); 372 | } else { 373 | f = predict(model, fvec); 374 | } 375 | 376 | if (os != NULL) { 377 | ds.getEvalColumns(line, feval); 378 | for (int i = 0; i < numEvalColumns; i++) { 379 | (*os) << feval[i] << '\t'; 380 | } 381 | (*os) << f << endl; 382 | } 383 | 384 | fun.accumulateExampleLoss(target, f); 385 | cmpFun.accumulateExampleLoss(target, score); 386 | if (fun.getNumExamples() % 1000 == 0) { 387 | LOG(INFO) << "test loss reduction: " << fun.getReduction() 388 | << " on num examples: " << fun.getNumExamples() 389 | << " total loss: " << fun.getLoss() 390 | << " logged score: " << score 391 | << " computed score: " << f 392 | << " cmp loss: " << cmpFun.getLoss() 393 | << " cmp reduction: " << cmpFun.getReduction(); 394 | } 395 | } 396 | } 397 | if (os != NULL) { 398 | os->flush(); 399 | } 400 | 401 | if (FLAGS_find_optimal_num_trees) { 402 | cout << model.size() << '\t'; 403 | for (int i = 0; i < model.size(); i++) { 404 | cout << funs[i]->getLoss() << '\t'; 405 | } 406 | } 407 | 408 | LOG(INFO) << fun.getNumExamples() << '\t' << fun.getReduction() << '\t' 409 | << fun.getLoss() << endl; 410 | 411 | LOG(INFO) << "test loss reduction: " << fun.getReduction() 412 | << ", cmp loss function: " << cmpFun.getReduction() 413 | << " on num examples: " << fun.getNumExamples(); 414 | 415 | } 416 | } 417 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | -------------------------------------------------- 205 | SOFTWARE DISTRIBUTED WITH THRIFT: 206 | 207 | The Apache Thrift software includes a number of subcomponents with 208 | separate copyright notices and license terms. Your use of the source 209 | code for the these subcomponents is subject to the terms and 210 | conditions of the following licenses. 211 | 212 | -------------------------------------------------- 213 | Portions of the following files are licensed under the MIT License: 214 | 215 | lib/erl/src/Makefile.am 216 | 217 | Please see doc/otp-base-license.txt for the full terms of this license. 218 | 219 | -------------------------------------------------- 220 | For the aclocal/ax_boost_base.m4 and contrib/fb303/aclocal/ax_boost_base.m4 components: 221 | 222 | # Copyright (c) 2007 Thomas Porschberg 223 | # 224 | # Copying and distribution of this file, with or without 225 | # modification, are permitted in any medium without royalty provided 226 | # the copyright notice and this notice are preserved. 227 | 228 | -------------------------------------------------- 229 | For the compiler/cpp/src/thrift/md5.[ch] components: 230 | 231 | /* 232 | Copyright (C) 1999, 2000, 2002 Aladdin Enterprises. All rights reserved. 233 | 234 | This software is provided 'as-is', without any express or implied 235 | warranty. In no event will the authors be held liable for any damages 236 | arising from the use of this software. 237 | 238 | Permission is granted to anyone to use this software for any purpose, 239 | including commercial applications, and to alter it and redistribute it 240 | freely, subject to the following restrictions: 241 | 242 | 1. The origin of this software must not be misrepresented; you must not 243 | claim that you wrote the original software. If you use this software 244 | in a product, an acknowledgment in the product documentation would be 245 | appreciated but is not required. 246 | 2. Altered source versions must be plainly marked as such, and must not be 247 | misrepresented as being the original software. 248 | 3. This notice may not be removed or altered from any source distribution. 249 | 250 | L. Peter Deutsch 251 | ghost@aladdin.com 252 | 253 | */ 254 | --------------------------------------------------------------------------------