├── PyFastBDT ├── __init__.py ├── utility.py └── FastBDT.py ├── MANIFEST.in ├── examples ├── Makefile ├── PurityTransformation.py ├── PythonExample.py ├── ugboost.py ├── CPPExample.cxx ├── orthogonal_discriminator.py ├── performance.py ├── splot.py └── comparison.cxx ├── src ├── test_all.cxx ├── FastBDT_IO.cxx ├── test_FastBDT_C_API.cxx ├── FastBDT_C_API.cxx ├── test_Classifier.cxx ├── Classifier.cxx ├── test_Performance.cxx ├── test_FastBDT_IO.cxx └── FastBDT.cxx ├── setup.py.in ├── include ├── LinkDef.h ├── FastBDT_C_API.h ├── Classifier.h └── FastBDT_IO.h ├── .travis.yml ├── README.md ├── CMakeLists.txt └── files └── iris.txt /PyFastBDT/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include libFastBDT_CInterface.so 2 | include libFastBDT_shared.so 3 | -------------------------------------------------------------------------------- /examples/Makefile: -------------------------------------------------------------------------------- 1 | make: 2 | #g++ CPPExample.cxx -o CPPExample -l FastBDT_static -L ../ -I ../include/ -ggdb3 3 | g++ CPPExample.cxx -o CPPExample -l FastBDT_static -L ../ -I ../include/ -O3 4 | -------------------------------------------------------------------------------- /src/test_all.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2015 3 | */ 4 | 5 | #include 6 | 7 | int main(int argc, char **argv) { 8 | ::testing::InitGoogleTest(&argc, argv); 9 | return RUN_ALL_TESTS(); 10 | } 11 | -------------------------------------------------------------------------------- /setup.py.in: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='PyFastBDT', 4 | version='${FastBDT_VERSION_MAJOR}.${FastBDT_VERSION_MINOR}', 5 | packages=['PyFastBDT'], 6 | package_data={'PyFastBDT': ['*.so']}, 7 | ) 8 | -------------------------------------------------------------------------------- /include/LinkDef.h: -------------------------------------------------------------------------------- 1 | #ifdef __CINT__ 2 | 3 | #pragma link off all global; 4 | #pragma link off all class; 5 | #pragma link off all function; 6 | #pragma link off all namespace; 7 | 8 | 9 | #pragma link C++ class TMVA::MethodFastBDT+; 10 | #pragma link C++ namespace TMVA; 11 | #pragma link C++ nestedclass; 12 | #pragma link C++ nestedtypedef; 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: cpp 4 | notifications: 5 | email: 6 | on_success: change # default: change 7 | on_failure: always # default: always 8 | compiler: 9 | - gcc 10 | addons: 11 | apt: 12 | sources: 13 | - ubuntu-toolchain-r-test 14 | packages: 15 | - libgtest-dev 16 | - build-essential 17 | - cmake 18 | before_install: 19 | - cd /usr/src/gtest && sudo cmake . && sudo make && sudo mv libg* /usr/lib/ && cd - 20 | install: 21 | - cmake . 22 | - make VERBOSE=1 23 | script: 24 | - ./unittests 25 | -------------------------------------------------------------------------------- /examples/PurityTransformation.py: -------------------------------------------------------------------------------- 1 | from PyFastBDT import FastBDT 2 | 3 | import pandas 4 | import numpy as np 5 | import sklearn.metrics 6 | 7 | if __name__ == '__main__': 8 | 9 | data = np.arange(100000) 10 | X = (data % 100).reshape((100000, 1)) 11 | y = (data % 2) == 1 12 | 13 | clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[False]).fit(X=X, y=y) 14 | p = clf.predict(X) 15 | print('No Purity Transformation', sklearn.metrics.roc_auc_score(y, p)) 16 | 17 | clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[True]).fit(X=X, y=y) 18 | p = clf.predict(X) 19 | print('With Purity Transformation', sklearn.metrics.roc_auc_score(y, p)) 20 | -------------------------------------------------------------------------------- /PyFastBDT/utility.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def flatness(probability, feature, target, weights=None, classes=[0, 1]): 5 | """ 6 | Calculates the flatness of a feature under cuts on a signal probability 7 | @param f the feature values 8 | @param p the probability values 9 | @param w optional weights 10 | @return the mean standard deviation between the local and global cut selection efficiency 11 | """ 12 | quantiles = list(range(101)) 13 | flatness_score = 0 14 | for m in [target == c for c in classes]: 15 | p = probability[m] 16 | f = feature[m] 17 | if weights is None: 18 | w = None 19 | else: 20 | w = weights[m] 21 | 22 | binning_feature = np.unique(np.percentile(f, q=quantiles)) 23 | binning_probability = np.unique(np.percentile(p, q=quantiles)) 24 | hist_n, _ = np.histogramdd(np.c_[p, f], 25 | bins=[binning_probability, binning_feature], 26 | weights=w) 27 | hist_inc = hist_n.sum(axis=1) 28 | hist_inc /= hist_inc.sum(axis=0) 29 | hist_n /= hist_n.sum(axis=0) 30 | hist_n = hist_n.cumsum(axis=0) 31 | hist_inc = hist_inc.cumsum(axis=0) 32 | diff = (hist_n.T - hist_inc)**2 33 | flatness_score += diff.sum() / (100*99) 34 | return np.sqrt(flatness_score) 35 | 36 | 37 | 38 | def auc_roc(probability, target): 39 | N = len(target) 40 | T = np.sum(target) 41 | index = np.argsort(probability) 42 | efficiency = (T - np.cumsum(target[index])) / float(T) 43 | purity = (T - np.cumsum(target[index])) / (N - np.cumsum(np.ones(N))) 44 | purity = np.where(np.isnan(purity), 0, purity) 45 | return np.abs(np.trapz(purity, efficiency)) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastBDT 2 | 3 | Stochastic Gradient Boosted Decision Trees, usable standalone, and via Python Interface. 4 | 5 | # Paper on ArXiv: http://arxiv.org/abs/1609.06119 6 | 7 | FastBDT: A speed-optimized and cache-friendly implementation of stochastic gradient-boosted decision trees for multivariate classification 8 | 9 | Stochastic gradient-boosted decision trees are widely employed for multivariate classification and regression tasks. This paper presents a speed-optimized and cache-friendly implementation for multivariate classification called FastBDT. FastBDT is one order of magnitude faster during the fitting-phase and application-phase, in comparison with popular implementations in software frameworks like TMVA, scikit-learn and XGBoost. The concepts used to optimize the execution time and performance studies are discussed in detail in this paper. The key ideas include: An equal-frequency binning on the input data, which allows replacing expensive floating-point with integer operations, while at the same time increasing the quality of the classification; a cache-friendly linear access pattern to the input data, in contrast to usual implementations, which exhibit a random access pattern. FastBDT provides interfaces to C/C++ and Python. It is extensively used in the field of high energy physics by the Belle II experiment. 10 | 11 | 12 | # Installation 13 | 14 | * cmake . 15 | * make 16 | * make install 17 | * make package (optional to build rpm, deb packages) 18 | * python3 setup.py install (optional to install the python package) 19 | 20 | 21 | # Usage 22 | 23 | Before you do anything you want to execute the unittests: 24 | * ./unittest 25 | 26 | But usually it should be more convinient to use FastBDT as a library 27 | and integrate FastBDT directly into your application using 28 | * the C++ shared/static library (see example/CPPExample.cxx), 29 | * the C shared library, 30 | * or the Python3 library python/FastBDT.py (see example/PythonExample.py ). 31 | 32 | 33 | # Further reading 34 | This work is mostly based on the papers by Jerome H. Friedman 35 | * https://statweb.stanford.edu/~jhf/ftp/trebst.pdf 36 | * https://statweb.stanford.edu/~jhf/ftp/stobst.pdf 37 | 38 | FastBDT also implements the uGB techniques to boost to flatness: 39 | * https://arxiv.org/abs/1410.4140 40 | 41 | -------------------------------------------------------------------------------- /include/FastBDT_C_API.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2015 3 | */ 4 | 5 | #include "FastBDT.h" 6 | #include "FastBDT_IO.h" 7 | #include "Classifier.h" 8 | 9 | extern "C" { 10 | 11 | void PrintVersion(); 12 | 13 | struct Expertise { 14 | FastBDT::Classifier classifier; 15 | }; 16 | 17 | void* Create(); 18 | 19 | void SetBinning(void *ptr, unsigned int* binning, unsigned int size); 20 | void SetPurityTransformation(void *ptr, bool* purityTransformation, unsigned int size); 21 | 22 | void SetNTrees(void *ptr, unsigned int nTrees); 23 | unsigned int GetNTrees(void *ptr); 24 | 25 | void SetDepth(void *ptr, unsigned int depth); 26 | unsigned int GetDepth(void *ptr); 27 | 28 | void SetNumberOfFlatnessFeatures(void *ptr, unsigned int numberOfFlatnessFeatures); 29 | unsigned int GetNumberOfFlatnessFeatures(void *ptr); 30 | 31 | void SetSubsample(void *ptr, double subsample); 32 | double GetSubsample(void *ptr); 33 | 34 | void SetShrinkage(void *ptr, double shrinkage); 35 | double GetShrinkage(void *ptr); 36 | 37 | void SetFlatnessLoss(void *ptr, double flatnessLoss); 38 | double GetFlatnessLoss(void *ptr); 39 | 40 | void SetTransform2Probability(void *ptr, bool transform2probability); 41 | bool GetTransform2Probability(void *ptr); 42 | 43 | void SetSPlot(void *ptr, bool sPlot); 44 | bool GetSPlot(void *ptr); 45 | 46 | void Delete(void *ptr); 47 | 48 | void Fit(void *ptr, float *data_ptr, float *weight_ptr, bool *target_ptr, unsigned int nEvents, unsigned int nFeatures); 49 | 50 | void Load(void* ptr, char *weightfile); 51 | 52 | float Predict(void *ptr, float *array); 53 | 54 | void PredictArray(void *ptr, float *array, float *result, unsigned int nEvents); 55 | 56 | void Save(void* ptr, char *weightfile); 57 | 58 | struct VariableRanking { 59 | std::map ranking; 60 | }; 61 | 62 | void* GetVariableRanking(void* ptr); 63 | 64 | void* GetIndividualVariableRanking(void* ptr, float *array); 65 | 66 | unsigned int ExtractNumberOfVariablesFromVariableRanking(void* ptr); 67 | 68 | double ExtractImportanceOfVariableFromVariableRanking(void* ptr, unsigned int iFeature); 69 | 70 | void DeleteVariableRanking(void* ptr); 71 | 72 | } 73 | -------------------------------------------------------------------------------- /examples/PythonExample.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PyFastBDT import FastBDT 3 | 4 | import numpy as np 5 | import sklearn.metrics 6 | 7 | if __name__ == '__main__': 8 | 9 | # Create some Monte Carlo data using a multidimensional gaussian distribution 10 | # The 0th row of the coveriance matrix describes the correlation to the target variable 11 | mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] 12 | cov = [[1.0, 0.8, 0.4, 0.2, 0.1, 0.0], 13 | [0.0, 1.0, 0.0, 0.0, 0.0, 0.0], 14 | [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], 15 | [0.0, 0.0, 0.0, 1.0, 0.0, 0.0], 16 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 17 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] 18 | 19 | for i in range(len(mean)): 20 | for j in range(i+1, len(mean)): 21 | cov[j][i] = cov[i][j] 22 | 23 | N_train, N_test = 10000, 10000 24 | data = np.random.multivariate_normal(mean, cov, N_train + N_test) 25 | X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 26 | X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 27 | 28 | # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers 29 | clf = FastBDT.Classifier() 30 | clf.fit(X=X_train, y=y_train) 31 | p = clf.predict(X_test) 32 | global_auc = sklearn.metrics.roc_auc_score(y_test, p) 33 | print("Global AUC", global_auc) 34 | 35 | # Intern feature importance is calculated using the sum of the information gains 36 | # provided by each feature in all decision trees 37 | print("Intern Feature Importance") 38 | print(clf.internFeatureImportance()) 39 | 40 | # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve 41 | # if the most important feature is left out recursively 42 | print("Extern Feature Importance") 43 | print(clf.externFeatureImportance(X_train, y_train, None, X_test, y_test, None)) 44 | 45 | # Individual feature importance is the sum of the information gains provided by feature 46 | # in the path an individual event takes through the forest 47 | print("Individual Feature Importance") 48 | events = [ np.array([1.0, 2.0, 3.0, 4.0, 5.0]), 49 | np.array([2.0, 2.0, 3.0, 4.0, 5.0]), 50 | np.array([0.0, 2.0, 3.0, 4.0, 5.0]), 51 | np.array([1.0, 3.0, 3.0, 4.0, 5.0]), 52 | np.array([1.0, 1.0, 3.0, 4.0, 5.0]), 53 | np.array([1.0, 2.0, 4.0, 4.0, 5.0]), 54 | np.array([1.0, 2.0, 2.0, 4.0, 5.0]), 55 | np.array([1.0, 2.0, 3.0, 5.0, 5.0]), 56 | np.array([1.0, 2.0, 3.0, 3.0, 5.0]), 57 | np.array([1.0, 2.0, 3.0, 4.0, 6.0]), 58 | np.array([1.0, 2.0, 3.0, 4.0, 4.0]) ] 59 | 60 | for event in events: 61 | print(clf.individualFeatureImportance(event)) 62 | 63 | # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers 64 | clf = FastBDT.Classifier(purityTransformation=[False, False, False, False, False], subsample=1.0) 65 | clf.fit(X=X_train, y=y_train) 66 | p = clf.predict(X_test) 67 | global_auc = sklearn.metrics.roc_auc_score(y_test, p) 68 | print("Global AUC without Purity Transformation", global_auc) 69 | 70 | clf = FastBDT.Classifier(purityTransformation=[True, True, True, True, True], subsample=1.0) 71 | clf.fit(X=X_train, y=y_train) 72 | p = clf.predict(X_test) 73 | global_auc = sklearn.metrics.roc_auc_score(y_test, p) 74 | print("Global AUC with Purity Transformation", global_auc) 75 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.8.12) 2 | 3 | set(CMAKE_C_COMPILER gcc) 4 | set(CMAKE_CXX_COMPILER g++) 5 | 6 | project (FastBDT) 7 | set (FastBDT_VERSION_MAJOR 5) 8 | set (FastBDT_VERSION_MINOR 2) 9 | 10 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake") 11 | 12 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wall -Wextra -g -msse2") 13 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb3 -g -std=c++11 -Wall -Wextra") 14 | 15 | configure_file ( 16 | "${PROJECT_SOURCE_DIR}/include/FastBDT.h.in" 17 | "${PROJECT_BINARY_DIR}/include/FastBDT.h" 18 | ) 19 | 20 | include_directories("${PROJECT_SOURCE_DIR}/include/" "${PROJECT_BINARY_DIR}/include/") 21 | 22 | set(FastBDT_SOURCES 23 | "${PROJECT_SOURCE_DIR}/src/FastBDT.cxx" 24 | "${PROJECT_SOURCE_DIR}/src/Classifier.cxx" 25 | "${PROJECT_SOURCE_DIR}/src/FastBDT_IO.cxx" 26 | ) 27 | 28 | set(FastBDT_TESTS 29 | "${PROJECT_SOURCE_DIR}/src/test_all.cxx" 30 | "${PROJECT_SOURCE_DIR}/src/test_FastBDT.cxx" 31 | "${PROJECT_SOURCE_DIR}/src/test_Performance.cxx" 32 | "${PROJECT_SOURCE_DIR}/src/test_Classifier.cxx" 33 | "${PROJECT_SOURCE_DIR}/src/test_FastBDT_IO.cxx" 34 | "${PROJECT_SOURCE_DIR}/src/test_FastBDT_C_API.cxx" 35 | ) 36 | 37 | set(FastBDT_HEADERS 38 | "${PROJECT_BINARY_DIR}/include/FastBDT.h" 39 | "${PROJECT_SOURCE_DIR}/include/Classifier.h" 40 | "${PROJECT_SOURCE_DIR}/include/FastBDT_IO.h" 41 | ) 42 | 43 | set(FastBDT_CINTERFACE 44 | "${PROJECT_SOURCE_DIR}/src/FastBDT_C_API.cxx" 45 | "${PROJECT_SOURCE_DIR}/include/FastBDT_C_API.h" 46 | ) 47 | 48 | set(FastBDT_Python 49 | "${PROJECT_SOURCE_DIR}/PyFastBDT/__init__.py" 50 | "${PROJECT_SOURCE_DIR}/PyFastBDT/FastBDT.py" 51 | "${PROJECT_SOURCE_DIR}/PyFastBDT/utility.py" 52 | ) 53 | 54 | add_library(FastBDT_static STATIC ${FastBDT_SOURCES} ${FastBDT_HEADERS}) 55 | add_library(FastBDT_CInterface SHARED ${FastBDT_CINTERFACE} ${FastBDT_SOURCES} ${FastBDT_HEADERS}) 56 | target_link_libraries(FastBDT_CInterface) 57 | add_library(FastBDT_shared SHARED ${FastBDT_SOURCES} ${FastBDT_HEADERS}) 58 | target_link_libraries(FastBDT_shared) 59 | 60 | install(TARGETS FastBDT_static FastBDT_shared FastBDT_CInterface 61 | LIBRARY DESTINATION lib 62 | ARCHIVE DESTINATION lib 63 | RUNTIME DESTINATION bin 64 | ) 65 | 66 | install(FILES ${FastBDT_HEADERS} DESTINATION include) 67 | 68 | find_package(GTest) 69 | if(GTEST_FOUND) 70 | add_executable(unittests ${FastBDT_TESTS} ${FastBDT_HEADERS} ${FastBDT_CINTERFACE}) 71 | target_link_libraries(unittests ${GTEST_BOTH_LIBRARIES} FastBDT_static pthread) 72 | message(STATUS ${GTEST_INCLUDE_DIRS}) 73 | target_include_directories(unittests PUBLIC ${GTEST_INCLUDE_DIRS}) 74 | install(TARGETS unittests DESTINATION bin) 75 | else() 76 | message(STATUS "Could not find gtest installation, skip building unittests.") 77 | endif() 78 | 79 | find_program(PYTHON "python3") 80 | 81 | if (PYTHON) 82 | configure_file( 83 | "${PROJECT_SOURCE_DIR}/setup.py.in" 84 | "${PROJECT_BINARY_DIR}/setup.py" 85 | ) 86 | add_custom_target(PyFastBDT ALL DEPENDS ${FastBDT_Python} FastBDT_shared FastBDT_CInterface) 87 | 88 | add_custom_command(TARGET PyFastBDT PRE_BUILD 89 | COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/PyFastBDT" "${CMAKE_BINARY_DIR}/PyFastBDT" 90 | COMMAND ${CMAKE_COMMAND} -E copy $ "${PROJECT_BINARY_DIR}/PyFastBDT/" 91 | COMMAND ${CMAKE_COMMAND} -E copy $ "${PROJECT_BINARY_DIR}/PyFastBDT/" 92 | COMMAND ${PYTHON} "${CMAKE_BINARY_DIR}/setup.py" build 93 | COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp" 94 | ) 95 | 96 | install(CODE "execute_process(COMMAND ${PYTHON} ${PROJECT_BINARY_DIR}/setup.py install --prefix=${CMAKE_INSTALL_PREFIX})") 97 | endif() 98 | 99 | set(CPACK_PACKAGE_VERSION "${FastBDT_VERSION_MAJOR}.${FastBDT_VERSION_MINOR}") 100 | set(CPACK_GENERATOR "RPM;DEB;TGZ") 101 | set(CPACK_PACKAGE_NAME "FastBDT") 102 | set(CPACK_PACKAGE_RELEASE 1) 103 | set(CPACK_PACKAGE_CONTACT "thomas.keck2@kit.edu") 104 | set(CPACK_PACKAGE_VENDOR "Private") 105 | set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}) 106 | set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CPACK_PACKAGE_RELEASE}.${CMAKE_SYSTEM_PROCESSOR}") 107 | 108 | SET(CPACK_DEBIAN_PACKAGE_PRIORITY "optional") 109 | SET(CPACK_DEBIAN_PACKAGE_SECTION "libs") 110 | SET(CPACK_DEBIAN_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR}) 111 | 112 | include(CPack) 113 | -------------------------------------------------------------------------------- /examples/ugboost.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PyFastBDT import FastBDT 3 | from PyFastBDT import utility 4 | 5 | import numpy as np 6 | import numpy 7 | import numpy.linalg 8 | import sklearn.metrics 9 | 10 | import matplotlib.pyplot as plt 11 | import matplotlib as mpl 12 | 13 | 14 | def calculate_cdf_and_pdf(X): 15 | """ 16 | Calculates cdf and pdf of given sample and adds under/overflow bins 17 | @param X 1-d numpy.array 18 | """ 19 | pdf, bins = numpy.histogram(X, bins=100, density=True) 20 | cdf = numpy.cumsum(pdf * (bins - numpy.roll(bins, 1))[1:]) 21 | return numpy.hstack([0.0, cdf, 1.0]), numpy.hstack([0.0, pdf, 0.0]), bins 22 | 23 | 24 | class Prior(object): 25 | def __init__(self, signal, bckgrd): 26 | self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(signal) 27 | self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(bckgrd) 28 | # Avoid numerical instabilities 29 | self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1 30 | self.signal_yield = len(signal) 31 | self.bckgrd_yield = len(bckgrd) 32 | 33 | def get_signal_pdf(self, X): 34 | return self.signal_pdf[numpy.digitize(X, bins=self.signal_bins)] 35 | 36 | def get_bckgrd_pdf(self, X): 37 | return self.bckgrd_pdf[numpy.digitize(X, bins=self.bckgrd_bins)] 38 | 39 | def get_signal_cdf(self, X): 40 | return self.signal_cdf[numpy.digitize(X, bins=self.signal_bins)] 41 | 42 | def get_bckgrd_cdf(self, X): 43 | return self.bckgrd_cdf[numpy.digitize(X, bins=self.bckgrd_bins)] 44 | 45 | def get_prior(self, X): 46 | return self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X)) 47 | 48 | 49 | def combine_probabilities(p1, p2): 50 | return p1*p2 / (p1*p2 + (1-p1)*(1-p2)) 51 | 52 | 53 | def evaluation(label, X_test, y_test, p, p_prior): 54 | print(label, utility.auc_roc(p, y_test), utility.flatness(p, X_test[:, 0], y_test, classes=[0]), utility.flatness(p, X_test[:, 0], y_test, classes=[1])) 55 | print(label, sklearn.metrics.roc_auc_score(y_test, p)) 56 | print(label + " with prior", sklearn.metrics.roc_auc_score(y_test, combine_probabilities(p, p_prior))) 57 | plt.scatter(X_test[y_test == 1, 0], p[y_test == 1], c='r', label=label + " (Signal)", alpha=0.2) 58 | plt.scatter(X_test[y_test == 0, 0], p[y_test == 0], c='b', label=label + " (Background)", alpha=0.2) 59 | plt.xlabel("Feature") 60 | plt.ylabel("Probability") 61 | plt.show() 62 | 63 | 64 | if __name__ == '__main__': 65 | # Create some Monte Carlo data using a multidimensional gaussian distribution 66 | # The 0th row of the coveriance matrix describes the correlation to the target variable 67 | mean = [0.5, 0.4, 0.4] 68 | cov = [[1.0, 0.6, 0.6], 69 | [0.0, 1.0, 0.0], 70 | [0.0, 0.0, 1.0]] 71 | 72 | mean2 = [-0.5, -0.4, -0.4] 73 | cov2 = [[1.0, 0.6, 0.6], 74 | [0.0, 1.0, 0.0], 75 | [0.0, 0.0, 1.0]] 76 | 77 | for i in range(len(mean)): 78 | for j in range(i+1, len(mean)): 79 | cov[j][i] = cov[i][j] 80 | cov2[j][i] = cov2[i][j] 81 | 82 | N_train, N_test = 100000, 2000 83 | data = np.random.multivariate_normal(mean2, cov2, (N_train + N_test)//2) 84 | data2 = np.random.multivariate_normal(mean, cov, (N_train + N_test)//2) 85 | X_train, y_train = np.r_[data[:N_train//2], data2[:N_train//2]], np.r_[np.ones(N_train//2) == 1, np.ones(N_train//2) == 0] 86 | X_test, y_test = np.r_[data[N_train//2:], data2[N_train//2:]], np.r_[np.ones(N_test//2) == 1, np.ones(N_test//2) == 0] 87 | 88 | # First variable is the variable we want to have independent of our network output 89 | prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0]) 90 | p_prior = prior.get_prior(X_test[:, 0]) 91 | evaluation("Prior", X_test, y_test, p_prior, p_prior) 92 | 93 | evaluation("Random", X_test, y_test, np.random.uniform(size=N_test), p_prior) 94 | 95 | for i in [0.0, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 500.0, 1000.0]: 96 | p = FastBDT.Classifier(flatnessLoss=i, numberOfFlatnessFeatures=1).fit(X=np.c_[X_train[:, 1:], X_train[:, 0]], y=y_train).predict(X_test[:, 1:]) 97 | print("Flatness", i) 98 | evaluation("UBoost", X_test, y_test, p, p_prior) 99 | 100 | p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test) 101 | evaluation("Full", X_test, y_test, p, p_prior) 102 | 103 | p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:]) 104 | evaluation("Restricted", X_test, y_test, p, p_prior) 105 | 106 | -------------------------------------------------------------------------------- /src/FastBDT_IO.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2014 3 | */ 4 | 5 | #include "FastBDT_IO.h" 6 | 7 | #include 8 | #include 9 | 10 | namespace FastBDT { 11 | 12 | float convert_to_float_safely(std::string &input) { 13 | float result = 0; 14 | try { 15 | // stof handles infinity and nan correctly but fails 16 | // for denormalized values 17 | result = std::stof(input); 18 | } catch(...) { 19 | // stringstream fails for nan and infinity but 20 | // handles denormalized values correctly. 21 | std::stringstream stream; 22 | stream << input; 23 | stream >> result; 24 | } 25 | return result; 26 | } 27 | 28 | double convert_to_double_safely(std::string &input) { 29 | double result = 0; 30 | try { 31 | // stof handles infinity and nan correctly but fails 32 | // for denormalized values 33 | result = std::stod(input); 34 | } catch(...) { 35 | // stringstream fails for nan and infinity but 36 | // handles denormalized values correctly. 37 | std::stringstream stream; 38 | stream << input; 39 | stream >> result; 40 | } 41 | return result; 42 | } 43 | 44 | template<> 45 | std::ostream& operator<<(std::ostream& stream, const std::vector &vector) { 46 | stream << vector.size(); 47 | stream.precision(std::numeric_limits::max_digits10); 48 | stream << std::scientific; 49 | for(const auto &value : vector) { 50 | stream << " " << value; 51 | } 52 | stream.precision(6); 53 | stream << std::endl; 54 | return stream; 55 | } 56 | 57 | template<> 58 | std::ostream& operator<<(std::ostream& stream, const std::vector &vector) { 59 | stream << vector.size(); 60 | stream.precision(std::numeric_limits::max_digits10); 61 | stream << std::scientific; 62 | for(const auto &value : vector) { 63 | stream << " " << value; 64 | } 65 | stream.precision(6); 66 | stream << std::endl; 67 | return stream; 68 | } 69 | 70 | template<> 71 | std::istream& operator>>(std::istream& stream, std::vector &vector) { 72 | unsigned int size; 73 | stream >> size; 74 | vector.resize(size); 75 | for(unsigned int i = 0; i < size; ++i) { 76 | std::string temp; 77 | stream >> temp; 78 | vector[i] = convert_to_float_safely(temp); 79 | } 80 | return stream; 81 | } 82 | 83 | template<> 84 | std::istream& operator>>(std::istream& stream, std::vector &vector) { 85 | unsigned int size; 86 | stream >> size; 87 | vector.resize(size); 88 | for(unsigned int i = 0; i < size; ++i) { 89 | std::string temp; 90 | stream >> temp; 91 | vector[i] = convert_to_double_safely(temp); 92 | } 93 | return stream; 94 | } 95 | 96 | /** 97 | * This function reads a Cut from an std::istream 98 | * @param stream an std::istream reference 99 | * @param cut containing read data 100 | */ 101 | template<> 102 | std::istream& operator>>(std::istream& stream, Cut &cut) { 103 | stream >> cut.feature; 104 | 105 | // Unfortunately we have to use our own conversion here to correctly parse NaN and Infinity 106 | // because usualy istream::operator>> doesn't do this! 107 | std::string index_string; 108 | stream >> index_string; 109 | cut.index = convert_to_float_safely(index_string); 110 | stream >> cut.valid; 111 | stream >> cut.gain; 112 | return stream; 113 | } 114 | 115 | /** 116 | * This function reads a Cut from an std::istream 117 | * @param stream an std::istream reference 118 | * @param cut containing read data 119 | */ 120 | template<> 121 | std::istream& operator>>(std::istream& stream, Cut &cut) { 122 | stream >> cut.feature; 123 | 124 | // Unfortunately we have to use our own conversion here to correctly parse NaN and Infinity 125 | // because usualy istream::operator>> doesn't do this! 126 | std::string index_string; 127 | stream >> index_string; 128 | cut.index = convert_to_double_safely(index_string); 129 | stream >> cut.valid; 130 | stream >> cut.gain; 131 | return stream; 132 | } 133 | 134 | std::ostream& operator<<(std::ostream& stream, const PurityTransformation &purityTransformation) { 135 | stream << purityTransformation.GetMapping() << std::endl; 136 | return stream; 137 | } 138 | 139 | std::istream& operator>>(std::istream& stream, PurityTransformation &purityTransformation) { 140 | 141 | std::vector mapping; 142 | stream >> mapping; 143 | purityTransformation.SetMapping(mapping); 144 | return stream; 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /include/Classifier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Thomas Keck 2017 3 | * 4 | * Simplified sklearn interface 5 | */ 6 | 7 | #pragma once 8 | 9 | #include "FastBDT.h" 10 | #include "FastBDT_IO.h" 11 | 12 | #include 13 | 14 | namespace FastBDT { 15 | class Classifier { 16 | 17 | public: 18 | /* 19 | * Explicitly activate default/copy constructor and assign operator. 20 | * This was a request of a user. 21 | */ 22 | Classifier() = default; 23 | Classifier(const Classifier&) = default; 24 | Classifier& operator=(const Classifier &) = default; 25 | 26 | Classifier(std::istream& stream) { 27 | 28 | stream >> m_version; 29 | stream >> m_nTrees; 30 | stream >> m_depth; 31 | stream >> m_binning; 32 | stream >> m_shrinkage; 33 | stream >> m_subsample; 34 | stream >> m_sPlot; 35 | stream >> m_flatnessLoss; 36 | stream >> m_purityTransformation; 37 | stream >> m_transform2probability; 38 | stream >> m_featureBinning; 39 | stream >> m_purityBinning; 40 | stream >> m_numberOfFeatures; 41 | stream >> m_numberOfFinalFeatures; 42 | stream >> m_numberOfFlatnessFeatures; 43 | stream >> m_can_use_fast_forest; 44 | m_fast_forest = readForestFromStream(stream); 45 | m_binned_forest = readForestFromStream(stream); 46 | 47 | } 48 | 49 | friend std::ostream& operator<<(std::ostream& stream, const Classifier& classifier); 50 | 51 | Classifier(unsigned int nTrees, unsigned int depth, std::vector binning, double shrinkage = 0.1, double subsample = 1.0, bool sPlot = false, double flatnessLoss = -1.0, std::vector purityTransformation = {}, unsigned int numberOfFlatnessFeatures=0, bool transform2probability=true) : 52 | m_nTrees(nTrees), m_depth(depth), m_binning(binning), m_shrinkage(shrinkage), m_subsample(subsample), m_sPlot(sPlot), m_flatnessLoss(flatnessLoss), m_purityTransformation(purityTransformation), m_numberOfFlatnessFeatures(numberOfFlatnessFeatures), m_transform2probability(transform2probability), m_can_use_fast_forest(true) { } 53 | 54 | void Print(); 55 | 56 | unsigned int GetNTrees() const { return m_nTrees; } 57 | void SetNTrees(unsigned int nTrees) { m_nTrees = nTrees; } 58 | 59 | unsigned int GetDepth() const { return m_depth; } 60 | void SetDepth(unsigned int depth) { m_depth = depth; } 61 | 62 | unsigned int GetNumberOfFlatnessFeatures() const { return m_numberOfFlatnessFeatures; } 63 | void SetNumberOfFlatnessFeatures(unsigned int numberOfFlatnessFeatures) { m_numberOfFlatnessFeatures = numberOfFlatnessFeatures; } 64 | 65 | unsigned int GetNFeatures() const { return m_numberOfFeatures; } 66 | 67 | double GetShrinkage() const { return m_shrinkage; } 68 | void SetShrinkage(double shrinkage) { m_shrinkage = shrinkage; } 69 | 70 | double GetSubsample() const { return m_subsample; } 71 | void SetSubsample(double subsample) { m_subsample = subsample; } 72 | 73 | bool GetSPlot() const { return m_sPlot; } 74 | void SetSPlot(bool sPlot) { m_sPlot = sPlot; } 75 | 76 | bool GetTransform2Probability() const { return m_transform2probability; } 77 | void SetTransform2Probability(bool transform2probability) { m_transform2probability = transform2probability; } 78 | 79 | std::vector GetBinning() const { return m_binning; } 80 | void SetBinning(std::vector binning) { m_binning = binning; } 81 | 82 | std::vector GetPurityTransformation() const { return m_purityTransformation; } 83 | void SetPurityTransformation(std::vector purityTransformation) { m_purityTransformation = purityTransformation; } 84 | 85 | double GetFlatnessLoss() const { return m_flatnessLoss; } 86 | void SetFlatnessLoss(double flatnessLoss) { m_flatnessLoss = flatnessLoss; } 87 | 88 | void fit(const std::vector> &X, const std::vector &y, const std::vector &w); 89 | 90 | float predict(const std::vector &X) const; 91 | 92 | std::map GetVariableRanking() const; 93 | 94 | std::map GetIndividualVariableRanking(const std::vector &X) const; 95 | 96 | std::map GetFeatureMapping() const; 97 | 98 | std::map MapRankingToOriginalFeatures(std::map ranking) const; 99 | 100 | private: 101 | unsigned int m_version = 1; 102 | unsigned int m_nTrees = 100; 103 | unsigned int m_depth = 3; 104 | std::vector m_binning; 105 | double m_shrinkage = 0.1; 106 | double m_subsample = 0.5; 107 | bool m_sPlot = true; 108 | double m_flatnessLoss = -1; 109 | std::vector m_purityTransformation; 110 | unsigned int m_numberOfFlatnessFeatures = 0; 111 | bool m_transform2probability = true; 112 | unsigned int m_numberOfFeatures = 0; 113 | unsigned int m_numberOfFinalFeatures = 0; 114 | std::vector> m_featureBinning; 115 | std::vector m_purityBinning; 116 | 117 | bool m_can_use_fast_forest = true; 118 | Forest m_fast_forest; 119 | Forest m_binned_forest; 120 | 121 | }; 122 | 123 | std::ostream& operator<<(std::ostream& stream, const Classifier& classifier); 124 | 125 | } 126 | -------------------------------------------------------------------------------- /files/iris.txt: -------------------------------------------------------------------------------- 1 | SepalLength SepalWidth PetalLength PetalWidth Class 2 | 5.1 3.5 1.4 0.1 0 3 | 4.9 3.0 1.4 0.2 0 4 | 4.7 3.2 1.3 0.2 0 5 | 4.6 3.1 1.5 0.2 0 6 | 5.0 3.6 1.4 0.2 0 7 | 5.4 3.9 1.7 0.4 0 8 | 4.6 3.4 1.4 0.3 0 9 | 5.0 3.4 1.5 0.2 0 10 | 4.4 2.9 1.4 0.2 0 11 | 4.9 3.1 1.5 0.1 0 12 | 5.4 3.7 1.5 0.2 0 13 | 4.8 3.4 1.6 0.2 0 14 | 4.8 3.0 1.4 0.1 0 15 | 4.3 3.0 1.1 0.1 0 16 | 5.8 4.0 1.2 0.2 0 17 | 5.7 4.4 1.5 0.4 0 18 | 5.4 3.9 1.3 0.4 0 19 | 5.1 3.5 1.4 0.3 0 20 | 5.7 3.8 1.7 0.3 0 21 | 5.1 3.8 1.5 0.3 0 22 | 5.4 3.4 1.7 0.2 0 23 | 5.1 3.7 1.5 0.4 0 24 | 4.6 3.6 1.0 0.2 0 25 | 5.1 3.3 1.7 0.5 0 26 | 4.8 3.4 1.9 0.2 0 27 | 5.0 3.0 1.6 0.2 0 28 | 5.0 3.4 1.6 0.4 0 29 | 5.2 3.5 1.5 0.2 0 30 | 5.2 3.4 1.4 0.2 0 31 | 4.7 3.2 1.6 0.2 0 32 | 4.8 3.1 1.6 0.2 0 33 | 5.4 3.4 1.5 0.4 0 34 | 5.2 4.1 1.5 0.1 0 35 | 5.5 4.2 1.4 0.2 0 36 | 4.9 3.1 1.5 0.2 0 37 | 5.0 3.2 1.2 0.2 0 38 | 5.5 3.5 1.3 0.2 0 39 | 4.9 3.6 1.4 0.1 0 40 | 4.4 3.0 1.3 0.2 0 41 | 5.1 3.4 1.5 0.2 0 42 | 5.0 3.5 1.3 0.3 0 43 | 4.5 2.3 1.3 0.3 0 44 | 4.4 3.2 1.3 0.2 0 45 | 5.0 3.5 1.6 0.6 0 46 | 5.1 3.8 1.9 0.4 0 47 | 4.8 3.0 1.4 0.3 0 48 | 5.1 3.8 1.6 0.2 0 49 | 4.6 3.2 1.4 0.2 0 50 | 5.3 3.7 1.5 0.2 0 51 | 5.0 3.3 1.4 0.2 0 52 | 7.0 3.2 4.7 1.4 2 53 | 6.4 3.2 4.5 1.5 2 54 | 6.9 3.1 4.9 1.5 2 55 | 5.5 2.3 4.0 1.3 2 56 | 6.5 2.8 4.6 1.5 2 57 | 5.7 2.8 4.5 1.3 2 58 | 6.3 3.3 4.7 1.6 2 59 | 4.9 2.4 3.3 1.0 2 60 | 6.6 2.9 4.6 1.3 2 61 | 5.2 2.7 3.9 1.4 2 62 | 5.0 2.0 3.5 1.0 2 63 | 5.9 3.0 4.2 1.5 2 64 | 6.0 2.2 4.0 1.0 2 65 | 6.1 2.9 4.7 1.4 2 66 | 5.6 2.9 3.6 1.3 2 67 | 6.7 3.1 4.4 1.4 2 68 | 5.6 3.0 4.5 1.5 2 69 | 5.8 2.7 4.1 1.0 2 70 | 6.2 2.2 4.5 1.5 2 71 | 5.6 2.5 3.9 1.1 2 72 | 5.9 3.2 4.8 1.8 2 73 | 6.1 2.8 4.0 1.3 2 74 | 6.3 2.5 4.9 1.5 2 75 | 6.1 2.8 4.7 1.2 2 76 | 6.4 2.9 4.3 1.3 2 77 | 6.6 3.0 4.4 1.4 2 78 | 6.8 2.8 4.8 1.4 2 79 | 6.7 3.0 5.0 1.7 2 80 | 6.0 2.9 4.5 1.5 2 81 | 5.7 2.6 3.5 1.0 2 82 | 5.5 2.4 3.8 1.1 2 83 | 5.5 2.4 3.7 1.0 2 84 | 5.8 2.7 3.9 1.2 2 85 | 6.0 2.7 5.1 1.6 2 86 | 5.4 3.0 4.5 1.5 2 87 | 6.0 3.4 4.5 1.6 2 88 | 6.7 3.1 4.7 1.5 2 89 | 6.3 2.3 4.4 1.3 2 90 | 5.6 3.0 4.1 1.3 2 91 | 5.5 2.5 4.0 1.3 2 92 | 5.5 2.6 4.4 1.2 2 93 | 6.1 3.0 4.6 1.4 2 94 | 5.8 2.6 4.0 1.2 2 95 | 5.0 2.3 3.3 1.0 2 96 | 5.6 2.7 4.2 1.3 2 97 | 5.7 3.0 4.2 1.2 2 98 | 5.7 2.9 4.2 1.3 2 99 | 6.2 2.9 4.3 1.3 2 100 | 5.1 2.5 3.0 1.1 2 101 | 5.7 2.8 4.1 1.3 2 102 | 6.3 3.3 6.0 2.5 1 103 | 5.8 2.7 5.1 1.9 1 104 | 7.1 3.0 5.9 2.1 1 105 | 6.3 2.9 5.6 1.8 1 106 | 6.5 3.0 5.8 2.2 1 107 | 7.6 3.0 6.6 2.1 1 108 | 4.9 2.5 4.5 1.7 1 109 | 7.3 2.9 6.3 1.8 1 110 | 6.7 2.5 5.8 1.8 1 111 | 7.2 3.6 6.1 2.5 1 112 | 6.5 3.2 5.1 2.0 1 113 | 6.4 2.7 5.3 1.9 1 114 | 6.8 3.0 5.5 2.1 1 115 | 5.7 2.5 5.0 2.0 1 116 | 5.8 2.8 5.1 2.4 1 117 | 6.4 3.2 5.3 2.3 1 118 | 6.5 3.0 5.5 1.8 1 119 | 7.7 3.8 6.7 2.2 1 120 | 7.7 2.6 6.9 2.3 1 121 | 6.0 2.2 5.0 1.5 1 122 | 6.9 3.2 5.7 2.3 1 123 | 5.6 2.8 4.9 2.0 1 124 | 7.7 2.8 6.7 2.0 1 125 | 6.3 2.7 4.9 1.8 1 126 | 6.7 3.3 5.7 2.1 1 127 | 7.2 3.2 6.0 1.8 1 128 | 6.2 2.8 4.8 1.8 1 129 | 6.1 3.0 4.9 1.8 1 130 | 6.4 2.8 5.6 2.1 1 131 | 7.2 3.0 5.8 1.6 1 132 | 7.4 2.8 6.1 1.9 1 133 | 7.9 3.8 6.4 2.0 1 134 | 6.4 2.8 5.6 2.2 1 135 | 6.3 2.8 5.1 1.5 1 136 | 6.1 2.6 5.6 1.4 1 137 | 7.7 3.0 6.1 2.3 1 138 | 6.3 3.4 5.6 2.4 1 139 | 6.4 3.1 5.5 1.8 1 140 | 6.0 3.0 4.8 1.8 1 141 | 6.9 3.1 5.4 2.1 1 142 | 6.7 3.1 5.6 2.4 1 143 | 6.9 3.1 5.1 2.3 1 144 | 5.8 2.7 5.1 1.9 1 145 | 6.8 3.2 5.9 2.3 1 146 | 6.7 3.3 5.7 2.5 1 147 | 6.7 3.0 5.2 2.3 1 148 | 6.3 2.5 5.0 1.9 1 149 | 6.5 3.0 5.2 2.0 1 150 | 6.2 3.4 5.4 2.3 1 151 | 5.9 3.0 5.1 1.8 1 152 | -------------------------------------------------------------------------------- /examples/CPPExample.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2017 3 | */ 4 | 5 | #include "Classifier.h" 6 | #include 7 | #include 8 | #include 9 | 10 | std::vector> GetIrisX() { 11 | std::vector> X = {{5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9,5.4,4.8,4.8,4.3,5.8,5.7,5.4,5.1,5.7,5.1,5.4,5.1,4.6,5.1,4.8,5.0,5.0,5.2,5.2,4.7,4.8,5.4,5.2,5.5,4.9,5.0,5.5,4.9,4.4,5.1,5.0,4.5,4.4,5.0,5.1,4.8,5.1,4.6,5.3,5.0,7.0,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,5.0,5.9,6.0,6.1,5.6,6.7,5.6,5.8,6.2,5.6,5.9,6.1,6.3,6.1,6.4,6.6,6.8,6.7,6.0,5.7,5.5,5.5,5.8,6.0,5.4,6.0,6.7,6.3,5.6,5.5,5.5,6.1,5.8,5.0,5.6,5.7,5.7,6.2,5.1,5.7,6.3,5.8,7.1,6.3,6.5,7.6,4.9,7.3,6.7,7.2,6.5,6.4,6.8,5.7,5.8,6.4,6.5,7.7,7.7,6.0,6.9,5.6,7.7,6.3,6.7,7.2,6.2,6.1,6.4,7.2,7.4,7.9,6.4,6.3,6.1,7.7,6.3,6.4,6.0,6.9,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9}, {3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,3.7,3.4,3.0,3.0,4.0,4.4,3.9,3.5,3.8,3.8,3.4,3.7,3.6,3.3,3.4,3.0,3.4,3.5,3.4,3.2,3.1,3.4,4.1,4.2,3.1,3.2,3.5,3.1,3.0,3.4,3.5,2.3,3.2,3.5,3.8,3.0,3.8,3.2,3.7,3.3,3.2,3.2,3.1,2.3,2.8,2.8,3.3,2.4,2.9,2.7,2.0,3.0,2.2,2.9,2.9,3.1,3.0,2.7,2.2,2.5,3.2,2.8,2.5,2.8,2.9,3.0,2.8,3.0,2.9,2.6,2.4,2.4,2.7,2.7,3.0,3.4,3.1,2.3,3.0,2.5,2.6,3.0,2.6,2.3,2.7,3.0,2.9,2.9,2.5,2.8,3.3,2.7,3.0,2.9,3.0,3.0,2.5,2.9,2.5,3.6,3.2,2.7,3.0,2.5,2.8,3.2,3.0,3.8,2.6,2.2,3.2,2.8,2.8,2.7,3.3,3.2,2.8,3.0,2.8,3.0,2.8,3.8,2.8,2.8,2.6,3.0,3.4,3.1,3.0,3.1,3.1,3.1,2.7,3.2,3.3,3.0,2.5,3.0,3.4,3.0}, {1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,1.5,1.6,1.4,1.1,1.2,1.5,1.3,1.4,1.7,1.5,1.7,1.5,1.0,1.7,1.9,1.6,1.6,1.5,1.4,1.6,1.6,1.5,1.5,1.4,1.5,1.2,1.3,1.5,1.3,1.5,1.3,1.3,1.3,1.6,1.9,1.4,1.6,1.4,1.5,1.4,4.7,4.5,4.9,4.0,4.6,4.5,4.7,3.3,4.6,3.9,3.5,4.2,4.0,4.7,3.6,4.4,4.5,4.1,4.5,3.9,4.8,4.0,4.9,4.7,4.3,4.4,4.8,5.0,4.5,3.5,3.8,3.7,3.9,5.1,4.5,4.5,4.7,4.4,4.1,4.0,4.4,4.6,4.0,3.3,4.2,4.2,4.2,4.3,3.0,4.1,6.0,5.1,5.9,5.6,5.8,6.6,4.5,6.3,5.8,6.1,5.1,5.3,5.5,5.0,5.1,5.3,5.5,6.7,6.9,5.0,5.7,4.9,6.7,4.9,5.7,6.0,4.8,4.9,5.6,5.8,6.1,6.4,5.6,5.1,5.6,6.1,5.6,5.5,4.8,5.4,5.6,5.1,5.1,5.9,5.7,5.2,5.0,5.2,5.4,5.1}, {0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,0.2,0.2,0.1,0.1,0.2,0.4,0.4,0.3,0.3,0.3,0.2,0.4,0.2,0.5,0.2,0.2,0.4,0.2,0.2,0.2,0.2,0.4,0.1,0.2,0.1,0.2,0.2,0.1,0.2,0.2,0.3,0.3,0.2,0.6,0.4,0.3,0.2,0.2,0.2,0.2,1.4,1.5,1.5,1.3,1.5,1.3,1.6,1.0,1.3,1.4,1.0,1.5,1.0,1.4,1.3,1.4,1.5,1.0,1.5,1.1,1.8,1.3,1.5,1.2,1.3,1.4,1.4,1.7,1.5,1.0,1.1,1.0,1.2,1.6,1.5,1.6,1.5,1.3,1.3,1.3,1.2,1.4,1.2,1.0,1.3,1.2,1.3,1.3,1.1,1.3,2.5,1.9,2.1,1.8,2.2,2.1,1.7,1.8,1.8,2.5,2.0,1.9,2.1,2.0,2.4,2.3,1.8,2.2,2.3,1.5,2.3,2.0,2.0,1.8,2.1,1.8,1.8,1.8,2.1,1.6,1.9,2.0,2.2,1.5,1.4,2.3,2.4,1.8,1.8,2.1,2.4,2.3,1.9,2.3,2.5,2.3,1.9,2.0,2.3,1.8} }; 12 | return X; 13 | } 14 | 15 | std::vector GetIrisY() { 16 | std::vector y = {false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true}; 17 | return y; 18 | } 19 | 20 | std::vector GetIrisW() { 21 | std::vector w = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; 22 | return w; 23 | } 24 | 25 | 26 | float GetIrisScore(const FastBDT::Classifier &classifier) { 27 | auto X = GetIrisX(); 28 | auto y = GetIrisY(); 29 | float sum = 0; 30 | for(unsigned int i = 0; i < y.size(); ++i) { 31 | float p = classifier.predict({X[0][i], X[1][i], X[2][i], X[3][i]}); 32 | sum += (static_cast(y[i])-p)*(static_cast(y[i])-p); 33 | } 34 | return sum; 35 | } 36 | 37 | int main() { 38 | 39 | FastBDT::Classifier classifier; 40 | // Most of the parameters have default values and 41 | // you don't have to set them. 42 | classifier.SetBinning({5, 5, 5, 5}); // 2^5 Bins for each feature, default is 2^8 bins per feature 43 | classifier.SetNTrees(10); // default is 100 44 | classifier.SetDepth(3); // default is 3 45 | classifier.SetShrinkage(0.1); // default is 0.1 46 | classifier.SetSubsample(0.5); // default is 0.5 47 | classifier.SetSPlot(false); // default is false 48 | classifier.SetPurityTransformation({false, false, false, false}); // Do not use purity transformation for the feature, default is false as well 49 | classifier.SetNumberOfFlatnessFeatures(0); // We do not use uniform boosting here (default is 0 as well) 50 | classifier.SetFlatnessLoss(-1); // We do not use uniform boosting here (default is -1 as well) 51 | classifier.SetTransform2Probability(true); // Transform output to probability (default is true) 52 | 53 | classifier.fit(GetIrisX(), GetIrisY(), GetIrisW()); 54 | 55 | std::cout << "Score " << GetIrisScore(classifier) << std::endl; 56 | 57 | std::fstream out_stream("unittest.weightfile", std::ios_base::out | std::ios_base::trunc); 58 | out_stream << classifier << std::endl; 59 | out_stream.close(); 60 | 61 | classifier.Print(); 62 | 63 | std::fstream in_stream("unittest.weightfile", std::ios_base::in); 64 | FastBDT::Classifier classifier2(in_stream); 65 | 66 | std::cout << "Score " << GetIrisScore(classifier2) << std::endl; 67 | 68 | classifier2.Print(); 69 | 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /src/test_FastBDT_C_API.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2015 3 | */ 4 | 5 | #include "FastBDT_C_API.h" 6 | 7 | #include 8 | 9 | class CInterfaceTest : public ::testing::Test { 10 | protected: 11 | virtual void SetUp() { 12 | expertise = static_cast(Create()); 13 | } 14 | 15 | virtual void TearDown() { 16 | Delete(expertise); 17 | } 18 | 19 | Expertise *expertise; 20 | 21 | }; 22 | 23 | TEST_F(CInterfaceTest, SetGetBinning ) { 24 | 25 | unsigned int binning[] = {10u, 20u}; 26 | SetBinning(expertise, binning, 2); 27 | EXPECT_EQ(expertise->classifier.GetBinning().size(), 2u); 28 | EXPECT_EQ(expertise->classifier.GetBinning()[0], 10u); 29 | EXPECT_EQ(expertise->classifier.GetBinning()[1], 20u); 30 | 31 | } 32 | 33 | TEST_F(CInterfaceTest, SetGetPurityTransformation ) { 34 | 35 | bool purityTransformation[] = {true, false}; 36 | SetPurityTransformation(expertise, purityTransformation, 2); 37 | EXPECT_EQ(expertise->classifier.GetPurityTransformation().size(), 2u); 38 | EXPECT_EQ(expertise->classifier.GetPurityTransformation()[0], true); 39 | EXPECT_EQ(expertise->classifier.GetPurityTransformation()[1], false); 40 | 41 | } 42 | 43 | TEST_F(CInterfaceTest, SetGetNTrees ) { 44 | 45 | SetNTrees(expertise, 200u); 46 | EXPECT_EQ(expertise->classifier.GetNTrees(), 200u); 47 | 48 | } 49 | 50 | TEST_F(CInterfaceTest, SetGetSPlot ) { 51 | 52 | SetSPlot(expertise, false); 53 | EXPECT_EQ(expertise->classifier.GetSPlot(), false); 54 | SetSPlot(expertise, true); 55 | EXPECT_EQ(expertise->classifier.GetSPlot(), true); 56 | 57 | } 58 | 59 | TEST_F(CInterfaceTest, SetGetTransform2Probability ) { 60 | 61 | SetTransform2Probability(expertise, false); 62 | EXPECT_EQ(expertise->classifier.GetTransform2Probability(), false); 63 | SetTransform2Probability(expertise, true); 64 | EXPECT_EQ(expertise->classifier.GetTransform2Probability(), true); 65 | 66 | } 67 | 68 | TEST_F(CInterfaceTest, SetGetDepth ) { 69 | 70 | SetDepth(expertise, 5u); 71 | EXPECT_EQ(expertise->classifier.GetDepth(), 5u); 72 | SetDepth(expertise, 2u); 73 | EXPECT_EQ(expertise->classifier.GetDepth(), 2u); 74 | 75 | } 76 | 77 | TEST_F(CInterfaceTest, SetGetFlatnessLossWorks ) { 78 | 79 | SetFlatnessLoss(expertise, 0.2); 80 | EXPECT_DOUBLE_EQ(expertise->classifier.GetFlatnessLoss(), 0.2); 81 | SetFlatnessLoss(expertise, 0.4); 82 | EXPECT_DOUBLE_EQ(expertise->classifier.GetFlatnessLoss(), 0.4); 83 | 84 | } 85 | 86 | TEST_F(CInterfaceTest, SetGetShrinkageWorks ) { 87 | 88 | SetShrinkage(expertise, 0.2); 89 | EXPECT_DOUBLE_EQ(expertise->classifier.GetShrinkage(), 0.2); 90 | SetShrinkage(expertise, 0.4); 91 | EXPECT_DOUBLE_EQ(expertise->classifier.GetShrinkage(), 0.4); 92 | 93 | } 94 | 95 | 96 | TEST_F(CInterfaceTest, SetSubsampleWorks ) { 97 | 98 | SetSubsample(expertise, 0.6); 99 | EXPECT_DOUBLE_EQ(expertise->classifier.GetSubsample(), 0.6); 100 | SetSubsample(expertise, 0.8); 101 | EXPECT_DOUBLE_EQ(expertise->classifier.GetSubsample(), 0.8); 102 | 103 | } 104 | 105 | 106 | TEST_F(CInterfaceTest, FitAndPredictWorksWithoutWeights ) { 107 | 108 | // Use just one branch instead of a whole forest for testing 109 | // We only test if the ForestBuilder is called correctly, 110 | // the builder itself is tested elsewhere. 111 | SetNTrees(expertise, 10u); 112 | SetDepth(expertise, 1u); 113 | SetSubsample(expertise, 1.0); 114 | SetShrinkage(expertise, 1.0); 115 | unsigned int binning[] = {2u, 2u}; 116 | SetBinning(expertise, binning, 2); 117 | SetTransform2Probability(expertise, true); 118 | SetNumberOfFlatnessFeatures(expertise, 0); 119 | 120 | float data_ptr[] = {1.0, 2.6, 1.6, 2.5, 1.1, 2.0, 1.9, 2.1, 1.6, 2.9, 1.9, 2.9, 1.5, 2.0}; 121 | bool target_ptr[] = {0, 1, 0, 1, 1, 1, 0}; 122 | Fit(expertise, data_ptr, nullptr, target_ptr, 7, 2); 123 | 124 | float test_ptr[] = {1.0, 2.6}; 125 | EXPECT_LE(Predict(expertise, test_ptr), 0.01); 126 | 127 | float test_ptr2[] = {1.6, 2.5}; 128 | EXPECT_GE(Predict(expertise, test_ptr2), 0.99); 129 | } 130 | 131 | 132 | TEST_F(CInterfaceTest, TrainAndAnalyseForestWorksWithSpectators ) { 133 | 134 | // Use just one branch instead of a whole forest for testing 135 | // We only test if the ForestBuilder is called correctly, 136 | // the builder itself is tested elsewhere. 137 | SetNTrees(expertise, 10u); 138 | SetDepth(expertise, 1u); 139 | SetSubsample(expertise, 1.0); 140 | SetShrinkage(expertise, 1.0); 141 | unsigned int binning[] = {2u, 2u, 2u, 3u}; 142 | SetBinning(expertise, binning, 4); 143 | SetTransform2Probability(expertise, true); 144 | SetNumberOfFlatnessFeatures(expertise, 2); 145 | 146 | float data_ptr[] = {1.0, 2.6, 0.0, -10.0, 147 | 1.6, 2.5, 99.0, 0.0, 148 | 1.1, 2.0, -500.0, 12.1, 149 | 1.9, 2.1, 0.0, 0.0, 150 | 1.6, 2.9, 23.0, 42.0, 151 | 1.9, 2.9, 0.0, 1.0, 152 | 1.5, 2.0, 1.0, -1.0}; 153 | bool target_ptr[] = {0, 1, 0, 1, 1, 1, 0}; 154 | Fit(expertise, data_ptr, nullptr, target_ptr, 7, 4); 155 | 156 | float test_ptr[] = {1.0, 2.6}; 157 | EXPECT_LE(Predict(expertise, test_ptr), 0.03); 158 | } 159 | 160 | TEST_F(CInterfaceTest, TrainAndAnalyseForestWorksWithWeights ) { 161 | 162 | // Use just one branch instead of a whole forest for testing 163 | // We only test if the ForestBuilder is called correctly, 164 | // the builder itself is tested elsewhere. 165 | SetNTrees(expertise, 10u); 166 | SetDepth(expertise, 1u); 167 | SetSubsample(expertise, 1.0); 168 | SetShrinkage(expertise, 1.0); 169 | unsigned int binning[] = {2u, 2u}; 170 | SetBinning(expertise, binning, 2); 171 | SetTransform2Probability(expertise, true); 172 | SetNumberOfFlatnessFeatures(expertise, 0); 173 | 174 | float data_ptr[] = {1.0, 2.6, 1.6, 2.5, 1.1, 2.0, 1.9, 2.1, 1.6, 2.9, 1.9, 2.9, 1.5, 2.0}; 175 | float weight_ptr[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; 176 | bool target_ptr[] = {0, 1, 0, 1, 1, 1, 0}; 177 | Fit(expertise, data_ptr, weight_ptr, target_ptr, 7, 2); 178 | 179 | float test_ptr[] = {1.0, 2.6}; 180 | EXPECT_LE(Predict(expertise, test_ptr), 0.01); 181 | 182 | float weight_ptr2[] = {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}; 183 | Fit(expertise, data_ptr, weight_ptr2, target_ptr, 7, 2); 184 | EXPECT_LE(Predict(expertise, test_ptr), 0.01); 185 | 186 | float weight_ptr3[] = {1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0}; 187 | Fit(expertise, data_ptr, weight_ptr3, target_ptr, 7, 2); 188 | EXPECT_LE(Predict(expertise, test_ptr), 0.03); 189 | } 190 | -------------------------------------------------------------------------------- /src/FastBDT_C_API.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2015 3 | */ 4 | 5 | #include "FastBDT_C_API.h" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace FastBDT; 12 | 13 | extern "C" { 14 | 15 | void PrintVersion() { 16 | std::cerr << "FastBDT Version: " << FastBDT_VERSION_MAJOR << "." << FastBDT_VERSION_MINOR << std::endl; 17 | } 18 | 19 | void* Create() { 20 | Expertise *expertise = new(std::nothrow) Expertise; 21 | return expertise; 22 | } 23 | 24 | void SetBinning(void *ptr, unsigned int* binning, unsigned int size) { 25 | reinterpret_cast(ptr)->classifier.SetBinning(std::vector(binning, binning + size)); 26 | } 27 | 28 | void SetPurityTransformation(void *ptr, bool* purityTransformation, unsigned int size) { 29 | reinterpret_cast(ptr)->classifier.SetPurityTransformation(std::vector(purityTransformation, purityTransformation + size)); 30 | } 31 | 32 | void SetNTrees(void *ptr, unsigned int nTrees) { 33 | reinterpret_cast(ptr)->classifier.SetNTrees(nTrees); 34 | } 35 | 36 | unsigned int GetNTrees(void *ptr) { 37 | return reinterpret_cast(ptr)->classifier.GetNTrees(); 38 | } 39 | 40 | void SetDepth(void *ptr, unsigned int depth) { 41 | reinterpret_cast(ptr)->classifier.SetDepth(depth); 42 | } 43 | 44 | unsigned int GetDepth(void *ptr) { 45 | return reinterpret_cast(ptr)->classifier.GetDepth(); 46 | } 47 | 48 | void SetNumberOfFlatnessFeatures(void *ptr, unsigned int numberOfFlatnessFeatures) { 49 | reinterpret_cast(ptr)->classifier.SetNumberOfFlatnessFeatures(numberOfFlatnessFeatures); 50 | } 51 | 52 | unsigned int GetNumberOfFlatnessFeatures(void *ptr) { 53 | return reinterpret_cast(ptr)->classifier.GetNumberOfFlatnessFeatures(); 54 | } 55 | 56 | void SetSubsample(void *ptr, double subsample) { 57 | reinterpret_cast(ptr)->classifier.SetSubsample(subsample); 58 | } 59 | 60 | double GetSubsample(void *ptr) { 61 | return reinterpret_cast(ptr)->classifier.GetSubsample(); 62 | } 63 | 64 | void SetShrinkage(void *ptr, double shrinkage) { 65 | reinterpret_cast(ptr)->classifier.SetShrinkage(shrinkage); 66 | } 67 | 68 | double GetShrinkage(void *ptr) { 69 | return reinterpret_cast(ptr)->classifier.GetShrinkage(); 70 | } 71 | 72 | void SetFlatnessLoss(void *ptr, double flatnessLoss) { 73 | reinterpret_cast(ptr)->classifier.SetFlatnessLoss(flatnessLoss);; 74 | } 75 | 76 | double GetFlatnessLoss(void *ptr) { 77 | return reinterpret_cast(ptr)->classifier.GetFlatnessLoss(); 78 | } 79 | 80 | void SetTransform2Probability(void *ptr, bool transform2probability) { 81 | reinterpret_cast(ptr)->classifier.SetTransform2Probability(transform2probability); 82 | } 83 | 84 | bool GetTransform2Probability(void *ptr) { 85 | return reinterpret_cast(ptr)->classifier.GetTransform2Probability(); 86 | } 87 | 88 | void SetSPlot(void *ptr, bool sPlot) { 89 | reinterpret_cast(ptr)->classifier.SetSPlot(sPlot); 90 | } 91 | 92 | bool GetSPlot(void *ptr) { 93 | return reinterpret_cast(ptr)->classifier.GetSPlot(); 94 | } 95 | 96 | void Delete(void *ptr) { 97 | delete reinterpret_cast(ptr); 98 | } 99 | 100 | void Fit(void *ptr, float *data_ptr, float *weight_ptr, bool *target_ptr, unsigned int nEvents, unsigned int nFeatures) { 101 | Expertise *expertise = reinterpret_cast(ptr); 102 | 103 | std::vector w; 104 | if(weight_ptr != nullptr) 105 | w = std::vector(weight_ptr, weight_ptr + nEvents); 106 | else 107 | w = std::vector(nEvents, 1.0); 108 | 109 | std::vector y(target_ptr, target_ptr + nEvents); 110 | std::vector> X(nFeatures); 111 | for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature) { 112 | std::vector temp(nEvents); 113 | for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) { 114 | temp[iEvent] = data_ptr[iEvent*nFeatures + iFeature]; 115 | } 116 | X[iFeature] = temp; 117 | } 118 | 119 | expertise->classifier.fit(X, y, w); 120 | 121 | } 122 | 123 | void Load(void* ptr, char *weightfile) { 124 | Expertise *expertise = reinterpret_cast(ptr); 125 | 126 | std::fstream file(weightfile, std::ios_base::in); 127 | if(not file) 128 | return; 129 | 130 | expertise->classifier = FastBDT::Classifier(file); 131 | } 132 | 133 | float Predict(void *ptr, float *array) { 134 | Expertise *expertise = reinterpret_cast(ptr); 135 | return expertise->classifier.predict(std::vector(array, array + expertise->classifier.GetNFeatures())); 136 | } 137 | 138 | void PredictArray(void *ptr, float *array, float *result, unsigned int nEvents) { 139 | Expertise *expertise = reinterpret_cast(ptr); 140 | unsigned int nFeatures = expertise->classifier.GetNFeatures(); 141 | for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) { 142 | result[iEvent] = expertise->classifier.predict(std::vector(array + iEvent*nFeatures, array + (iEvent+1)*nFeatures)); 143 | } 144 | } 145 | 146 | void Save(void* ptr, char *weightfile) { 147 | Expertise *expertise = reinterpret_cast(ptr); 148 | 149 | std::fstream file(weightfile, std::ios_base::out | std::ios_base::trunc); 150 | file << expertise->classifier << std::endl; 151 | } 152 | 153 | void* GetVariableRanking(void* ptr) { 154 | Expertise *expertise = reinterpret_cast(ptr); 155 | VariableRanking *ranking = new(std::nothrow) VariableRanking; 156 | ranking->ranking = expertise->classifier.GetVariableRanking(); 157 | return ranking; 158 | } 159 | 160 | void* GetIndividualVariableRanking(void* ptr, float *array) { 161 | Expertise *expertise = reinterpret_cast(ptr); 162 | VariableRanking *ranking = new(std::nothrow) VariableRanking; 163 | ranking->ranking = expertise->classifier.GetIndividualVariableRanking(std::vector(array, array + expertise->classifier.GetNFeatures())); 164 | return ranking; 165 | } 166 | 167 | unsigned int ExtractNumberOfVariablesFromVariableRanking(void* ptr) { 168 | VariableRanking *ranking = reinterpret_cast(ptr); 169 | unsigned int max = 0; 170 | for(auto &pair : ranking->ranking) { 171 | if(pair.first > max) { 172 | max = pair.first; 173 | } 174 | } 175 | return max+1; 176 | } 177 | 178 | double ExtractImportanceOfVariableFromVariableRanking(void* ptr, unsigned int iFeature) { 179 | VariableRanking *ranking = reinterpret_cast(ptr); 180 | if ( ranking->ranking.find( iFeature ) == ranking->ranking.end() ) 181 | return 0.0; 182 | return ranking->ranking[iFeature]; 183 | } 184 | 185 | void DeleteVariableRanking(void *ptr) { 186 | delete reinterpret_cast(ptr); 187 | } 188 | 189 | } 190 | -------------------------------------------------------------------------------- /examples/orthogonal_discriminator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PyFastBDT import FastBDT 3 | 4 | import numpy as np 5 | import numpy 6 | import numpy.linalg 7 | import sklearn.metrics 8 | 9 | import matplotlib.pyplot as plt 10 | import matplotlib as mpl 11 | 12 | def calculate_cdf_and_pdf(X): 13 | """ 14 | Calculates cdf and pdf of given sample and adds under/overflow bins 15 | @param X 1-d numpy.array 16 | """ 17 | pdf, bins = numpy.histogram(X, bins=30, density=True) 18 | cdf = numpy.cumsum(pdf * (bins - numpy.roll(bins, 1))[1:]) 19 | return numpy.hstack([0.0, cdf, 1.0]), numpy.hstack([0.0, pdf, 0.0]), bins 20 | 21 | 22 | def calculate_splot_weights(pdfs, yields): 23 | """ 24 | Calculates sPlot weights using the pdfs 25 | @param pdfs list of 1-d numpy.array with pdf values of the different components for each event 26 | @param yields list of the yields of the different components 27 | """ 28 | N_components = len(pdfs) 29 | # Consistency checks 30 | if N_components != len(yields): 31 | raise RuntimeError("You have to provide the same number of pdfs and yields!") 32 | if N_components < 2: 33 | raise RuntimeError("Need at least two components!") 34 | 35 | # Calculate covariance matrix 36 | inverse_covariance = numpy.zeros((N_components, N_components)) 37 | norm = sum((yields[k] * pdfs[k] for k in range(1, N_components)), yields[0] * pdfs[0])**2 38 | for i in range(N_components): 39 | for j in range(N_components): 40 | inverse_covariance[i, j] = numpy.nansum(pdfs[i] * pdfs[j] / norm) 41 | covariance = numpy.linalg.inv(inverse_covariance) 42 | 43 | # Return list of sPlot weights for each component 44 | return [sum(covariance[n, k] * pdfs[k] for k in range(N_components)) / 45 | sum(yields[k] * pdfs[k] for k in range(N_components)) for n in range(N_components)] 46 | 47 | 48 | class Prior(object): 49 | def __init__(self, signal, bckgrd): 50 | self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(signal) 51 | self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(bckgrd) 52 | # Avoid numerical instabilities 53 | self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1 54 | self.signal_yield = len(signal) 55 | self.bckgrd_yield = len(bckgrd) 56 | 57 | def get_signal_pdf(self, X): 58 | return self.signal_pdf[numpy.digitize(X, bins=self.signal_bins)] 59 | 60 | def get_bckgrd_pdf(self, X): 61 | return self.bckgrd_pdf[numpy.digitize(X, bins=self.bckgrd_bins)] 62 | 63 | def get_signal_cdf(self, X): 64 | return self.signal_cdf[numpy.digitize(X, bins=self.signal_bins)] 65 | 66 | def get_bckgrd_cdf(self, X): 67 | return self.bckgrd_cdf[numpy.digitize(X, bins=self.bckgrd_bins)] 68 | 69 | def get_prior(self, X): 70 | return self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X)) 71 | 72 | def get_signal_boost_weights(self, X): 73 | return self.get_signal_cdf(X) / self.get_bckgrd_pdf(X) 74 | 75 | def get_bckgrd_boost_weights(self, X): 76 | # NOT self.get_bckgrd_cdf() here, signal and background are handlet asymmetrical! 77 | return (1.0 - self.get_signal_cdf(X)) / self.get_bckgrd_pdf(X) 78 | 79 | def get_boost_weights(self, X): 80 | return numpy.r_[self.get_signal_boost_weights(X), self.get_bckgrd_boost_weights(X)] 81 | 82 | def get_splot_weights(self, X): 83 | pdfs = [self.get_signal_pdf(X), self.get_bckgrd_pdf(X)] 84 | yields = [self.signal_yield, self.bckgrd_yield] 85 | weights = calculate_splot_weights(pdfs, yields) 86 | return numpy.r_[weights[0], weights[1]] 87 | 88 | def get_uncorrelation_weights(self, X, boost_prediction): 89 | reg_boost_prediction = boost_prediction * 0.99 + 0.005 90 | weights = (self.get_signal_cdf(X) / reg_boost_prediction + (1.0 - self.get_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2 91 | return weights 92 | 93 | def get_aplot_weights(self, X, boost_prediction): 94 | weights = self.get_uncorrelation_weights(X, boost_prediction) 95 | return self.get_splot_weights(X) * numpy.r_[weights, weights] 96 | 97 | 98 | def combine_probabilities(p1, p2): 99 | return p1*p2 / (p1*p2 + (1-p1)*(1-p2)) 100 | 101 | 102 | 103 | def acticvate_post_mortem_debugger(): 104 | import sys 105 | 106 | def info(type, value, tb): 107 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 108 | # we are in interactive mode or we don't have a tty-like 109 | # device, so we call the default hook 110 | sys.__excepthook__(type, value, tb) 111 | else: 112 | import traceback, pdb 113 | # we are NOT in interactive mode, print the exception... 114 | traceback.print_exception(type, value, tb) 115 | # ...then start the debugger in post-mortem mode. 116 | pdb.post_mortem(tb) 117 | 118 | sys.excepthook = info 119 | 120 | acticvate_post_mortem_debugger() 121 | 122 | 123 | def evaluation(label, X_test, y_test, p, p_prior): 124 | print(label, sklearn.metrics.roc_auc_score(y_test, p)) 125 | print(label + " with prior", sklearn.metrics.roc_auc_score(y_test, combine_probabilities(p, p_prior))) 126 | plt.scatter(X_test[y_test == 1, 0], p[y_test == 1], c='r', label=label + " (Signal)", alpha=0.2) 127 | plt.scatter(X_test[y_test == 0, 0], p[y_test == 0], c='b', label=label + " (Background)", alpha=0.2) 128 | plt.xlabel("Feature") 129 | plt.ylabel("Probability") 130 | plt.show() 131 | 132 | 133 | if __name__ == '__main__': 134 | # Create some Monte Carlo data using a multidimensional gaussian distribution 135 | # The 0th row of the coveriance matrix describes the correlation to the target variable 136 | for cor in np.linspace(-0.2, 0.2, 3): 137 | print("Correlation ", cor) 138 | mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] 139 | cov = [[1.0, 0.6, 0.4, 0.2, 0.1, 0.0], 140 | [0.0, 1.0, cor, cor, cor, 0.0], 141 | [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], 142 | [0.0, 0.0, 0.0, 1.0, 0.0, 0.0], 143 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 144 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] 145 | 146 | for i in range(len(mean)): 147 | for j in range(i+1, len(mean)): 148 | cov[j][i] = cov[i][j] 149 | 150 | N_train, N_test = 100000, 2000 151 | data = np.random.multivariate_normal(mean, cov, N_train + N_test) 152 | X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 153 | X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 154 | 155 | # First variable is the variable we want to have independent of our network output 156 | prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0]) 157 | p_prior = prior.get_prior(X_test[:, 0]) 158 | evaluation("Prior", X_test, y_test, p_prior, p_prior) 159 | 160 | p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test) 161 | evaluation("Full", X_test, y_test, p, p_prior) 162 | 163 | p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:]) 164 | evaluation("Restricted", X_test, y_test, p, p_prior) 165 | 166 | boost_p = FastBDT.Classifier().fit(X=numpy.r_[X_train[:, 1:], X_train[:, 1:]], 167 | y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)], 168 | weights=prior.get_boost_weights(X_train[:, 0])).predict(X_train[:, 1:]) 169 | 170 | p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train, 171 | weights=prior.get_uncorrelation_weights(X_train[:, 0], boost_p)).predict(X_test[:, 1:]) 172 | evaluation("Uncorrelation", X_test, y_test, p, p_prior) 173 | 174 | p = FastBDT.Classifier().fit(X=numpy.r_[X_train[:, 1:], X_train[:, 1:]], 175 | y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)], 176 | weights=prior.get_aplot_weights(X_train[:, 0], boost_p)).predict(X_test[:, 1:]) 177 | evaluation("APlot", X_test, y_test, p, p_prior) 178 | 179 | p = FastBDT.Classifier().fit(X=numpy.r_[X_train[:, 1:], X_train[:, 1:]], 180 | y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)], 181 | weights=prior.get_splot_weights(X_train[:, 0])).predict(X_test[:, 1:]) 182 | evaluation("SPlot", X_test, y_test, p, p_prior) 183 | -------------------------------------------------------------------------------- /include/FastBDT_IO.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2014 3 | */ 4 | 5 | #pragma once 6 | #include "FastBDT.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace FastBDT { 14 | 15 | /** 16 | * Converts from string to float safely 17 | * Should behave similar to boost::lexical_cast 18 | * but does not signal if it fails! 19 | * @param input string containing a float 20 | */ 21 | float convert_to_float_safely(std::string &input); 22 | 23 | /** 24 | * Converts from string to double safely 25 | * Should behave similar to boost::lexical_cast 26 | * but does not signal if it fails! 27 | * @param input string containing a float 28 | */ 29 | double convert_to_double_safely(std::string &input); 30 | 31 | /** 32 | * This template saves a vector to an std::ostream 33 | * @param stream an std::ostream reference 34 | * @param vector the vector which shall be stored 35 | */ 36 | template 37 | std::ostream& operator<<(std::ostream& stream, const std::vector &vector) { 38 | stream << vector.size(); 39 | for(const auto &value : vector) { 40 | stream << " " << value; 41 | } 42 | stream << std::endl; 43 | return stream; 44 | } 45 | 46 | /** 47 | * Specialize vector output operator, so it checks for nan and infinity in float/double types 48 | * Note: I know about http://www.gotw.ca/publications/mill17.htm, SFINAE, but nothing worked for me ... 49 | * so I sticked with this simple solution instead of complicated template meta programming 50 | */ 51 | template<> 52 | std::ostream& operator<<(std::ostream& stream, const std::vector &vector); 53 | 54 | template<> 55 | std::ostream& operator<<(std::ostream& stream, const std::vector &vector); 56 | 57 | /** 58 | * This template reads a vector from an std::istream 59 | * @param stream an std::istream reference 60 | * @param vector the vector containing read data 61 | */ 62 | template 63 | std::istream& operator>>(std::istream& stream, std::vector &vector) { 64 | unsigned int size; 65 | stream >> size; 66 | vector.resize(size); 67 | for(unsigned int i = 0; i < size; ++i) { 68 | T temp; 69 | stream >> temp; 70 | vector[i] = temp; 71 | } 72 | return stream; 73 | } 74 | 75 | template<> 76 | std::istream& operator>>(std::istream& stream, std::vector &vector); 77 | 78 | template<> 79 | std::istream& operator>>(std::istream& stream, std::vector &vector); 80 | 81 | /** 82 | * This function saves a Cut to an std::ostream 83 | * @param stream an std::ostream reference 84 | * @param cut which shall be stored 85 | */ 86 | template 87 | std::ostream& operator<<(std::ostream& stream, const Cut &cut) { 88 | stream << cut.feature << std::endl; 89 | stream.precision(std::numeric_limits::max_digits10); 90 | stream << std::scientific; 91 | stream << cut.index << std::endl; 92 | stream.precision(6); 93 | stream << cut.valid << std::endl; 94 | stream << cut.gain; 95 | stream << std::endl; 96 | return stream; 97 | } 98 | 99 | /** 100 | * This function reads a Cut from an std::istream 101 | * @param stream an std::istream reference 102 | * @param cut containing read data 103 | */ 104 | template 105 | std::istream& operator>>(std::istream& stream, Cut &cut) { 106 | stream >> cut.feature; 107 | stream >> cut.index; 108 | stream >> cut.valid; 109 | stream >> cut.gain; 110 | return stream; 111 | } 112 | 113 | template<> 114 | std::istream& operator>>(std::istream& stream, Cut &cut); 115 | 116 | template<> 117 | std::istream& operator>>(std::istream& stream, Cut &cut); 118 | 119 | 120 | /** 121 | * This function saves a Tree to an std::ostream 122 | * @param stream an std::ostream reference 123 | * @param tree the tree which shall be stored 124 | */ 125 | template 126 | std::ostream& operator<<(std::ostream& stream, const Tree &tree) { 127 | const auto &cuts = tree.GetCuts(); 128 | stream << cuts.size() << std::endl; 129 | for( const auto& cut : cuts ) { 130 | stream << cut << std::endl; 131 | } 132 | stream << tree.GetBoostWeights() << std::endl; 133 | stream << tree.GetPurities() << std::endl; 134 | stream << tree.GetNEntries() << std::endl; 135 | return stream; 136 | } 137 | 138 | 139 | /** 140 | * This function reads a Tree from an std::istream 141 | * @param stream an std::istream reference 142 | * @preturn tree containing read data 143 | */ 144 | template 145 | Tree readTreeFromStream(std::istream& stream) { 146 | unsigned int size; 147 | stream >> size; 148 | std::vector> cuts(size); 149 | for(unsigned int i = 0; i < size; ++i) { 150 | stream >> cuts[i]; 151 | } 152 | 153 | std::vector boost_weights; 154 | stream >> boost_weights; 155 | 156 | std::vector purities; 157 | stream >> purities; 158 | 159 | std::vector nEntries; 160 | stream >> nEntries; 161 | 162 | return Tree(cuts, nEntries, purities, boost_weights); 163 | 164 | } 165 | 166 | /** 167 | * This function saves a Forest to an std::ostream 168 | * @param stream an std::ostream reference 169 | * @param forest the forest which shall be stored 170 | */ 171 | template 172 | std::ostream& operator<<(std::ostream& stream, const Forest &forest) { 173 | stream << forest.GetF0() << std::endl; 174 | stream << forest.GetShrinkage() << std::endl; 175 | stream << forest.GetTransform2Probability() << std::endl; 176 | 177 | const auto &trees = forest.GetForest(); 178 | stream << trees.size() << std::endl; 179 | for(const auto& tree : trees) { 180 | stream << tree << std::endl; 181 | } 182 | 183 | return stream; 184 | } 185 | 186 | /** 187 | * This function reads a Forest from an std::istream 188 | * @param stream an std::istream reference 189 | * @preturn forest containing read data 190 | */ 191 | template 192 | Forest readForestFromStream(std::istream& stream) { 193 | double F0; 194 | stream >> F0; 195 | 196 | double shrinkage; 197 | stream >> shrinkage; 198 | 199 | bool transform2probability; 200 | stream >> transform2probability; 201 | 202 | Forest forest(shrinkage, F0, transform2probability); 203 | 204 | unsigned int size; 205 | stream >> size; 206 | 207 | for(unsigned int i = 0; i < size; ++i) { 208 | forest.AddTree(readTreeFromStream(stream)); 209 | } 210 | 211 | return forest; 212 | } 213 | 214 | /** 215 | * This function saves a PurityTransformation to an std::ostream 216 | * @param stream an std::ostream reference 217 | * @param purityTransformation the purity transformation which shall be stored 218 | */ 219 | std::ostream& operator<<(std::ostream& stream, const PurityTransformation &purityTransformation); 220 | 221 | /** 222 | * This function reads a PurityTransformation from an std::istream 223 | * @param stream an std::istream reference 224 | * @param purityTransformation the purity transformation which shall be stored 225 | */ 226 | std::istream& operator>>(std::istream& stream, PurityTransformation &purityTransformation); 227 | 228 | 229 | /** 230 | * This function saves a FeatureBinning to an std::ostream 231 | * @param stream an std::ostream reference 232 | * @param featureBinning the FeatureBinning which shall be stored 233 | */ 234 | template 235 | std::ostream& operator<<(std::ostream& stream, const FeatureBinning &featureBinning) { 236 | 237 | stream << featureBinning.GetNLevels() << std::endl; 238 | stream << featureBinning.GetBinning() << std::endl; 239 | 240 | return stream; 241 | } 242 | 243 | /** 244 | * This function reads a FeatureBinning from an std::istream 245 | * @param stream an std::istream reference 246 | * @preturn FeatureBinning containing read data 247 | */ 248 | template 249 | FeatureBinning readFeatureBinningFromStream(std::istream& stream) { 250 | 251 | unsigned int nLevels; 252 | stream >> nLevels; 253 | 254 | std::vector bins; 255 | stream >> bins; 256 | 257 | return FeatureBinning(nLevels, bins); 258 | 259 | } 260 | 261 | /** 262 | * Overload vector input operator, so it can read in FeatureBinnings 263 | */ 264 | template 265 | std::istream& operator>>(std::istream& stream, std::vector> &vector) { 266 | unsigned int size; 267 | stream >> size; 268 | for(unsigned int i = 0; i < size; ++i) 269 | vector.push_back(readFeatureBinningFromStream(stream)); 270 | return stream; 271 | } 272 | 273 | 274 | } 275 | -------------------------------------------------------------------------------- /src/test_Classifier.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2017 3 | */ 4 | 5 | #include "Classifier.h" 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | using namespace FastBDT; 18 | 19 | 20 | std::vector> GetIrisX() { 21 | std::vector> X = {{5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9,5.4,4.8,4.8,4.3,5.8,5.7,5.4,5.1,5.7,5.1,5.4,5.1,4.6,5.1,4.8,5.0,5.0,5.2,5.2,4.7,4.8,5.4,5.2,5.5,4.9,5.0,5.5,4.9,4.4,5.1,5.0,4.5,4.4,5.0,5.1,4.8,5.1,4.6,5.3,5.0,7.0,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,5.0,5.9,6.0,6.1,5.6,6.7,5.6,5.8,6.2,5.6,5.9,6.1,6.3,6.1,6.4,6.6,6.8,6.7,6.0,5.7,5.5,5.5,5.8,6.0,5.4,6.0,6.7,6.3,5.6,5.5,5.5,6.1,5.8,5.0,5.6,5.7,5.7,6.2,5.1,5.7,6.3,5.8,7.1,6.3,6.5,7.6,4.9,7.3,6.7,7.2,6.5,6.4,6.8,5.7,5.8,6.4,6.5,7.7,7.7,6.0,6.9,5.6,7.7,6.3,6.7,7.2,6.2,6.1,6.4,7.2,7.4,7.9,6.4,6.3,6.1,7.7,6.3,6.4,6.0,6.9,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9}, {3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,3.7,3.4,3.0,3.0,4.0,4.4,3.9,3.5,3.8,3.8,3.4,3.7,3.6,3.3,3.4,3.0,3.4,3.5,3.4,3.2,3.1,3.4,4.1,4.2,3.1,3.2,3.5,3.1,3.0,3.4,3.5,2.3,3.2,3.5,3.8,3.0,3.8,3.2,3.7,3.3,3.2,3.2,3.1,2.3,2.8,2.8,3.3,2.4,2.9,2.7,2.0,3.0,2.2,2.9,2.9,3.1,3.0,2.7,2.2,2.5,3.2,2.8,2.5,2.8,2.9,3.0,2.8,3.0,2.9,2.6,2.4,2.4,2.7,2.7,3.0,3.4,3.1,2.3,3.0,2.5,2.6,3.0,2.6,2.3,2.7,3.0,2.9,2.9,2.5,2.8,3.3,2.7,3.0,2.9,3.0,3.0,2.5,2.9,2.5,3.6,3.2,2.7,3.0,2.5,2.8,3.2,3.0,3.8,2.6,2.2,3.2,2.8,2.8,2.7,3.3,3.2,2.8,3.0,2.8,3.0,2.8,3.8,2.8,2.8,2.6,3.0,3.4,3.1,3.0,3.1,3.1,3.1,2.7,3.2,3.3,3.0,2.5,3.0,3.4,3.0}, {1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,1.5,1.6,1.4,1.1,1.2,1.5,1.3,1.4,1.7,1.5,1.7,1.5,1.0,1.7,1.9,1.6,1.6,1.5,1.4,1.6,1.6,1.5,1.5,1.4,1.5,1.2,1.3,1.5,1.3,1.5,1.3,1.3,1.3,1.6,1.9,1.4,1.6,1.4,1.5,1.4,4.7,4.5,4.9,4.0,4.6,4.5,4.7,3.3,4.6,3.9,3.5,4.2,4.0,4.7,3.6,4.4,4.5,4.1,4.5,3.9,4.8,4.0,4.9,4.7,4.3,4.4,4.8,5.0,4.5,3.5,3.8,3.7,3.9,5.1,4.5,4.5,4.7,4.4,4.1,4.0,4.4,4.6,4.0,3.3,4.2,4.2,4.2,4.3,3.0,4.1,6.0,5.1,5.9,5.6,5.8,6.6,4.5,6.3,5.8,6.1,5.1,5.3,5.5,5.0,5.1,5.3,5.5,6.7,6.9,5.0,5.7,4.9,6.7,4.9,5.7,6.0,4.8,4.9,5.6,5.8,6.1,6.4,5.6,5.1,5.6,6.1,5.6,5.5,4.8,5.4,5.6,5.1,5.1,5.9,5.7,5.2,5.0,5.2,5.4,5.1}, {0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,0.2,0.2,0.1,0.1,0.2,0.4,0.4,0.3,0.3,0.3,0.2,0.4,0.2,0.5,0.2,0.2,0.4,0.2,0.2,0.2,0.2,0.4,0.1,0.2,0.1,0.2,0.2,0.1,0.2,0.2,0.3,0.3,0.2,0.6,0.4,0.3,0.2,0.2,0.2,0.2,1.4,1.5,1.5,1.3,1.5,1.3,1.6,1.0,1.3,1.4,1.0,1.5,1.0,1.4,1.3,1.4,1.5,1.0,1.5,1.1,1.8,1.3,1.5,1.2,1.3,1.4,1.4,1.7,1.5,1.0,1.1,1.0,1.2,1.6,1.5,1.6,1.5,1.3,1.3,1.3,1.2,1.4,1.2,1.0,1.3,1.2,1.3,1.3,1.1,1.3,2.5,1.9,2.1,1.8,2.2,2.1,1.7,1.8,1.8,2.5,2.0,1.9,2.1,2.0,2.4,2.3,1.8,2.2,2.3,1.5,2.3,2.0,2.0,1.8,2.1,1.8,1.8,1.8,2.1,1.6,1.9,2.0,2.2,1.5,1.4,2.3,2.4,1.8,1.8,2.1,2.4,2.3,1.9,2.3,2.5,2.3,1.9,2.0,2.3,1.8} }; 22 | return X; 23 | } 24 | 25 | std::vector GetIrisY() { 26 | std::vector y = {false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true}; 27 | return y; 28 | } 29 | 30 | std::vector GetIrisW() { 31 | std::vector w = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; 32 | return w; 33 | } 34 | 35 | 36 | float GetIrisScore(const Classifier &classifier) { 37 | auto X = GetIrisX(); 38 | auto y = GetIrisY(); 39 | float sum = 0; 40 | for(unsigned int i = 0; i < y.size(); ++i) { 41 | float p = classifier.predict({X[0][i], X[1][i], X[2][i], X[3][i]}); 42 | sum += (y[i]-p)*(y[i]-p); 43 | } 44 | return -sum; 45 | } 46 | 47 | class ClassifierTest : public ::testing::Test { 48 | protected: 49 | virtual void SetUp() { 50 | X = GetIrisX(); 51 | y = GetIrisY(); 52 | w = GetIrisW(); 53 | } 54 | 55 | virtual void TearDown() { 56 | } 57 | 58 | std::vector> X; 59 | std::vector y; 60 | std::vector w; 61 | 62 | }; 63 | 64 | TEST_F(ClassifierTest, SimpleClassifierWorks) { 65 | 66 | FastBDT::Classifier classifier(10, 3, {4, 4, 4, 4}); 67 | classifier.fit(X, y, w); 68 | 69 | EXPECT_GT(GetIrisScore(classifier), -7.0); 70 | EXPECT_LT(GetIrisScore(classifier), -5.0); 71 | 72 | } 73 | 74 | TEST_F(ClassifierTest, MoreTreesAreBetter) { 75 | 76 | FastBDT::Classifier classifier1(1, 1, {4, 4, 4, 4}); 77 | classifier1.fit(X, y, w); 78 | 79 | FastBDT::Classifier classifier2(4, 1, {4, 4, 4, 4}); 80 | classifier2.fit(X, y, w); 81 | 82 | FastBDT::Classifier classifier3(16, 1, {4, 4, 4, 4}); 83 | classifier3.fit(X, y, w); 84 | 85 | FastBDT::Classifier classifier4(64, 1, {4, 4, 4, 4}); 86 | classifier4.fit(X, y, w); 87 | 88 | EXPECT_LT(GetIrisScore(classifier1), GetIrisScore(classifier2)); 89 | EXPECT_LT(GetIrisScore(classifier2), GetIrisScore(classifier3)); 90 | EXPECT_LT(GetIrisScore(classifier3), GetIrisScore(classifier4)); 91 | 92 | } 93 | 94 | TEST_F(ClassifierTest, DeeperTreesAreBetter) { 95 | 96 | FastBDT::Classifier classifier1(1, 1, {4, 4, 4, 4}); 97 | classifier1.fit(X, y, w); 98 | 99 | FastBDT::Classifier classifier2(1, 3, {4, 4, 4, 4}); 100 | classifier2.fit(X, y, w); 101 | 102 | FastBDT::Classifier classifier3(1, 5, {4, 4, 4, 4}); 103 | classifier3.fit(X, y, w); 104 | 105 | FastBDT::Classifier classifier4(1, 7, {4, 4, 4, 4}); 106 | classifier4.fit(X, y, w); 107 | 108 | EXPECT_LT(GetIrisScore(classifier1), GetIrisScore(classifier2)); 109 | EXPECT_LT(GetIrisScore(classifier2), GetIrisScore(classifier3)); 110 | EXPECT_LT(GetIrisScore(classifier3), GetIrisScore(classifier4)); 111 | 112 | } 113 | 114 | 115 | TEST_F(ClassifierTest, MoreBinsAreBetter) { 116 | 117 | FastBDT::Classifier classifier1(1, 3, {2, 2, 2, 2}); 118 | classifier1.fit(X, y, w); 119 | 120 | FastBDT::Classifier classifier2(1, 3, {2, 3, 2, 3}); 121 | classifier2.fit(X, y, w); 122 | 123 | FastBDT::Classifier classifier3(1, 3, {3, 4, 3, 4}); 124 | classifier3.fit(X, y, w); 125 | 126 | FastBDT::Classifier classifier4(1, 3, {4, 5, 4, 5}); 127 | classifier4.fit(X, y, w); 128 | 129 | EXPECT_LT(GetIrisScore(classifier1), GetIrisScore(classifier2)); 130 | EXPECT_LT(GetIrisScore(classifier2), GetIrisScore(classifier3)); 131 | EXPECT_LT(GetIrisScore(classifier3), GetIrisScore(classifier4)); 132 | 133 | } 134 | 135 | TEST_F(ClassifierTest, HeavilyOvertrainedBDTIsPerfect) { 136 | 137 | FastBDT::Classifier classifier(100, 10, {8, 8, 8, 8}); 138 | classifier.fit(X, y, w); 139 | 140 | EXPECT_GT(GetIrisScore(classifier), -0.01f); 141 | 142 | } 143 | 144 | TEST_F(ClassifierTest, SubsamplingChangesResult) { 145 | 146 | FastBDT::Classifier classifier1(1, 5, {4, 4, 4, 4}, 0.1, 0.5); 147 | classifier1.fit(X, y, w); 148 | 149 | FastBDT::Classifier classifier2(1, 5, {4, 4, 4, 4}, 0.1, 0.5); 150 | classifier2.fit(X, y, w); 151 | 152 | EXPECT_NE(GetIrisScore(classifier1), GetIrisScore(classifier2)); 153 | 154 | } 155 | 156 | TEST_F(ClassifierTest, GetFeatureMaping) { 157 | 158 | FastBDT::Classifier classifier(1, 5, {4, 4, 4, 4}, 0.1, 0.5); 159 | classifier.SetPurityTransformation({true, false, true, false}); 160 | classifier.fit(X, y, w); 161 | 162 | auto mapping = classifier.GetFeatureMapping(); 163 | EXPECT_EQ(mapping[0], 0u); 164 | EXPECT_EQ(mapping[1], 0u); 165 | EXPECT_EQ(mapping[2], 1u); 166 | EXPECT_EQ(mapping[3], 2u); 167 | EXPECT_EQ(mapping[4], 2u); 168 | EXPECT_EQ(mapping[5], 3u); 169 | 170 | } 171 | 172 | TEST_F(ClassifierTest, LoadAndSaveWorks) { 173 | 174 | FastBDT::Classifier classifier(10, 3, {4, 4, 4, 4}); 175 | classifier.fit(X, y, w); 176 | 177 | float score1 = GetIrisScore(classifier); 178 | 179 | std::fstream file_out("unittest.weightfile", std::ios_base::out | std::ios_base::trunc); 180 | file_out << classifier << std::endl; 181 | file_out.close(); 182 | 183 | std::fstream file_in("unittest.weightfile", std::ios_base::in); 184 | FastBDT::Classifier classifier2(file_in); 185 | file_in.close(); 186 | 187 | float score2 = GetIrisScore(classifier2); 188 | 189 | EXPECT_FLOAT_EQ(score1, score2); 190 | } 191 | 192 | -------------------------------------------------------------------------------- /src/Classifier.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * Thomas Keck 2017 3 | * 4 | * Simplified sklearn interface 5 | */ 6 | 7 | 8 | #include "Classifier.h" 9 | #include 10 | 11 | namespace FastBDT { 12 | 13 | void Classifier::fit(const std::vector> &X, const std::vector &y, const std::vector &w) { 14 | 15 | if(static_cast(X.size()) - static_cast(m_numberOfFlatnessFeatures) <= 0) { 16 | throw std::runtime_error("FastBDT requires at least one feature"); 17 | } 18 | m_numberOfFeatures = X.size() - m_numberOfFlatnessFeatures ; 19 | 20 | if(m_binning.size() == 0) { 21 | for(unsigned int i = 0; i < X.size(); ++i) 22 | m_binning.push_back(8); 23 | } 24 | 25 | if(m_numberOfFeatures + m_numberOfFlatnessFeatures != m_binning.size()) { 26 | throw std::runtime_error("Number of features must be equal to the number of provided binnings"); 27 | } 28 | 29 | if(m_purityTransformation.size() == 0) { 30 | for(unsigned int i = 0; i < m_binning.size() - m_numberOfFlatnessFeatures; ++i) 31 | m_purityTransformation.push_back(false); 32 | } 33 | 34 | for(auto p : m_purityTransformation) 35 | if(p) 36 | m_can_use_fast_forest = false; 37 | 38 | if(m_numberOfFeatures != m_purityTransformation.size()) { 39 | throw std::runtime_error("Number of ordinary features must be equal to the number of provided purityTransformation flags."); 40 | } 41 | 42 | unsigned int numberOfEvents = X[0].size(); 43 | if(numberOfEvents == 0) { 44 | throw std::runtime_error("FastBDT requires at least one event"); 45 | } 46 | 47 | if(numberOfEvents != y.size()) { 48 | throw std::runtime_error("Number of data-points X doesn't match the numbers of labels y"); 49 | } 50 | 51 | if(numberOfEvents != w.size()) { 52 | throw std::runtime_error("Number of data-points X doesn't match the numbers of weights w"); 53 | } 54 | 55 | m_numberOfFinalFeatures = m_numberOfFeatures; 56 | for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) { 57 | auto feature = X[iFeature]; 58 | m_featureBinning.push_back(FeatureBinning(m_binning[iFeature], feature)); 59 | if(m_purityTransformation[iFeature]) { 60 | m_numberOfFinalFeatures++; 61 | std::vector feature(numberOfEvents); 62 | for(unsigned int iEvent = 0; iEvent < numberOfEvents; ++iEvent) { 63 | feature[iEvent] = m_featureBinning[iFeature].ValueToBin(X[iFeature][iEvent]); 64 | } 65 | m_purityBinning.push_back(PurityTransformation(m_binning[iFeature], feature, w, y)); 66 | m_binning.insert(m_binning.begin() + iFeature + 1, m_binning[iFeature]); 67 | } 68 | } 69 | 70 | for(unsigned int iFeature = 0; iFeature < m_numberOfFlatnessFeatures; ++iFeature) { 71 | auto feature = X[iFeature + m_numberOfFeatures]; 72 | m_featureBinning.push_back(FeatureBinning(m_binning[iFeature + m_numberOfFinalFeatures], feature)); 73 | } 74 | 75 | EventSample eventSample(numberOfEvents, m_numberOfFinalFeatures, m_numberOfFlatnessFeatures, m_binning); 76 | std::vector bins(m_numberOfFinalFeatures+m_numberOfFlatnessFeatures); 77 | 78 | for(unsigned int iEvent = 0; iEvent < numberOfEvents; ++iEvent) { 79 | unsigned int bin = 0; 80 | unsigned int pFeature = 0; 81 | for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) { 82 | bins[bin] = m_featureBinning[iFeature].ValueToBin(X[iFeature][iEvent]); 83 | bin++; 84 | if(m_purityTransformation[iFeature]) { 85 | bins[bin] = m_purityBinning[pFeature].BinToPurityBin(bins[bin-1]); 86 | pFeature++; 87 | bin++; 88 | } 89 | } 90 | for(unsigned int iFeature = 0; iFeature < m_numberOfFlatnessFeatures; ++iFeature) { 91 | bins[bin] = m_featureBinning[iFeature + m_numberOfFeatures].ValueToBin(X[iFeature + m_numberOfFeatures][iEvent]); 92 | bin++; 93 | } 94 | eventSample.AddEvent(bins, w[iEvent], y[iEvent] == 1); 95 | } 96 | 97 | m_featureBinning.resize(m_numberOfFeatures); 98 | 99 | ForestBuilder df(eventSample, m_nTrees, m_shrinkage, m_subsample, m_depth, m_sPlot, m_flatnessLoss); 100 | if(m_can_use_fast_forest) { 101 | Forest temp_forest( df.GetShrinkage(), df.GetF0(), m_transform2probability); 102 | for( auto t : df.GetForest() ) { 103 | temp_forest.AddTree(removeFeatureBinningTransformationFromTree(t, m_featureBinning)); 104 | } 105 | m_fast_forest = temp_forest; 106 | } else { 107 | Forest temp_forest(df.GetShrinkage(), df.GetF0(), m_transform2probability); 108 | for( auto t : df.GetForest() ) { 109 | temp_forest.AddTree(t); 110 | } 111 | m_binned_forest = temp_forest; 112 | } 113 | 114 | } 115 | 116 | void Classifier::Print() { 117 | 118 | std::cout << "NTrees " << m_nTrees << std::endl; 119 | std::cout << "Depth " << m_depth << std::endl; 120 | std::cout << "NumberOfFeatures " << m_numberOfFeatures << std::endl; 121 | 122 | } 123 | 124 | float Classifier::predict(const std::vector &X) const { 125 | 126 | if(m_can_use_fast_forest) { 127 | return m_fast_forest.Analyse(X); 128 | } else { 129 | std::vector bins(m_numberOfFinalFeatures); 130 | unsigned int bin = 0; 131 | unsigned int pFeature = 0; 132 | for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) { 133 | bins[bin] = m_featureBinning[iFeature].ValueToBin(X[iFeature]); 134 | bin++; 135 | if(m_purityTransformation[iFeature]) { 136 | bins[bin] = m_purityBinning[pFeature].BinToPurityBin(bins[bin-1]); 137 | pFeature++; 138 | bin++; 139 | } 140 | } 141 | return m_binned_forest.Analyse(bins); 142 | } 143 | } 144 | 145 | std::map Classifier::GetIndividualVariableRanking(const std::vector &X) const { 146 | 147 | std::map ranking; 148 | 149 | if(m_can_use_fast_forest) { 150 | ranking = m_fast_forest.GetIndividualVariableRanking(X); 151 | } else { 152 | std::vector bins(m_numberOfFinalFeatures); 153 | unsigned int bin = 0; 154 | unsigned int pFeature = 0; 155 | for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) { 156 | bins[bin] = m_featureBinning[iFeature].ValueToBin(X[iFeature]); 157 | bin++; 158 | if(m_purityTransformation[iFeature]) { 159 | bins[bin] = m_purityBinning[pFeature].BinToPurityBin(bins[bin-1]); 160 | pFeature++; 161 | bin++; 162 | } 163 | } 164 | ranking = m_binned_forest.GetIndividualVariableRanking(bins); 165 | } 166 | 167 | return MapRankingToOriginalFeatures(ranking); 168 | } 169 | 170 | std::map Classifier::GetFeatureMapping() const { 171 | 172 | std::map transformed2original; 173 | unsigned int transformedFeature = 0; 174 | for(unsigned int originalFeature = 0; originalFeature < m_numberOfFeatures; ++originalFeature) { 175 | transformed2original[transformedFeature] = originalFeature; 176 | if(m_purityTransformation[originalFeature]) { 177 | transformedFeature++; 178 | transformed2original[transformedFeature] = originalFeature; 179 | } 180 | transformedFeature++; 181 | } 182 | 183 | return transformed2original; 184 | 185 | } 186 | 187 | std::map Classifier::MapRankingToOriginalFeatures(std::map ranking) const { 188 | auto transformed2original = GetFeatureMapping(); 189 | std::map original_ranking; 190 | for(auto &pair : ranking) { 191 | if(original_ranking.find(transformed2original[pair.first]) == original_ranking.end()) 192 | original_ranking[transformed2original[pair.first]] = 0; 193 | original_ranking[transformed2original[pair.first]] += pair.second; 194 | } 195 | return original_ranking; 196 | } 197 | 198 | 199 | std::map Classifier::GetVariableRanking() const { 200 | std::map ranking; 201 | if (m_can_use_fast_forest) 202 | ranking = m_fast_forest.GetVariableRanking(); 203 | else 204 | ranking = m_binned_forest.GetVariableRanking(); 205 | return MapRankingToOriginalFeatures(ranking); 206 | } 207 | 208 | 209 | std::ostream& operator<<(std::ostream& stream, const Classifier& classifier) { 210 | 211 | stream << classifier.m_version << std::endl; 212 | stream << classifier.m_nTrees << std::endl; 213 | stream << classifier.m_depth << std::endl; 214 | stream << classifier.m_binning << std::endl; 215 | stream << classifier.m_shrinkage << std::endl; 216 | stream << classifier.m_subsample << std::endl; 217 | stream << classifier.m_sPlot << std::endl; 218 | stream << classifier.m_flatnessLoss << std::endl; 219 | stream << classifier.m_purityTransformation << std::endl; 220 | stream << classifier.m_transform2probability << std::endl; 221 | stream << classifier.m_featureBinning << std::endl; 222 | stream << classifier.m_purityBinning << std::endl; 223 | stream << classifier.m_numberOfFeatures << std::endl; 224 | stream << classifier.m_numberOfFinalFeatures << std::endl; 225 | stream << classifier.m_numberOfFlatnessFeatures << std::endl; 226 | stream << classifier.m_can_use_fast_forest << std::endl; 227 | stream << classifier.m_fast_forest << std::endl; 228 | stream << classifier.m_binned_forest << std::endl; 229 | 230 | return stream; 231 | } 232 | 233 | } 234 | -------------------------------------------------------------------------------- /src/test_Performance.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2017 3 | */ 4 | 5 | #include "FastBDT.h" 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace FastBDT; 16 | 17 | class PerformanceFeatureBinningTest : public ::testing::Test { 18 | protected: 19 | virtual void SetUp() { 20 | std::default_random_engine generator; 21 | std::uniform_real_distribution distribution(0.0,1.0); 22 | unsigned int N = 10000000; 23 | data.resize(N); 24 | for(unsigned int i = 0; i < N; ++i) { 25 | data[i] = distribution(generator); 26 | } 27 | } 28 | 29 | std::vector data; 30 | 31 | }; 32 | 33 | 34 | TEST_F(PerformanceFeatureBinningTest, FeatureBinningScalesLinearInNumberOfDataPoints) { 35 | 36 | // This is dominated by the sorting of the numbers -> N log (N), 37 | // for our purposes we assume just N, which seems to be fine 38 | // if this unittest starts failing I have to revise this and add the factor of log(N) 39 | 40 | std::vector sizes = {1000, 10000, 100000, 1000000}; 41 | std::vector times; 42 | 43 | for( auto &size : sizes ) { 44 | std::vector temp_data(data.begin(), data.begin() + size); 45 | std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); 46 | FeatureBinning binning(4, temp_data); 47 | std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now(); 48 | 49 | // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself 50 | EXPECT_EQ(binning.GetNLevels(), 4u); 51 | 52 | std::chrono::duration time = stop - start; 53 | times.push_back(time.count()); 54 | } 55 | 56 | // Check linear behaviour 57 | for(unsigned int i = 1; i < sizes.size(); ++i) { 58 | double size_ratio = sizes[i] / static_cast(sizes[0]); 59 | double time_ratio = times[i] / static_cast(times[0]); 60 | // We allow for deviation of factor two 61 | EXPECT_LT(time_ratio, size_ratio * 2.0); 62 | } 63 | 64 | } 65 | 66 | 67 | TEST_F(PerformanceFeatureBinningTest, FeatureBinningScalesConstantInSmallNumberOfLayers) { 68 | 69 | // The feature binning should be dominated by the sorting of the numbers 70 | // hence it does not scale with the number of layers to first order 71 | // for large layers this will be wrong ~ #Layer > 17 72 | std::vector sizes = {2, 3, 5, 7, 11, 13, 17}; 73 | std::vector times; 74 | 75 | for( auto &size : sizes ) { 76 | std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); 77 | FeatureBinning binning(size, data); 78 | std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now(); 79 | 80 | // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself 81 | EXPECT_EQ(binning.GetNLevels(), size); 82 | 83 | std::chrono::duration time = stop - start; 84 | times.push_back(time.count()); 85 | } 86 | 87 | // Check linear behaviour 88 | // We ignore the first measurement, to avoids effects of caching 89 | for(unsigned int i = 2; i < sizes.size(); ++i) { 90 | double time_ratio = times[i] / static_cast(times[1]); 91 | EXPECT_GT(time_ratio, 0.8); 92 | EXPECT_LT(time_ratio, 1.2); 93 | } 94 | 95 | } 96 | 97 | class PerformanceTreeBuilderTest : public ::testing::Test { 98 | protected: 99 | std::default_random_engine generator; 100 | std::uniform_int_distribution distribution{0, 16}; 101 | }; 102 | 103 | TEST_F(PerformanceTreeBuilderTest, TreeBuilderScalesLinearInNumberOfDataPoints) { 104 | 105 | auto random_source = std::bind(distribution, generator); 106 | 107 | unsigned int nFeatures = 10; 108 | unsigned int nLayers = 4; 109 | 110 | std::vector sizes = {1000, 10000, 100000, 1000000, 10000000}; 111 | std::vector times; 112 | 113 | for( auto &size : sizes ) { 114 | unsigned int nDataPoints = size; 115 | std::vector row(nFeatures); 116 | std::vector binning_levels(nFeatures, 4); 117 | 118 | EventSample sample(nDataPoints, nFeatures, 0, binning_levels); 119 | for(unsigned int i = 0; i < nDataPoints; ++i) { 120 | std::generate_n(row.begin(), nFeatures, random_source); 121 | sample.AddEvent( row, 1.0, i % 2 == 0); 122 | } 123 | 124 | std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); 125 | TreeBuilder dt(nLayers, sample); 126 | std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now(); 127 | 128 | // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself 129 | const auto &purities = dt.GetPurities(); 130 | EXPECT_EQ(purities.size(), static_cast((1 << (nLayers+1)) - 1)); 131 | 132 | std::chrono::duration time = stop - start; 133 | times.push_back(time.count()); 134 | } 135 | 136 | // Check linear behaviour 137 | for(unsigned int i = 1; i < sizes.size(); ++i) { 138 | double size_ratio = sizes[i] / static_cast(sizes[0]); 139 | double time_ratio = times[i] / static_cast(times[0]); 140 | // We allow for deviation of factor two 141 | EXPECT_LT(time_ratio, size_ratio * 2.0); 142 | } 143 | 144 | 145 | } 146 | 147 | TEST_F(PerformanceTreeBuilderTest, TreeBuilderScalesLinearInNumberOfFeatures) { 148 | 149 | auto random_source = std::bind(distribution, generator); 150 | 151 | unsigned int nLayers = 4; 152 | unsigned int nDataPoints = 100000; 153 | 154 | std::vector sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512}; 155 | std::vector times; 156 | 157 | for( auto &size : sizes ) { 158 | unsigned int nFeatures = size; 159 | std::vector row(nFeatures); 160 | std::vector binning_levels(nFeatures, 4); 161 | 162 | EventSample sample(nDataPoints, nFeatures, 0, binning_levels); 163 | for(unsigned int i = 0; i < nDataPoints; ++i) { 164 | std::generate_n(row.begin(), nFeatures, random_source); 165 | sample.AddEvent( row, 1.0, i % 2 == 0); 166 | } 167 | 168 | std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); 169 | TreeBuilder dt(nLayers, sample); 170 | std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now(); 171 | 172 | // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself 173 | const auto &purities = dt.GetPurities(); 174 | EXPECT_EQ(purities.size(), static_cast((1 << (nLayers+1)) - 1)); 175 | 176 | std::chrono::duration time = stop - start; 177 | times.push_back(time.count()); 178 | } 179 | 180 | // Check linear behaviour 181 | // We ignore the first measurement, to avoids effects of caching 182 | for(unsigned int i = 2; i < sizes.size(); ++i) { 183 | double size_ratio = sizes[i] / static_cast(sizes[1]); 184 | double time_ratio = times[i] / static_cast(times[1]); 185 | // We allow for deviation of factor two 186 | EXPECT_LT(time_ratio, size_ratio * 2.0); 187 | } 188 | } 189 | 190 | 191 | TEST_F(PerformanceTreeBuilderTest, TreeBuilderScalesLinearForSmallNumberOfLayers) { 192 | 193 | // For small numbers of layers (below 10) we should scale linear, 194 | // above the number of nodes in the deeper layers of the tree gets in the same order 195 | // of magnitude as the number of data_points and the summing of the histograms 196 | // becomes important 197 | auto random_source = std::bind(distribution, generator); 198 | 199 | unsigned int nFeatures = 10; 200 | unsigned int nDataPoints = 100000; 201 | 202 | std::vector sizes = {1, 2, 3, 5, 7, 11, 13}; 203 | std::vector times; 204 | 205 | std::vector row(nFeatures); 206 | std::vector binning_levels(nFeatures, 4); 207 | EventSample sample(nDataPoints, nFeatures, 0, binning_levels); 208 | for(unsigned int i = 0; i < nDataPoints; ++i) { 209 | std::generate_n(row.begin(), nFeatures, random_source); 210 | sample.AddEvent( row, 1.0, i % 2 == 0); 211 | } 212 | 213 | for( auto &size : sizes ) { 214 | unsigned int nLayers = size; 215 | 216 | // Reset flags, so we can use the sample multiple times 217 | auto &flags = sample.GetFlags(); 218 | for(unsigned int iEvent = 0; iEvent < nDataPoints; ++iEvent) 219 | flags.Set(iEvent, 1); 220 | 221 | std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); 222 | TreeBuilder dt(nLayers, sample); 223 | std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now(); 224 | 225 | // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself 226 | const auto &purities = dt.GetPurities(); 227 | EXPECT_EQ(purities.size(), static_cast((1 << (nLayers+1)) - 1)); 228 | 229 | std::chrono::duration time = stop - start; 230 | times.push_back(time.count()); 231 | } 232 | 233 | // Check linear behaviour 234 | // We ignore the first measurement, to avoids effects of caching 235 | for(unsigned int i = 2; i < sizes.size(); ++i) { 236 | double size_ratio = sizes[i] / static_cast(sizes[1]); 237 | double time_ratio = times[i] / static_cast(times[1]); 238 | // We allow for deviation of factor two 239 | EXPECT_LT(time_ratio, size_ratio * 2.0); 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /examples/performance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # A python version of the performance measurement script 4 | # I didn't used this in the paper 5 | 6 | import sys 7 | sys.path.append('../FastBDT/python') 8 | sys.path.append('../xgboost/python') 9 | import numpy as np 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | import FastBDT 12 | import pickle 13 | import scipy.sparse 14 | import xgboost as xgb 15 | import ROOT 16 | from ROOT import TMVA 17 | ROOT.TMVA.Tools.Instance() 18 | import array 19 | 20 | from timeit import default_timer as timer 21 | 22 | class Data(object): 23 | def __init__(self, datafile, numberOfFeatures, numberOfEvents): 24 | data = np.loadtxt(datafile, skiprows=1, dtype=np.float64) 25 | self.numberOfFeatures = numberOfFeatures 26 | self.numberOfEvents = numberOfEvents 27 | self.X = data[:numberOfEvents, :numberOfFeatures].astype(np.float64) 28 | self.y = data[:numberOfEvents, -1].astype(np.uint32) 29 | 30 | 31 | class Config(object): 32 | def __init__(self, numberOfFeatures, numberOfEvents, nTrees, depth, shrinkage, subSampling, nCutLevels): 33 | self.numberOfFeatures = numberOfFeatures 34 | self.numberOfEvents = numberOfEvents 35 | self.nTrees = nTrees 36 | self.depth = depth 37 | self.shrinkage = shrinkage 38 | self.subSampling = subSampling 39 | self.nCutLevels = nCutLevels 40 | 41 | 42 | class Result(object): 43 | def __init__(self, label, probabilities, preprocessingTime, trainingTime, testTime): 44 | self.label = label 45 | self.probabilities = probabilities 46 | self.preprocessingTime = preprocessingTime 47 | self.trainingTime = trainingTime 48 | self.testTime = testTime 49 | 50 | 51 | def writeResults(filename, results, test, config): 52 | with open(filename, 'w') as f: 53 | f.write("{c.nTrees} {c.depth} {c.shrinkage} {c.subSampling} {c.nCutLevels} {c.numberOfFeatures} {c.numberOfEvents}\n".format(c=config)) 54 | f.write(" ".join(r.label for r in results) + "\n") 55 | f.write("PreprocessingTime: " + " ".join(str(r.preprocessingTime) for r in results) + "\n") 56 | f.write("TrainingTime: " + " ".join(str(r.trainingTime) for r in results) + "\n") 57 | f.write("TestTime: " + " ".join(str(r.testTime) for r in results) + "\n") 58 | 59 | for i in range(len(test.y)): 60 | f.write(" ".join(str(r.probabilities[i]) for r in results) + " " + str(test.y[i]) + "\n") 61 | 62 | 63 | def measureFastBDT(train, test, config): 64 | preprocessing_start = timer() 65 | preprocessing_stop = timer() 66 | preprocessingTime = preprocessing_stop - preprocessing_start 67 | print('PreprocessingTime', preprocessingTime) 68 | 69 | training_start = timer() 70 | forest = FastBDT.Classifier(config.nCutLevels, config.nTrees, config.depth, config.shrinkage, config.subSampling) 71 | forest.fit(train.X, train.y) 72 | training_stop = timer() 73 | trainingTime = training_stop - training_start 74 | print('TrainingTime', trainingTime) 75 | 76 | test_start = timer() 77 | probabilities = forest.predict(test.X) 78 | test_stop = timer() 79 | testTime = test_stop - test_start 80 | print('TestTime', testTime) 81 | return Result("FastBDT", probabilities, preprocessingTime, trainingTime, testTime); 82 | 83 | 84 | def measureSKLearn(train, test, config): 85 | preprocessing_start = timer() 86 | preprocessing_stop = timer() 87 | preprocessingTime = preprocessing_stop - preprocessing_start 88 | print('PreprocessingTime', preprocessingTime) 89 | 90 | training_start = timer() 91 | forest = GradientBoostingClassifier(n_estimators=config.nTrees, learning_rate=config.shrinkage, max_depth=config.depth, random_state=0, subsample=config.subSampling) 92 | forest.fit(train.X, train.y) 93 | training_stop = timer() 94 | trainingTime = training_stop - training_start 95 | print('TrainingTime', trainingTime) 96 | 97 | test_start = timer() 98 | probabilities = forest.predict_proba(test.X)[:, 1] 99 | test_stop = timer() 100 | testTime = test_stop - test_start 101 | print('TestTime', testTime) 102 | return Result("SKLearn", probabilities, preprocessingTime, trainingTime, testTime); 103 | 104 | 105 | def measureXGBoost(train, test, config): 106 | preprocessing_start = timer() 107 | dtrain = xgb.DMatrix(train.X, label=train.y) 108 | dtest = xgb.DMatrix(test.X, label=test.y) 109 | preprocessing_stop = timer() 110 | preprocessingTime = preprocessing_stop - preprocessing_start 111 | print('PreprocessingTime', preprocessingTime) 112 | 113 | training_start = timer() 114 | param = {'max_depth':config.depth, 'eta':config.shrinkage, 'silent':1, 'objective':'binary:logistic', 'subsample': config.subSampling, 'nthread': 1} 115 | watchlist = [(dtrain,'train')] 116 | bst = xgb.train(param, dtrain, config.nTrees, watchlist) 117 | training_stop = timer() 118 | trainingTime = training_stop - training_start 119 | print('TrainingTime', trainingTime) 120 | 121 | test_start = timer() 122 | probabilities = bst.predict(dtest) 123 | test_stop = timer() 124 | testTime = test_stop - test_start 125 | print('TestTime', testTime) 126 | return Result("XGBoost", probabilities, preprocessingTime, trainingTime, testTime); 127 | 128 | 129 | def measureTMVA(train, test, config): 130 | preprocessing_start = timer() 131 | variables = ['index', 'chiProb', 'M', 'dr', 'dz', 'E', 'p', 'pz', 'pt', 'Kid', 'piid', 'Kz', 'piz', 'Kr', 'pir', 'Kz0', 'piz0', 'pi0M', 132 | 'gamma1E', 'gamma2E', 'pipi0M', 'KpiM', 'Kpi0M', 'errM', 'KpCMS', 'pipCMS', 'pi0pCMS', 'distance', 'gamma1clusterTiming', 133 | 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks', 'nECLClusters', 'nKLMClusters'] 134 | variables = variables[:config.numberOfFeatures] 135 | 136 | outputFile = ROOT.TFile("temp.root", "recreate") 137 | train_tree = ROOT.TTree("train_tree", "Training Tree") 138 | test_tree = ROOT.TTree("train_tree", "Training Tree") 139 | 140 | register = {v: array.array('f', [0]) for v in variables + ['isSignal']} 141 | for v in variables + ['isSignal']: 142 | train_tree.Branch(v, register[v], v + '/F') 143 | test_tree.Branch(v, register[v], v + '/F') 144 | 145 | for i, row in enumerate(train.X): 146 | for j, v in enumerate(variables): 147 | register[v][0] = row[j] 148 | register['isSignal'][0] = float(train.y[i]) 149 | train_tree.Fill() 150 | 151 | for i, row in enumerate(test.X): 152 | for j, v in enumerate(variables): 153 | register[v][0] = row[j] 154 | register['isSignal'][0] = float(test.y[i]) 155 | test_tree.Fill() 156 | preprocessing_stop = timer() 157 | preprocessingTime = preprocessing_stop - preprocessing_start 158 | print('PreprocessingTime', preprocessingTime) 159 | 160 | training_start = timer() 161 | factory = TMVA.Factory( "TMVAClassification", outputFile, 162 | "!V:Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Classification" ) 163 | factory.SetVerbose(False) 164 | for v in variables: 165 | factory.AddVariable(v, v, "", 'F') 166 | factory.SetInputTrees(train_tree, ROOT.TCut("isSignal == 1"), ROOT.TCut("isSignal == 0")) 167 | nsig = np.sum(train.y) 168 | nbkg = np.sum(1 - train.y) 169 | factory.PrepareTrainingAndTestTree(ROOT.TCut(""), "nTrain_Signal={}:nTrain_Background={}:SplitMode=Block:NormMode=NumEvents:!V".format(nsig, nbkg) ) 170 | 171 | factory.BookMethod( TMVA.Types.kBDT, "BDTG", 172 | "!H:!V:NTrees={}:BoostType=Grad:Shrinkage={:.2f}:UseBaggedBoost:BaggedSampleFraction={:.2f}:nCuts={}:MaxDepth={}:IgnoreNegWeightsInTraining".format(config.nTrees, config.shrinkage, config.subSampling, 2**config.nCutLevels, config.depth) ) 173 | factory.TrainAllMethods() 174 | reader = ROOT.TMVA.Reader() 175 | reader.SetVerbose(False) 176 | for v in variables: 177 | reader.AddVariable(v, register[v]) 178 | reader.BookMVA("BDTG","weights/TMVAClassification_BDTG.weights.xml") 179 | training_stop = timer() 180 | trainingTime = training_stop - training_start 181 | print('TrainingTime', trainingTime) 182 | 183 | test_start = timer() 184 | probabilities = np.zeros(len(test.y)) 185 | for i in range(test_tree.GetEntries()): 186 | test_tree.GetEvent(i) 187 | probabilities[i] = reader.EvaluateMVA("BDTG") 188 | test_stop = timer() 189 | testTime = test_stop - test_start 190 | print('TestTime', testTime) 191 | 192 | return Result("TMVA", probabilities, preprocessingTime, trainingTime, testTime); 193 | 194 | 195 | i = 0 196 | def measure(config): 197 | 198 | load_start = timer() 199 | train = Data('data/train.csv', config.numberOfFeatures, config.numberOfEvents) 200 | test = Data('data/test.csv', config.numberOfFeatures, config.numberOfEvents) 201 | load_stop = timer() 202 | print('Load', load_stop - load_start) 203 | 204 | start = timer() 205 | resultTMVA = measureTMVA(train, test, config) 206 | stop = timer() 207 | print('measureTMVA', stop - start) 208 | 209 | start = timer() 210 | resultFastBDT = measureFastBDT(train, test, config) 211 | stop = timer() 212 | print('measureFastBDT', stop - start) 213 | 214 | start = timer() 215 | resultSKLearn = measureSKLearn(train, test, config) 216 | stop = timer() 217 | print('measureSKLearn', stop - start) 218 | 219 | start = timer() 220 | resultXGBoost = measureXGBoost(train, test, config) 221 | stop = timer() 222 | print('measureXGBoost', stop - start) 223 | 224 | ++i 225 | writeResults('result_{}_python.txt'.format(i), [resultFastBDT, resultXGBoost, resultSKLearn, resultTMVA], test, config) 226 | 227 | 228 | if __name__ == '__main__': 229 | 230 | config = Config(numberOfFeatures=35, numberOfEvents=50000, nTrees=100, shrinkage=0.1, depth=3, nCutLevels=8, subSampling=0.5) 231 | for i in range(35, 36): 232 | config.numberOfFeatures = i 233 | measure(config) 234 | 235 | -------------------------------------------------------------------------------- /PyFastBDT/FastBDT.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import numpy as np 5 | 6 | import ctypes 7 | import ctypes.util 8 | c_double_p = ctypes.POINTER(ctypes.c_double) 9 | c_float_p = ctypes.POINTER(ctypes.c_float) 10 | c_bool_p = ctypes.POINTER(ctypes.c_bool) 11 | c_uint_p = ctypes.POINTER(ctypes.c_uint) 12 | 13 | FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__),'libFastBDT_CInterface.so')) 14 | 15 | FastBDT_library.Create.restype = ctypes.c_void_p 16 | FastBDT_library.Delete.argtypes = [ctypes.c_void_p] 17 | 18 | FastBDT_library.Load.argtypes = [ctypes.c_void_p, ctypes.c_char_p] 19 | FastBDT_library.Save.argtypes = [ctypes.c_void_p, ctypes.c_char_p] 20 | 21 | FastBDT_library.Fit.argtypes = [ctypes.c_void_p, c_float_p, c_float_p, c_bool_p, ctypes.c_uint] 22 | 23 | FastBDT_library.Predict.argtypes = [ctypes.c_void_p, c_float_p] 24 | FastBDT_library.Predict.restype = ctypes.c_float 25 | 26 | FastBDT_library.PredictArray.argtypes = [ctypes.c_void_p, c_float_p, c_float_p, ctypes.c_uint] 27 | 28 | FastBDT_library.SetSubsample.argtypes = [ctypes.c_void_p, ctypes.c_double] 29 | FastBDT_library.GetSubsample.argtypes = [ctypes.c_void_p] 30 | FastBDT_library.GetSubsample.restypes = ctypes.c_double 31 | 32 | FastBDT_library.SetShrinkage.argtypes = [ctypes.c_void_p, ctypes.c_double] 33 | FastBDT_library.GetShrinkage.argtypes = [ctypes.c_void_p] 34 | FastBDT_library.GetShrinkage.restypes = ctypes.c_double 35 | 36 | FastBDT_library.SetFlatnessLoss.argtypes = [ctypes.c_void_p, ctypes.c_double] 37 | FastBDT_library.GetFlatnessLoss.argtypes = [ctypes.c_void_p] 38 | FastBDT_library.GetFlatnessLoss.restypes = ctypes.c_double 39 | 40 | FastBDT_library.SetNTrees.argtypes = [ctypes.c_void_p, ctypes.c_uint] 41 | FastBDT_library.GetNTrees.argtypes = [ctypes.c_void_p] 42 | FastBDT_library.GetNTrees.restypes = ctypes.c_uint 43 | 44 | FastBDT_library.SetNumberOfFlatnessFeatures.argtypes = [ctypes.c_void_p, ctypes.c_uint] 45 | FastBDT_library.GetNumberOfFlatnessFeatures.argtypes = [ctypes.c_void_p] 46 | FastBDT_library.GetNumberOfFlatnessFeatures.restypes = ctypes.c_uint 47 | 48 | FastBDT_library.SetBinning.argtypes = [ctypes.c_void_p, c_uint_p, ctypes.c_uint] 49 | FastBDT_library.SetPurityTransformation.argtypes = [ctypes.c_void_p, c_uint_p, ctypes.c_uint] 50 | 51 | FastBDT_library.SetDepth.argtypes = [ctypes.c_void_p, ctypes.c_uint] 52 | FastBDT_library.GetDepth.argtypes = [ctypes.c_void_p] 53 | FastBDT_library.GetDepth.restypes = ctypes.c_uint 54 | 55 | FastBDT_library.SetTransform2Probability.argtypes = [ctypes.c_void_p, ctypes.c_bool] 56 | FastBDT_library.GetTransform2Probability.argtypes = [ctypes.c_void_p] 57 | FastBDT_library.GetTransform2Probability.restypes = ctypes.c_bool 58 | 59 | FastBDT_library.SetSPlot.argtypes = [ctypes.c_void_p, ctypes.c_bool] 60 | FastBDT_library.GetSPlot.argtypes = [ctypes.c_void_p] 61 | FastBDT_library.GetSPlot.restypes = ctypes.c_bool 62 | 63 | 64 | FastBDT_library.GetVariableRanking.argtypes = [ctypes.c_void_p] 65 | FastBDT_library.GetVariableRanking.restype = ctypes.c_void_p 66 | FastBDT_library.DeleteVariableRanking.argtypes = [ctypes.c_void_p] 67 | FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.argtypes = [ctypes.c_void_p] 68 | FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.restype = ctypes.c_uint 69 | FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.argtypes = [ctypes.c_void_p, ctypes.c_uint] 70 | FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.restype = ctypes.c_double 71 | 72 | FastBDT_library.GetIndividualVariableRanking.argtypes = [ctypes.c_void_p, c_float_p] 73 | FastBDT_library.GetIndividualVariableRanking.restype = ctypes.c_void_p 74 | 75 | 76 | def PrintVersion(): 77 | FastBDT_library.PrintVersion() 78 | 79 | 80 | def calculate_roc_auc(p, t, w=None): 81 | """ 82 | Calculates the area under the receiver oeprating characteristic curve (AUC ROC) 83 | @param p np.array filled with the probability output of a classifier 84 | @param t np.array filled with the target (0 or 1) 85 | """ 86 | if w is None: 87 | w = np.ones(len(t)) 88 | N = w.sum() 89 | T = np.sum(t*w) 90 | t = t*w 91 | index = np.argsort(p) 92 | efficiency = (T - np.cumsum(t[index])) / float(T) 93 | purity = (T - np.cumsum(t[index])) / (N - np.cumsum(w)) 94 | purity = np.where(np.isnan(purity), 0, purity) 95 | return np.abs(np.trapz(purity, efficiency)) 96 | 97 | 98 | class Classifier(object): 99 | def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True, purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0): 100 | """ 101 | @param binning list of numbers with the power N used for each feature binning e.g. 8 means 2^8 bins 102 | @param nTrees number of trees 103 | @param shrinkage reduction factor of each tree, lower shrinkage leads to slower but more stable convergence 104 | @param subsample the ratio of samples used for each tree 105 | @param transform2probability whether to transform the output to a probability 106 | @param purityTransformation list of bools, defines for each feature of in addition the purity-transformed of the feature should be used (this will slow down the inference) 107 | @param sPlot special treatment of sPlot weights are used 108 | @param flatnessLoss if bigger than 0 a flatness boost against all flatnessFeatures 109 | @param numberOfFlatnessFeatures the number of flatness features, it is assumed that the last N features are the flatness features 110 | """ 111 | self.binning = binning 112 | self.nTrees = nTrees 113 | self.depth = depth 114 | self.shrinkage = shrinkage 115 | self.subsample = subsample 116 | self.transform2probability = transform2probability 117 | self.purityTransformation = purityTransformation 118 | self.sPlot = sPlot 119 | self.flatnessLoss = flatnessLoss 120 | self.numberOfFlatnessFeatures = numberOfFlatnessFeatures 121 | self.forest = self.create_forest() 122 | 123 | def create_forest(self): 124 | forest = FastBDT_library.Create() 125 | FastBDT_library.SetBinning(forest, np.array(self.binning).ctypes.data_as(c_uint_p), int(len(self.binning))) 126 | FastBDT_library.SetNTrees(forest, int(self.nTrees)) 127 | FastBDT_library.SetDepth(forest, int(self.depth)) 128 | FastBDT_library.SetNumberOfFlatnessFeatures(forest, int(self.numberOfFlatnessFeatures)) 129 | FastBDT_library.SetShrinkage(forest, float(self.shrinkage)) 130 | FastBDT_library.SetSubsample(forest, float(self.subsample)) 131 | FastBDT_library.SetFlatnessLoss(forest, float(self.flatnessLoss)) 132 | FastBDT_library.SetTransform2Probability(forest, bool(self.transform2probability)) 133 | FastBDT_library.SetSPlot(forest, bool(self.sPlot)) 134 | FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p), int(len(self.purityTransformation))) 135 | return forest 136 | 137 | def fit(self, X, y, weights=None): 138 | X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) 139 | y_temp = np.require(y, dtype=np.bool, requirements=['A', 'W', 'C', 'O']) 140 | if weights is not None: 141 | w_temp = np.require(weights, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) 142 | numberOfEvents, numberOfFeatures = X_temp.shape 143 | FastBDT_library.Fit(self.forest, X_temp.ctypes.data_as(c_float_p), 144 | w_temp.ctypes.data_as(c_float_p) if weights is not None else None, 145 | y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures)) 146 | return self 147 | 148 | def predict(self, X): 149 | X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) 150 | N = len(X) 151 | p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O']) 152 | FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), int(X_temp.shape[0])) 153 | return p 154 | 155 | def predict_single(self, row): 156 | return FastBDT_library.Predict(self.forest, row.ctypes.data_as(c_float_p)) 157 | 158 | def save(self, weightfile): 159 | FastBDT_library.Save(self.forest, bytes(weightfile, 'utf-8')) 160 | 161 | def load(self, weightfile): 162 | FastBDT_library.Load(self.forest, bytes(weightfile, 'utf-8')) 163 | 164 | def individualFeatureImportance(self, X): 165 | X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) 166 | _ranking = FastBDT_library.GetIndividualVariableRanking(self.forest, X_temp.ctypes.data_as(c_float_p)) 167 | ranking = dict() 168 | for i in range(FastBDT_library.ExtractNumberOfVariablesFromVariableRanking(_ranking)): 169 | ranking[i] = FastBDT_library.ExtractImportanceOfVariableFromVariableRanking(_ranking, int(i)) 170 | FastBDT_library.DeleteVariableRanking(_ranking) 171 | return ranking 172 | 173 | def internFeatureImportance(self): 174 | _ranking = FastBDT_library.GetVariableRanking(self.forest) 175 | ranking = dict() 176 | for i in range(FastBDT_library.ExtractNumberOfVariablesFromVariableRanking(_ranking)): 177 | ranking[i] = FastBDT_library.ExtractImportanceOfVariableFromVariableRanking(_ranking, int(i)) 178 | FastBDT_library.DeleteVariableRanking(_ranking) 179 | return ranking 180 | 181 | def externFeatureImportance(self, X, y, weights=None, X_test=None, y_test=None, weights_test=None): 182 | if X_test is None: 183 | X_test = X 184 | if y_test is None: 185 | y_test = y 186 | if weights_test is None: 187 | weights_test = weights 188 | numberOfEvents, numberOfFeatures = X.shape 189 | global_auc = calculate_roc_auc(self.predict(X_test), y_test, weights_test) 190 | forest = self.forest 191 | importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test, y_test, weights_test) 192 | self.forest = forest 193 | return importances 194 | 195 | def _externFeatureImportance(self, features, global_auc, X, y, weights, X_test, y_test, weights_test): 196 | importances = dict() 197 | for i in features: 198 | remaining_features = [f for f in features if f != i] 199 | X_temp = X[:, remaining_features] 200 | X_test_temp = X_test[:, remaining_features] 201 | self.forest = self.create_forest() 202 | self.fit(X_temp, y, weights) 203 | auc = calculate_roc_auc(self.predict(X_test_temp), y_test, weights_test) 204 | FastBDT_library.Delete(self.forest) 205 | importances[i] = global_auc - auc 206 | 207 | most_important = max(importances.keys(), key=lambda x: importances[x]) 208 | remaining_features = [v for v in features if v != most_important] 209 | if len(remaining_features) == 1: 210 | return importances 211 | 212 | importances = {most_important: importances[most_important]} 213 | rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y, weights, X_test, y_test, weights_test) 214 | importances.update(rest) 215 | return importances 216 | 217 | def __del__(self): 218 | FastBDT_library.Delete(self.forest) 219 | -------------------------------------------------------------------------------- /examples/splot.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../python/') 3 | import FastBDT 4 | 5 | import matplotlib.pyplot as plt 6 | import matplotlib as mpl 7 | import numpy 8 | import numpy.linalg 9 | import pandas 10 | import seaborn 11 | import sklearn.metrics 12 | 13 | 14 | class Prior(object): 15 | def __init__(self, signal, bckgrd): 16 | self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(signal) 17 | self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(bckgrd) 18 | # Avoid numerical instabilities 19 | self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1 20 | self.signal_yield = len(signal) 21 | self.bckgrd_yield = len(bckgrd) 22 | 23 | def get_signal_pdf(self, X): 24 | return self.signal_pdf[numpy.digitize(X, bins=self.signal_bins)] 25 | 26 | def get_bckgrd_pdf(self, X): 27 | return self.bckgrd_pdf[numpy.digitize(X, bins=self.bckgrd_bins)] 28 | 29 | def get_signal_cdf(self, X): 30 | return self.signal_cdf[numpy.digitize(X, bins=self.signal_bins)] 31 | 32 | def get_bckgrd_cdf(self, X): 33 | return self.bckgrd_cdf[numpy.digitize(X, bins=self.bckgrd_bins)] 34 | 35 | def get_prior(self, X): 36 | return self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X)) 37 | 38 | def get_signal_boost_weights(self, X): 39 | return self.get_signal_cdf(X) / self.get_bckgrd_pdf(X) 40 | 41 | def get_bckgrd_boost_weights(self, X): 42 | # NOT self.get_bckgrd_cdf() here, signal and background are handlet asymmetrical! 43 | return (1.0 - self.get_signal_cdf(X)) / self.get_bckgrd_pdf(X) 44 | 45 | def get_boost_weights(self, X): 46 | return numpy.r_[self.get_signal_boost_weights(X), self.get_bckgrd_boost_weights(X)] 47 | 48 | def get_splot_weights(self, X): 49 | pdfs = [self.get_signal_pdf(X), self.get_bckgrd_pdf(X)] 50 | yields = [self.signal_yield, self.bckgrd_yield] 51 | weights = calculate_splot_weights(pdfs, yields) 52 | return numpy.r_[weights[0], weights[1]] 53 | 54 | def get_aplot_weights(self, X, boost_prediction): 55 | reg_boost_prediction = boost_prediction * 0.99 + 0.005 56 | weights = (self.get_signal_cdf(X) / reg_boost_prediction + (1.0 - self.get_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2 57 | return self.get_splot_weights(X) * numpy.r_[weights, weights] 58 | 59 | 60 | def calculate_cdf_and_pdf(X): 61 | """ 62 | Calculates cdf and pdf of given sample and adds under/overflow bins 63 | @param X 1-d numpy.array 64 | """ 65 | pdf, bins = numpy.histogram(X, bins=100, density=True) 66 | cdf = numpy.cumsum(pdf * (bins - numpy.roll(bins, 1))[1:]) 67 | return numpy.hstack([0.0, cdf, 1.0]), numpy.hstack([0.0, pdf, 0.0]), bins 68 | 69 | 70 | def calculate_splot_weights(pdfs, yields): 71 | """ 72 | Calculates sPlot weights using the pdfs 73 | @param pdfs list of 1-d numpy.array with pdf values of the different components for each event 74 | @param yields list of the yields of the different components 75 | """ 76 | N_components = len(pdfs) 77 | # Consistency checks 78 | if N_components != len(yields): 79 | raise RuntimeError("You have to provide the same number of pdfs and yields!") 80 | if N_components < 2: 81 | raise RuntimeError("Need at least two components!") 82 | 83 | # Calculate covariance matrix 84 | inverse_covariance = numpy.zeros((N_components, N_components)) 85 | norm = sum((yields[k] * pdfs[k] for k in range(1, N_components)), yields[0] * pdfs[0])**2 86 | for i in range(N_components): 87 | for j in range(N_components): 88 | inverse_covariance[i, j] = numpy.nansum(pdfs[i] * pdfs[j] / norm) 89 | covariance = numpy.linalg.inv(inverse_covariance) 90 | print(inverse_covariance) 91 | print(covariance) 92 | 93 | # Return list of sPlot weights for each component 94 | return [sum(covariance[n, k] * pdfs[k] for k in range(N_components)) / 95 | sum(yields[k] * pdfs[k] for k in range(N_components)) for n in range(N_components)] 96 | 97 | 98 | def calculate_score(label, train_prediction, test_prediction, train_truth, test_truth): 99 | train_fpr, train_tpr, train_thresholds = sklearn.metrics.roc_curve(train_truth, train_prediction) 100 | train_auc = sklearn.metrics.auc(train_fpr, train_tpr) 101 | #plt.plot(train_fpr, train_tpr, label=label + ' (Train) ROC Integral = {:.3}'.format(train_auc)) 102 | test_fpr, test_tpr, test_thresholds = sklearn.metrics.roc_curve(test_truth, test_prediction) 103 | test_auc = sklearn.metrics.auc(test_fpr, test_tpr) 104 | plt.plot(test_fpr, test_tpr, lw=4, label=label + ' ROC Integral = {:.3}'.format(test_auc)) 105 | #plt.legend() 106 | #plt.show() 107 | return train_auc, test_auc 108 | 109 | 110 | def combine_probabilities(p1, p2): 111 | return p1*p2 / (p1*p2 + (1-p1)*(1-p2)) 112 | 113 | 114 | if __name__ == '__main__': 115 | train_datafile = '../files/D0_2.txt' 116 | data = pandas.DataFrame.from_csv(train_datafile, sep=' ', index_col=None) 117 | df = data[data['distance'] < 0.1] 118 | N = len(df) // 2 119 | train_df = df.iloc[:N] 120 | print("Length training data", len(train_df)) 121 | test_df = df.iloc[N:] 122 | print("Length test data", len(test_df)) 123 | #keys = ['dM', 'chiProb', 'distance', 'gamma1E', 'gamma2E', 'gamma1clusterTiming', 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks'] 124 | keys = ['dM', 'Kpi0M', 'KpiM', 'chiProb', 'distance', 'gamma1E', 'gamma2E', 'gamma1clusterTiming', 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks'] 125 | #keys = ['dM', 'Kpi0M', 'KpiM', 'chiProb', 'distance', 'gamma1E', 'gamma2E', 'gamma1clusterTiming', 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks', 'dMBestCandidate'] 126 | 127 | signal = train_df[train_df.isSignal == 1][keys[0]].values 128 | bckgrd = train_df[train_df.isSignal == 0][keys[0]].values 129 | prior = Prior(signal, bckgrd) 130 | splot_weights = calculate_splot_weights([prior.get_signal_pdf(train_df[keys[0]].values), prior.get_bckgrd_pdf(train_df[keys[0]].values)], [len(signal), len(bckgrd)]) 131 | 132 | full_forest = FastBDT.Classifier().fit(X=train_df[keys].values, 133 | y=train_df['isSignal'].values) 134 | 135 | ordinary_forest = FastBDT.Classifier().fit(X=train_df[keys[1:]].values, 136 | y=train_df['isSignal'].values) 137 | 138 | splot_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[keys[1:]].values, train_df[keys[1:]].values], 139 | y=numpy.r_[numpy.ones(N), numpy.zeros(N)], 140 | weights=prior.get_splot_weights(train_df[keys[0]].values)) 141 | 142 | boost_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[keys[1:]].values, train_df[keys[1:]].values], 143 | y=numpy.r_[numpy.ones(N), numpy.zeros(N)], 144 | weights=prior.get_boost_weights(train_df[keys[0]].values)) 145 | aplot_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[keys[1:]].values, train_df[keys[1:]].values], 146 | y=numpy.r_[numpy.ones(N), numpy.zeros(N)], 147 | weights=prior.get_aplot_weights(train_df[keys[0]].values, boost_forest.predict(train_df[keys[1:]].values))) 148 | 149 | 150 | # Side-Band Subtraction 151 | signal_region = (train_df.dM.abs() < 0.05) 152 | neg_signal_region = (0.24 < train_df.dM.abs()) & (train_df.dM.abs() < 0.2849) 153 | bckgrd_region = (0.258 < train_df.dM.abs()) & (train_df.dM.abs() < 0.2746) 154 | print("SignalRegion:", "Signal", (signal_region & (train_df.isSignal == 1)).sum(), "Bckgrd", (signal_region & (train_df.isSignal == 0)).sum()) 155 | print("BckgrdRegion:", "Signal", (bckgrd_region & (train_df.isSignal == 1)).sum(), "Bckgrd", (bckgrd_region & (train_df.isSignal == 0)).sum()) 156 | print("NegSignalRegion:", "Signal", (neg_signal_region & (train_df.isSignal == 1)).sum(), "Bckgrd", (neg_signal_region & (train_df.isSignal == 0)).sum()) 157 | 158 | side_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[signal_region][keys[1:]].values, train_df[bckgrd_region][keys[1:]].values, train_df[neg_signal_region][keys[1:]].values], 159 | y=numpy.r_[numpy.ones(signal_region.sum()), numpy.zeros(bckgrd_region.sum()), numpy.ones(neg_signal_region.sum())], 160 | weights=numpy.r_[numpy.ones(signal_region.sum()), numpy.ones(bckgrd_region.sum()), -numpy.ones(neg_signal_region.sum())]) 161 | 162 | 163 | seaborn.set(font_scale=4.5) 164 | seaborn.distplot(train_df.dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Data') 165 | seaborn.distplot(train_df[signal_region].dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Signal Region') 166 | seaborn.distplot(train_df[neg_signal_region].dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Negative Signal Region') 167 | seaborn.distplot(train_df[bckgrd_region].dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Background Region') 168 | plt.xlim((-0.3,0.3)) 169 | plt.xlabel('Reconstructed Mass - Nominal Mass') 170 | plt.legend() 171 | figure = plt.gcf() # get current figure 172 | figure.set_size_inches(24, 16) 173 | plt.savefig('sideband.png') 174 | plt.clf() 175 | 176 | seaborn.distplot(train_df.dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Signal Fit') 177 | seaborn.distplot(train_df[train_df.isSignal == 0].dM.values, kde=False, bins=200, hist_kws={'range': (-0.3, 0.3)}, label='Background Fit') 178 | plt.xlim((-0.3,0.3)) 179 | plt.xlabel('Reconstructed Mass - Nominal Mass') 180 | plt.legend() 181 | figure = plt.gcf() # get current figure 182 | figure.set_size_inches(24, 16) 183 | plt.savefig('splot.png') 184 | plt.clf() 185 | 186 | full_prediction_train = full_forest.predict(train_df[keys].values) 187 | ordinary_prediction_train = ordinary_forest.predict(train_df[keys[1:]].values) 188 | splot_prediction_train = splot_forest.predict(train_df[keys[1:]].values) 189 | aplot_prediction_train = aplot_forest.predict(train_df[keys[1:]].values) 190 | prior_prediction_train = prior.get_prior(train_df[keys[0]].values) 191 | side_prediction_train = side_forest.predict(train_df[keys[1:]].values) 192 | ordinary_prior_prediction_train = combine_probabilities(ordinary_prediction_train, prior_prediction_train) 193 | splot_prior_prediction_train = combine_probabilities(splot_prediction_train, prior_prediction_train) 194 | aplot_prior_prediction_train = combine_probabilities(aplot_prediction_train, prior_prediction_train) 195 | side_prior_prediction_train = combine_probabilities(side_prediction_train, prior_prediction_train) 196 | truth_train = train_df['isSignal'].values 197 | 198 | full_prediction_test = full_forest.predict(test_df[keys].values) 199 | ordinary_prediction_test = ordinary_forest.predict(test_df[keys[1:]].values) 200 | splot_prediction_test = splot_forest.predict(test_df[keys[1:]].values) 201 | aplot_prediction_test = aplot_forest.predict(test_df[keys[1:]].values) 202 | prior_prediction_test = prior.get_prior(test_df[keys[0]].values) 203 | side_prediction_test = side_forest.predict(test_df[keys[1:]].values) 204 | ordinary_prior_prediction_test = combine_probabilities(ordinary_prediction_test, prior_prediction_test) 205 | splot_prior_prediction_test = combine_probabilities(splot_prediction_test, prior_prediction_test) 206 | aplot_prior_prediction_test = combine_probabilities(aplot_prediction_test, prior_prediction_test) 207 | side_prior_prediction_test = combine_probabilities(side_prediction_test, prior_prediction_test) 208 | truth_test = test_df['isSignal'].values 209 | 210 | seaborn.set_palette("Set1", n_colors=10, desat=.5) 211 | trivial_prior = train_df.isSignal.mean() 212 | #calculate_score("Trivial", numpy.ones(len(truth_train))*trivial_prior, numpy.ones(len(truth_test))*trivial_prior, truth_train, truth_test) 213 | calculate_score("Full", full_prediction_train, full_prediction_test, truth_train, truth_test) 214 | calculate_score("Ordinary", ordinary_prediction_train, ordinary_prediction_test, truth_train, truth_test) 215 | calculate_score("SPlot", splot_prediction_train, splot_prediction_test, truth_train, truth_test) 216 | calculate_score("APlot", aplot_prediction_train, aplot_prediction_test, truth_train, truth_test) 217 | calculate_score("Sideband", side_prediction_train, side_prediction_test, truth_train, truth_test) 218 | calculate_score("Prior", prior_prediction_train, prior_prediction_test, truth_train, truth_test) 219 | calculate_score("OrdinaryPrior", ordinary_prior_prediction_train, ordinary_prior_prediction_test, truth_train, truth_test) 220 | calculate_score("SPlotPrior", splot_prior_prediction_train, splot_prior_prediction_test, truth_train, truth_test) 221 | calculate_score("APlotPrior", aplot_prior_prediction_train, aplot_prior_prediction_test, truth_train, truth_test) 222 | calculate_score("SidePrior", side_prior_prediction_train, side_prior_prediction_test, truth_train, truth_test) 223 | plt.xlabel('False Positive Rate (Type I Error)') 224 | plt.ylabel('True Positive Rate (Efficiency)') 225 | plt.xlim((0.5,1.0)) 226 | plt.xlim((0.0,0.5)) 227 | plt.legend(loc='lower right') 228 | figure = plt.gcf() # get current figure 229 | figure.set_size_inches(24, 16) 230 | plt.savefig('splot_sideband_roc.png') 231 | plt.clf() 232 | -------------------------------------------------------------------------------- /src/test_FastBDT_IO.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2014 3 | */ 4 | 5 | #include "FastBDT.h" 6 | #include "FastBDT_IO.h" 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | using namespace FastBDT; 14 | 15 | 16 | class IOTest : public ::testing::Test { 17 | protected: 18 | virtual void SetUp() {} 19 | virtual void TearDown() {} 20 | 21 | }; 22 | 23 | 24 | template 25 | ::testing::AssertionResult CmpHelperFloatingPointEQNanSafe(const char* expected_expression, const char* actual_expression, T expected, T actual) { 26 | if(std::isnan(expected) and std::isnan(actual)) { 27 | return ::testing::AssertionSuccess(); 28 | } 29 | return ::testing::internal::CmpHelperFloatingPointEQ(expected_expression, actual_expression, expected, actual); 30 | } 31 | 32 | 33 | #define EXPECT_FLOAT_EQ_NAN_SAFE(x, y) EXPECT_PRED_FORMAT2(CmpHelperFloatingPointEQNanSafe, x, y) 34 | #define EXPECT_DOUBLE_EQ_NAN_SAFE(x, y) EXPECT_PRED_FORMAT2(CmpHelperFloatingPointEQNanSafe, x, y) 35 | 36 | 37 | TEST_F(IOTest, IOVector) { 38 | 39 | std::vector before = {0.0, 1.0, 2.5, 3.2, -1.4, 0.0}; 40 | 41 | std::stringstream stream; 42 | stream << before; 43 | 44 | std::vector after; 45 | stream >> after; 46 | 47 | EXPECT_EQ(before.size(), after.size()); 48 | for(unsigned int i = 0; i < before.size() and i < after.size(); ++i) 49 | EXPECT_FLOAT_EQ(before[i], after[i]); 50 | 51 | } 52 | 53 | TEST_F(IOTest, IOUsingSpecialValuesFloat) { 54 | 55 | std::vector before = {std::numeric_limits::lowest(), 56 | std::numeric_limits::denorm_min(), 57 | std::numeric_limits::min(), 58 | std::numeric_limits::max(), 59 | std::numeric_limits::infinity(), 60 | -std::numeric_limits::infinity(), 61 | std::numeric_limits::quiet_NaN(), 62 | std::numeric_limits::signaling_NaN(), 63 | 0.0}; 64 | 65 | std::stringstream stream; 66 | stream << before; 67 | 68 | std::vector after; 69 | stream >> after; 70 | 71 | EXPECT_EQ(before.size(), after.size()); 72 | for(unsigned int i = 0; i < before.size() and i < after.size(); ++i) 73 | EXPECT_FLOAT_EQ_NAN_SAFE(before[i], after[i]); 74 | 75 | } 76 | 77 | TEST_F(IOTest, IOUsingSpecialValuesDouble) { 78 | 79 | std::vector before = {std::numeric_limits::lowest(), 80 | std::numeric_limits::denorm_min(), 81 | std::numeric_limits::min(), 82 | std::numeric_limits::max(), 83 | std::numeric_limits::infinity(), 84 | -std::numeric_limits::infinity(), 85 | std::numeric_limits::quiet_NaN(), 86 | std::numeric_limits::signaling_NaN(), 87 | 0.0}; 88 | 89 | std::stringstream stream; 90 | stream << before; 91 | 92 | std::vector after; 93 | stream >> after; 94 | 95 | EXPECT_EQ(before.size(), after.size()); 96 | for(unsigned int i = 0; i < before.size() and i < after.size(); ++i) 97 | EXPECT_DOUBLE_EQ_NAN_SAFE(before[i], after[i]); 98 | 99 | } 100 | 101 | TEST_F(IOTest, IOFeatureBinning) { 102 | 103 | std::vector binning = { 1.0f, 7.0f, 4.0f, 10.0f, 12.0f }; 104 | FeatureBinning before(2, binning); 105 | const auto &before_binning = before.GetBinning(); 106 | 107 | std::stringstream stream; 108 | stream << before; 109 | 110 | auto after = readFeatureBinningFromStream(stream); 111 | const auto &after_binning = after.GetBinning(); 112 | 113 | EXPECT_EQ(before.GetNLevels(), after.GetNLevels()); 114 | EXPECT_EQ(before_binning.size(), after_binning.size()); 115 | for(unsigned int i = 0; i < before_binning.size() and i < after_binning.size(); ++i) 116 | EXPECT_FLOAT_EQ_NAN_SAFE(before_binning[i], after_binning[i]); 117 | 118 | } 119 | 120 | TEST_F(IOTest, IOFeatureBinningVector) { 121 | 122 | std::vector binning1 = { 1.0f, 7.0f, 4.0f, 10.0f, 12.0f }; 123 | std::vector binning2 = { 6.0f, 7.0f, 2.0f, 12.0f, 12.0f }; 124 | std::vector> before = {FeatureBinning(2, binning1), 125 | FeatureBinning(2, binning2)}; 126 | 127 | std::stringstream stream; 128 | stream << before; 129 | 130 | std::vector> after; 131 | stream >> after; 132 | 133 | EXPECT_EQ(before.size(), after.size()); 134 | for(unsigned int j = 0; j < before.size() and j < after.size(); ++j) { 135 | 136 | auto &before_featureBinning = before[j]; 137 | auto &after_featureBinning = after[j]; 138 | const auto &after_binning = after_featureBinning.GetBinning(); 139 | const auto &before_binning = before_featureBinning.GetBinning(); 140 | 141 | EXPECT_EQ(before_featureBinning.GetNLevels(), after_featureBinning.GetNLevels()); 142 | EXPECT_EQ(before_binning.size(), after_binning.size()); 143 | for(unsigned int i = 0; i < before_binning.size() and i < after_binning.size(); ++i) 144 | EXPECT_FLOAT_EQ_NAN_SAFE(before_binning[i], after_binning[i]); 145 | 146 | } 147 | 148 | } 149 | 150 | TEST_F(IOTest, IOCut) { 151 | 152 | Cut before; 153 | before.feature = 1; 154 | before.gain = 3.4; 155 | before.index = 5; 156 | before.valid = true; 157 | 158 | std::stringstream stream; 159 | stream << before; 160 | 161 | Cut after; 162 | stream >> after; 163 | 164 | EXPECT_EQ(before.feature, after.feature); 165 | EXPECT_EQ(before.gain, after.gain); 166 | EXPECT_EQ(before.index, after.index); 167 | EXPECT_EQ(before.valid, after.valid); 168 | 169 | } 170 | 171 | TEST_F(IOTest, IOCutSpecialValuesFloat) { 172 | 173 | std::vector values = {std::numeric_limits::lowest(), 174 | std::numeric_limits::denorm_min(), 175 | std::numeric_limits::min(), 176 | std::numeric_limits::max(), 177 | std::numeric_limits::infinity(), 178 | -std::numeric_limits::infinity(), 179 | std::numeric_limits::quiet_NaN(), 180 | std::numeric_limits::signaling_NaN(), 181 | 0.0}; 182 | 183 | for(auto &f : values) { 184 | Cut before; 185 | before.feature = 1; 186 | before.gain = 3.4; 187 | before.index = f; 188 | before.valid = true; 189 | 190 | std::stringstream stream; 191 | stream << before; 192 | 193 | Cut after; 194 | stream >> after; 195 | 196 | EXPECT_EQ(before.feature, after.feature); 197 | EXPECT_FLOAT_EQ(before.gain, after.gain); 198 | EXPECT_FLOAT_EQ_NAN_SAFE(before.index, after.index); 199 | EXPECT_EQ(before.valid, after.valid); 200 | } 201 | 202 | } 203 | 204 | TEST_F(IOTest, IOCutSpecialValuesDouble) { 205 | 206 | std::vector values = {std::numeric_limits::lowest(), 207 | std::numeric_limits::denorm_min(), 208 | std::numeric_limits::min(), 209 | std::numeric_limits::max(), 210 | std::numeric_limits::infinity(), 211 | -std::numeric_limits::infinity(), 212 | std::numeric_limits::quiet_NaN(), 213 | std::numeric_limits::signaling_NaN(), 214 | 0.0}; 215 | 216 | for(auto &f : values) { 217 | Cut before; 218 | before.feature = 1; 219 | before.gain = 3.4; 220 | before.index = f; 221 | before.valid = true; 222 | 223 | std::stringstream stream; 224 | stream << before; 225 | 226 | Cut after; 227 | stream >> after; 228 | 229 | EXPECT_EQ(before.feature, after.feature); 230 | EXPECT_FLOAT_EQ(before.gain, after.gain); 231 | EXPECT_DOUBLE_EQ_NAN_SAFE(before.index, after.index); 232 | EXPECT_EQ(before.valid, after.valid); 233 | } 234 | 235 | } 236 | 237 | TEST_F(IOTest, IOTree) { 238 | 239 | Cut cut1, cut2, cut3; 240 | cut1.feature = 0; 241 | cut1.index = 5; 242 | cut1.valid = true; 243 | cut1.gain = -3.0; 244 | cut2.feature = 1; 245 | cut2.index = 9; 246 | cut2.gain = 1.0; 247 | cut2.valid = true; 248 | cut3.feature = 0; 249 | cut3.index = 1; 250 | cut3.gain = 0.0; 251 | cut3.valid = false; 252 | 253 | std::vector> before_cuts = {cut1, cut2, cut3}; 254 | std::vector before_nEntries = { 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 }; 255 | std::vector before_purities = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7 }; 256 | std::vector before_boostWeights = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}; 257 | Tree before(before_cuts, before_nEntries, before_purities, before_boostWeights); 258 | 259 | std::stringstream stream; 260 | stream << before; 261 | 262 | auto after = readTreeFromStream(stream); 263 | const auto &after_cuts = after.GetCuts(); 264 | const auto &after_purities = after.GetPurities(); 265 | const auto &after_boostWeights = after.GetBoostWeights(); 266 | const auto &after_nEntries = after.GetNEntries(); 267 | 268 | EXPECT_EQ(before_cuts.size(), after_cuts.size()); 269 | for(unsigned int i = 0; i < before_cuts.size() and i < after_cuts.size(); ++i) { 270 | EXPECT_FLOAT_EQ(before_cuts[i].feature, after_cuts[i].feature); 271 | EXPECT_FLOAT_EQ(before_cuts[i].valid, after_cuts[i].valid); 272 | EXPECT_FLOAT_EQ(before_cuts[i].index, after_cuts[i].index); 273 | EXPECT_FLOAT_EQ(before_cuts[i].gain, after_cuts[i].gain); 274 | } 275 | 276 | EXPECT_EQ(before_purities.size(), after_purities.size()); 277 | for(unsigned int i = 0; i < before_purities.size() and i < after_purities.size(); ++i) 278 | EXPECT_FLOAT_EQ(before_purities[i], after_purities[i]); 279 | 280 | EXPECT_EQ(before_boostWeights.size(), after_boostWeights.size()); 281 | for(unsigned int i = 0; i < before_boostWeights.size() and i < after_boostWeights.size(); ++i) 282 | EXPECT_FLOAT_EQ(before_boostWeights[i], after_boostWeights[i]); 283 | 284 | EXPECT_EQ(before_nEntries.size(), after_nEntries.size()); 285 | for(unsigned int i = 0; i < before_nEntries.size() and i < after_nEntries.size(); ++i) 286 | EXPECT_FLOAT_EQ(before_nEntries[i], after_nEntries[i]); 287 | 288 | } 289 | 290 | TEST_F(IOTest, IOForest) { 291 | 292 | Cut cut1, cut2, cut3, cut4; 293 | cut1.feature = 0; 294 | cut1.index = 5; 295 | cut1.valid = true; 296 | cut1.gain = -3.0; 297 | cut2.feature = 1; 298 | cut2.index = 9; 299 | cut2.gain = 1.0; 300 | cut2.valid = true; 301 | cut3.feature = 0; 302 | cut3.index = 1; 303 | cut3.gain = 0.0; 304 | cut3.valid = false; 305 | cut4.feature = 2; 306 | cut4.index = 3; 307 | cut4.valid = true; 308 | cut4.gain = 1.61; 309 | 310 | std::vector nEntries = { 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 }; 311 | 312 | Forest before(0.5, 1.6, true); 313 | before.AddTree(Tree({cut1, cut2, cut3}, nEntries, { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7 }, { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0})); 314 | before.AddTree(Tree({cut1, cut4, cut3}, nEntries, { 0.6, 0.2, 0.5, 0.4, 0.5, 0.6, 0.7 }, { 2.0, 2.0, 3.0, 5.0, 5.0, 6.0, 1.0})); 315 | const auto &before_forest = before.GetForest(); 316 | 317 | std::stringstream stream; 318 | stream << before; 319 | 320 | auto after = readForestFromStream(stream); 321 | const auto &after_forest = after.GetForest(); 322 | 323 | EXPECT_EQ(before.GetTransform2Probability(), after.GetTransform2Probability()); 324 | EXPECT_EQ(before.GetF0(), after.GetF0()); 325 | EXPECT_EQ(before.GetShrinkage(), after.GetShrinkage()); 326 | 327 | EXPECT_EQ(before_forest.size(), after_forest.size()); 328 | for(unsigned int j = 0; j < before_forest.size() and j < after_forest.size(); ++j) { 329 | 330 | auto &before_tree = before_forest[j]; 331 | const auto &before_cuts = before_tree.GetCuts(); 332 | const auto &before_purities = before_tree.GetPurities(); 333 | const auto &before_boostWeights = before_tree.GetBoostWeights(); 334 | const auto &before_nEntries = before_tree.GetNEntries(); 335 | 336 | auto &after_tree = after_forest[j]; 337 | const auto &after_cuts = after_tree.GetCuts(); 338 | const auto &after_purities = after_tree.GetPurities(); 339 | const auto &after_boostWeights = after_tree.GetBoostWeights(); 340 | const auto &after_nEntries = after_tree.GetNEntries(); 341 | 342 | EXPECT_EQ(before_cuts.size(), after_cuts.size()); 343 | for(unsigned int i = 0; i < before_cuts.size() and i < after_cuts.size(); ++i) { 344 | EXPECT_FLOAT_EQ(before_cuts[i].feature, after_cuts[i].feature); 345 | EXPECT_FLOAT_EQ(before_cuts[i].valid, after_cuts[i].valid); 346 | EXPECT_FLOAT_EQ(before_cuts[i].index, after_cuts[i].index); 347 | EXPECT_FLOAT_EQ(before_cuts[i].gain, after_cuts[i].gain); 348 | } 349 | 350 | EXPECT_EQ(before_purities.size(), after_purities.size()); 351 | for(unsigned int i = 0; i < before_purities.size() and i < after_purities.size(); ++i) 352 | EXPECT_FLOAT_EQ(before_purities[i], after_purities[i]); 353 | 354 | EXPECT_EQ(before_boostWeights.size(), after_boostWeights.size()); 355 | for(unsigned int i = 0; i < before_boostWeights.size() and i < after_boostWeights.size(); ++i) 356 | EXPECT_FLOAT_EQ(before_boostWeights[i], after_boostWeights[i]); 357 | 358 | EXPECT_EQ(before_nEntries.size(), after_nEntries.size()); 359 | for(unsigned int i = 0; i < before_nEntries.size() and i < after_nEntries.size(); ++i) 360 | EXPECT_FLOAT_EQ(before_nEntries[i], after_nEntries[i]); 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /src/FastBDT.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2014 3 | */ 4 | 5 | #include "FastBDT.h" 6 | #include "FastBDT_IO.h" 7 | 8 | #include 9 | #include 10 | 11 | namespace FastBDT { 12 | 13 | std::vector EventWeights::GetSums(unsigned int nSignals) const { 14 | 15 | // Vectorizing FTW! 16 | std::vector sums(3,0); 17 | for(unsigned int i = 0; i < nSignals; ++i) { 18 | sums[0] += boost_weights[i] * original_weights[i]; 19 | sums[2] += boost_weights[i]*boost_weights[i] * original_weights[i]; 20 | } 21 | 22 | for(unsigned int i = nSignals; i < original_weights.size(); ++i) { 23 | sums[1] += boost_weights[i] * original_weights[i]; 24 | sums[2] += boost_weights[i]*boost_weights[i] * original_weights[i]; 25 | } 26 | return sums; 27 | 28 | } 29 | 30 | EventValues::EventValues(unsigned int nEvents, unsigned int nFeatures, unsigned int nSpectators, const std::vector &nLevels) : values(nEvents*(nFeatures+nSpectators), 0), nFeatures(nFeatures), nSpectators(nSpectators) { 31 | 32 | if(nFeatures + nSpectators != nLevels.size()) { 33 | throw std::runtime_error("Number of features must be the same as the number of provided binning levels! " + std::to_string(nFeatures) + " + " + std::to_string(nSpectators) + " vs " + std::to_string(nLevels.size())); 34 | } 35 | 36 | nBins.reserve(nLevels.size()); 37 | for(auto& nLevel : nLevels) 38 | nBins.push_back((1 << nLevel)+1); 39 | 40 | nBinSums.reserve(nLevels.size()+1); 41 | nBinSums.push_back(0); 42 | for(auto &nBin : nBins) 43 | nBinSums.push_back(nBinSums.back() + nBin); 44 | 45 | } 46 | 47 | void EventValues::Set(unsigned int iEvent, const std::vector &features) { 48 | 49 | // Check if the feature vector has the correct size 50 | if(features.size() != nFeatures + nSpectators) { 51 | throw std::runtime_error(std::string("Promised number of features are not provided. ") + std::to_string(features.size()) + " vs " + std::to_string(nFeatures) + " + " + std::to_string(nSpectators)); 52 | } 53 | 54 | // Check if the feature values are in the correct range 55 | for(unsigned int iFeature = 0; iFeature < nFeatures+nSpectators; ++iFeature) { 56 | if( features[iFeature] > nBins[iFeature] ) 57 | throw std::runtime_error(std::string("Promised number of bins is violated. ") + std::to_string(features[iFeature]) + " vs " + std::to_string(nBins[iFeature])); 58 | } 59 | 60 | // Now add the new values to the values vector. 61 | for(unsigned int iFeature = 0; iFeature < nFeatures+nSpectators; ++iFeature) { 62 | values[iEvent*(nFeatures+nSpectators) + iFeature] = features[iFeature]; 63 | } 64 | 65 | } 66 | 67 | void EventSample::AddEvent(const std::vector &features, Weight weight, bool isSignal) { 68 | 69 | // First check of we have enough space for an additional event. As the number of 70 | // events is fixed in the constructor (to avoid time consuming reallocations) 71 | if(nSignals + nBckgrds == nEvents) { 72 | throw std::runtime_error(std::string("Promised maximum number of events exceeded. ") + std::to_string(nSignals) + " + " + std::to_string(nBckgrds) + " vs " + std::to_string(nEvents) ); 73 | } 74 | 75 | if(std::isnan(weight)) { 76 | throw std::runtime_error("NAN values as weights are not supported!"); 77 | } 78 | 79 | // Now add the weight and the features at the right position of the arrays. 80 | // To do so, we calculate the correct index of this event. If it's a signal 81 | // event we store it right after the last signal event, starting at the 0 position. 82 | // If it's a background event, we store it right before the last added background event, 83 | // starting at the nEvents-1 position. We also update the weight sums and amount counts. 84 | unsigned int index = 0; 85 | if( isSignal ) { 86 | index = nSignals; 87 | ++nSignals; 88 | } else { 89 | index = nEvents - 1 - nBckgrds; 90 | ++nBckgrds; 91 | } 92 | weights.SetOriginalWeight(index, weight); 93 | values.Set(index, features); 94 | 95 | } 96 | 97 | Weight LossFunction(const Weight &nSignal, const Weight &nBckgrd) { 98 | // Gini-Index x total number of events (needed to calculate information gain efficiently)! 99 | if( nSignal <= 0 or nBckgrd <= 0 ) 100 | return 0; 101 | return (nSignal*nBckgrd)/(nSignal+nBckgrd); 102 | //return (nSignal*nBckgrd)/((nSignal+nBckgrd)*(nSignal+nBckgrd)); 103 | } 104 | 105 | CumulativeDistributions::CumulativeDistributions(const unsigned int iLayer, const EventSample &sample) { 106 | 107 | const auto &values = sample.GetValues(); 108 | nFeatures = values.GetNFeatures(); 109 | nNodes = (1 << iLayer); 110 | nBins = values.GetNBins(); 111 | nBinSums = values.GetNBinSums(); 112 | 113 | signalCDFs = CalculateCDFs(sample, 0, sample.GetNSignals()); 114 | bckgrdCDFs = CalculateCDFs(sample, sample.GetNSignals(), sample.GetNEvents()); 115 | 116 | } 117 | 118 | std::vector CumulativeDistributions::CalculateCDFs(const EventSample &sample, const unsigned int firstEvent, const unsigned int lastEvent) const { 119 | 120 | const auto &values = sample.GetValues(); 121 | const auto &flags = sample.GetFlags(); 122 | const auto &weights = sample.GetWeights(); 123 | 124 | std::vector bins( nNodes*nBinSums[nFeatures] ); 125 | 126 | // Fill Cut-PDFs for all nodes in this layer and for every feature 127 | for(unsigned int iEvent = firstEvent; iEvent < lastEvent; ++iEvent) { 128 | if( flags.Get(iEvent) < static_cast(nNodes) ) 129 | continue; 130 | const unsigned int index = (flags.Get(iEvent)-nNodes)*nBinSums[nFeatures]; 131 | for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature ) { 132 | const unsigned int subindex = nBinSums[iFeature] + values.Get(iEvent,iFeature); 133 | bins[index+subindex] += weights.GetOriginalWeight(iEvent) * (weights.GetBoostWeight(iEvent) + weights.GetFlatnessWeight(iEvent)); 134 | } 135 | } 136 | 137 | // Sum up Cut-PDFs to culumative Cut-PDFs 138 | for(unsigned int iNode = 0; iNode < nNodes; ++iNode) { 139 | for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature) { 140 | // Start at 2, this ignore the NaN bin at 0! 141 | for(unsigned int iBin = 2; iBin < nBins[iFeature]; ++iBin) { 142 | unsigned int index = iNode*nBinSums[nFeatures] + nBinSums[iFeature] + iBin; 143 | bins[index] += bins[index-1]; 144 | } 145 | } 146 | } 147 | 148 | return bins; 149 | } 150 | 151 | Cut Node::CalculateBestCut(const CumulativeDistributions &CDFs) const { 152 | 153 | Cut cut; 154 | 155 | const unsigned int nFeatures = CDFs.GetNFeatures(); 156 | const auto& nBins = CDFs.GetNBins(); 157 | 158 | Weight currentLoss = LossFunction(signal, bckgrd); 159 | if( currentLoss == 0 ) 160 | return cut; 161 | 162 | // Loop over all features and cuts and sum up signal and background histograms to cumulative histograms 163 | for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature) { 164 | // Start at 2, this ignores the NaN bin at 0 165 | for(unsigned int iCut = 2; iCut < nBins[iFeature]; ++iCut) { 166 | Weight s = CDFs.GetSignal(iNode, iFeature, iCut-1); 167 | Weight b = CDFs.GetBckgrd(iNode, iFeature, iCut-1); 168 | Weight currentGain = currentLoss - LossFunction( signal-s, bckgrd-b ) - LossFunction( s, b ); 169 | 170 | if( cut.gain <= currentGain ) { 171 | cut.gain = currentGain; 172 | cut.feature = iFeature; 173 | cut.index = iCut; 174 | cut.valid = true; 175 | } 176 | } 177 | } 178 | 179 | return cut; 180 | 181 | } 182 | 183 | void Node::AddSignalWeight(Weight weight, Weight original_weight) { 184 | if(original_weight == 0) 185 | return; 186 | signal += weight * original_weight; 187 | square += weight*weight * original_weight; 188 | } 189 | 190 | 191 | void Node::AddBckgrdWeight(Weight weight, Weight original_weight) { 192 | if(original_weight == 0) 193 | return; 194 | bckgrd += weight * original_weight; 195 | square += weight*weight * original_weight; 196 | } 197 | 198 | void Node::SetWeights(std::vector weights) { 199 | signal = weights[0]; 200 | bckgrd = weights[1]; 201 | square = weights[2]; 202 | } 203 | 204 | Weight Node::GetBoostWeight() const { 205 | 206 | Weight denominator = (2*(signal+bckgrd)-square); 207 | if( denominator == 0 ) { 208 | if(signal == bckgrd) 209 | return 0; 210 | if(signal > bckgrd) 211 | return 999.0; 212 | else 213 | return -999.0; 214 | } 215 | Weight value = (signal - bckgrd)/denominator; 216 | if( value > 999.0 or value < -999.0 ) { 217 | if(signal > bckgrd) 218 | return 999.0; 219 | else 220 | return -999.0; 221 | } 222 | return value; 223 | 224 | } 225 | 226 | void Node::Print() const { 227 | std::cout << "Node: " << iNode << std::endl; 228 | std::cout << "Layer: " << iLayer << std::endl; 229 | std::cout << "Signal: " << signal << std::endl; 230 | std::cout << "Bckgrd: " << bckgrd << std::endl; 231 | std::cout << "Square: " << square << std::endl; 232 | } 233 | 234 | /** 235 | * In bin-space NaN is marked by bin 0 236 | */ 237 | template<> 238 | bool is_nan(const unsigned int &value) { 239 | return value == 0; 240 | } 241 | 242 | 243 | TreeBuilder::TreeBuilder(unsigned int nLayers, EventSample &sample) : nLayers(nLayers) { 244 | 245 | const unsigned int nNodes = 1 << nLayers; 246 | cuts.resize(nNodes - 1); 247 | 248 | for(unsigned int iLayer = 0; iLayer <= nLayers; ++iLayer) { 249 | for(unsigned int iNode = 0; iNode < static_cast(1< 0, determines the node which holds this event at the moment 256 | // the trees are enumerated from top to bottom from left to right, starting at 1. 257 | // Secondly, a flag <= 0, disables this event, so it isn't used. 258 | // flag == 0 means disabled by stochastic bagging 259 | // flag < 0 means disabled due to missing value, where -flag is the node the event belongs to 260 | // Initially all events which are not disabled get the flag 1. 261 | // 262 | // All the flags of the enabled events are set to 1 by the DecisionForest 263 | // prepareEventSample method. So there's no need to do this here again. 264 | 265 | // The number of signal and bckgrd events at the root node, is given by the total 266 | // number of signal and background in the sample. 267 | const auto sums = sample.GetWeights().GetSums(sample.GetNSignals()); 268 | nodes[0].SetWeights(sums); 269 | 270 | // The training of the tree is done level by level. So we iterate over the levels of the tree 271 | // and create histograms for signal and background events for different cuts, nodes and features. 272 | for(unsigned int iLayer = 0; iLayer < nLayers; ++iLayer) { 273 | 274 | CumulativeDistributions CDFs(iLayer, sample); 275 | UpdateCuts(CDFs, iLayer); 276 | UpdateFlags(sample); 277 | UpdateEvents(sample, iLayer); 278 | 279 | } 280 | 281 | } 282 | 283 | void TreeBuilder::UpdateCuts(const CumulativeDistributions &CDFs, unsigned int iLayer) { 284 | 285 | for(auto &node : nodes) { 286 | if( node.IsInLayer(iLayer) ) { 287 | cuts[ node.GetPosition() ] = node.CalculateBestCut(CDFs); 288 | } 289 | } 290 | } 291 | 292 | void TreeBuilder::UpdateFlags(EventSample &sample) { 293 | 294 | auto &flags = sample.GetFlags(); 295 | const auto &values = sample.GetValues(); 296 | // Iterate over all signal events, and update weights in each node of the next level according to the cuts. 297 | for(unsigned int iEvent = 0; iEvent < sample.GetNEvents(); ++iEvent) { 298 | 299 | const int flag = flags.Get(iEvent); 300 | if( flag <= 0) 301 | continue; 302 | auto &cut = cuts[flag-1]; 303 | if( not cut.valid ) 304 | continue; 305 | 306 | const unsigned int index = values.Get(iEvent, cut.feature ); 307 | // If NaN value we throw out the event, but remeber its current node using the a negative flag! 308 | if( index == 0 ) { 309 | flags.Set(iEvent, -flag); 310 | } else if( index < cut.index ) { 311 | flags.Set(iEvent, flag * 2); 312 | } else { 313 | flags.Set(iEvent, flag * 2 + 1); 314 | } 315 | } 316 | } 317 | 318 | void TreeBuilder::UpdateEvents(const EventSample &sample, unsigned int iLayer) { 319 | 320 | const unsigned int nNodes = (1 << iLayer); 321 | const auto &weights = sample.GetWeights(); 322 | const auto &flags = sample.GetFlags(); 323 | 324 | for(unsigned int iEvent = 0; iEvent < sample.GetNSignals(); ++iEvent) { 325 | const int flag = flags.Get(iEvent); 326 | if( flag >= static_cast(nNodes) ) { 327 | nodes[flag-1].AddSignalWeight( weights.GetBoostWeight(iEvent), weights.GetOriginalWeight(iEvent) ); 328 | } 329 | } 330 | for(unsigned int iEvent = sample.GetNSignals(); iEvent < sample.GetNEvents(); ++iEvent) { 331 | const int flag = flags.Get(iEvent); 332 | if( flag >= static_cast(nNodes) ) { 333 | nodes[flag-1].AddBckgrdWeight( weights.GetBoostWeight(iEvent), weights.GetOriginalWeight(iEvent) ); 334 | } 335 | } 336 | 337 | } 338 | 339 | 340 | void TreeBuilder::Print() const { 341 | 342 | std::cout << "Start Printing Tree" << std::endl; 343 | 344 | for(auto &node : nodes) { 345 | node.Print(); 346 | std::cout << std::endl; 347 | } 348 | 349 | for(auto &cut : cuts) { 350 | std::cout << "Index: " << cut.index << std::endl; 351 | std::cout << "Feature: " << cut.feature << std::endl; 352 | std::cout << "Gain: " << cut.gain << std::endl; 353 | std::cout << "Valid: " << cut.valid << std::endl; 354 | std::cout << std::endl; 355 | } 356 | 357 | std::cout << "Finished Printing Tree" << std::endl; 358 | } 359 | 360 | ForestBuilder::ForestBuilder(EventSample &sample, unsigned int nTrees, double shrinkage, double randRatio, unsigned int nLayersPerTree, bool sPlot, double flatnessLoss) : shrinkage(shrinkage), flatnessLoss(flatnessLoss) { 361 | 362 | auto &weights = sample.GetWeights(); 363 | sums = weights.GetSums(sample.GetNSignals()); 364 | // Calculating the initial F value from the proportion of the number of signal and background events in the sample 365 | double average = (sums[0] - sums[1])/(sums[0] + sums[1]); 366 | F0 = 0.5*std::log((1+average)/(1-average)); 367 | 368 | // Apply F0 to original_weights because F0 is not a boost_weight, otherwise prior probability in case of 369 | // Events with missing values is wrong. 370 | if (F0 != 0.0) { 371 | const unsigned int nEvents = sample.GetNEvents(); 372 | const unsigned int nSignals = sample.GetNSignals(); 373 | for(unsigned int iEvent = 0; iEvent < nSignals; ++iEvent) 374 | weights.SetOriginalWeight(iEvent, 2.0 * sums[1] / (sums[0] + sums[1]) * weights.GetOriginalWeight(iEvent)); 375 | for(unsigned int iEvent = nSignals; iEvent < nEvents; ++iEvent) 376 | weights.SetOriginalWeight(iEvent, 2.0 * sums[0] / (sums[0] + sums[1]) * weights.GetOriginalWeight(iEvent)); 377 | } 378 | 379 | // Resize the FCache to the number of events, and initalise it with the inital 0.0 value 380 | // Not F0 because F0 is already used in the original_weights 381 | FCache.resize(sample.GetNEvents(), 0.0); 382 | 383 | // Reserve enough space for the boost_weights and trees, to avoid reallocations 384 | forest.reserve(nTrees); 385 | 386 | // Reserve enough space for binned uniform spectators 387 | if(flatnessLoss > 0) { 388 | const auto &values = sample.GetValues(); 389 | auto nFeatures = values.GetNFeatures(); 390 | auto nSpectators = values.GetNSpectators(); 391 | auto &nBins = values.GetNBins(); 392 | const unsigned int nEvents = sample.GetNEvents(); 393 | const unsigned int nSignals = sample.GetNSignals(); 394 | 395 | signal_event_index_sorted_by_F.resize(nSignals); 396 | bckgrd_event_index_sorted_by_F.resize(nEvents - nSignals); 397 | 398 | uniform_bin_weight_signal.resize(nSpectators); 399 | uniform_bin_weight_bckgrd.resize(nSpectators); 400 | weight_below_current_F_per_uniform_bin.resize(nSpectators); 401 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 402 | uniform_bin_weight_signal[iSpectator].resize(nBins[nFeatures + iSpectator], 0.0); 403 | uniform_bin_weight_bckgrd[iSpectator].resize(nBins[nFeatures + iSpectator], 0.0); 404 | weight_below_current_F_per_uniform_bin[iSpectator].resize(nBins[nFeatures + iSpectator], 0.0); 405 | } 406 | 407 | for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) { 408 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 409 | const uint64_t uniformBin = values.GetSpectator(iEvent, iSpectator); 410 | if (iEvent < nSignals) 411 | uniform_bin_weight_signal[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent); 412 | else 413 | uniform_bin_weight_bckgrd[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent); 414 | } 415 | } 416 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 417 | for(uint64_t iUniformBin = 0; iUniformBin < uniform_bin_weight_signal[iSpectator].size(); ++iUniformBin) { 418 | uniform_bin_weight_signal[iSpectator][iUniformBin] /= sums[0]; 419 | } 420 | for(uint64_t iUniformBin = 0; iUniformBin < uniform_bin_weight_bckgrd[iSpectator].size(); ++iUniformBin) { 421 | uniform_bin_weight_bckgrd[iSpectator][iUniformBin] /= sums[1]; 422 | } 423 | } 424 | } 425 | 426 | // Now train config.nTrees! 427 | for(unsigned int iTree = 0; iTree < nTrees; ++iTree) { 428 | 429 | // Update the event weights according to their F value 430 | updateEventWeights(sample); 431 | 432 | // Add flatness loss terms 433 | if(flatnessLoss > 0 and iTree > 0) 434 | updateEventWeightsWithFlatnessPenalty(sample); 435 | 436 | // Prepare the flags of the events 437 | prepareEventSample( sample, randRatio, sPlot ); 438 | 439 | // Create and train a new train on the sample 440 | TreeBuilder builder(nLayersPerTree, sample); 441 | if(builder.IsValid()) { 442 | forest.push_back( Tree( builder.GetCuts(), builder.GetNEntries(), builder.GetPurities(), builder.GetBoostWeights() ) ); 443 | } else { 444 | std::cerr << "Terminated boosting at tree " << iTree << " out of " << nTrees << std::endl; 445 | std::cerr << "Because the last tree was not valid, meaning it couldn't find an optimal cut." << std::endl; 446 | std::cerr << "This can happen if you do a large number of boosting steps." << std::endl; 447 | break; 448 | } 449 | } 450 | 451 | } 452 | 453 | void ForestBuilder::prepareEventSample(EventSample &sample, double randRatio, bool sPlot) { 454 | 455 | // Draw a random sample if stochastic gradient boost is used 456 | // Draw random number [0,1) and compare it to the given ratio. If bigger disable this event by flagging it with 0. 457 | // If smaller set the flag to 1. This is important! If the flags are != 1, the DecisionTree algorithm will fail. 458 | const unsigned int nEvents = sample.GetNEvents(); 459 | auto &flags = sample.GetFlags(); 460 | if( randRatio < 1.0 and sPlot) { 461 | // For an sPlot Training it is important to take always signal and background pairs together into the training! 462 | for(unsigned int iEvent = 0; iEvent < nEvents / 2 + 1; ++iEvent) { 463 | int use = (static_cast(rand())/static_cast(RAND_MAX) > randRatio ) ? 0 : 1; 464 | flags.Set(iEvent, use); 465 | unsigned int jEvent = static_cast(static_cast(nEvents) - static_cast(iEvent) - 1); 466 | flags.Set(jEvent, use); 467 | } 468 | } else if( randRatio < 1.0) { 469 | for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) 470 | flags.Set(iEvent, ( static_cast(rand())/static_cast(RAND_MAX) > randRatio ) ? 0 : 1 ); 471 | } else { 472 | for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) 473 | flags.Set(iEvent, 1); 474 | } 475 | 476 | } 477 | 478 | void ForestBuilder::updateEventWeights(EventSample &eventSample) { 479 | 480 | const unsigned int nEvents = eventSample.GetNEvents(); 481 | const unsigned int nSignals = eventSample.GetNSignals(); 482 | 483 | const auto &flags = eventSample.GetFlags(); 484 | const auto &values = eventSample.GetValues(); 485 | auto &weights = eventSample.GetWeights(); 486 | 487 | // Loop over all events and update FCache 488 | // If the event wasn't disabled, we can use the flag directly to determine the node of this event 489 | // If not we have to calculate the node to which this event belongs 490 | if( forest.size() > 0 ) { 491 | for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) { 492 | if( flags.Get(iEvent) != 0) 493 | FCache[iEvent] += shrinkage*forest.back().GetBoostWeight( std::abs(flags.Get(iEvent)) - 1); 494 | else 495 | FCache[iEvent] += shrinkage*forest.back().GetBoostWeight( forest.back().ValueToNode(&values.Get(iEvent)) ); 496 | } 497 | } 498 | 499 | for(unsigned int iEvent = 0; iEvent < nSignals; ++iEvent) 500 | weights.SetBoostWeight(iEvent, 2.0/(1.0+std::exp(2.0*FCache[iEvent]))); 501 | for(unsigned int iEvent = nSignals; iEvent < nEvents; ++iEvent) 502 | weights.SetBoostWeight(iEvent, 2.0/(1.0+std::exp(-2.0*FCache[iEvent]))); 503 | 504 | } 505 | 506 | void ForestBuilder::updateEventWeightsWithFlatnessPenalty(EventSample &eventSample) { 507 | 508 | const unsigned int nEvents = eventSample.GetNEvents(); 509 | const unsigned int nSignals = eventSample.GetNSignals(); 510 | 511 | const auto &values = eventSample.GetValues(); 512 | auto &weights = eventSample.GetWeights(); 513 | 514 | auto nSpectators = values.GetNSpectators(); 515 | 516 | // Sort events in order of increasing F Value 517 | for(unsigned int iEvent = 0; iEvent < nSignals; ++iEvent) { 518 | signal_event_index_sorted_by_F[iEvent] = {FCache[iEvent], iEvent}; 519 | } 520 | for(unsigned int iEvent = 0; iEvent < nEvents-nSignals; ++iEvent) { 521 | bckgrd_event_index_sorted_by_F[iEvent] = {-FCache[iEvent+nSignals], iEvent+nSignals}; 522 | } 523 | 524 | { 525 | auto first = signal_event_index_sorted_by_F.begin(); 526 | auto last = signal_event_index_sorted_by_F.end(); 527 | std::sort(first, last, compareWithIndex); 528 | } 529 | 530 | { 531 | auto first = bckgrd_event_index_sorted_by_F.begin(); 532 | auto last = bckgrd_event_index_sorted_by_F.end(); 533 | std::sort(first, last, compareWithIndex); 534 | } 535 | 536 | double global_weight_below_current_F = 0; 537 | for(unsigned int iIndex = 0; iIndex < signal_event_index_sorted_by_F.size(); ++iIndex) { 538 | unsigned int iEvent = signal_event_index_sorted_by_F[iIndex].index; 539 | 540 | global_weight_below_current_F += weights.GetOriginalWeight(iEvent); 541 | double F = global_weight_below_current_F / sums[0]; 542 | double fw = 0.0; 543 | 544 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 545 | const uint64_t uniformBin = values.GetSpectator(iEvent, iSpectator); 546 | weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent); 547 | double F_bin = weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] / (uniform_bin_weight_signal[iSpectator][uniformBin] * sums[0]); 548 | 549 | fw += (F_bin - F); 550 | } 551 | fw *= flatnessLoss; 552 | weights.SetFlatnessWeight(iEvent, fw); 553 | 554 | } 555 | 556 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 557 | for(uint64_t iUniformBin = 0; iUniformBin < weight_below_current_F_per_uniform_bin[iSpectator].size(); ++iUniformBin) { 558 | weight_below_current_F_per_uniform_bin[iSpectator][iUniformBin] = 0.0; 559 | } 560 | } 561 | 562 | global_weight_below_current_F = 0; 563 | 564 | for(unsigned int iIndex = 0; iIndex < bckgrd_event_index_sorted_by_F.size(); ++iIndex) { 565 | unsigned int iEvent = bckgrd_event_index_sorted_by_F[iIndex].index; 566 | 567 | global_weight_below_current_F += weights.GetOriginalWeight(iEvent); 568 | double F = global_weight_below_current_F / sums[1]; 569 | double fw = 0.0; 570 | 571 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 572 | const uint64_t uniformBin = values.GetSpectator(iEvent, iSpectator); 573 | weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent); 574 | double F_bin = weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] / (uniform_bin_weight_bckgrd[iSpectator][uniformBin] * sums[1]); 575 | 576 | fw += (F_bin - F); 577 | } 578 | fw *= flatnessLoss; 579 | weights.SetFlatnessWeight(iEvent, fw); 580 | 581 | } 582 | 583 | for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) { 584 | for(uint64_t iUniformBin = 0; iUniformBin < weight_below_current_F_per_uniform_bin[iSpectator].size(); ++iUniformBin) { 585 | weight_below_current_F_per_uniform_bin[iSpectator][iUniformBin] = 0.0; 586 | } 587 | } 588 | 589 | } 590 | 591 | } 592 | 593 | -------------------------------------------------------------------------------- /examples/comparison.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * Thomas Keck 2017 3 | * 4 | * Performance comparison code. 5 | * 6 | * This file measures the runtime of the fitting and application phase for different hyper-parameter settings for 7 | * - FastBDT 8 | * - XGBoost 9 | * - TMVA 10 | * - SKLearn 11 | * All methods are accessed via C++ to ensure an optimal performance. 12 | * In case of XGBoost and SKLearn this is rather unusual, but it is faster than to use Python. 13 | * I wouldn't recommend using XGBoost and SKLearn in the way it is shown below in your daily work, 14 | * it is rather error-prone, and I only do this to ensure fair conditions between the contestants. 15 | * 16 | * Compiling this code is complicated because it involves all the different frameworks. 17 | * You have to install FastBDT and XGBoost from github, ROOT, and sklearn using pip3; as well as python3.5 headers and libraries for your distribution. 18 | * 19 | * I compile in the following way 20 | * g++ comparison.cxx -o comparison -O3 21 | * -L ../FastBDT/ -I ../FastBDT/include/ -lFastBDT_shared 22 | * -I ../xgboost/rabit/include/ -I ../xgboost/dmlc-core/include/ -L ../xgboost/rabit/lib/ -L ../xgboost/dmlc-core/ -I ../xgboost/include/ -L ../xgboost/lib/ -l xgboost 23 | * `root-config --cflags --libs` -lTMVA -lMLP -lXMLIO 24 | * -lpython3.5 25 | * 26 | * And execute it like this: 27 | * LD_LIBRARY_PATH=$PATH_TO_XGBOOST/lib/:$PATH_TO_FASTBDT/:$LD_LIBRARY_PATH ./comparison 10 28 | * 29 | * The executable takes a command line argument, which I use to call the executable multiple times 30 | * with different hyper-parameter configurations. 31 | * 32 | * The code reads its data from data/train.csv and data/test.csv 33 | * These files must contain the appropriate amount of features (separated by whitespaces) and events (separated by linebreaks). 34 | * The last feature must be an integer with the truth information (1 for signal and 0 for background). 35 | * 36 | * The results of the measurements are outputted in files result_$id_cpp.txt, which contain the runtime in the preprocessing (preparation of the data), 37 | * fitting (fitting the classifier) and application (inference on indepentend test data using the classifier); as well as the outputted probabilities 38 | * for the test dataset for each event (where the last column contains the truth variable). 39 | */ 40 | 41 | #include "FastBDT.h" 42 | 43 | #include "xgboost/c_api.h" 44 | #include "xgboost/data.h" 45 | 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #include 52 | #include 53 | #include 54 | #include 55 | 56 | #include 57 | #include 58 | #include 59 | 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | 68 | class Data { 69 | public: 70 | Data(std::string datafile, unsigned int _numberOfFeatures, unsigned int _numberOfEvents) : numberOfFeatures(_numberOfFeatures), numberOfEvents(_numberOfEvents) { 71 | 72 | X.reserve(numberOfEvents); 73 | y.reserve(numberOfEvents); 74 | 75 | std::fstream fs (datafile, std::fstream::in); 76 | std::string line; 77 | 78 | // Skip Header 79 | std::getline(fs, line); 80 | 81 | unsigned int iEvent = 0; 82 | while(std::getline(fs, line)) { 83 | 84 | std::istringstream sin(line); 85 | std::vector row; 86 | float value = 0; 87 | unsigned int iFeature = 0; 88 | while(sin >> value) { 89 | if(iFeature < numberOfFeatures) 90 | row.push_back(value); 91 | ++iFeature; 92 | } 93 | X.push_back(row); 94 | y.push_back(static_cast(value)); 95 | 96 | ++iEvent; 97 | if(iEvent >= numberOfEvents) { 98 | break; 99 | } 100 | } 101 | 102 | std::cout << "Loaded " << iEvent << " Events" << std::endl; 103 | } 104 | 105 | unsigned int numberOfEvents = 0; 106 | unsigned int numberOfFeatures = 0; 107 | std::vector> X; 108 | std::vector y; 109 | }; 110 | 111 | 112 | struct Config { 113 | unsigned int nTrees; 114 | unsigned int depth; 115 | double shrinkage; 116 | double subSampling; 117 | // Only TMVA and FastBDT 118 | unsigned int nCutLevels; 119 | unsigned int numberOfFeatures; 120 | unsigned int numberOfEvents; 121 | }; 122 | 123 | struct Result { 124 | 125 | std::string label; 126 | std::vector probabilities; 127 | std::chrono::duration preprocessingTime; 128 | std::chrono::duration trainingTime; 129 | std::chrono::duration testTime; 130 | }; 131 | 132 | 133 | void writeResults(std::string filename, const std::vector &results, const Data& test, const Config& config) { 134 | 135 | std::fstream str(filename, std::fstream::out); 136 | str << config.nTrees << " " << config.depth << " " << config.shrinkage << " " << config.subSampling << " " << config.nCutLevels << " " << config.numberOfFeatures << " " << config.numberOfEvents << std::endl; 137 | 138 | str << "Labels: "; 139 | for(auto &r : results) { 140 | str << r.label << " "; 141 | } 142 | str << std::endl; 143 | 144 | str << "PreprocessingTime: "; 145 | for(auto &r : results) { 146 | str << r.preprocessingTime.count() << " "; 147 | } 148 | str << std::endl; 149 | 150 | str << "TrainingTime: "; 151 | for(auto &r : results) { 152 | str << r.trainingTime.count() << " "; 153 | } 154 | str << std::endl; 155 | 156 | str << "TestTime: "; 157 | for(auto &r : results) { 158 | str << r.testTime.count() << " "; 159 | } 160 | str << std::endl; 161 | 162 | for(unsigned int iEvent = 0; iEvent < config.numberOfEvents; ++iEvent) { 163 | for(auto &r : results) { 164 | str << r.probabilities[iEvent] << " "; 165 | } 166 | str << test.y[iEvent] << std::endl; 167 | } 168 | 169 | } 170 | 171 | Result measureSKLearn(const Data& train, const Data& test, const Config& config) { 172 | 173 | Result result; 174 | result.label = "SKLearn"; 175 | 176 | std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now(); 177 | PyObject* cls = PyUnicode_FromString((char*)"GradientBoostingClassifier"); 178 | PyObject* fit = PyUnicode_FromString((char*)"fit"); 179 | PyObject* predict = PyUnicode_FromString((char*)"predict_proba"); 180 | PyObject* pModule = PyImport_ImportModule("sklearn.ensemble"); 181 | 182 | PyObject* loss = PyUnicode_FromString((char*)"deviance"); 183 | PyObject* learning_rate = PyFloat_FromDouble(static_cast(config.shrinkage)); 184 | PyObject* n_estimators = PyLong_FromLong(static_cast(config.nTrees)); 185 | PyObject* subsample = PyFloat_FromDouble(static_cast(config.subSampling)); 186 | PyObject* criterion = PyUnicode_FromString((char*)"friedman_mse"); 187 | PyObject* min_samples_split = PyLong_FromLong(static_cast(2)); 188 | PyObject* min_samples_leaf = PyLong_FromLong(static_cast(1)); 189 | PyObject* min_weight_fraction_leaf = PyFloat_FromDouble(static_cast(0.0)); 190 | PyObject* max_depth = PyLong_FromLong(static_cast(config.depth)); 191 | PyObject* forest = PyObject_CallMethodObjArgs(pModule, cls, loss, learning_rate, n_estimators, subsample, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, NULL); 192 | Py_DECREF(loss); 193 | Py_DECREF(learning_rate); 194 | Py_DECREF(n_estimators); 195 | Py_DECREF(subsample); 196 | Py_DECREF(min_samples_split); 197 | Py_DECREF(min_samples_leaf); 198 | Py_DECREF(min_weight_fraction_leaf); 199 | Py_DECREF(max_depth); 200 | 201 | float *X = new float[train.numberOfEvents*train.numberOfFeatures]; 202 | float *y = new float[train.numberOfEvents]; 203 | for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) { 204 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) 205 | X[iEvent*train.numberOfFeatures + iFeature] = train.X[iEvent][iFeature]; 206 | y[iEvent] = static_cast(train.y[iEvent]); 207 | } 208 | long dimensions_X[2] = {train.numberOfEvents, train.numberOfFeatures}; 209 | long dimensions_y[1] = {train.numberOfEvents}; 210 | PyObject* ndarray_X = PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X); 211 | PyObject* ndarray_y = PyArray_SimpleNewFromData(1, dimensions_y, NPY_FLOAT32, y); 212 | 213 | std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now(); 214 | result.preprocessingTime = preprocessingTime2 - preprocessingTime1; 215 | std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl; 216 | 217 | std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now(); 218 | PyObject *x = PyObject_CallMethodObjArgs(forest, fit, ndarray_X, ndarray_y, NULL); 219 | std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now(); 220 | result.trainingTime = trainingTime2 - trainingTime1; 221 | std::cout << "TrainingTime " << result.trainingTime.count() << std::endl; 222 | 223 | result.probabilities.resize(test.numberOfEvents); 224 | 225 | std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now(); 226 | float *X_test = new float[test.numberOfEvents*test.numberOfFeatures]; 227 | for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) { 228 | for(unsigned int iFeature = 0; iFeature < test.numberOfFeatures; ++iFeature) 229 | X_test[iEvent*test.numberOfFeatures + iFeature] = test.X[iEvent][iFeature]; 230 | } 231 | long dimensions_X_test[2] = {test.numberOfEvents, test.numberOfFeatures}; 232 | PyObject* ndarray_X_test = PyArray_SimpleNewFromData(2, dimensions_X_test, NPY_FLOAT32, X_test); 233 | 234 | PyObject *pyresult = PyObject_CallMethodObjArgs(forest, predict, ndarray_X_test, NULL); 235 | for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) { 236 | result.probabilities[iEvent] = 1.0 - static_cast(*static_cast(PyArray_GETPTR1(pyresult, iEvent))); 237 | } 238 | std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now(); 239 | result.testTime = testTime2 - testTime1; 240 | std::cout << "TestTime " << result.testTime.count() << std::endl; 241 | 242 | Py_DECREF(ndarray_X); 243 | Py_DECREF(ndarray_X_test); 244 | Py_DECREF(ndarray_y); 245 | Py_DECREF(pyresult); 246 | Py_DECREF(cls); 247 | Py_DECREF(predict); 248 | Py_DECREF(fit); 249 | Py_DECREF(pModule); 250 | delete[] X; 251 | delete[] X_test; 252 | delete[] y; 253 | 254 | return result; 255 | 256 | } 257 | 258 | Result measureFastBDT(const Data& train, const Data& test, const Config& config) { 259 | 260 | Result result; 261 | result.label = "FastBDT"; 262 | 263 | std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now(); 264 | // Equal statistics binning 265 | std::vector> featureBinnings(train.numberOfFeatures); 266 | std::vector feature(train.numberOfEvents); 267 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) { 268 | for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) 269 | feature[iEvent] = train.X[iEvent][iFeature]; 270 | featureBinnings[iFeature] = FastBDT::FeatureBinning(config.nCutLevels, feature); 271 | } 272 | 273 | // Fill event Sample 274 | FastBDT::EventSample eventSample(train.numberOfEvents, train.numberOfFeatures, 0, std::vector(train.numberOfFeatures, config.nCutLevels)); 275 | std::vector bins(train.numberOfFeatures); 276 | for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) { 277 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) 278 | bins[iFeature] = featureBinnings[iFeature].ValueToBin( train.X[iEvent][iFeature] ); 279 | eventSample.AddEvent(bins, 1.0, train.y[iEvent] == 1); 280 | } 281 | std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now(); 282 | result.preprocessingTime = preprocessingTime2 - preprocessingTime1; 283 | std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl; 284 | 285 | std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now(); 286 | // Train classifier using training data 287 | FastBDT::ForestBuilder dt(eventSample, config.nTrees, config.shrinkage, config.subSampling, config.depth); 288 | FastBDT::Forest forest( dt.GetShrinkage(), dt.GetF0(), false); 289 | for( auto t : dt.GetForest() ) 290 | forest.AddTree(FastBDT::removeFeatureBinningTransformationFromTree(t, featureBinnings)); 291 | std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now(); 292 | result.trainingTime = trainingTime2 - trainingTime1; 293 | std::cout << "TrainingTime " << result.trainingTime.count() << std::endl; 294 | 295 | result.probabilities.resize(test.numberOfEvents); 296 | 297 | // Apply classifier on test data 298 | std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now(); 299 | for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) { 300 | result.probabilities[iEvent] = forest.Analyse(test.X[iEvent]); 301 | } 302 | std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now(); 303 | result.testTime = testTime2 - testTime1; 304 | std::cout << "TestTime " << result.testTime.count() << std::endl; 305 | return result; 306 | 307 | } 308 | 309 | Result measureTMVA(const Data& train, const Data& test, const Config& config) { 310 | 311 | Result result; 312 | result.label = "TMVA"; 313 | 314 | std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now(); 315 | TMVA::Tools::Instance(); 316 | TFile classFile("TMVA.root", "RECREATE"); 317 | classFile.cd(); 318 | TMVA::DataLoader data_loader("TMVAClassification"); 319 | TMVA::Factory factory("TMVAClassification", &classFile, "!V:Silent:Color:DrawProgressBar:AnalysisType=Classification"); 320 | 321 | std::vector variables = {"M", "p", "pt", "pz", "phi", "daughter__bo0__cm__spp__bc", "daughter__bo0__cm__sppz__bc", "daughter__bo0__cm__sppt__bc", "daughter__bo0__cm__spphi__bc", "daughter__bo1__cm__spp__bc", "daughter__bo1__cm__sppz__bc", "daughter__bo1__cm__sppt__bc", "daughter__bo1__cm__spphi__bc", "daughter__bo2__cm__spp__bc", "daughter__bo2__cm__sppz__bc", "daughter__bo2__cm__sppt__bc", "daughter__bo2__cm__spphi__bc", "chiProb", "dr", "dz", "dphi", "daughter__bo0__cm__spdr__bc", "daughter__bo1__cm__spdr__bc", "daughter__bo0__cm__spdz__bc", "daughter__bo1__cm__spdz__bc", "daughter__bo0__cm__spdphi__bc", "daughter__bo1__cm__spdphi__bc", "daughter__bo0__cm__spchiProb__bc", "daughter__bo1__cm__spchiProb__bc", "daughter__bo2__cm__spchiProb__bc", "daughter__bo0__cm__spKid__bc", "daughter__bo0__cm__sppiid__bc", "daughter__bo1__cm__spKid__bc", "daughter__bo1__cm__sppiid__bc", "daughterAngle__bo0__cm__sp1__bc", "daughterAngle__bo0__cm__sp2__bc", "daughterAngle__bo1__cm__sp2__bc", "daughter__bo2__cm__spdaughter__bo0__cm__spE__bc__bc", "daughter__bo2__cm__spdaughter__bo1__cm__spE__bc__bc", "daughter__bo2__cm__spdaughter__bo0__cm__spclusterTiming__bc__bc", "daughter__bo2__cm__spdaughter__bo1__cm__spclusterTiming__bc__bc", "daughter__bo2__cm__spdaughter__bo0__cm__spclusterE9E25__bc__bc", "daughter__bo2__cm__spdaughter__bo1__cm__spclusterE9E25__bc__bc", "daughter__bo2__cm__spdaughter__bo0__cm__spminC2HDist__bc__bc", "daughter__bo2__cm__spdaughter__bo1__cm__spminC2HDist__bc__bc", "daughterInvariantMass__bo0__cm__sp1__bc", "daughterInvariantMass__bo0__cm__sp2__bc", "daughterInvariantMass__bo1__cm__sp2__bc"}; 322 | 323 | std::vector vec(train.numberOfFeatures); 324 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) { 325 | data_loader.AddVariable(variables[iFeature].c_str()); 326 | } 327 | 328 | TTree *signal_tree = new TTree("signal_tree", "signal_tree"); 329 | TTree *background_tree = new TTree("background_tree", "background_tree"); 330 | 331 | for (unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) { 332 | signal_tree->Branch(variables[iFeature].c_str(), &vec[iFeature]); 333 | background_tree->Branch(variables[iFeature].c_str(), &vec[iFeature]); 334 | } 335 | 336 | unsigned int nsig = 0; 337 | unsigned int nbkg = 0; 338 | for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) { 339 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) { 340 | vec[iFeature] = train.X[iEvent][iFeature]; 341 | } 342 | if(train.y[iEvent] == 1) { 343 | ++nsig; 344 | signal_tree->Fill(); 345 | } else { 346 | ++nbkg; 347 | background_tree->Fill(); 348 | } 349 | } 350 | 351 | data_loader.AddSignalTree(signal_tree); 352 | data_loader.AddBackgroundTree(background_tree); 353 | 354 | data_loader.PrepareTrainingAndTestTree("", std::string("nTrain_Signal=") + std::to_string(nsig) + std::string(":nTrain_Background=") + std::to_string(nbkg) + std::string(":SplitMode=Block:!V")); 355 | factory.BookMethod(&data_loader, TMVA::Types::kBDT, "BDTG", std::string("!H:!V:NTrees=") + std::to_string(config.nTrees) + std::string("BoostType=Grad:Shrinkage=") + std::to_string(config.shrinkage) + std::string(":UseBaggedBoost:BaggedSampleFraction=") + std::to_string(config.subSampling) + std::string(":nCuts=") + std::to_string(1 << config.nCutLevels) + std::string(":MaxDepth=") + std::to_string(config.depth) + std::string(":IgnoreNegWeightsInTraining")); 356 | 357 | TMVA::Reader *reader = new TMVA::Reader("!Color:!Silent"); 358 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) { 359 | reader->AddVariable(variables[iFeature].c_str(), &vec[iFeature]); 360 | } 361 | 362 | std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now(); 363 | result.preprocessingTime = preprocessingTime2 - preprocessingTime1; 364 | std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl; 365 | 366 | std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now(); 367 | factory.TrainAllMethods(); 368 | std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now(); 369 | result.trainingTime = trainingTime2 - trainingTime1; 370 | std::cout << "TrainingTime " << result.trainingTime.count() << std::endl; 371 | 372 | //factory.TestAllMethods(); 373 | //factory.EvaluateAllMethods(); 374 | 375 | reader->BookMVA("BDTG","TMVAClassification/weights/TMVAClassification_BDTG.weights.xml"); 376 | result.probabilities.resize(test.numberOfEvents); 377 | 378 | // Apply classifier on test data 379 | std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now(); 380 | for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) { 381 | for(unsigned int iFeature = 0; iFeature < test.numberOfFeatures; ++iFeature) { 382 | vec[iFeature] = test.X[iEvent][iFeature]; 383 | } 384 | result.probabilities[iEvent] = (reader->EvaluateMVA("BDTG") + 1)*0.5; 385 | } 386 | std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now(); 387 | result.testTime = testTime2 - testTime1; 388 | std::cout << "TestTime " << result.testTime.count() << std::endl; 389 | 390 | delete reader; 391 | delete signal_tree; 392 | delete background_tree; 393 | 394 | return result; 395 | 396 | } 397 | 398 | Result measureXGBoost(const Data& train, const Data& test, const Config& config) { 399 | 400 | Result result; 401 | result.label = "XGBoost"; 402 | 403 | std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now(); 404 | // Create XGDMatrix 405 | float *matrix = new float[train.numberOfEvents*train.numberOfFeatures]; 406 | for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) 407 | for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) 408 | matrix[iEvent*train.numberOfFeatures + iFeature] = train.X[iEvent][iFeature]; 409 | 410 | DMatrixHandle dmatrix; 411 | XGDMatrixCreateFromMat(matrix, train.numberOfEvents, train.numberOfFeatures, NAN, &dmatrix); 412 | delete[] matrix; 413 | 414 | XGDMatrixSetUIntInfo(dmatrix, "label", train.y.data(), train.numberOfEvents); 415 | 416 | std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now(); 417 | result.preprocessingTime = preprocessingTime2 - preprocessingTime1; 418 | std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl; 419 | 420 | std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now(); 421 | BoosterHandle booster; 422 | XGBoosterCreate(&dmatrix, 1, &booster); 423 | XGBoosterSetParam(booster, "max_depth", std::to_string(config.depth).c_str()); 424 | XGBoosterSetParam(booster, "eta", std::to_string(config.shrinkage).c_str()); 425 | XGBoosterSetParam(booster, "silent", std::to_string(1).c_str()); 426 | XGBoosterSetParam(booster, "subsample", std::to_string(config.subSampling).c_str()); 427 | XGBoosterSetParam(booster, "nthread", std::to_string(1).c_str()); 428 | XGBoosterSetParam(booster, "objective", "binary:logistic"); 429 | XGBoosterSetParam(booster, "tree_method", "hist"); 430 | 431 | // Train classifier using training data 432 | for(unsigned int iBoost = 0; iBoost < config.nTrees; ++iBoost) { 433 | XGBoosterUpdateOneIter(booster, iBoost, dmatrix); 434 | } 435 | 436 | std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now(); 437 | result.trainingTime = trainingTime2 - trainingTime1; 438 | std::cout << "TrainingTime " << result.trainingTime.count() << std::endl; 439 | 440 | result.probabilities.resize(test.numberOfEvents); 441 | 442 | // Apply classifier on test data 443 | std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now(); 444 | float *test_matrix = new float[test.numberOfEvents*test.numberOfFeatures]; 445 | for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) 446 | for(unsigned int iFeature = 0; iFeature < test.numberOfFeatures; ++iFeature) 447 | test_matrix[iEvent*train.numberOfFeatures + iFeature] = test.X[iEvent][iFeature]; 448 | DMatrixHandle test_dmatrix; 449 | XGDMatrixCreateFromMat(test_matrix, test.numberOfEvents, test.numberOfFeatures, NAN, &test_dmatrix); 450 | delete[] test_matrix; 451 | long unsigned int out_len; 452 | const float *out_result; 453 | XGBoosterPredict(booster, test_dmatrix, 0, 0, &out_len, &out_result); 454 | for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) { 455 | result.probabilities[iEvent] = out_result[iEvent]; 456 | } 457 | std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now(); 458 | result.testTime = testTime2 - testTime1; 459 | std::cout << "TestTime " << result.testTime.count() << std::endl; 460 | 461 | XGBoosterFree(booster); 462 | XGDMatrixFree(dmatrix); 463 | XGDMatrixFree(test_dmatrix); 464 | 465 | return result; 466 | 467 | } 468 | 469 | void measure(Config &config, unsigned int id) { 470 | 471 | std::chrono::high_resolution_clock::time_point loadTime1 = std::chrono::high_resolution_clock::now(); 472 | Data train("data/train.csv", config.numberOfFeatures, config.numberOfEvents); 473 | Data test("data/test.csv", config.numberOfFeatures, config.numberOfEvents); 474 | std::chrono::high_resolution_clock::time_point loadTime2 = std::chrono::high_resolution_clock::now(); 475 | std::chrono::duration loadTime = loadTime2 - loadTime1; 476 | std::cout << "LoadTime " << loadTime.count() << std::endl; 477 | 478 | // Repeat each measurement 5 times 479 | for(unsigned int i = 0; i < 5; ++i) { 480 | 481 | std::chrono::high_resolution_clock::time_point measureSKLearnTime1 = std::chrono::high_resolution_clock::now(); 482 | Result resultSKLearn = measureSKLearn(train, test, config); 483 | std::chrono::high_resolution_clock::time_point measureSKLearnTime2 = std::chrono::high_resolution_clock::now(); 484 | std::chrono::duration measureSKLearnTime = measureSKLearnTime2 - measureSKLearnTime1; 485 | std::cout << "MeasureSKLearnTime " << measureSKLearnTime.count() << std::endl; 486 | 487 | std::chrono::high_resolution_clock::time_point measureTMVATime1 = std::chrono::high_resolution_clock::now(); 488 | Result resultTMVA = measureTMVA(train, test, config); 489 | std::chrono::high_resolution_clock::time_point measureTMVATime2 = std::chrono::high_resolution_clock::now(); 490 | std::chrono::duration measureTMVATime = measureTMVATime2 - measureTMVATime1; 491 | std::cout << "MeasureTMVATime " << measureTMVATime.count() << std::endl; 492 | 493 | std::chrono::high_resolution_clock::time_point measureXGBoostTime1 = std::chrono::high_resolution_clock::now(); 494 | Result resultXGBoost = measureXGBoost(train, test, config); 495 | std::chrono::high_resolution_clock::time_point measureXGBoostTime2 = std::chrono::high_resolution_clock::now(); 496 | std::chrono::duration measureXGBoostTime = measureXGBoostTime2 - measureXGBoostTime1; 497 | std::cout << "MeasureXGBoostTime " << measureXGBoostTime.count() << std::endl; 498 | 499 | std::chrono::high_resolution_clock::time_point measureFastBDTTime1 = std::chrono::high_resolution_clock::now(); 500 | Result resultFastBDT = measureFastBDT(train, test, config); 501 | std::chrono::high_resolution_clock::time_point measureFastBDTTime2 = std::chrono::high_resolution_clock::now(); 502 | std::chrono::duration measureFastBDTTime = measureFastBDTTime2 - measureFastBDTTime1; 503 | std::cout << "MeasureFastBDTTime " << measureFastBDTTime.count() << std::endl; 504 | 505 | writeResults(std::string("result_") + std::to_string(id+i) + std::string("_cpp.txt"), {resultFastBDT, resultXGBoost, resultSKLearn, resultTMVA}, test, config); 506 | } 507 | } 508 | 509 | int main(int argc, char *argv[]) { 510 | 511 | Py_Initialize(); 512 | import_array(); 513 | 514 | Config config; 515 | config.nTrees = 100; 516 | config.depth = 3; 517 | config.shrinkage = 0.1; 518 | config.subSampling = 0.5; 519 | config.nCutLevels = 8; 520 | config.numberOfEvents = 800000; 521 | config.numberOfFeatures = 40; 522 | 523 | unsigned int id = atoi(argv[1]); 524 | 525 | // Here you can choose different hyper-parameters depending on the passed id 526 | //config.nTrees = id*10; 527 | //config.numberOfEvents = 500000 >> (id - 1); 528 | //config.numberOfFeatures = id; 529 | //config.depth = id; 530 | //config.subSampling = 0.1*id; 531 | 532 | measure(config, id*10); 533 | 534 | Py_Finalize(); 535 | 536 | } 537 | 538 | --------------------------------------------------------------------------------