├── PyFastBDT
    ├── __init__.py
    ├── utility.py
    └── FastBDT.py
├── MANIFEST.in
├── examples
    ├── Makefile
    ├── PurityTransformation.py
    ├── PythonExample.py
    ├── ugboost.py
    ├── CPPExample.cxx
    ├── orthogonal_discriminator.py
    ├── performance.py
    ├── splot.py
    └── comparison.cxx
├── src
    ├── test_all.cxx
    ├── FastBDT_IO.cxx
    ├── test_FastBDT_C_API.cxx
    ├── FastBDT_C_API.cxx
    ├── test_Classifier.cxx
    ├── Classifier.cxx
    ├── test_Performance.cxx
    ├── test_FastBDT_IO.cxx
    └── FastBDT.cxx
├── setup.py.in
├── include
    ├── LinkDef.h
    ├── FastBDT_C_API.h
    ├── Classifier.h
    └── FastBDT_IO.h
├── .travis.yml
├── README.md
├── CMakeLists.txt
└── files
    └── iris.txt


/PyFastBDT/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include libFastBDT_CInterface.so
2 | include libFastBDT_shared.so
3 | 


--------------------------------------------------------------------------------
/examples/Makefile:
--------------------------------------------------------------------------------
1 | make:
2 | 	#g++ CPPExample.cxx -o CPPExample -l FastBDT_static -L ../ -I ../include/ -ggdb3
3 | 	g++ CPPExample.cxx -o CPPExample -l FastBDT_static -L ../ -I ../include/ -O3
4 | 


--------------------------------------------------------------------------------
/src/test_all.cxx:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Thomas Keck 2015
 3 |  */
 4 | 
 5 | #include <gtest/gtest.h>
 6 | 
 7 | int main(int argc, char **argv) {
 8 |   ::testing::InitGoogleTest(&argc, argv);
 9 |   return RUN_ALL_TESTS();
10 | }
11 | 


--------------------------------------------------------------------------------
/setup.py.in:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | 
3 | setup(name='PyFastBDT',
4 |       version='${FastBDT_VERSION_MAJOR}.${FastBDT_VERSION_MINOR}',
5 |       packages=['PyFastBDT'],
6 |       package_data={'PyFastBDT': ['*.so']},
7 |      )
8 | 


--------------------------------------------------------------------------------
/include/LinkDef.h:
--------------------------------------------------------------------------------
 1 | #ifdef __CINT__
 2 | 
 3 | #pragma link off all global;
 4 | #pragma link off all class;
 5 | #pragma link off all function;
 6 | #pragma link off all namespace;
 7 | 
 8 | 
 9 | #pragma link C++ class TMVA::MethodFastBDT+;
10 | #pragma link C++ namespace TMVA;
11 | #pragma link C++ nestedclass;
12 | #pragma link C++ nestedtypedef;
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | dist: trusty
 3 | language: cpp
 4 | notifications:
 5 |   email:
 6 |     on_success: change # default: change
 7 |     on_failure: always # default: always
 8 | compiler:
 9 |   - gcc
10 | addons:
11 |   apt:
12 |     sources:
13 |     - ubuntu-toolchain-r-test
14 |     packages:
15 |     - libgtest-dev
16 |     - build-essential
17 |     - cmake
18 | before_install:
19 |   - cd /usr/src/gtest && sudo cmake . && sudo make && sudo mv libg* /usr/lib/ && cd -
20 | install:
21 |   - cmake .
22 |   - make VERBOSE=1
23 | script: 
24 |   - ./unittests
25 | 


--------------------------------------------------------------------------------
/examples/PurityTransformation.py:
--------------------------------------------------------------------------------
 1 | from PyFastBDT import FastBDT
 2 | 
 3 | import pandas
 4 | import numpy as np
 5 | import sklearn.metrics
 6 | 
 7 | if __name__ == '__main__':
 8 | 
 9 |     data = np.arange(100000)
10 |     X = (data % 100).reshape((100000, 1))
11 |     y = (data % 2) == 1
12 | 
13 |     clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[False]).fit(X=X, y=y)
14 |     p = clf.predict(X)
15 |     print('No Purity Transformation', sklearn.metrics.roc_auc_score(y, p))
16 | 
17 |     clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[True]).fit(X=X, y=y)
18 |     p = clf.predict(X)
19 |     print('With Purity Transformation', sklearn.metrics.roc_auc_score(y, p))
20 | 


--------------------------------------------------------------------------------
/PyFastBDT/utility.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def flatness(probability, feature, target, weights=None, classes=[0, 1]):
 5 |     """ 
 6 |     Calculates the flatness of a feature under cuts on a signal probability
 7 |     @param f the feature values
 8 |     @param p the probability values
 9 |     @param w optional weights
10 |     @return the mean standard deviation between the local and global cut selection efficiency
11 |     """
12 |     quantiles = list(range(101))
13 |     flatness_score = 0
14 |     for m in [target == c for c in classes]:
15 |         p = probability[m]
16 |         f = feature[m]
17 |         if weights is None:
18 |             w = None
19 |         else:
20 |             w = weights[m]
21 | 
22 |         binning_feature = np.unique(np.percentile(f, q=quantiles))
23 |         binning_probability = np.unique(np.percentile(p, q=quantiles))
24 |         hist_n, _ = np.histogramdd(np.c_[p, f], 
25 |                                    bins=[binning_probability, binning_feature],
26 |                                    weights=w)
27 |         hist_inc = hist_n.sum(axis=1)
28 |         hist_inc /= hist_inc.sum(axis=0)
29 |         hist_n /= hist_n.sum(axis=0)
30 |         hist_n = hist_n.cumsum(axis=0)
31 |         hist_inc = hist_inc.cumsum(axis=0)
32 |         diff = (hist_n.T - hist_inc)**2
33 |         flatness_score += diff.sum() / (100*99)
34 |     return np.sqrt(flatness_score)
35 | 
36 | 
37 | 
38 | def auc_roc(probability, target):
39 |     N = len(target)
40 |     T = np.sum(target)
41 |     index = np.argsort(probability)
42 |     efficiency = (T - np.cumsum(target[index])) / float(T)
43 |     purity = (T - np.cumsum(target[index])) / (N - np.cumsum(np.ones(N)))
44 |     purity = np.where(np.isnan(purity), 0, purity)
45 |     return np.abs(np.trapz(purity, efficiency))
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FastBDT
 2 | 
 3 | Stochastic Gradient Boosted Decision Trees, usable standalone, and via Python Interface.
 4 | 
 5 | # Paper on ArXiv: http://arxiv.org/abs/1609.06119
 6 | 
 7 | FastBDT: A speed-optimized and cache-friendly implementation of stochastic gradient-boosted decision trees for multivariate classification
 8 | 
 9 | Stochastic gradient-boosted decision trees are widely employed for multivariate classification and regression tasks. This paper presents a speed-optimized and cache-friendly implementation for multivariate classification called FastBDT. FastBDT is one order of magnitude faster during the fitting-phase and application-phase, in comparison with popular implementations in software frameworks like TMVA, scikit-learn and XGBoost. The concepts used to optimize the execution time and performance studies are discussed in detail in this paper. The key ideas include: An equal-frequency binning on the input data, which allows replacing expensive floating-point with integer operations, while at the same time increasing the quality of the classification; a cache-friendly linear access pattern to the input data, in contrast to usual implementations, which exhibit a random access pattern. FastBDT provides interfaces to C/C++ and Python. It is extensively used in the field of high energy physics by the Belle II experiment. 
10 | 
11 | 
12 | # Installation
13 | 
14 |   * cmake .
15 |   * make
16 |   * make install
17 |   * make package (optional to build rpm, deb packages)
18 |   * python3 setup.py install (optional to install the python package)
19 | 
20 | 
21 | # Usage
22 | 
23 | Before you do anything you want to execute the unittests:
24 |   * ./unittest
25 | 
26 | But usually it should be more convinient to use FastBDT as a library
27 | and integrate FastBDT directly into your application using
28 |   * the C++ shared/static library (see example/CPPExample.cxx),
29 |   * the C shared library,
30 |   * or the Python3 library python/FastBDT.py (see example/PythonExample.py ).
31 | 
32 | 
33 | # Further reading
34 | This work is mostly based on the papers by Jerome H. Friedman
35 |   * https://statweb.stanford.edu/~jhf/ftp/trebst.pdf
36 |   * https://statweb.stanford.edu/~jhf/ftp/stobst.pdf
37 | 
38 | FastBDT also implements the uGB techniques to boost to flatness:
39 |   * https://arxiv.org/abs/1410.4140
40 | 
41 | 


--------------------------------------------------------------------------------
/include/FastBDT_C_API.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Thomas Keck 2015
 3 |  */
 4 | 
 5 | #include "FastBDT.h"
 6 | #include "FastBDT_IO.h"
 7 | #include "Classifier.h"
 8 | 
 9 | extern "C" {
10 | 
11 |     void PrintVersion();
12 | 
13 |     struct Expertise {
14 |       FastBDT::Classifier classifier;
15 |     };
16 |       
17 |     void* Create();
18 | 
19 |     void SetBinning(void *ptr, unsigned int* binning, unsigned int size);
20 |     void SetPurityTransformation(void *ptr, bool* purityTransformation, unsigned int size);
21 |     
22 |     void SetNTrees(void *ptr, unsigned int nTrees);
23 |     unsigned int GetNTrees(void *ptr);
24 |     
25 |     void SetDepth(void *ptr, unsigned int depth);
26 |     unsigned int GetDepth(void *ptr);
27 |     
28 |     void SetNumberOfFlatnessFeatures(void *ptr, unsigned int numberOfFlatnessFeatures);
29 |     unsigned int GetNumberOfFlatnessFeatures(void *ptr);
30 |     
31 |     void SetSubsample(void *ptr, double subsample);
32 |     double GetSubsample(void *ptr);
33 |     
34 |     void SetShrinkage(void *ptr, double shrinkage);
35 |     double GetShrinkage(void *ptr);
36 |     
37 |     void SetFlatnessLoss(void *ptr, double flatnessLoss);
38 |     double GetFlatnessLoss(void *ptr);
39 | 
40 |     void SetTransform2Probability(void *ptr, bool transform2probability);
41 |     bool GetTransform2Probability(void *ptr);
42 |     
43 |     void SetSPlot(void *ptr, bool sPlot);
44 |     bool GetSPlot(void *ptr);
45 |     
46 |     void Delete(void *ptr);
47 |     
48 |     void Fit(void *ptr, float *data_ptr, float *weight_ptr, bool *target_ptr, unsigned int nEvents, unsigned int nFeatures);
49 | 
50 |     void Load(void* ptr, char *weightfile);
51 | 
52 |     float Predict(void *ptr, float *array);
53 | 
54 |     void PredictArray(void *ptr, float *array, float *result, unsigned int nEvents);
55 | 
56 |     void Save(void* ptr, char *weightfile);
57 |     
58 |     struct VariableRanking {
59 |         std::map<unsigned int, double> ranking;
60 |     }; 
61 | 
62 |     void* GetVariableRanking(void* ptr);
63 |     
64 |     void* GetIndividualVariableRanking(void* ptr, float *array);
65 |     
66 |     unsigned int ExtractNumberOfVariablesFromVariableRanking(void* ptr);
67 |     
68 |     double ExtractImportanceOfVariableFromVariableRanking(void* ptr, unsigned int iFeature);
69 |     
70 |     void DeleteVariableRanking(void* ptr);
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/examples/PythonExample.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from PyFastBDT import FastBDT
 3 | 
 4 | import numpy as np
 5 | import sklearn.metrics
 6 | 
 7 | if __name__ == '__main__':
 8 | 
 9 |     # Create some Monte Carlo data using a multidimensional gaussian distribution
10 |     # The 0th row of the coveriance matrix describes the correlation to the target variable
11 |     mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
12 |     cov = [[1.0, 0.8, 0.4, 0.2, 0.1, 0.0],
13 |            [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
14 |            [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
15 |            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
16 |            [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
17 |            [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]
18 | 
19 |     for i in range(len(mean)):
20 |         for j in range(i+1, len(mean)):
21 |             cov[j][i] = cov[i][j]
22 | 
23 |     N_train, N_test = 10000, 10000
24 |     data = np.random.multivariate_normal(mean, cov, N_train + N_test)
25 |     X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 
26 |     X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 
27 | 
28 |     # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers
29 |     clf = FastBDT.Classifier()
30 |     clf.fit(X=X_train, y=y_train)
31 |     p = clf.predict(X_test)
32 |     global_auc = sklearn.metrics.roc_auc_score(y_test, p)
33 |     print("Global AUC", global_auc)
34 | 
35 |     # Intern feature importance is calculated using the sum of the information gains
36 |     # provided by each feature in all decision trees
37 |     print("Intern Feature Importance")
38 |     print(clf.internFeatureImportance())
39 | 
40 |     # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve
41 |     # if the most important feature is left out recursively
42 |     print("Extern Feature Importance")
43 |     print(clf.externFeatureImportance(X_train, y_train, None, X_test, y_test, None))
44 | 
45 |     # Individual feature importance is the sum of the information gains provided by feature
46 |     # in the path an individual event takes through the forest
47 |     print("Individual Feature Importance")
48 |     events = [ np.array([1.0, 2.0, 3.0, 4.0, 5.0]),
49 |                np.array([2.0, 2.0, 3.0, 4.0, 5.0]),
50 |                np.array([0.0, 2.0, 3.0, 4.0, 5.0]),
51 |                np.array([1.0, 3.0, 3.0, 4.0, 5.0]),
52 |                np.array([1.0, 1.0, 3.0, 4.0, 5.0]),
53 |                np.array([1.0, 2.0, 4.0, 4.0, 5.0]),
54 |                np.array([1.0, 2.0, 2.0, 4.0, 5.0]),
55 |                np.array([1.0, 2.0, 3.0, 5.0, 5.0]),
56 |                np.array([1.0, 2.0, 3.0, 3.0, 5.0]),
57 |                np.array([1.0, 2.0, 3.0, 4.0, 6.0]),
58 |                np.array([1.0, 2.0, 3.0, 4.0, 4.0]) ]
59 | 
60 |     for event in events:
61 |         print(clf.individualFeatureImportance(event))
62 |     
63 |     # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers
64 |     clf = FastBDT.Classifier(purityTransformation=[False, False, False, False, False], subsample=1.0)
65 |     clf.fit(X=X_train, y=y_train)
66 |     p = clf.predict(X_test)
67 |     global_auc = sklearn.metrics.roc_auc_score(y_test, p)
68 |     print("Global AUC without Purity Transformation", global_auc)
69 | 
70 |     clf = FastBDT.Classifier(purityTransformation=[True, True, True, True, True], subsample=1.0)
71 |     clf.fit(X=X_train, y=y_train)
72 |     p = clf.predict(X_test)
73 |     global_auc = sklearn.metrics.roc_auc_score(y_test, p)
74 |     print("Global AUC with Purity Transformation", global_auc)
75 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required (VERSION 2.8.12)
  2 | 
  3 | set(CMAKE_C_COMPILER gcc)
  4 | set(CMAKE_CXX_COMPILER g++)
  5 | 
  6 | project (FastBDT)
  7 | set (FastBDT_VERSION_MAJOR 5)
  8 | set (FastBDT_VERSION_MINOR 2)
  9 | 
 10 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake")
 11 | 
 12 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -O3 -std=c++11 -Wall -Wextra -g -msse2")
 13 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -ggdb3 -g -std=c++11 -Wall -Wextra")
 14 | 
 15 | configure_file (
 16 |   "${PROJECT_SOURCE_DIR}/include/FastBDT.h.in"
 17 |   "${PROJECT_BINARY_DIR}/include/FastBDT.h"
 18 | )
 19 | 
 20 | include_directories("${PROJECT_SOURCE_DIR}/include/" "${PROJECT_BINARY_DIR}/include/")
 21 | 
 22 | set(FastBDT_SOURCES
 23 |   "${PROJECT_SOURCE_DIR}/src/FastBDT.cxx"
 24 |   "${PROJECT_SOURCE_DIR}/src/Classifier.cxx"
 25 |   "${PROJECT_SOURCE_DIR}/src/FastBDT_IO.cxx"
 26 | )
 27 | 
 28 | set(FastBDT_TESTS
 29 |   "${PROJECT_SOURCE_DIR}/src/test_all.cxx"
 30 |   "${PROJECT_SOURCE_DIR}/src/test_FastBDT.cxx"
 31 |   "${PROJECT_SOURCE_DIR}/src/test_Performance.cxx"
 32 |   "${PROJECT_SOURCE_DIR}/src/test_Classifier.cxx"
 33 |   "${PROJECT_SOURCE_DIR}/src/test_FastBDT_IO.cxx"
 34 |   "${PROJECT_SOURCE_DIR}/src/test_FastBDT_C_API.cxx"
 35 | )
 36 | 
 37 | set(FastBDT_HEADERS
 38 |   "${PROJECT_BINARY_DIR}/include/FastBDT.h"
 39 |   "${PROJECT_SOURCE_DIR}/include/Classifier.h"
 40 |   "${PROJECT_SOURCE_DIR}/include/FastBDT_IO.h"
 41 | )
 42 | 
 43 | set(FastBDT_CINTERFACE
 44 |     "${PROJECT_SOURCE_DIR}/src/FastBDT_C_API.cxx"
 45 |     "${PROJECT_SOURCE_DIR}/include/FastBDT_C_API.h" 
 46 | )
 47 | 
 48 | set(FastBDT_Python
 49 |   "${PROJECT_SOURCE_DIR}/PyFastBDT/__init__.py"
 50 |   "${PROJECT_SOURCE_DIR}/PyFastBDT/FastBDT.py"
 51 |   "${PROJECT_SOURCE_DIR}/PyFastBDT/utility.py"
 52 | )
 53 | 
 54 | add_library(FastBDT_static STATIC ${FastBDT_SOURCES} ${FastBDT_HEADERS})
 55 | add_library(FastBDT_CInterface SHARED ${FastBDT_CINTERFACE} ${FastBDT_SOURCES} ${FastBDT_HEADERS})
 56 | target_link_libraries(FastBDT_CInterface)
 57 | add_library(FastBDT_shared SHARED ${FastBDT_SOURCES} ${FastBDT_HEADERS})
 58 | target_link_libraries(FastBDT_shared)
 59 | 
 60 | install(TARGETS FastBDT_static FastBDT_shared FastBDT_CInterface
 61 |      LIBRARY DESTINATION lib
 62 |       ARCHIVE DESTINATION lib
 63 |       RUNTIME DESTINATION bin   
 64 |    )
 65 | 
 66 | install(FILES ${FastBDT_HEADERS} DESTINATION include)
 67 | 
 68 | find_package(GTest)
 69 | if(GTEST_FOUND)
 70 |     add_executable(unittests ${FastBDT_TESTS} ${FastBDT_HEADERS} ${FastBDT_CINTERFACE})
 71 |   target_link_libraries(unittests ${GTEST_BOTH_LIBRARIES} FastBDT_static pthread)
 72 |   message(STATUS  ${GTEST_INCLUDE_DIRS})
 73 |   target_include_directories(unittests PUBLIC ${GTEST_INCLUDE_DIRS})
 74 |   install(TARGETS unittests DESTINATION bin)
 75 | else()
 76 |   message(STATUS "Could not find gtest installation, skip building unittests.")
 77 | endif()
 78 | 
 79 | find_program(PYTHON "python3")
 80 | 
 81 | if (PYTHON)
 82 |     configure_file(
 83 |         "${PROJECT_SOURCE_DIR}/setup.py.in"
 84 |         "${PROJECT_BINARY_DIR}/setup.py"
 85 |     )
 86 |     add_custom_target(PyFastBDT ALL DEPENDS ${FastBDT_Python} FastBDT_shared FastBDT_CInterface)
 87 | 
 88 |     add_custom_command(TARGET PyFastBDT PRE_BUILD
 89 |                        COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/PyFastBDT" "${CMAKE_BINARY_DIR}/PyFastBDT"
 90 |                        COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:FastBDT_shared> "${PROJECT_BINARY_DIR}/PyFastBDT/"
 91 |                        COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:FastBDT_CInterface> "${PROJECT_BINARY_DIR}/PyFastBDT/"
 92 |                        COMMAND ${PYTHON} "${CMAKE_BINARY_DIR}/setup.py" build
 93 |                        COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp"
 94 |     )
 95 | 
 96 |     install(CODE "execute_process(COMMAND ${PYTHON} ${PROJECT_BINARY_DIR}/setup.py install --prefix=${CMAKE_INSTALL_PREFIX})")
 97 | endif()
 98 | 
 99 | set(CPACK_PACKAGE_VERSION "${FastBDT_VERSION_MAJOR}.${FastBDT_VERSION_MINOR}")
100 | set(CPACK_GENERATOR "RPM;DEB;TGZ")
101 | set(CPACK_PACKAGE_NAME "FastBDT")
102 | set(CPACK_PACKAGE_RELEASE 1)
103 | set(CPACK_PACKAGE_CONTACT "thomas.keck2@kit.edu")
104 | set(CPACK_PACKAGE_VENDOR "Private")
105 | set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
106 | set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CPACK_PACKAGE_RELEASE}.${CMAKE_SYSTEM_PROCESSOR}")
107 | 
108 | SET(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
109 | SET(CPACK_DEBIAN_PACKAGE_SECTION "libs")
110 | SET(CPACK_DEBIAN_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
111 | 
112 | include(CPack)
113 | 


--------------------------------------------------------------------------------
/examples/ugboost.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from PyFastBDT import FastBDT
  3 | from PyFastBDT import utility
  4 | 
  5 | import numpy as np
  6 | import numpy
  7 | import numpy.linalg
  8 | import sklearn.metrics
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib as mpl
 12 | 
 13 | 
 14 | def calculate_cdf_and_pdf(X):
 15 |     """
 16 |     Calculates cdf and pdf of given sample and adds under/overflow bins
 17 |         @param X 1-d numpy.array
 18 |     """
 19 |     pdf, bins = numpy.histogram(X, bins=100, density=True)
 20 |     cdf = numpy.cumsum(pdf * (bins - numpy.roll(bins, 1))[1:])
 21 |     return numpy.hstack([0.0, cdf, 1.0]), numpy.hstack([0.0, pdf, 0.0]), bins
 22 | 
 23 | 
 24 | class Prior(object):
 25 |     def __init__(self, signal, bckgrd):
 26 |         self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(signal)
 27 |         self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(bckgrd)
 28 |         # Avoid numerical instabilities
 29 |         self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
 30 |         self.signal_yield = len(signal)
 31 |         self.bckgrd_yield = len(bckgrd)
 32 | 
 33 |     def get_signal_pdf(self, X):
 34 |         return self.signal_pdf[numpy.digitize(X, bins=self.signal_bins)]
 35 | 
 36 |     def get_bckgrd_pdf(self, X):
 37 |         return self.bckgrd_pdf[numpy.digitize(X, bins=self.bckgrd_bins)]
 38 | 
 39 |     def get_signal_cdf(self, X):
 40 |         return self.signal_cdf[numpy.digitize(X, bins=self.signal_bins)]
 41 | 
 42 |     def get_bckgrd_cdf(self, X):
 43 |         return self.bckgrd_cdf[numpy.digitize(X, bins=self.bckgrd_bins)]
 44 |     
 45 |     def get_prior(self, X):
 46 |         return self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X))
 47 | 
 48 | 
 49 | def combine_probabilities(p1, p2):
 50 |     return p1*p2 / (p1*p2 + (1-p1)*(1-p2))
 51 | 
 52 | 
 53 | def evaluation(label, X_test, y_test, p, p_prior):
 54 |     print(label, utility.auc_roc(p, y_test), utility.flatness(p, X_test[:, 0], y_test, classes=[0]), utility.flatness(p, X_test[:, 0], y_test, classes=[1]))
 55 |     print(label, sklearn.metrics.roc_auc_score(y_test, p))
 56 |     print(label + " with prior", sklearn.metrics.roc_auc_score(y_test, combine_probabilities(p, p_prior)))
 57 |     plt.scatter(X_test[y_test == 1, 0], p[y_test == 1], c='r', label=label + " (Signal)", alpha=0.2)
 58 |     plt.scatter(X_test[y_test == 0, 0], p[y_test == 0], c='b', label=label + " (Background)", alpha=0.2)
 59 |     plt.xlabel("Feature")
 60 |     plt.ylabel("Probability")
 61 |     plt.show()
 62 | 
 63 | 
 64 | if __name__ == '__main__':
 65 |     # Create some Monte Carlo data using a multidimensional gaussian distribution
 66 |     # The 0th row of the coveriance matrix describes the correlation to the target variable
 67 |     mean = [0.5, 0.4, 0.4]
 68 |     cov = [[1.0, 0.6, 0.6],
 69 |            [0.0, 1.0, 0.0],
 70 |            [0.0, 0.0, 1.0]]
 71 |     
 72 |     mean2 = [-0.5, -0.4, -0.4]
 73 |     cov2 = [[1.0, 0.6, 0.6],
 74 |            [0.0, 1.0, 0.0],
 75 |            [0.0, 0.0, 1.0]]
 76 | 
 77 |     for i in range(len(mean)):
 78 |         for j in range(i+1, len(mean)):
 79 |             cov[j][i] = cov[i][j]
 80 |             cov2[j][i] = cov2[i][j]
 81 | 
 82 |     N_train, N_test = 100000, 2000
 83 |     data = np.random.multivariate_normal(mean2, cov2, (N_train + N_test)//2)
 84 |     data2 = np.random.multivariate_normal(mean, cov, (N_train + N_test)//2)
 85 |     X_train, y_train = np.r_[data[:N_train//2], data2[:N_train//2]], np.r_[np.ones(N_train//2) == 1, np.ones(N_train//2) == 0]
 86 |     X_test, y_test = np.r_[data[N_train//2:], data2[N_train//2:]], np.r_[np.ones(N_test//2) == 1, np.ones(N_test//2) == 0]
 87 | 
 88 |     # First variable is the variable we want to have independent of our network output
 89 |     prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0])
 90 |     p_prior = prior.get_prior(X_test[:, 0])
 91 |     evaluation("Prior", X_test, y_test, p_prior, p_prior)
 92 |     
 93 |     evaluation("Random", X_test, y_test, np.random.uniform(size=N_test), p_prior)
 94 | 
 95 |     for i in [0.0, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 500.0, 1000.0]:
 96 |         p = FastBDT.Classifier(flatnessLoss=i, numberOfFlatnessFeatures=1).fit(X=np.c_[X_train[:, 1:], X_train[:, 0]], y=y_train).predict(X_test[:, 1:])
 97 |         print("Flatness", i)
 98 |         evaluation("UBoost", X_test, y_test, p, p_prior)
 99 |     
100 |     p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test)
101 |     evaluation("Full", X_test, y_test, p, p_prior)
102 | 
103 |     p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:])
104 |     evaluation("Restricted", X_test, y_test, p, p_prior)
105 |         
106 | 


--------------------------------------------------------------------------------
/src/FastBDT_IO.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2014
  3 |  */
  4 | 
  5 | #include "FastBDT_IO.h"
  6 | 
  7 | #include <string>
  8 | #include <sstream>
  9 | 
 10 | namespace FastBDT {
 11 | 
 12 |   float convert_to_float_safely(std::string &input) {
 13 |      float result = 0;
 14 |      try {
 15 |         // stof handles infinity and nan correctly but fails
 16 |         // for denormalized values
 17 |         result = std::stof(input);
 18 |      } catch(...) {
 19 |         // stringstream fails for nan and infinity but
 20 |         // handles denormalized values correctly.
 21 |         std::stringstream stream;
 22 |         stream << input;
 23 |         stream >> result;
 24 |      }
 25 |      return result;
 26 |   }
 27 |   
 28 |   double convert_to_double_safely(std::string &input) {
 29 |      double result = 0;
 30 |      try {
 31 |         // stof handles infinity and nan correctly but fails
 32 |         // for denormalized values
 33 |         result = std::stod(input);
 34 |      } catch(...) {
 35 |         // stringstream fails for nan and infinity but
 36 |         // handles denormalized values correctly.
 37 |         std::stringstream stream;
 38 |         stream << input;
 39 |         stream >> result;
 40 |      }
 41 |      return result;
 42 |   }
 43 |   
 44 |   template<>
 45 |   std::ostream& operator<<(std::ostream& stream, const std::vector<float> &vector) {
 46 |      stream << vector.size();
 47 |      stream.precision(std::numeric_limits<float>::max_digits10);
 48 |      stream << std::scientific;
 49 |      for(const auto &value : vector) {
 50 |          stream << " " << value;
 51 |      }
 52 |      stream.precision(6);
 53 |      stream << std::endl;
 54 |      return stream;
 55 |   }
 56 |   
 57 |   template<>
 58 |   std::ostream& operator<<(std::ostream& stream, const std::vector<double> &vector) {
 59 |      stream << vector.size();
 60 |      stream.precision(std::numeric_limits<double>::max_digits10);
 61 |      stream << std::scientific;
 62 |      for(const auto &value : vector) {
 63 |          stream << " " << value;
 64 |      }
 65 |      stream.precision(6);
 66 |      stream << std::endl;
 67 |      return stream;
 68 |   }
 69 |   
 70 |   template<>
 71 |   std::istream& operator>>(std::istream& stream, std::vector<float> &vector) {
 72 |      unsigned int size;
 73 |      stream >> size;
 74 |      vector.resize(size);
 75 |      for(unsigned int i = 0; i < size; ++i) {
 76 |          std::string temp;
 77 |          stream >> temp;
 78 |          vector[i] = convert_to_float_safely(temp);
 79 |      }
 80 |      return stream;
 81 |   }
 82 |   
 83 |   template<>
 84 |   std::istream& operator>>(std::istream& stream, std::vector<double> &vector) {
 85 |      unsigned int size;
 86 |      stream >> size;
 87 |      vector.resize(size);
 88 |      for(unsigned int i = 0; i < size; ++i) {
 89 |          std::string temp;
 90 |          stream >> temp;
 91 |          vector[i] = convert_to_double_safely(temp);
 92 |      }
 93 |      return stream;
 94 |   }
 95 |   
 96 |   /**
 97 |    * This function reads a Cut from an std::istream
 98 |    * @param stream an std::istream reference
 99 |    * @param cut containing read data
100 |    */
101 |   template<>
102 |   std::istream& operator>>(std::istream& stream, Cut<float> &cut) {
103 |      stream >> cut.feature;
104 | 
105 |      // Unfortunately we have to use our own conversion here to correctly parse NaN and Infinity
106 |      // because usualy istream::operator>> doesn't do this!
107 |      std::string index_string;
108 |      stream >> index_string;
109 |      cut.index = convert_to_float_safely(index_string);
110 |      stream >> cut.valid;
111 |      stream >> cut.gain;
112 |      return stream;
113 |   }
114 |   
115 |   /**
116 |    * This function reads a Cut from an std::istream
117 |    * @param stream an std::istream reference
118 |    * @param cut containing read data
119 |    */
120 |   template<>
121 |   std::istream& operator>>(std::istream& stream, Cut<double> &cut) {
122 |      stream >> cut.feature;
123 | 
124 |      // Unfortunately we have to use our own conversion here to correctly parse NaN and Infinity
125 |      // because usualy istream::operator>> doesn't do this!
126 |      std::string index_string;
127 |      stream >> index_string;
128 |      cut.index = convert_to_double_safely(index_string);
129 |      stream >> cut.valid;
130 |      stream >> cut.gain;
131 |      return stream;
132 |   }
133 |   
134 |   std::ostream& operator<<(std::ostream& stream, const PurityTransformation &purityTransformation) {
135 |       stream << purityTransformation.GetMapping() << std::endl;
136 |       return stream;
137 |   }
138 |   
139 |   std::istream& operator>>(std::istream& stream, PurityTransformation &purityTransformation) {
140 | 
141 |       std::vector<unsigned int> mapping;
142 |       stream >> mapping;
143 |       purityTransformation.SetMapping(mapping);
144 |       return stream;
145 |   }
146 | 
147 | }
148 | 


--------------------------------------------------------------------------------
/include/Classifier.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Thomas Keck 2017
  3 |  *
  4 |  * Simplified sklearn interface
  5 |  */
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "FastBDT.h"
 10 | #include "FastBDT_IO.h"
 11 | 
 12 | #include <vector>
 13 | 
 14 | namespace FastBDT {
 15 | class Classifier {
 16 | 
 17 |   public:
 18 |       /*
 19 |        * Explicitly activate default/copy constructor and assign operator.
 20 |        * This was a request of a user.
 21 |        */
 22 |       Classifier() = default;
 23 |       Classifier(const Classifier&) = default;
 24 |       Classifier& operator=(const Classifier &) = default;
 25 | 
 26 |       Classifier(std::istream& stream) {
 27 | 
 28 |         stream >> m_version;
 29 |         stream >> m_nTrees;
 30 |         stream >> m_depth;
 31 |         stream >> m_binning;
 32 |         stream >> m_shrinkage;
 33 |         stream >> m_subsample;
 34 |         stream >> m_sPlot;
 35 |         stream >> m_flatnessLoss;
 36 |         stream >> m_purityTransformation;
 37 |         stream >> m_transform2probability;
 38 |         stream >> m_featureBinning;
 39 |         stream >> m_purityBinning;
 40 |         stream >> m_numberOfFeatures;
 41 |         stream >> m_numberOfFinalFeatures;
 42 |         stream >> m_numberOfFlatnessFeatures;
 43 |         stream >> m_can_use_fast_forest;
 44 |         m_fast_forest = readForestFromStream<float>(stream);
 45 |         m_binned_forest = readForestFromStream<unsigned int>(stream);
 46 | 
 47 |       }
 48 | 
 49 |       friend std::ostream& operator<<(std::ostream& stream, const Classifier& classifier);
 50 | 
 51 | 			Classifier(unsigned int nTrees, unsigned int depth, std::vector<unsigned int> binning, double shrinkage = 0.1, double subsample = 1.0, bool sPlot = false, double flatnessLoss = -1.0, std::vector<bool> purityTransformation = {}, unsigned int numberOfFlatnessFeatures=0, bool transform2probability=true) :
 52 |         m_nTrees(nTrees), m_depth(depth), m_binning(binning), m_shrinkage(shrinkage), m_subsample(subsample), m_sPlot(sPlot), m_flatnessLoss(flatnessLoss), m_purityTransformation(purityTransformation), m_numberOfFlatnessFeatures(numberOfFlatnessFeatures), m_transform2probability(transform2probability), m_can_use_fast_forest(true) { }
 53 | 
 54 |       void Print();
 55 | 
 56 |       unsigned int GetNTrees() const { return m_nTrees; }
 57 |       void SetNTrees(unsigned int nTrees) { m_nTrees = nTrees; }
 58 |       
 59 |       unsigned int GetDepth() const { return m_depth; }
 60 |       void SetDepth(unsigned int depth) { m_depth = depth; }
 61 |       
 62 |       unsigned int GetNumberOfFlatnessFeatures() const { return m_numberOfFlatnessFeatures; }
 63 |       void SetNumberOfFlatnessFeatures(unsigned int numberOfFlatnessFeatures) { m_numberOfFlatnessFeatures = numberOfFlatnessFeatures; }
 64 | 
 65 |       unsigned int GetNFeatures() const { return m_numberOfFeatures; }
 66 | 
 67 |       double GetShrinkage() const { return m_shrinkage; }
 68 |       void SetShrinkage(double shrinkage) { m_shrinkage = shrinkage; }
 69 |       
 70 |       double GetSubsample() const { return m_subsample; }
 71 |       void SetSubsample(double subsample) { m_subsample = subsample; }
 72 |       
 73 |       bool GetSPlot() const { return m_sPlot; }
 74 |       void SetSPlot(bool sPlot) { m_sPlot = sPlot; }
 75 |       
 76 |       bool GetTransform2Probability() const { return m_transform2probability; }
 77 |       void SetTransform2Probability(bool transform2probability) { m_transform2probability = transform2probability; }
 78 |       
 79 |       std::vector<unsigned int> GetBinning() const { return m_binning; }
 80 |       void SetBinning(std::vector<unsigned int> binning) { m_binning = binning; }
 81 | 
 82 |       std::vector<bool> GetPurityTransformation() const { return m_purityTransformation; }
 83 |       void SetPurityTransformation(std::vector<bool> purityTransformation) { m_purityTransformation = purityTransformation; }
 84 | 
 85 |       double GetFlatnessLoss() const { return m_flatnessLoss; }
 86 |       void SetFlatnessLoss(double flatnessLoss) { m_flatnessLoss = flatnessLoss; }
 87 | 			
 88 |       void fit(const std::vector<std::vector<float>> &X, const std::vector<bool> &y, const std::vector<Weight> &w);
 89 | 
 90 |       float predict(const std::vector<float> &X) const;
 91 |       
 92 |       std::map<unsigned int, double> GetVariableRanking() const;
 93 |       
 94 |       std::map<unsigned int, double> GetIndividualVariableRanking(const std::vector<float> &X) const;
 95 | 
 96 |       std::map<unsigned int, unsigned int> GetFeatureMapping() const;
 97 |   
 98 |       std::map<unsigned int, double> MapRankingToOriginalFeatures(std::map<unsigned int, double> ranking) const;
 99 | 
100 |   private:
101 |     unsigned int m_version = 1;
102 |     unsigned int m_nTrees = 100;
103 |     unsigned int m_depth = 3;
104 |     std::vector<unsigned int> m_binning;
105 |     double m_shrinkage = 0.1;
106 |     double m_subsample = 0.5;
107 |     bool m_sPlot = true;
108 |     double m_flatnessLoss = -1;
109 |     std::vector<bool> m_purityTransformation;
110 |     unsigned int m_numberOfFlatnessFeatures = 0;
111 |     bool m_transform2probability = true;
112 |     unsigned int m_numberOfFeatures = 0;
113 |     unsigned int m_numberOfFinalFeatures = 0;
114 |     std::vector<FeatureBinning<float>> m_featureBinning;
115 |     std::vector<PurityTransformation> m_purityBinning;
116 | 
117 |     bool m_can_use_fast_forest = true;
118 |     Forest<float> m_fast_forest;
119 |     Forest<unsigned int> m_binned_forest;
120 | 
121 | };
122 | 
123 | std::ostream& operator<<(std::ostream& stream, const Classifier& classifier);
124 | 
125 | }
126 | 


--------------------------------------------------------------------------------
/files/iris.txt:
--------------------------------------------------------------------------------
  1 | SepalLength SepalWidth PetalLength PetalWidth Class
  2 | 5.1     3.5     1.4     0.1     0
  3 | 4.9     3.0     1.4     0.2     0
  4 | 4.7     3.2     1.3     0.2     0
  5 | 4.6     3.1     1.5     0.2     0
  6 | 5.0     3.6     1.4     0.2     0
  7 | 5.4     3.9     1.7     0.4     0
  8 | 4.6     3.4     1.4     0.3     0
  9 | 5.0     3.4     1.5     0.2     0
 10 | 4.4     2.9     1.4     0.2     0
 11 | 4.9     3.1     1.5     0.1     0
 12 | 5.4     3.7     1.5     0.2     0
 13 | 4.8     3.4     1.6     0.2     0
 14 | 4.8     3.0     1.4     0.1     0
 15 | 4.3     3.0     1.1     0.1     0
 16 | 5.8     4.0     1.2     0.2     0
 17 | 5.7     4.4     1.5     0.4     0
 18 | 5.4     3.9     1.3     0.4     0
 19 | 5.1     3.5     1.4     0.3     0
 20 | 5.7     3.8     1.7     0.3     0
 21 | 5.1     3.8     1.5     0.3     0
 22 | 5.4     3.4     1.7     0.2     0
 23 | 5.1     3.7     1.5     0.4     0
 24 | 4.6     3.6     1.0     0.2     0
 25 | 5.1     3.3     1.7     0.5     0
 26 | 4.8     3.4     1.9     0.2     0
 27 | 5.0     3.0     1.6     0.2     0
 28 | 5.0     3.4     1.6     0.4     0
 29 | 5.2     3.5     1.5     0.2     0
 30 | 5.2     3.4     1.4     0.2     0
 31 | 4.7     3.2     1.6     0.2     0
 32 | 4.8     3.1     1.6     0.2     0
 33 | 5.4     3.4     1.5     0.4     0
 34 | 5.2     4.1     1.5     0.1     0
 35 | 5.5     4.2     1.4     0.2     0
 36 | 4.9     3.1     1.5     0.2     0
 37 | 5.0     3.2     1.2     0.2     0
 38 | 5.5     3.5     1.3     0.2     0
 39 | 4.9     3.6     1.4     0.1     0
 40 | 4.4     3.0     1.3     0.2     0
 41 | 5.1     3.4     1.5     0.2     0
 42 | 5.0     3.5     1.3     0.3     0
 43 | 4.5     2.3     1.3     0.3     0
 44 | 4.4     3.2     1.3     0.2     0
 45 | 5.0     3.5     1.6     0.6     0
 46 | 5.1     3.8     1.9     0.4     0
 47 | 4.8     3.0     1.4     0.3     0
 48 | 5.1     3.8     1.6     0.2     0
 49 | 4.6     3.2     1.4     0.2     0
 50 | 5.3     3.7     1.5     0.2     0
 51 | 5.0     3.3     1.4     0.2     0
 52 | 7.0     3.2     4.7     1.4     2
 53 | 6.4     3.2     4.5     1.5     2
 54 | 6.9     3.1     4.9     1.5     2
 55 | 5.5     2.3     4.0     1.3     2
 56 | 6.5     2.8     4.6     1.5     2
 57 | 5.7     2.8     4.5     1.3     2
 58 | 6.3     3.3     4.7     1.6     2
 59 | 4.9     2.4     3.3     1.0     2
 60 | 6.6     2.9     4.6     1.3     2
 61 | 5.2     2.7     3.9     1.4     2
 62 | 5.0     2.0     3.5     1.0     2
 63 | 5.9     3.0     4.2     1.5     2
 64 | 6.0     2.2     4.0     1.0     2
 65 | 6.1     2.9     4.7     1.4     2
 66 | 5.6     2.9     3.6     1.3     2
 67 | 6.7     3.1     4.4     1.4     2
 68 | 5.6     3.0     4.5     1.5     2
 69 | 5.8     2.7     4.1     1.0     2
 70 | 6.2     2.2     4.5     1.5     2
 71 | 5.6     2.5     3.9     1.1     2
 72 | 5.9     3.2     4.8     1.8     2
 73 | 6.1     2.8     4.0     1.3     2
 74 | 6.3     2.5     4.9     1.5     2
 75 | 6.1     2.8     4.7     1.2     2
 76 | 6.4     2.9     4.3     1.3     2
 77 | 6.6     3.0     4.4     1.4     2
 78 | 6.8     2.8     4.8     1.4     2
 79 | 6.7     3.0     5.0     1.7     2
 80 | 6.0     2.9     4.5     1.5     2
 81 | 5.7     2.6     3.5     1.0     2
 82 | 5.5     2.4     3.8     1.1     2
 83 | 5.5     2.4     3.7     1.0     2
 84 | 5.8     2.7     3.9     1.2     2
 85 | 6.0     2.7     5.1     1.6     2
 86 | 5.4     3.0     4.5     1.5     2
 87 | 6.0     3.4     4.5     1.6     2
 88 | 6.7     3.1     4.7     1.5     2
 89 | 6.3     2.3     4.4     1.3     2
 90 | 5.6     3.0     4.1     1.3     2
 91 | 5.5     2.5     4.0     1.3     2
 92 | 5.5     2.6     4.4     1.2     2
 93 | 6.1     3.0     4.6     1.4     2
 94 | 5.8     2.6     4.0     1.2     2
 95 | 5.0     2.3     3.3     1.0     2
 96 | 5.6     2.7     4.2     1.3     2
 97 | 5.7     3.0     4.2     1.2     2
 98 | 5.7     2.9     4.2     1.3     2
 99 | 6.2     2.9     4.3     1.3     2
100 | 5.1     2.5     3.0     1.1     2
101 | 5.7     2.8     4.1     1.3     2
102 | 6.3     3.3     6.0     2.5     1
103 | 5.8     2.7     5.1     1.9     1
104 | 7.1     3.0     5.9     2.1     1
105 | 6.3     2.9     5.6     1.8     1
106 | 6.5     3.0     5.8     2.2     1
107 | 7.6     3.0     6.6     2.1     1
108 | 4.9     2.5     4.5     1.7     1
109 | 7.3     2.9     6.3     1.8     1
110 | 6.7     2.5     5.8     1.8     1
111 | 7.2     3.6     6.1     2.5     1
112 | 6.5     3.2     5.1     2.0     1
113 | 6.4     2.7     5.3     1.9     1
114 | 6.8     3.0     5.5     2.1     1
115 | 5.7     2.5     5.0     2.0     1
116 | 5.8     2.8     5.1     2.4     1
117 | 6.4     3.2     5.3     2.3     1
118 | 6.5     3.0     5.5     1.8     1
119 | 7.7     3.8     6.7     2.2     1
120 | 7.7     2.6     6.9     2.3     1
121 | 6.0     2.2     5.0     1.5     1
122 | 6.9     3.2     5.7     2.3     1
123 | 5.6     2.8     4.9     2.0     1
124 | 7.7     2.8     6.7     2.0     1
125 | 6.3     2.7     4.9     1.8     1
126 | 6.7     3.3     5.7     2.1     1
127 | 7.2     3.2     6.0     1.8     1
128 | 6.2     2.8     4.8     1.8     1
129 | 6.1     3.0     4.9     1.8     1
130 | 6.4     2.8     5.6     2.1     1
131 | 7.2     3.0     5.8     1.6     1
132 | 7.4     2.8     6.1     1.9     1
133 | 7.9     3.8     6.4     2.0     1
134 | 6.4     2.8     5.6     2.2     1
135 | 6.3     2.8     5.1     1.5     1
136 | 6.1     2.6     5.6     1.4     1
137 | 7.7     3.0     6.1     2.3     1
138 | 6.3     3.4     5.6     2.4     1
139 | 6.4     3.1     5.5     1.8     1
140 | 6.0     3.0     4.8     1.8     1
141 | 6.9     3.1     5.4     2.1     1
142 | 6.7     3.1     5.6     2.4     1
143 | 6.9     3.1     5.1     2.3     1
144 | 5.8     2.7     5.1     1.9     1
145 | 6.8     3.2     5.9     2.3     1
146 | 6.7     3.3     5.7     2.5     1
147 | 6.7     3.0     5.2     2.3     1
148 | 6.3     2.5     5.0     1.9     1
149 | 6.5     3.0     5.2     2.0     1
150 | 6.2     3.4     5.4     2.3     1
151 | 5.9     3.0     5.1     1.8     1
152 | 


--------------------------------------------------------------------------------
/examples/CPPExample.cxx:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Thomas Keck 2017
 3 |  */
 4 | 
 5 | #include "Classifier.h"
 6 | #include <iostream>
 7 | #include <fstream>
 8 | #include <sstream>
 9 | 
10 | std::vector<std::vector<float>> GetIrisX() {
11 |   std::vector<std::vector<float>> X = {{5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9,5.4,4.8,4.8,4.3,5.8,5.7,5.4,5.1,5.7,5.1,5.4,5.1,4.6,5.1,4.8,5.0,5.0,5.2,5.2,4.7,4.8,5.4,5.2,5.5,4.9,5.0,5.5,4.9,4.4,5.1,5.0,4.5,4.4,5.0,5.1,4.8,5.1,4.6,5.3,5.0,7.0,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,5.0,5.9,6.0,6.1,5.6,6.7,5.6,5.8,6.2,5.6,5.9,6.1,6.3,6.1,6.4,6.6,6.8,6.7,6.0,5.7,5.5,5.5,5.8,6.0,5.4,6.0,6.7,6.3,5.6,5.5,5.5,6.1,5.8,5.0,5.6,5.7,5.7,6.2,5.1,5.7,6.3,5.8,7.1,6.3,6.5,7.6,4.9,7.3,6.7,7.2,6.5,6.4,6.8,5.7,5.8,6.4,6.5,7.7,7.7,6.0,6.9,5.6,7.7,6.3,6.7,7.2,6.2,6.1,6.4,7.2,7.4,7.9,6.4,6.3,6.1,7.7,6.3,6.4,6.0,6.9,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9}, {3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,3.7,3.4,3.0,3.0,4.0,4.4,3.9,3.5,3.8,3.8,3.4,3.7,3.6,3.3,3.4,3.0,3.4,3.5,3.4,3.2,3.1,3.4,4.1,4.2,3.1,3.2,3.5,3.1,3.0,3.4,3.5,2.3,3.2,3.5,3.8,3.0,3.8,3.2,3.7,3.3,3.2,3.2,3.1,2.3,2.8,2.8,3.3,2.4,2.9,2.7,2.0,3.0,2.2,2.9,2.9,3.1,3.0,2.7,2.2,2.5,3.2,2.8,2.5,2.8,2.9,3.0,2.8,3.0,2.9,2.6,2.4,2.4,2.7,2.7,3.0,3.4,3.1,2.3,3.0,2.5,2.6,3.0,2.6,2.3,2.7,3.0,2.9,2.9,2.5,2.8,3.3,2.7,3.0,2.9,3.0,3.0,2.5,2.9,2.5,3.6,3.2,2.7,3.0,2.5,2.8,3.2,3.0,3.8,2.6,2.2,3.2,2.8,2.8,2.7,3.3,3.2,2.8,3.0,2.8,3.0,2.8,3.8,2.8,2.8,2.6,3.0,3.4,3.1,3.0,3.1,3.1,3.1,2.7,3.2,3.3,3.0,2.5,3.0,3.4,3.0}, {1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,1.5,1.6,1.4,1.1,1.2,1.5,1.3,1.4,1.7,1.5,1.7,1.5,1.0,1.7,1.9,1.6,1.6,1.5,1.4,1.6,1.6,1.5,1.5,1.4,1.5,1.2,1.3,1.5,1.3,1.5,1.3,1.3,1.3,1.6,1.9,1.4,1.6,1.4,1.5,1.4,4.7,4.5,4.9,4.0,4.6,4.5,4.7,3.3,4.6,3.9,3.5,4.2,4.0,4.7,3.6,4.4,4.5,4.1,4.5,3.9,4.8,4.0,4.9,4.7,4.3,4.4,4.8,5.0,4.5,3.5,3.8,3.7,3.9,5.1,4.5,4.5,4.7,4.4,4.1,4.0,4.4,4.6,4.0,3.3,4.2,4.2,4.2,4.3,3.0,4.1,6.0,5.1,5.9,5.6,5.8,6.6,4.5,6.3,5.8,6.1,5.1,5.3,5.5,5.0,5.1,5.3,5.5,6.7,6.9,5.0,5.7,4.9,6.7,4.9,5.7,6.0,4.8,4.9,5.6,5.8,6.1,6.4,5.6,5.1,5.6,6.1,5.6,5.5,4.8,5.4,5.6,5.1,5.1,5.9,5.7,5.2,5.0,5.2,5.4,5.1}, {0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,0.2,0.2,0.1,0.1,0.2,0.4,0.4,0.3,0.3,0.3,0.2,0.4,0.2,0.5,0.2,0.2,0.4,0.2,0.2,0.2,0.2,0.4,0.1,0.2,0.1,0.2,0.2,0.1,0.2,0.2,0.3,0.3,0.2,0.6,0.4,0.3,0.2,0.2,0.2,0.2,1.4,1.5,1.5,1.3,1.5,1.3,1.6,1.0,1.3,1.4,1.0,1.5,1.0,1.4,1.3,1.4,1.5,1.0,1.5,1.1,1.8,1.3,1.5,1.2,1.3,1.4,1.4,1.7,1.5,1.0,1.1,1.0,1.2,1.6,1.5,1.6,1.5,1.3,1.3,1.3,1.2,1.4,1.2,1.0,1.3,1.2,1.3,1.3,1.1,1.3,2.5,1.9,2.1,1.8,2.2,2.1,1.7,1.8,1.8,2.5,2.0,1.9,2.1,2.0,2.4,2.3,1.8,2.2,2.3,1.5,2.3,2.0,2.0,1.8,2.1,1.8,1.8,1.8,2.1,1.6,1.9,2.0,2.2,1.5,1.4,2.3,2.4,1.8,1.8,2.1,2.4,2.3,1.9,2.3,2.5,2.3,1.9,2.0,2.3,1.8} };
12 |   return X;
13 | }
14 | 
15 | std::vector<bool> GetIrisY() {
16 |    std::vector<bool> y = {false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true};
17 |    return y;
18 | }
19 | 
20 | std::vector<float> GetIrisW() {
21 |   std::vector<float> w = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
22 |   return w;
23 | }
24 | 
25 | 
26 | float GetIrisScore(const FastBDT::Classifier &classifier) {
27 |     auto X = GetIrisX();
28 |     auto y = GetIrisY();
29 |     float sum = 0;
30 |     for(unsigned int i = 0; i < y.size(); ++i) {
31 |       float p = classifier.predict({X[0][i], X[1][i], X[2][i], X[3][i]});
32 |       sum += (static_cast<int>(y[i])-p)*(static_cast<int>(y[i])-p);
33 |     }
34 |     return sum;
35 | }
36 | 
37 | int main() {
38 |   
39 |   FastBDT::Classifier classifier;
40 |   // Most of the parameters have default values and
41 |   // you don't have to set them.
42 |   classifier.SetBinning({5, 5, 5, 5}); // 2^5 Bins for each feature, default is 2^8 bins per feature
43 |   classifier.SetNTrees(10); // default is 100
44 |   classifier.SetDepth(3); // default is 3 
45 |   classifier.SetShrinkage(0.1); // default is 0.1
46 |   classifier.SetSubsample(0.5); // default is 0.5
47 |   classifier.SetSPlot(false); // default is false
48 |   classifier.SetPurityTransformation({false, false, false, false}); // Do not use purity transformation for the feature, default is false as well
49 |   classifier.SetNumberOfFlatnessFeatures(0); // We do not use uniform boosting here (default is 0 as well)
50 |   classifier.SetFlatnessLoss(-1); // We do not use uniform boosting here (default is -1 as well)
51 |   classifier.SetTransform2Probability(true); // Transform output to probability (default is true)
52 | 
53 |   classifier.fit(GetIrisX(), GetIrisY(), GetIrisW());
54 | 
55 |   std::cout << "Score " << GetIrisScore(classifier) << std::endl;
56 | 
57 |   std::fstream out_stream("unittest.weightfile", std::ios_base::out | std::ios_base::trunc);
58 |   out_stream << classifier << std::endl;
59 |   out_stream.close();
60 | 
61 |   classifier.Print();
62 | 
63 |   std::fstream in_stream("unittest.weightfile", std::ios_base::in);
64 |   FastBDT::Classifier classifier2(in_stream);
65 |   
66 |   std::cout << "Score " << GetIrisScore(classifier2) << std::endl;
67 |  
68 |   classifier2.Print();
69 | 
70 |   return 0;
71 | }
72 | 


--------------------------------------------------------------------------------
/src/test_FastBDT_C_API.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2015
  3 |  */
  4 | 
  5 | #include "FastBDT_C_API.h"
  6 | 
  7 | #include <gtest/gtest.h>
  8 | 
  9 | class CInterfaceTest : public ::testing::Test {
 10 |     protected:
 11 |         virtual void SetUp() {
 12 |             expertise = static_cast<Expertise*>(Create());
 13 |         }
 14 | 
 15 |         virtual void TearDown() {
 16 |             Delete(expertise);
 17 |         }
 18 | 
 19 |         Expertise *expertise;
 20 | 
 21 | };
 22 | 
 23 | TEST_F(CInterfaceTest, SetGetBinning ) {
 24 | 
 25 |     unsigned int binning[] = {10u, 20u};
 26 |     SetBinning(expertise, binning, 2);
 27 |     EXPECT_EQ(expertise->classifier.GetBinning().size(), 2u);
 28 |     EXPECT_EQ(expertise->classifier.GetBinning()[0], 10u);
 29 |     EXPECT_EQ(expertise->classifier.GetBinning()[1], 20u);
 30 | 
 31 | }
 32 | 
 33 | TEST_F(CInterfaceTest, SetGetPurityTransformation ) {
 34 |     
 35 |     bool purityTransformation[] = {true, false};
 36 |     SetPurityTransformation(expertise, purityTransformation, 2);
 37 |     EXPECT_EQ(expertise->classifier.GetPurityTransformation().size(), 2u);
 38 |     EXPECT_EQ(expertise->classifier.GetPurityTransformation()[0], true);
 39 |     EXPECT_EQ(expertise->classifier.GetPurityTransformation()[1], false);
 40 | 
 41 | }
 42 | 
 43 | TEST_F(CInterfaceTest, SetGetNTrees ) {
 44 |     
 45 |     SetNTrees(expertise, 200u);
 46 |     EXPECT_EQ(expertise->classifier.GetNTrees(), 200u);
 47 | 
 48 | }
 49 | 
 50 | TEST_F(CInterfaceTest, SetGetSPlot ) {
 51 |     
 52 |     SetSPlot(expertise, false);
 53 |     EXPECT_EQ(expertise->classifier.GetSPlot(), false);
 54 |     SetSPlot(expertise, true);
 55 |     EXPECT_EQ(expertise->classifier.GetSPlot(), true);
 56 | 
 57 | }
 58 | 
 59 | TEST_F(CInterfaceTest, SetGetTransform2Probability ) {
 60 |     
 61 |     SetTransform2Probability(expertise, false);
 62 |     EXPECT_EQ(expertise->classifier.GetTransform2Probability(), false);
 63 |     SetTransform2Probability(expertise, true);
 64 |     EXPECT_EQ(expertise->classifier.GetTransform2Probability(), true);
 65 | 
 66 | }
 67 | 
 68 | TEST_F(CInterfaceTest, SetGetDepth ) {
 69 |     
 70 |     SetDepth(expertise, 5u);
 71 |     EXPECT_EQ(expertise->classifier.GetDepth(), 5u);
 72 |     SetDepth(expertise, 2u);
 73 |     EXPECT_EQ(expertise->classifier.GetDepth(), 2u);
 74 | 
 75 | }
 76 | 
 77 | TEST_F(CInterfaceTest, SetGetFlatnessLossWorks ) {
 78 |     
 79 |     SetFlatnessLoss(expertise, 0.2);
 80 |     EXPECT_DOUBLE_EQ(expertise->classifier.GetFlatnessLoss(), 0.2);
 81 |     SetFlatnessLoss(expertise, 0.4);
 82 |     EXPECT_DOUBLE_EQ(expertise->classifier.GetFlatnessLoss(), 0.4);
 83 | 
 84 | }
 85 | 
 86 | TEST_F(CInterfaceTest, SetGetShrinkageWorks ) {
 87 |     
 88 |     SetShrinkage(expertise, 0.2);
 89 |     EXPECT_DOUBLE_EQ(expertise->classifier.GetShrinkage(), 0.2);
 90 |     SetShrinkage(expertise, 0.4);
 91 |     EXPECT_DOUBLE_EQ(expertise->classifier.GetShrinkage(), 0.4);
 92 | 
 93 | }
 94 |     
 95 |     
 96 | TEST_F(CInterfaceTest, SetSubsampleWorks ) {
 97 |     
 98 |     SetSubsample(expertise, 0.6);
 99 |     EXPECT_DOUBLE_EQ(expertise->classifier.GetSubsample(), 0.6);
100 |     SetSubsample(expertise, 0.8);
101 |     EXPECT_DOUBLE_EQ(expertise->classifier.GetSubsample(), 0.8);
102 | 
103 | }
104 | 
105 | 
106 | TEST_F(CInterfaceTest, FitAndPredictWorksWithoutWeights ) {
107 | 
108 |     // Use just one branch instead of a whole forest for testing
109 |     // We only test if the ForestBuilder is called correctly,
110 |     // the builder itself is tested elsewhere.
111 |     SetNTrees(expertise, 10u);
112 |     SetDepth(expertise, 1u);
113 |     SetSubsample(expertise, 1.0);
114 |     SetShrinkage(expertise, 1.0);
115 |     unsigned int binning[] = {2u, 2u};
116 |     SetBinning(expertise, binning, 2);
117 |     SetTransform2Probability(expertise, true);
118 |     SetNumberOfFlatnessFeatures(expertise, 0);
119 | 
120 |     float data_ptr[] = {1.0, 2.6, 1.6, 2.5, 1.1, 2.0, 1.9, 2.1, 1.6, 2.9, 1.9, 2.9, 1.5, 2.0};
121 |     bool target_ptr[] = {0, 1, 0, 1, 1, 1, 0};
122 |     Fit(expertise, data_ptr, nullptr, target_ptr, 7, 2);
123 | 
124 |     float test_ptr[] = {1.0, 2.6};
125 |     EXPECT_LE(Predict(expertise, test_ptr), 0.01);
126 |     
127 |     float test_ptr2[] = {1.6, 2.5};
128 |     EXPECT_GE(Predict(expertise, test_ptr2), 0.99);
129 | }
130 | 
131 | 
132 | TEST_F(CInterfaceTest, TrainAndAnalyseForestWorksWithSpectators ) {
133 | 
134 |     // Use just one branch instead of a whole forest for testing
135 |     // We only test if the ForestBuilder is called correctly,
136 |     // the builder itself is tested elsewhere.
137 |     SetNTrees(expertise, 10u);
138 |     SetDepth(expertise, 1u);
139 |     SetSubsample(expertise, 1.0);
140 |     SetShrinkage(expertise, 1.0);
141 |     unsigned int binning[] = {2u, 2u, 2u, 3u};
142 |     SetBinning(expertise, binning, 4);
143 |     SetTransform2Probability(expertise, true);
144 |     SetNumberOfFlatnessFeatures(expertise, 2);
145 | 
146 |     float data_ptr[] = {1.0, 2.6, 0.0, -10.0, 
147 |                          1.6, 2.5, 99.0, 0.0,
148 |                          1.1, 2.0, -500.0, 12.1,
149 |                          1.9, 2.1, 0.0, 0.0,
150 |                          1.6, 2.9, 23.0, 42.0,
151 |                          1.9, 2.9, 0.0, 1.0,
152 |                          1.5, 2.0, 1.0, -1.0};
153 |     bool target_ptr[] = {0, 1, 0, 1, 1, 1, 0};
154 |     Fit(expertise, data_ptr, nullptr, target_ptr, 7, 4);
155 | 
156 |     float test_ptr[] = {1.0, 2.6};
157 |     EXPECT_LE(Predict(expertise, test_ptr), 0.03);
158 | }
159 | 
160 | TEST_F(CInterfaceTest, TrainAndAnalyseForestWorksWithWeights ) {
161 | 
162 |     // Use just one branch instead of a whole forest for testing
163 |     // We only test if the ForestBuilder is called correctly,
164 |     // the builder itself is tested elsewhere.
165 |     SetNTrees(expertise, 10u);
166 |     SetDepth(expertise, 1u);
167 |     SetSubsample(expertise, 1.0);
168 |     SetShrinkage(expertise, 1.0);
169 |     unsigned int binning[] = {2u, 2u};
170 |     SetBinning(expertise, binning, 2);
171 |     SetTransform2Probability(expertise, true);
172 |     SetNumberOfFlatnessFeatures(expertise, 0);
173 | 
174 |     float data_ptr[] = {1.0, 2.6, 1.6, 2.5, 1.1, 2.0, 1.9, 2.1, 1.6, 2.9, 1.9, 2.9, 1.5, 2.0};
175 |     float weight_ptr[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
176 |     bool target_ptr[] = {0, 1, 0, 1, 1, 1, 0};
177 |     Fit(expertise, data_ptr, weight_ptr, target_ptr, 7, 2);
178 | 
179 |     float test_ptr[] = {1.0, 2.6};
180 |     EXPECT_LE(Predict(expertise, test_ptr), 0.01);
181 |     
182 |     float weight_ptr2[] = {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0};
183 |     Fit(expertise, data_ptr, weight_ptr2, target_ptr, 7, 2);
184 |     EXPECT_LE(Predict(expertise, test_ptr), 0.01);
185 |     
186 |     float weight_ptr3[] = {1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0};
187 |     Fit(expertise, data_ptr, weight_ptr3, target_ptr, 7, 2);
188 |     EXPECT_LE(Predict(expertise, test_ptr), 0.03);
189 | }
190 | 


--------------------------------------------------------------------------------
/src/FastBDT_C_API.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2015
  3 |  */
  4 | 
  5 | #include "FastBDT_C_API.h"
  6 | 
  7 | #include <fstream>
  8 | #include <new>
  9 | #include <iostream>
 10 | 
 11 | using namespace FastBDT;
 12 | 
 13 | extern "C" {
 14 | 
 15 |     void PrintVersion() {
 16 |       std::cerr << "FastBDT Version: " << FastBDT_VERSION_MAJOR << "." << FastBDT_VERSION_MINOR << std::endl;
 17 |     }
 18 | 
 19 |     void* Create() {
 20 |       Expertise *expertise = new(std::nothrow) Expertise;
 21 |       return expertise;
 22 |     }
 23 |     
 24 |     void SetBinning(void *ptr, unsigned int* binning, unsigned int size) {
 25 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetBinning(std::vector<unsigned int>(binning, binning + size));
 26 |     }
 27 | 
 28 |     void SetPurityTransformation(void *ptr, bool* purityTransformation, unsigned int size) {
 29 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetPurityTransformation(std::vector<bool>(purityTransformation, purityTransformation + size));
 30 |     }
 31 |     
 32 |     void SetNTrees(void *ptr, unsigned int nTrees) {
 33 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetNTrees(nTrees);
 34 |     }
 35 | 
 36 |     unsigned int GetNTrees(void *ptr) {
 37 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetNTrees();
 38 |     }
 39 |     
 40 |     void SetDepth(void *ptr, unsigned int depth) {
 41 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetDepth(depth);
 42 |     }
 43 | 
 44 |     unsigned int GetDepth(void *ptr) {
 45 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetDepth();
 46 |     }
 47 |     
 48 |     void SetNumberOfFlatnessFeatures(void *ptr, unsigned int numberOfFlatnessFeatures) {
 49 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetNumberOfFlatnessFeatures(numberOfFlatnessFeatures);
 50 |     }
 51 | 
 52 |     unsigned int GetNumberOfFlatnessFeatures(void *ptr) {
 53 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetNumberOfFlatnessFeatures();
 54 |     }
 55 |     
 56 |     void SetSubsample(void *ptr, double subsample) {
 57 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetSubsample(subsample);
 58 |     }
 59 | 
 60 |     double GetSubsample(void *ptr) {
 61 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetSubsample();
 62 |     }
 63 |     
 64 |     void SetShrinkage(void *ptr, double shrinkage) {
 65 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetShrinkage(shrinkage);
 66 |     }
 67 | 
 68 |     double GetShrinkage(void *ptr) {
 69 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetShrinkage();
 70 |     }
 71 |     
 72 |     void SetFlatnessLoss(void *ptr, double flatnessLoss) {
 73 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetFlatnessLoss(flatnessLoss);;
 74 |     }
 75 | 
 76 |     double GetFlatnessLoss(void *ptr) {
 77 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetFlatnessLoss();
 78 |     }
 79 | 
 80 |     void SetTransform2Probability(void *ptr, bool transform2probability) {
 81 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetTransform2Probability(transform2probability);
 82 |     }
 83 | 
 84 |     bool GetTransform2Probability(void *ptr) {
 85 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetTransform2Probability();
 86 |     }
 87 |     
 88 |     void SetSPlot(void *ptr, bool sPlot) {
 89 |       reinterpret_cast<Expertise*>(ptr)->classifier.SetSPlot(sPlot);
 90 |     }
 91 | 
 92 |     bool GetSPlot(void *ptr) {
 93 |       return reinterpret_cast<Expertise*>(ptr)->classifier.GetSPlot();
 94 |     }
 95 | 
 96 |     void Delete(void *ptr) {
 97 |       delete reinterpret_cast<Expertise*>(ptr);
 98 |     }
 99 |     
100 |     void Fit(void *ptr, float *data_ptr, float *weight_ptr, bool *target_ptr, unsigned int nEvents, unsigned int nFeatures) {
101 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
102 | 
103 |       std::vector<float> w;
104 |       if(weight_ptr != nullptr)
105 |         w = std::vector<float>(weight_ptr, weight_ptr + nEvents);
106 |       else
107 |         w = std::vector<float>(nEvents, 1.0);
108 | 
109 |       std::vector<bool> y(target_ptr, target_ptr + nEvents);
110 |       std::vector<std::vector<float>> X(nFeatures);
111 |       for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature) {
112 |         std::vector<float> temp(nEvents);
113 |         for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) {
114 |           temp[iEvent] = data_ptr[iEvent*nFeatures + iFeature];
115 |         }
116 |         X[iFeature] = temp;
117 |       }
118 | 
119 |       expertise->classifier.fit(X, y, w);
120 | 
121 |     }
122 | 
123 |     void Load(void* ptr, char *weightfile) {
124 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
125 |       
126 |       std::fstream file(weightfile, std::ios_base::in);
127 |       if(not file)
128 |     	  return;
129 | 
130 |       expertise->classifier = FastBDT::Classifier(file);
131 |     }
132 | 
133 |     float Predict(void *ptr, float *array) {
134 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
135 |       return expertise->classifier.predict(std::vector<float>(array, array + expertise->classifier.GetNFeatures()));
136 |     }
137 |     
138 |     void PredictArray(void *ptr, float *array, float *result, unsigned int nEvents) {
139 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
140 |       unsigned int nFeatures = expertise->classifier.GetNFeatures();
141 |       for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) {
142 |         result[iEvent] = expertise->classifier.predict(std::vector<float>(array + iEvent*nFeatures, array + (iEvent+1)*nFeatures));
143 |       }
144 |     }
145 | 
146 |     void Save(void* ptr, char *weightfile) {
147 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
148 | 
149 |       std::fstream file(weightfile, std::ios_base::out | std::ios_base::trunc);
150 |       file << expertise->classifier << std::endl;
151 |     }
152 |   
153 |     void* GetVariableRanking(void* ptr) {
154 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
155 |       VariableRanking *ranking = new(std::nothrow) VariableRanking;
156 |       ranking->ranking = expertise->classifier.GetVariableRanking();
157 |       return ranking;
158 |     }
159 |     
160 |     void* GetIndividualVariableRanking(void* ptr, float *array) {
161 |       Expertise *expertise = reinterpret_cast<Expertise*>(ptr);
162 |       VariableRanking *ranking = new(std::nothrow) VariableRanking;
163 |       ranking->ranking = expertise->classifier.GetIndividualVariableRanking(std::vector<float>(array, array + expertise->classifier.GetNFeatures()));
164 |       return ranking;
165 |     }
166 |     
167 |     unsigned int ExtractNumberOfVariablesFromVariableRanking(void* ptr) {
168 |       VariableRanking *ranking = reinterpret_cast<VariableRanking*>(ptr);
169 |       unsigned int max = 0;
170 |       for(auto &pair : ranking->ranking) {
171 |         if(pair.first > max) {
172 |           max = pair.first;
173 |         }
174 |       }
175 |       return max+1;
176 |     }
177 |     
178 |     double ExtractImportanceOfVariableFromVariableRanking(void* ptr, unsigned int iFeature) {
179 |       VariableRanking *ranking = reinterpret_cast<VariableRanking*>(ptr);
180 |       if ( ranking->ranking.find( iFeature ) == ranking->ranking.end() )
181 |         return 0.0;
182 |       return ranking->ranking[iFeature];  
183 |     }
184 |     
185 |     void DeleteVariableRanking(void *ptr) {
186 |       delete reinterpret_cast<VariableRanking*>(ptr);
187 |     }
188 | 
189 | }
190 | 


--------------------------------------------------------------------------------
/examples/orthogonal_discriminator.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from PyFastBDT import FastBDT
  3 | 
  4 | import numpy as np
  5 | import numpy
  6 | import numpy.linalg
  7 | import sklearn.metrics
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | import matplotlib as mpl
 11 | 
 12 | def calculate_cdf_and_pdf(X):
 13 |     """
 14 |     Calculates cdf and pdf of given sample and adds under/overflow bins
 15 |         @param X 1-d numpy.array
 16 |     """
 17 |     pdf, bins = numpy.histogram(X, bins=30, density=True)
 18 |     cdf = numpy.cumsum(pdf * (bins - numpy.roll(bins, 1))[1:])
 19 |     return numpy.hstack([0.0, cdf, 1.0]), numpy.hstack([0.0, pdf, 0.0]), bins
 20 | 
 21 | 
 22 | def calculate_splot_weights(pdfs, yields):
 23 |     """
 24 |     Calculates sPlot weights using the pdfs
 25 |         @param pdfs list of 1-d numpy.array with pdf values of the different components for each event
 26 |         @param yields list of the yields of the different components
 27 |     """
 28 |     N_components = len(pdfs)
 29 |     # Consistency checks
 30 |     if N_components != len(yields):
 31 |         raise RuntimeError("You have to provide the same number of pdfs and yields!")
 32 |     if N_components < 2:
 33 |         raise RuntimeError("Need at least two components!")
 34 | 
 35 |     # Calculate covariance matrix
 36 |     inverse_covariance = numpy.zeros((N_components, N_components))
 37 |     norm = sum((yields[k] * pdfs[k] for k in range(1, N_components)), yields[0] * pdfs[0])**2
 38 |     for i in range(N_components):
 39 |         for j in range(N_components):
 40 |             inverse_covariance[i, j] = numpy.nansum(pdfs[i] * pdfs[j] / norm)
 41 |     covariance = numpy.linalg.inv(inverse_covariance)
 42 | 
 43 |     # Return list of sPlot weights for each component
 44 |     return [sum(covariance[n, k] * pdfs[k] for k in range(N_components)) /
 45 |             sum(yields[k] * pdfs[k] for k in range(N_components)) for n in range(N_components)]
 46 | 
 47 | 
 48 | class Prior(object):
 49 |     def __init__(self, signal, bckgrd):
 50 |         self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(signal)
 51 |         self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(bckgrd)
 52 |         # Avoid numerical instabilities
 53 |         self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
 54 |         self.signal_yield = len(signal)
 55 |         self.bckgrd_yield = len(bckgrd)
 56 | 
 57 |     def get_signal_pdf(self, X):
 58 |         return self.signal_pdf[numpy.digitize(X, bins=self.signal_bins)]
 59 | 
 60 |     def get_bckgrd_pdf(self, X):
 61 |         return self.bckgrd_pdf[numpy.digitize(X, bins=self.bckgrd_bins)]
 62 | 
 63 |     def get_signal_cdf(self, X):
 64 |         return self.signal_cdf[numpy.digitize(X, bins=self.signal_bins)]
 65 | 
 66 |     def get_bckgrd_cdf(self, X):
 67 |         return self.bckgrd_cdf[numpy.digitize(X, bins=self.bckgrd_bins)]
 68 |     
 69 |     def get_prior(self, X):
 70 |         return self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X))
 71 |     
 72 |     def get_signal_boost_weights(self, X):
 73 |         return self.get_signal_cdf(X) / self.get_bckgrd_pdf(X)
 74 |     
 75 |     def get_bckgrd_boost_weights(self, X):
 76 |         # NOT self.get_bckgrd_cdf() here, signal and background are handlet asymmetrical!
 77 |         return (1.0 - self.get_signal_cdf(X)) / self.get_bckgrd_pdf(X)
 78 | 
 79 |     def get_boost_weights(self, X):
 80 |         return numpy.r_[self.get_signal_boost_weights(X), self.get_bckgrd_boost_weights(X)]
 81 | 
 82 |     def get_splot_weights(self, X):
 83 |         pdfs = [self.get_signal_pdf(X), self.get_bckgrd_pdf(X)]
 84 |         yields = [self.signal_yield, self.bckgrd_yield]
 85 |         weights = calculate_splot_weights(pdfs, yields)
 86 |         return numpy.r_[weights[0], weights[1]]
 87 |     
 88 |     def get_uncorrelation_weights(self, X, boost_prediction):
 89 |         reg_boost_prediction = boost_prediction * 0.99 + 0.005
 90 |         weights = (self.get_signal_cdf(X) / reg_boost_prediction +  (1.0 - self.get_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
 91 |         return weights
 92 | 
 93 |     def get_aplot_weights(self, X, boost_prediction):
 94 |         weights = self.get_uncorrelation_weights(X, boost_prediction)
 95 |         return self.get_splot_weights(X) * numpy.r_[weights, weights]
 96 | 
 97 | 
 98 | def combine_probabilities(p1, p2):
 99 |     return p1*p2 / (p1*p2 + (1-p1)*(1-p2))
100 | 
101 | 
102 | 
103 | def acticvate_post_mortem_debugger():
104 |     import sys 
105 | 
106 |     def info(type, value, tb):
107 |         if hasattr(sys, 'ps1') or not sys.stderr.isatty():
108 |             # we are in interactive mode or we don't have a tty-like
109 |             # device, so we call the default hook
110 |             sys.__excepthook__(type, value, tb) 
111 |         else:
112 |             import traceback, pdb 
113 |             # we are NOT in interactive mode, print the exception...
114 |             traceback.print_exception(type, value, tb) 
115 |             # ...then start the debugger in post-mortem mode.
116 |             pdb.post_mortem(tb)
117 | 
118 |     sys.excepthook = info
119 | 
120 | acticvate_post_mortem_debugger()
121 | 
122 | 
123 | def evaluation(label, X_test, y_test, p, p_prior):
124 |     print(label, sklearn.metrics.roc_auc_score(y_test, p))
125 |     print(label + " with prior", sklearn.metrics.roc_auc_score(y_test, combine_probabilities(p, p_prior)))
126 |     plt.scatter(X_test[y_test == 1, 0], p[y_test == 1], c='r', label=label + " (Signal)", alpha=0.2)
127 |     plt.scatter(X_test[y_test == 0, 0], p[y_test == 0], c='b', label=label + " (Background)", alpha=0.2)
128 |     plt.xlabel("Feature")
129 |     plt.ylabel("Probability")
130 |     plt.show()
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     # Create some Monte Carlo data using a multidimensional gaussian distribution
135 |     # The 0th row of the coveriance matrix describes the correlation to the target variable
136 |     for cor in np.linspace(-0.2, 0.2, 3):
137 |         print("Correlation ", cor)
138 |         mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
139 |         cov = [[1.0, 0.6, 0.4, 0.2, 0.1, 0.0],
140 |                [0.0, 1.0, cor, cor, cor, 0.0],
141 |                [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
142 |                [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
143 |                [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
144 |                [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]
145 | 
146 |         for i in range(len(mean)):
147 |             for j in range(i+1, len(mean)):
148 |                 cov[j][i] = cov[i][j]
149 | 
150 |         N_train, N_test = 100000, 2000
151 |         data = np.random.multivariate_normal(mean, cov, N_train + N_test)
152 |         X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 
153 |         X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 
154 |         
155 |         # First variable is the variable we want to have independent of our network output
156 |         prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0])
157 |         p_prior = prior.get_prior(X_test[:, 0])
158 |         evaluation("Prior", X_test, y_test, p_prior, p_prior)
159 |         
160 |         p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test)
161 |         evaluation("Full", X_test, y_test, p, p_prior)
162 | 
163 |         p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:])
164 |         evaluation("Restricted", X_test, y_test, p, p_prior)
165 |         
166 |         boost_p = FastBDT.Classifier().fit(X=numpy.r_[X_train[:, 1:], X_train[:, 1:]],
167 |                                            y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)],
168 |                                            weights=prior.get_boost_weights(X_train[:, 0])).predict(X_train[:, 1:])
169 |         
170 |         p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train,
171 |                                      weights=prior.get_uncorrelation_weights(X_train[:, 0], boost_p)).predict(X_test[:, 1:]) 
172 |         evaluation("Uncorrelation", X_test, y_test, p, p_prior)
173 | 
174 |         p = FastBDT.Classifier().fit(X=numpy.r_[X_train[:, 1:], X_train[:, 1:]],
175 |                                      y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)],
176 |                                      weights=prior.get_aplot_weights(X_train[:, 0], boost_p)).predict(X_test[:, 1:]) 
177 |         evaluation("APlot", X_test, y_test, p, p_prior)
178 |         
179 |         p = FastBDT.Classifier().fit(X=numpy.r_[X_train[:, 1:], X_train[:, 1:]],
180 |                                      y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)],
181 |                                      weights=prior.get_splot_weights(X_train[:, 0])).predict(X_test[:, 1:]) 
182 |         evaluation("SPlot", X_test, y_test, p, p_prior)
183 | 


--------------------------------------------------------------------------------
/include/FastBDT_IO.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2014
  3 |  */
  4 | 
  5 | #pragma once
  6 | #include "FastBDT.h"
  7 | 
  8 | #include <iostream>
  9 | #include <vector>
 10 | #include <stdexcept>
 11 | #include <type_traits>
 12 | 
 13 | namespace FastBDT {
 14 |   
 15 |   /**
 16 |    * Converts from string to float safely
 17 |    * Should behave similar to boost::lexical_cast<float>
 18 |    * but does not signal if it fails!
 19 |    * @param input string containing a float
 20 |    */
 21 |   float convert_to_float_safely(std::string &input);
 22 |   
 23 |   /**
 24 |    * Converts from string to double safely
 25 |    * Should behave similar to boost::lexical_cast<double>
 26 |    * but does not signal if it fails!
 27 |    * @param input string containing a float
 28 |    */
 29 |   double convert_to_double_safely(std::string &input);
 30 | 
 31 |   /**
 32 |    * This template saves a vector to an std::ostream
 33 |    * @param stream an std::ostream reference
 34 |    * @param vector the vector which shall be stored
 35 |    */
 36 |   template<class T>
 37 |   std::ostream& operator<<(std::ostream& stream, const std::vector<T> &vector) {
 38 |      stream << vector.size();
 39 |      for(const auto &value : vector) {
 40 |          stream << " " << value;
 41 |      }
 42 |      stream << std::endl;
 43 |      return stream;
 44 |   }
 45 |   
 46 |   /**
 47 |    * Specialize vector output operator, so it checks for nan and infinity in float/double types
 48 |    * Note: I know about http://www.gotw.ca/publications/mill17.htm, SFINAE, but nothing worked for me ...
 49 |    *       so I sticked with this simple solution instead of complicated template meta programming
 50 |    */
 51 |   template<>
 52 |   std::ostream& operator<<(std::ostream& stream, const std::vector<float> &vector);
 53 |   
 54 |   template<>
 55 |   std::ostream& operator<<(std::ostream& stream, const std::vector<double> &vector);
 56 |   
 57 |   /**
 58 |    * This template reads a vector from an std::istream
 59 |    * @param stream an std::istream reference
 60 |    * @param vector the vector containing read data
 61 |    */
 62 |   template<class T>
 63 |   std::istream& operator>>(std::istream& stream, std::vector<T> &vector) {
 64 |      unsigned int size;
 65 |      stream >> size;
 66 |      vector.resize(size);
 67 |      for(unsigned int i = 0; i < size; ++i) {
 68 |          T temp;
 69 |          stream >> temp;
 70 |          vector[i] = temp;
 71 |      }
 72 |      return stream;
 73 |   }
 74 |   
 75 |   template<>
 76 |   std::istream& operator>>(std::istream& stream, std::vector<float> &vector);
 77 |   
 78 |   template<>
 79 |   std::istream& operator>>(std::istream& stream, std::vector<double> &vector);
 80 | 
 81 |   /**
 82 |    * This function saves a Cut to an std::ostream
 83 |    * @param stream an std::ostream reference
 84 |    * @param cut which shall be stored
 85 |    */
 86 |   template<class T>
 87 |   std::ostream& operator<<(std::ostream& stream, const Cut<T> &cut) {
 88 |      stream << cut.feature << std::endl;
 89 |      stream.precision(std::numeric_limits<T>::max_digits10);
 90 |      stream << std::scientific;
 91 |      stream << cut.index << std::endl;
 92 |      stream.precision(6);
 93 |      stream << cut.valid << std::endl;
 94 |      stream << cut.gain;
 95 |      stream << std::endl;
 96 |      return stream;
 97 |   }
 98 |   
 99 |   /**
100 |    * This function reads a Cut from an std::istream
101 |    * @param stream an std::istream reference
102 |    * @param cut containing read data
103 |    */
104 |   template<class T>
105 |   std::istream& operator>>(std::istream& stream, Cut<T> &cut) {
106 |      stream >> cut.feature;
107 |      stream >> cut.index;
108 |      stream >> cut.valid;
109 |      stream >> cut.gain;
110 |      return stream;
111 |   }
112 |   
113 |   template<>
114 |   std::istream& operator>>(std::istream& stream, Cut<float> &cut);
115 |   
116 |   template<>
117 |   std::istream& operator>>(std::istream& stream, Cut<double> &cut);
118 |   
119 |   
120 |   /**
121 |    * This function saves a Tree to an std::ostream
122 |    * @param stream an std::ostream reference
123 |    * @param tree the tree which shall be stored
124 |    */
125 |   template<class T>
126 |   std::ostream& operator<<(std::ostream& stream, const Tree<T> &tree) {
127 |      const auto &cuts = tree.GetCuts();
128 |      stream << cuts.size() << std::endl;
129 |      for( const auto& cut : cuts ) {
130 |         stream << cut << std::endl;
131 |      }
132 |      stream << tree.GetBoostWeights() << std::endl;
133 |      stream << tree.GetPurities() << std::endl;
134 |      stream << tree.GetNEntries() << std::endl;
135 |      return stream;
136 |   }
137 |   
138 |   
139 |   /**
140 |    * This function reads a Tree from an std::istream
141 |    * @param stream an std::istream reference
142 |    * @preturn tree containing read data
143 |    */
144 |   template<class T>
145 |   Tree<T> readTreeFromStream(std::istream& stream) {
146 |       unsigned int size;
147 |       stream >> size;
148 |       std::vector<Cut<T>> cuts(size);
149 |       for(unsigned int i = 0; i < size; ++i) {
150 |         stream >> cuts[i];
151 |       }
152 | 
153 |       std::vector<Weight> boost_weights;
154 |       stream >> boost_weights;
155 |       
156 |       std::vector<Weight> purities;
157 |       stream >> purities;
158 |       
159 | 			std::vector<Weight> nEntries;
160 |       stream >> nEntries;
161 |       
162 |       return Tree<T>(cuts, nEntries, purities, boost_weights);
163 | 
164 |   }
165 |   
166 |   /**
167 |    * This function saves a Forest to an std::ostream
168 |    * @param stream an std::ostream reference
169 |    * @param forest the forest which shall be stored
170 |    */
171 |   template<class T>
172 |   std::ostream& operator<<(std::ostream& stream, const Forest<T> &forest) {
173 |      stream << forest.GetF0() << std::endl;
174 |      stream << forest.GetShrinkage() << std::endl;
175 |      stream << forest.GetTransform2Probability() << std::endl;
176 | 
177 |      const auto &trees = forest.GetForest();
178 |      stream << trees.size() << std::endl;
179 |      for(const auto& tree : trees) {
180 |          stream << tree << std::endl;
181 |      }
182 | 
183 |      return stream;
184 |   }
185 |   
186 |   /**
187 |    * This function reads a Forest from an std::istream
188 |    * @param stream an std::istream reference
189 |    * @preturn forest containing read data
190 |    */
191 |   template<class T>
192 |   Forest<T> readForestFromStream(std::istream& stream) {
193 |       double F0;
194 |       stream >> F0;
195 | 
196 |       double shrinkage;
197 |       stream >> shrinkage;
198 |      
199 |       bool transform2probability;
200 |       stream >> transform2probability;
201 | 
202 |       Forest<T> forest(shrinkage, F0, transform2probability);
203 | 
204 |       unsigned int size;
205 |       stream >> size;
206 | 
207 |       for(unsigned int i = 0; i < size; ++i) {
208 |         forest.AddTree(readTreeFromStream<T>(stream));
209 |       }
210 | 
211 |       return forest;
212 |   }
213 |   
214 |   /**
215 |    * This function saves a PurityTransformation to an std::ostream
216 |    * @param stream an std::ostream reference
217 |    * @param purityTransformation the purity transformation which shall be stored
218 |    */
219 |   std::ostream& operator<<(std::ostream& stream, const PurityTransformation &purityTransformation);
220 |   
221 |   /**
222 |    * This function reads a PurityTransformation from an std::istream
223 |    * @param stream an std::istream reference
224 |    * @param purityTransformation the purity transformation which shall be stored
225 |    */
226 |   std::istream& operator>>(std::istream& stream, PurityTransformation &purityTransformation);
227 |   
228 |   
229 |   /**
230 |    * This function saves a FeatureBinning to an std::ostream
231 |    * @param stream an std::ostream reference
232 |    * @param featureBinning the FeatureBinning which shall be stored
233 |    */
234 |   template<class T>
235 |   std::ostream& operator<<(std::ostream& stream, const FeatureBinning<T> &featureBinning) {
236 |      
237 |      stream << featureBinning.GetNLevels() << std::endl;
238 |      stream << featureBinning.GetBinning() << std::endl;
239 | 
240 |      return stream;
241 |   }
242 |   
243 |   /**
244 |    * This function reads a FeatureBinning from an std::istream
245 |    * @param stream an std::istream reference
246 |    * @preturn FeatureBinning containing read data
247 |    */
248 |   template<class T>
249 |   FeatureBinning<T> readFeatureBinningFromStream(std::istream& stream) {
250 |   
251 |       unsigned int nLevels;
252 |       stream >> nLevels; 
253 | 
254 |       std::vector<T> bins;
255 |       stream >> bins;
256 | 
257 |       return FeatureBinning<T>(nLevels, bins);
258 | 
259 |   }
260 |  
261 |   /**
262 |    * Overload vector input operator, so it can read in FeatureBinnings
263 |    */
264 |   template<class T>
265 |   std::istream& operator>>(std::istream& stream, std::vector<FeatureBinning<T>> &vector) {
266 |      unsigned int size;
267 |      stream >> size;
268 |      for(unsigned int i = 0; i < size; ++i)
269 |          vector.push_back(readFeatureBinningFromStream<T>(stream));
270 |      return stream;
271 |   }
272 |   
273 |   
274 | }
275 | 


--------------------------------------------------------------------------------
/src/test_Classifier.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2017
  3 |  */
  4 | 
  5 | #include "Classifier.h"
  6 | #include <iostream>
  7 | #include <fstream>
  8 | 
  9 | #include <gtest/gtest.h>
 10 | 
 11 | #include <sstream>
 12 | #include <limits>
 13 | 
 14 | #include <algorithm>
 15 | #include <random>
 16 | 
 17 | using namespace FastBDT;
 18 | 
 19 | 
 20 | std::vector<std::vector<float>> GetIrisX() {
 21 |   std::vector<std::vector<float>> X = {{5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0,4.4,4.9,5.4,4.8,4.8,4.3,5.8,5.7,5.4,5.1,5.7,5.1,5.4,5.1,4.6,5.1,4.8,5.0,5.0,5.2,5.2,4.7,4.8,5.4,5.2,5.5,4.9,5.0,5.5,4.9,4.4,5.1,5.0,4.5,4.4,5.0,5.1,4.8,5.1,4.6,5.3,5.0,7.0,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,5.0,5.9,6.0,6.1,5.6,6.7,5.6,5.8,6.2,5.6,5.9,6.1,6.3,6.1,6.4,6.6,6.8,6.7,6.0,5.7,5.5,5.5,5.8,6.0,5.4,6.0,6.7,6.3,5.6,5.5,5.5,6.1,5.8,5.0,5.6,5.7,5.7,6.2,5.1,5.7,6.3,5.8,7.1,6.3,6.5,7.6,4.9,7.3,6.7,7.2,6.5,6.4,6.8,5.7,5.8,6.4,6.5,7.7,7.7,6.0,6.9,5.6,7.7,6.3,6.7,7.2,6.2,6.1,6.4,7.2,7.4,7.9,6.4,6.3,6.1,7.7,6.3,6.4,6.0,6.9,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9}, {3.5,3.0,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,3.7,3.4,3.0,3.0,4.0,4.4,3.9,3.5,3.8,3.8,3.4,3.7,3.6,3.3,3.4,3.0,3.4,3.5,3.4,3.2,3.1,3.4,4.1,4.2,3.1,3.2,3.5,3.1,3.0,3.4,3.5,2.3,3.2,3.5,3.8,3.0,3.8,3.2,3.7,3.3,3.2,3.2,3.1,2.3,2.8,2.8,3.3,2.4,2.9,2.7,2.0,3.0,2.2,2.9,2.9,3.1,3.0,2.7,2.2,2.5,3.2,2.8,2.5,2.8,2.9,3.0,2.8,3.0,2.9,2.6,2.4,2.4,2.7,2.7,3.0,3.4,3.1,2.3,3.0,2.5,2.6,3.0,2.6,2.3,2.7,3.0,2.9,2.9,2.5,2.8,3.3,2.7,3.0,2.9,3.0,3.0,2.5,2.9,2.5,3.6,3.2,2.7,3.0,2.5,2.8,3.2,3.0,3.8,2.6,2.2,3.2,2.8,2.8,2.7,3.3,3.2,2.8,3.0,2.8,3.0,2.8,3.8,2.8,2.8,2.6,3.0,3.4,3.1,3.0,3.1,3.1,3.1,2.7,3.2,3.3,3.0,2.5,3.0,3.4,3.0}, {1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,1.5,1.6,1.4,1.1,1.2,1.5,1.3,1.4,1.7,1.5,1.7,1.5,1.0,1.7,1.9,1.6,1.6,1.5,1.4,1.6,1.6,1.5,1.5,1.4,1.5,1.2,1.3,1.5,1.3,1.5,1.3,1.3,1.3,1.6,1.9,1.4,1.6,1.4,1.5,1.4,4.7,4.5,4.9,4.0,4.6,4.5,4.7,3.3,4.6,3.9,3.5,4.2,4.0,4.7,3.6,4.4,4.5,4.1,4.5,3.9,4.8,4.0,4.9,4.7,4.3,4.4,4.8,5.0,4.5,3.5,3.8,3.7,3.9,5.1,4.5,4.5,4.7,4.4,4.1,4.0,4.4,4.6,4.0,3.3,4.2,4.2,4.2,4.3,3.0,4.1,6.0,5.1,5.9,5.6,5.8,6.6,4.5,6.3,5.8,6.1,5.1,5.3,5.5,5.0,5.1,5.3,5.5,6.7,6.9,5.0,5.7,4.9,6.7,4.9,5.7,6.0,4.8,4.9,5.6,5.8,6.1,6.4,5.6,5.1,5.6,6.1,5.6,5.5,4.8,5.4,5.6,5.1,5.1,5.9,5.7,5.2,5.0,5.2,5.4,5.1}, {0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,0.2,0.2,0.1,0.1,0.2,0.4,0.4,0.3,0.3,0.3,0.2,0.4,0.2,0.5,0.2,0.2,0.4,0.2,0.2,0.2,0.2,0.4,0.1,0.2,0.1,0.2,0.2,0.1,0.2,0.2,0.3,0.3,0.2,0.6,0.4,0.3,0.2,0.2,0.2,0.2,1.4,1.5,1.5,1.3,1.5,1.3,1.6,1.0,1.3,1.4,1.0,1.5,1.0,1.4,1.3,1.4,1.5,1.0,1.5,1.1,1.8,1.3,1.5,1.2,1.3,1.4,1.4,1.7,1.5,1.0,1.1,1.0,1.2,1.6,1.5,1.6,1.5,1.3,1.3,1.3,1.2,1.4,1.2,1.0,1.3,1.2,1.3,1.3,1.1,1.3,2.5,1.9,2.1,1.8,2.2,2.1,1.7,1.8,1.8,2.5,2.0,1.9,2.1,2.0,2.4,2.3,1.8,2.2,2.3,1.5,2.3,2.0,2.0,1.8,2.1,1.8,1.8,1.8,2.1,1.6,1.9,2.0,2.2,1.5,1.4,2.3,2.4,1.8,1.8,2.1,2.4,2.3,1.9,2.3,2.5,2.3,1.9,2.0,2.3,1.8} };
 22 |   return X;
 23 | }
 24 | 
 25 | std::vector<bool> GetIrisY() {
 26 |    std::vector<bool> y = {false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true};
 27 |    return y;
 28 | }
 29 | 
 30 | std::vector<float> GetIrisW() {
 31 |   std::vector<float> w = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 32 |   return w;
 33 | }
 34 | 
 35 | 
 36 | float GetIrisScore(const Classifier &classifier) {
 37 |     auto X = GetIrisX();
 38 |     auto y = GetIrisY();
 39 |     float sum = 0;
 40 |     for(unsigned int i = 0; i < y.size(); ++i) {
 41 |       float p = classifier.predict({X[0][i], X[1][i], X[2][i], X[3][i]});
 42 |       sum += (y[i]-p)*(y[i]-p);
 43 |     }
 44 |     return -sum;
 45 | }
 46 | 
 47 | class ClassifierTest : public ::testing::Test {
 48 |     protected:
 49 |         virtual void SetUp() {
 50 |           X = GetIrisX();
 51 |           y = GetIrisY();
 52 |           w = GetIrisW();
 53 |         }
 54 | 
 55 |         virtual void TearDown() {
 56 |         }
 57 | 
 58 |         std::vector<std::vector<float>> X;
 59 |         std::vector<bool> y;
 60 |         std::vector<float> w;
 61 | 
 62 | };
 63 | 
 64 | TEST_F(ClassifierTest, SimpleClassifierWorks) {
 65 | 
 66 |     FastBDT::Classifier classifier(10, 3, {4, 4, 4, 4});
 67 |     classifier.fit(X, y, w);
 68 | 
 69 |     EXPECT_GT(GetIrisScore(classifier), -7.0);
 70 |     EXPECT_LT(GetIrisScore(classifier), -5.0);
 71 | 
 72 | }
 73 | 
 74 | TEST_F(ClassifierTest, MoreTreesAreBetter) {
 75 | 
 76 |     FastBDT::Classifier classifier1(1, 1, {4, 4, 4, 4});
 77 |     classifier1.fit(X, y, w);
 78 |     
 79 |     FastBDT::Classifier classifier2(4, 1, {4, 4, 4, 4});
 80 |     classifier2.fit(X, y, w);
 81 |     
 82 |     FastBDT::Classifier classifier3(16, 1, {4, 4, 4, 4});
 83 |     classifier3.fit(X, y, w);
 84 |     
 85 |     FastBDT::Classifier classifier4(64, 1, {4, 4, 4, 4});
 86 |     classifier4.fit(X, y, w);
 87 | 
 88 |     EXPECT_LT(GetIrisScore(classifier1), GetIrisScore(classifier2));
 89 |     EXPECT_LT(GetIrisScore(classifier2), GetIrisScore(classifier3));
 90 |     EXPECT_LT(GetIrisScore(classifier3), GetIrisScore(classifier4));
 91 | 
 92 | }
 93 | 
 94 | TEST_F(ClassifierTest, DeeperTreesAreBetter) {
 95 | 
 96 |     FastBDT::Classifier classifier1(1, 1, {4, 4, 4, 4});
 97 |     classifier1.fit(X, y, w);
 98 |     
 99 |     FastBDT::Classifier classifier2(1, 3, {4, 4, 4, 4});
100 |     classifier2.fit(X, y, w);
101 |     
102 |     FastBDT::Classifier classifier3(1, 5, {4, 4, 4, 4});
103 |     classifier3.fit(X, y, w);
104 |     
105 |     FastBDT::Classifier classifier4(1, 7, {4, 4, 4, 4});
106 |     classifier4.fit(X, y, w);
107 | 
108 |     EXPECT_LT(GetIrisScore(classifier1), GetIrisScore(classifier2));
109 |     EXPECT_LT(GetIrisScore(classifier2), GetIrisScore(classifier3));
110 |     EXPECT_LT(GetIrisScore(classifier3), GetIrisScore(classifier4));
111 | 
112 | }
113 | 
114 | 
115 | TEST_F(ClassifierTest, MoreBinsAreBetter) {
116 | 
117 |     FastBDT::Classifier classifier1(1, 3, {2, 2, 2, 2});
118 |     classifier1.fit(X, y, w);
119 |     
120 |     FastBDT::Classifier classifier2(1, 3, {2, 3, 2, 3});
121 |     classifier2.fit(X, y, w);
122 |     
123 |     FastBDT::Classifier classifier3(1, 3, {3, 4, 3, 4});
124 |     classifier3.fit(X, y, w);
125 |     
126 |     FastBDT::Classifier classifier4(1, 3, {4, 5, 4, 5});
127 |     classifier4.fit(X, y, w);
128 | 
129 |     EXPECT_LT(GetIrisScore(classifier1), GetIrisScore(classifier2));
130 |     EXPECT_LT(GetIrisScore(classifier2), GetIrisScore(classifier3));
131 |     EXPECT_LT(GetIrisScore(classifier3), GetIrisScore(classifier4));
132 | 
133 | }
134 | 
135 | TEST_F(ClassifierTest, HeavilyOvertrainedBDTIsPerfect) {
136 | 
137 |     FastBDT::Classifier classifier(100, 10, {8, 8, 8, 8});
138 |     classifier.fit(X, y, w);
139 | 
140 |     EXPECT_GT(GetIrisScore(classifier), -0.01f);
141 | 
142 | }
143 | 
144 | TEST_F(ClassifierTest, SubsamplingChangesResult) {
145 | 
146 |     FastBDT::Classifier classifier1(1, 5, {4, 4, 4, 4}, 0.1, 0.5);
147 |     classifier1.fit(X, y, w);
148 |     
149 |     FastBDT::Classifier classifier2(1, 5, {4, 4, 4, 4}, 0.1, 0.5);
150 |     classifier2.fit(X, y, w);
151 | 
152 |     EXPECT_NE(GetIrisScore(classifier1), GetIrisScore(classifier2));
153 | 
154 | }
155 | 
156 | TEST_F(ClassifierTest, GetFeatureMaping) {
157 | 
158 |     FastBDT::Classifier classifier(1, 5, {4, 4, 4, 4}, 0.1, 0.5);
159 |     classifier.SetPurityTransformation({true, false, true, false});
160 |     classifier.fit(X, y, w);
161 | 
162 |     auto mapping = classifier.GetFeatureMapping();
163 |     EXPECT_EQ(mapping[0], 0u);
164 |     EXPECT_EQ(mapping[1], 0u);
165 |     EXPECT_EQ(mapping[2], 1u);
166 |     EXPECT_EQ(mapping[3], 2u);
167 |     EXPECT_EQ(mapping[4], 2u);
168 |     EXPECT_EQ(mapping[5], 3u);
169 | 
170 | }
171 | 
172 | TEST_F(ClassifierTest, LoadAndSaveWorks) {
173 | 
174 |     FastBDT::Classifier classifier(10, 3, {4, 4, 4, 4});
175 |     classifier.fit(X, y, w);
176 |     
177 |     float score1 = GetIrisScore(classifier);
178 | 
179 |     std::fstream file_out("unittest.weightfile", std::ios_base::out | std::ios_base::trunc);
180 |     file_out << classifier << std::endl;
181 |     file_out.close();
182 | 
183 |     std::fstream file_in("unittest.weightfile", std::ios_base::in);
184 |     FastBDT::Classifier classifier2(file_in);
185 |     file_in.close();
186 |     
187 |     float score2 = GetIrisScore(classifier2);
188 | 
189 |     EXPECT_FLOAT_EQ(score1, score2);
190 | }
191 | 
192 | 


--------------------------------------------------------------------------------
/src/Classifier.cxx:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Thomas Keck 2017
  3 |  *
  4 |  * Simplified sklearn interface
  5 |  */
  6 | 
  7 | 
  8 | #include "Classifier.h"
  9 | #include <iostream>
 10 | 
 11 | namespace FastBDT {
 12 | 
 13 |   void Classifier::fit(const std::vector<std::vector<float>> &X, const std::vector<bool> &y, const std::vector<Weight> &w) {
 14 | 
 15 |     if(static_cast<int>(X.size()) - static_cast<int>(m_numberOfFlatnessFeatures) <= 0) {
 16 |       throw std::runtime_error("FastBDT requires at least one feature");
 17 |     }
 18 |     m_numberOfFeatures = X.size() - m_numberOfFlatnessFeatures ;
 19 | 
 20 |     if(m_binning.size() == 0) {
 21 |       for(unsigned int i = 0; i < X.size(); ++i)
 22 |         m_binning.push_back(8);
 23 |     }
 24 | 
 25 |     if(m_numberOfFeatures + m_numberOfFlatnessFeatures != m_binning.size()) {
 26 |       throw std::runtime_error("Number of features must be equal to the number of provided binnings");
 27 |     }
 28 |     
 29 |     if(m_purityTransformation.size() == 0) {
 30 |       for(unsigned int i = 0; i < m_binning.size() - m_numberOfFlatnessFeatures; ++i)
 31 |         m_purityTransformation.push_back(false);
 32 |     }
 33 | 
 34 |     for(auto p : m_purityTransformation)
 35 |       if(p)
 36 |         m_can_use_fast_forest = false;
 37 |     
 38 |     if(m_numberOfFeatures != m_purityTransformation.size()) {
 39 |       throw std::runtime_error("Number of ordinary features must be equal to the number of provided purityTransformation flags.");
 40 |     }
 41 | 
 42 |     unsigned int numberOfEvents = X[0].size();
 43 |     if(numberOfEvents == 0) {
 44 |       throw std::runtime_error("FastBDT requires at least one event");
 45 |     }
 46 | 
 47 |     if(numberOfEvents != y.size()) {
 48 |       throw std::runtime_error("Number of data-points X doesn't match the numbers of labels y");
 49 |     }
 50 |     
 51 |     if(numberOfEvents != w.size()) {
 52 |       throw std::runtime_error("Number of data-points X doesn't match the numbers of weights w");
 53 |     }
 54 | 
 55 |     m_numberOfFinalFeatures = m_numberOfFeatures;
 56 |     for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) {
 57 |       auto feature = X[iFeature];
 58 |       m_featureBinning.push_back(FeatureBinning<float>(m_binning[iFeature], feature));
 59 |       if(m_purityTransformation[iFeature]) {
 60 |         m_numberOfFinalFeatures++;
 61 |         std::vector<unsigned int> feature(numberOfEvents);
 62 |         for(unsigned int iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
 63 |           feature[iEvent] = m_featureBinning[iFeature].ValueToBin(X[iFeature][iEvent]);
 64 |         }
 65 |         m_purityBinning.push_back(PurityTransformation(m_binning[iFeature], feature, w, y));
 66 |         m_binning.insert(m_binning.begin() + iFeature + 1, m_binning[iFeature]);
 67 |       }
 68 |     }
 69 |     
 70 |     for(unsigned int iFeature = 0; iFeature < m_numberOfFlatnessFeatures; ++iFeature) {
 71 |       auto feature = X[iFeature + m_numberOfFeatures];
 72 |       m_featureBinning.push_back(FeatureBinning<float>(m_binning[iFeature + m_numberOfFinalFeatures], feature));
 73 |     }
 74 |   
 75 |     EventSample eventSample(numberOfEvents, m_numberOfFinalFeatures, m_numberOfFlatnessFeatures, m_binning);
 76 |     std::vector<unsigned int> bins(m_numberOfFinalFeatures+m_numberOfFlatnessFeatures);
 77 | 
 78 |     for(unsigned int iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
 79 |       unsigned int bin = 0;
 80 |       unsigned int pFeature = 0; 
 81 |       for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) {
 82 |         bins[bin] = m_featureBinning[iFeature].ValueToBin(X[iFeature][iEvent]);
 83 |         bin++;
 84 |         if(m_purityTransformation[iFeature]) {
 85 |           bins[bin] = m_purityBinning[pFeature].BinToPurityBin(bins[bin-1]);
 86 |           pFeature++;
 87 |           bin++;
 88 |         }
 89 |       }
 90 |       for(unsigned int iFeature = 0; iFeature < m_numberOfFlatnessFeatures; ++iFeature) {
 91 |         bins[bin] = m_featureBinning[iFeature + m_numberOfFeatures].ValueToBin(X[iFeature + m_numberOfFeatures][iEvent]);
 92 |         bin++;
 93 |       }
 94 |       eventSample.AddEvent(bins, w[iEvent], y[iEvent] == 1);
 95 |     }
 96 |    
 97 |     m_featureBinning.resize(m_numberOfFeatures);
 98 | 
 99 |     ForestBuilder df(eventSample, m_nTrees, m_shrinkage, m_subsample, m_depth, m_sPlot, m_flatnessLoss);
100 |     if(m_can_use_fast_forest) {
101 |         Forest<float> temp_forest( df.GetShrinkage(), df.GetF0(), m_transform2probability);
102 |         for( auto t : df.GetForest() ) {
103 |            temp_forest.AddTree(removeFeatureBinningTransformationFromTree(t, m_featureBinning));
104 |         }
105 |         m_fast_forest = temp_forest;
106 |     } else {
107 |         Forest<unsigned int> temp_forest(df.GetShrinkage(), df.GetF0(), m_transform2probability);
108 |         for( auto t : df.GetForest() ) {
109 |            temp_forest.AddTree(t);
110 |         }
111 |         m_binned_forest = temp_forest;
112 |     }
113 | 
114 |   }
115 | 
116 |   void Classifier::Print() {
117 | 
118 |     std::cout << "NTrees " << m_nTrees << std::endl;
119 |     std::cout << "Depth " << m_depth << std::endl;
120 |     std::cout << "NumberOfFeatures " << m_numberOfFeatures << std::endl;
121 | 
122 |   }
123 |       
124 |   float Classifier::predict(const std::vector<float> &X) const {
125 | 
126 |       if(m_can_use_fast_forest) {
127 |         return m_fast_forest.Analyse(X);
128 |       } else {
129 |         std::vector<unsigned int> bins(m_numberOfFinalFeatures);
130 |         unsigned int bin = 0;
131 |         unsigned int pFeature = 0;
132 |         for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) {
133 |           bins[bin] = m_featureBinning[iFeature].ValueToBin(X[iFeature]);
134 |           bin++;
135 |           if(m_purityTransformation[iFeature]) {
136 |               bins[bin] = m_purityBinning[pFeature].BinToPurityBin(bins[bin-1]);
137 |               pFeature++;
138 |               bin++;
139 |           }
140 |         }
141 |         return m_binned_forest.Analyse(bins);
142 |       }
143 |   }
144 |   
145 |   std::map<unsigned int, double> Classifier::GetIndividualVariableRanking(const std::vector<float> &X) const {
146 |     
147 |       std::map<unsigned int, double> ranking;
148 | 
149 |       if(m_can_use_fast_forest) {
150 |         ranking = m_fast_forest.GetIndividualVariableRanking(X);
151 |       } else {
152 |         std::vector<unsigned int> bins(m_numberOfFinalFeatures);
153 |         unsigned int bin = 0;
154 |         unsigned int pFeature = 0;
155 |         for(unsigned int iFeature = 0; iFeature < m_numberOfFeatures; ++iFeature) {
156 |           bins[bin] = m_featureBinning[iFeature].ValueToBin(X[iFeature]);
157 |           bin++;
158 |           if(m_purityTransformation[iFeature]) {
159 |               bins[bin] = m_purityBinning[pFeature].BinToPurityBin(bins[bin-1]);
160 |               pFeature++;
161 |               bin++;
162 |           }
163 |         }
164 |         ranking = m_binned_forest.GetIndividualVariableRanking(bins);
165 |       }
166 | 
167 |       return MapRankingToOriginalFeatures(ranking);
168 |   }
169 | 
170 |   std::map<unsigned int, unsigned int> Classifier::GetFeatureMapping() const {
171 |     
172 |     std::map<unsigned int, unsigned int> transformed2original;
173 |     unsigned int transformedFeature = 0;
174 |     for(unsigned int originalFeature = 0; originalFeature < m_numberOfFeatures; ++originalFeature) {
175 |       transformed2original[transformedFeature] = originalFeature;
176 |       if(m_purityTransformation[originalFeature]) {
177 |         transformedFeature++;
178 |         transformed2original[transformedFeature] = originalFeature;
179 |       }
180 |       transformedFeature++;
181 |     }
182 | 
183 |     return transformed2original;
184 | 
185 |   }
186 | 
187 |   std::map<unsigned int, double> Classifier::MapRankingToOriginalFeatures(std::map<unsigned int, double> ranking) const {
188 |     auto transformed2original = GetFeatureMapping();
189 |     std::map<unsigned int, double> original_ranking;
190 |     for(auto &pair : ranking) {
191 |       if(original_ranking.find(transformed2original[pair.first]) == original_ranking.end())
192 |         original_ranking[transformed2original[pair.first]] = 0;
193 |       original_ranking[transformed2original[pair.first]] += pair.second;
194 |     }
195 |     return original_ranking;
196 |   }
197 | 
198 | 
199 |   std::map<unsigned int, double> Classifier::GetVariableRanking() const {
200 |     std::map<unsigned int, double> ranking;
201 |     if (m_can_use_fast_forest)
202 |       ranking = m_fast_forest.GetVariableRanking();
203 |     else
204 |       ranking = m_binned_forest.GetVariableRanking();
205 |     return MapRankingToOriginalFeatures(ranking);
206 |   }
207 | 
208 | 
209 | std::ostream& operator<<(std::ostream& stream, const Classifier& classifier) {
210 | 
211 |     stream << classifier.m_version << std::endl;
212 |     stream << classifier.m_nTrees << std::endl;
213 |     stream << classifier.m_depth << std::endl;
214 |     stream << classifier.m_binning << std::endl;
215 |     stream << classifier.m_shrinkage << std::endl;
216 |     stream << classifier.m_subsample << std::endl;
217 |     stream << classifier.m_sPlot << std::endl;
218 |     stream << classifier.m_flatnessLoss << std::endl;
219 |     stream << classifier.m_purityTransformation << std::endl;
220 |     stream << classifier.m_transform2probability << std::endl;
221 |     stream << classifier.m_featureBinning << std::endl;
222 |     stream << classifier.m_purityBinning << std::endl;
223 |     stream << classifier.m_numberOfFeatures << std::endl;
224 |     stream << classifier.m_numberOfFinalFeatures << std::endl;
225 |     stream << classifier.m_numberOfFlatnessFeatures << std::endl;
226 |     stream << classifier.m_can_use_fast_forest << std::endl;
227 |     stream << classifier.m_fast_forest << std::endl;
228 |     stream << classifier.m_binned_forest << std::endl;
229 | 
230 |     return stream;
231 | }
232 | 
233 | }
234 | 


--------------------------------------------------------------------------------
/src/test_Performance.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2017
  3 |  */
  4 | 
  5 | #include "FastBDT.h"
  6 | 
  7 | #include <gtest/gtest.h>
  8 | 
  9 | #include <limits>
 10 | #include <chrono>
 11 | #include <random>
 12 | #include <algorithm>
 13 | #include <functional>
 14 | 
 15 | using namespace FastBDT;
 16 | 
 17 | class PerformanceFeatureBinningTest : public ::testing::Test {
 18 |     protected:
 19 |         virtual void SetUp() {
 20 |             std::default_random_engine generator;
 21 |             std::uniform_real_distribution<double> distribution(0.0,1.0);
 22 |             unsigned int N = 10000000;
 23 |             data.resize(N);
 24 |             for(unsigned int i = 0; i < N; ++i) {
 25 |               data[i] = distribution(generator);
 26 |             }
 27 |         }
 28 | 
 29 |         std::vector<float> data;
 30 | 
 31 | };
 32 | 
 33 | 
 34 | TEST_F(PerformanceFeatureBinningTest, FeatureBinningScalesLinearInNumberOfDataPoints) {
 35 | 
 36 |     // This is dominated by the sorting of the numbers -> N log (N),
 37 |     // for our purposes we assume just N, which seems to be fine
 38 |     // if this unittest starts failing I have to revise this and add the factor of log(N)
 39 | 
 40 |     std::vector<unsigned int> sizes = {1000, 10000, 100000, 1000000};
 41 |     std::vector<double> times;
 42 | 
 43 |     for( auto &size : sizes ) {
 44 |       std::vector<float> temp_data(data.begin(), data.begin() + size);
 45 |       std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
 46 |       FeatureBinning<float> binning(4, temp_data);
 47 |       std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now();
 48 | 
 49 |       // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself
 50 |       EXPECT_EQ(binning.GetNLevels(), 4u);
 51 | 
 52 |       std::chrono::duration<double, std::micro> time = stop - start;
 53 |       times.push_back(time.count());
 54 |     }
 55 | 
 56 |     // Check linear behaviour
 57 |     for(unsigned int i = 1; i < sizes.size(); ++i) {
 58 |       double size_ratio = sizes[i] / static_cast<double>(sizes[0]);
 59 |       double time_ratio = times[i] / static_cast<double>(times[0]);
 60 |       // We allow for deviation of factor two
 61 |       EXPECT_LT(time_ratio,  size_ratio * 2.0);
 62 |     }
 63 | 
 64 | }
 65 | 
 66 | 
 67 | TEST_F(PerformanceFeatureBinningTest, FeatureBinningScalesConstantInSmallNumberOfLayers) {
 68 | 
 69 |     // The feature binning should be dominated by the sorting of the numbers
 70 |     // hence it does not scale with the number of layers to first order
 71 |     // for large layers this will be wrong ~ #Layer > 17
 72 |     std::vector<unsigned int> sizes = {2, 3, 5, 7, 11, 13, 17};
 73 |     std::vector<double> times;
 74 | 
 75 |     for( auto &size : sizes ) {
 76 |       std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
 77 |       FeatureBinning<float> binning(size, data);
 78 |       std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now();
 79 | 
 80 |       // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself
 81 |       EXPECT_EQ(binning.GetNLevels(), size);
 82 | 
 83 |       std::chrono::duration<double, std::micro> time = stop - start;
 84 |       times.push_back(time.count());
 85 |     }
 86 | 
 87 |     // Check linear behaviour
 88 |     // We ignore the first measurement, to avoids effects of caching
 89 |     for(unsigned int i = 2; i < sizes.size(); ++i) {
 90 |       double time_ratio = times[i] / static_cast<double>(times[1]);
 91 |       EXPECT_GT(time_ratio,  0.8);
 92 |       EXPECT_LT(time_ratio,  1.2);
 93 |     }
 94 | 
 95 | }
 96 | 
 97 | class PerformanceTreeBuilderTest : public ::testing::Test {
 98 |     protected:
 99 |         std::default_random_engine generator;
100 |         std::uniform_int_distribution<unsigned int> distribution{0, 16};
101 | };
102 | 
103 | TEST_F(PerformanceTreeBuilderTest, TreeBuilderScalesLinearInNumberOfDataPoints) {
104 | 
105 |     auto random_source = std::bind(distribution, generator);
106 | 
107 |     unsigned int nFeatures = 10;
108 |     unsigned int nLayers = 4;
109 |     
110 |     std::vector<unsigned int> sizes = {1000, 10000, 100000, 1000000, 10000000};
111 |     std::vector<double> times;
112 | 
113 |     for( auto &size : sizes ) {
114 |       unsigned int nDataPoints = size;
115 |       std::vector<unsigned int> row(nFeatures);
116 |       std::vector<unsigned int> binning_levels(nFeatures, 4);
117 | 
118 |       EventSample sample(nDataPoints, nFeatures, 0, binning_levels);
119 |       for(unsigned int i = 0; i < nDataPoints; ++i) {
120 |         std::generate_n(row.begin(), nFeatures, random_source); 
121 |         sample.AddEvent( row, 1.0, i % 2 == 0);
122 |       }
123 | 
124 |       std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
125 |       TreeBuilder dt(nLayers, sample);
126 |       std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now();
127 | 
128 |       // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself
129 |       const auto &purities = dt.GetPurities();
130 |       EXPECT_EQ(purities.size(), static_cast<unsigned int>((1 << (nLayers+1)) - 1));
131 | 
132 |       std::chrono::duration<double, std::micro> time = stop - start;
133 |       times.push_back(time.count());
134 |     }
135 | 
136 |     // Check linear behaviour
137 |     for(unsigned int i = 1; i < sizes.size(); ++i) {
138 |       double size_ratio = sizes[i] / static_cast<double>(sizes[0]);
139 |       double time_ratio = times[i] / static_cast<double>(times[0]);
140 |       // We allow for deviation of factor two
141 |       EXPECT_LT(time_ratio,  size_ratio * 2.0);
142 |     }
143 | 
144 | 
145 | }
146 | 
147 | TEST_F(PerformanceTreeBuilderTest, TreeBuilderScalesLinearInNumberOfFeatures) {
148 | 
149 |     auto random_source = std::bind(distribution, generator);
150 | 
151 |     unsigned int nLayers = 4;
152 |     unsigned int nDataPoints = 100000;
153 |     
154 |     std::vector<unsigned int> sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512};
155 |     std::vector<double> times;
156 | 
157 |     for( auto &size : sizes ) {
158 |       unsigned int nFeatures = size;
159 |       std::vector<unsigned int> row(nFeatures);
160 |       std::vector<unsigned int> binning_levels(nFeatures, 4);
161 | 
162 |       EventSample sample(nDataPoints, nFeatures, 0, binning_levels);
163 |       for(unsigned int i = 0; i < nDataPoints; ++i) {
164 |         std::generate_n(row.begin(), nFeatures, random_source); 
165 |         sample.AddEvent( row, 1.0, i % 2 == 0);
166 |       }
167 | 
168 |       std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
169 |       TreeBuilder dt(nLayers, sample);
170 |       std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now();
171 | 
172 |       // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself
173 |       const auto &purities = dt.GetPurities();
174 |       EXPECT_EQ(purities.size(), static_cast<unsigned int>((1 << (nLayers+1)) - 1));
175 | 
176 |       std::chrono::duration<double, std::micro> time = stop - start;
177 |       times.push_back(time.count());
178 |     }
179 | 
180 |     // Check linear behaviour
181 |     // We ignore the first measurement, to avoids effects of caching
182 |     for(unsigned int i = 2; i < sizes.size(); ++i) {
183 |       double size_ratio = sizes[i] / static_cast<double>(sizes[1]);
184 |       double time_ratio = times[i] / static_cast<double>(times[1]);
185 |       // We allow for deviation of factor two
186 |       EXPECT_LT(time_ratio,  size_ratio * 2.0);
187 |     }
188 | }
189 | 
190 | 
191 | TEST_F(PerformanceTreeBuilderTest, TreeBuilderScalesLinearForSmallNumberOfLayers) {
192 | 
193 |     // For small numbers of layers (below 10) we should scale linear,
194 |     // above the number of nodes in the deeper layers of the tree gets in the same order
195 |     // of magnitude as the number of data_points and the summing of the histograms
196 |     // becomes important
197 |     auto random_source = std::bind(distribution, generator);
198 | 
199 |     unsigned int nFeatures = 10;
200 |     unsigned int nDataPoints = 100000;
201 |     
202 |     std::vector<unsigned int> sizes = {1, 2, 3, 5, 7, 11, 13};
203 |     std::vector<double> times;
204 |       
205 |     std::vector<unsigned int> row(nFeatures);
206 |     std::vector<unsigned int> binning_levels(nFeatures, 4);
207 |     EventSample sample(nDataPoints, nFeatures, 0, binning_levels);
208 |     for(unsigned int i = 0; i < nDataPoints; ++i) {
209 |       std::generate_n(row.begin(), nFeatures, random_source); 
210 |       sample.AddEvent( row, 1.0, i % 2 == 0);
211 |     }
212 | 
213 |     for( auto &size : sizes ) {
214 |       unsigned int nLayers = size;
215 | 
216 |       // Reset flags, so we can use the sample multiple times
217 |       auto &flags = sample.GetFlags();
218 |       for(unsigned int iEvent = 0; iEvent < nDataPoints; ++iEvent)
219 |         flags.Set(iEvent, 1);
220 | 
221 |       std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
222 |       TreeBuilder dt(nLayers, sample);
223 |       std::chrono::high_resolution_clock::time_point stop = std::chrono::high_resolution_clock::now();
224 | 
225 |       // We check something simple, so that we are sure that the compiler cannot optimize out the binning itself
226 |       const auto &purities = dt.GetPurities();
227 |       EXPECT_EQ(purities.size(), static_cast<unsigned int>((1 << (nLayers+1)) - 1));
228 | 
229 |       std::chrono::duration<double, std::micro> time = stop - start;
230 |       times.push_back(time.count());
231 |     }
232 | 
233 |     // Check linear behaviour
234 |     // We ignore the first measurement, to avoids effects of caching
235 |     for(unsigned int i = 2; i < sizes.size(); ++i) {
236 |       double size_ratio = sizes[i] / static_cast<double>(sizes[1]);
237 |       double time_ratio = times[i] / static_cast<double>(times[1]);
238 |       // We allow for deviation of factor two
239 |       EXPECT_LT(time_ratio,  size_ratio * 2.0);
240 |     }
241 | }
242 | 


--------------------------------------------------------------------------------
/examples/performance.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # A python version of the performance measurement script
  4 | # I didn't used this in the paper
  5 | 
  6 | import sys
  7 | sys.path.append('../FastBDT/python')
  8 | sys.path.append('../xgboost/python')
  9 | import numpy as np
 10 | from sklearn.ensemble import GradientBoostingClassifier
 11 | import FastBDT
 12 | import pickle
 13 | import scipy.sparse
 14 | import xgboost as xgb
 15 | import ROOT
 16 | from ROOT import TMVA
 17 | ROOT.TMVA.Tools.Instance()
 18 | import array
 19 | 
 20 | from timeit import default_timer as timer
 21 | 
 22 | class Data(object):
 23 |     def __init__(self, datafile, numberOfFeatures, numberOfEvents):
 24 |         data = np.loadtxt(datafile, skiprows=1, dtype=np.float64)
 25 |         self.numberOfFeatures = numberOfFeatures
 26 |         self.numberOfEvents = numberOfEvents
 27 |         self.X = data[:numberOfEvents, :numberOfFeatures].astype(np.float64)
 28 |         self.y = data[:numberOfEvents, -1].astype(np.uint32)
 29 | 
 30 | 
 31 | class Config(object):
 32 |     def __init__(self, numberOfFeatures, numberOfEvents, nTrees, depth, shrinkage, subSampling, nCutLevels):
 33 |         self.numberOfFeatures = numberOfFeatures
 34 |         self.numberOfEvents = numberOfEvents
 35 |         self.nTrees = nTrees
 36 |         self.depth = depth
 37 |         self.shrinkage = shrinkage
 38 |         self.subSampling = subSampling
 39 |         self.nCutLevels = nCutLevels
 40 | 
 41 | 
 42 | class Result(object):
 43 |     def __init__(self, label, probabilities, preprocessingTime, trainingTime, testTime):
 44 |         self.label = label
 45 |         self.probabilities = probabilities
 46 |         self.preprocessingTime = preprocessingTime
 47 |         self.trainingTime = trainingTime
 48 |         self.testTime = testTime
 49 | 
 50 | 
 51 | def writeResults(filename, results, test, config):
 52 |     with open(filename, 'w') as f:
 53 |         f.write("{c.nTrees} {c.depth} {c.shrinkage} {c.subSampling} {c.nCutLevels} {c.numberOfFeatures} {c.numberOfEvents}\n".format(c=config))
 54 |         f.write(" ".join(r.label for r in results) + "\n")
 55 |         f.write("PreprocessingTime: " + " ".join(str(r.preprocessingTime) for r in results) + "\n")
 56 |         f.write("TrainingTime: " + " ".join(str(r.trainingTime) for r in results) + "\n")
 57 |         f.write("TestTime: " + " ".join(str(r.testTime) for r in results) + "\n")
 58 | 
 59 |         for i in range(len(test.y)):
 60 |             f.write(" ".join(str(r.probabilities[i]) for r in results) + " " + str(test.y[i]) + "\n")
 61 | 
 62 | 
 63 | def measureFastBDT(train, test, config):
 64 |     preprocessing_start = timer()
 65 |     preprocessing_stop = timer()
 66 |     preprocessingTime = preprocessing_stop - preprocessing_start
 67 |     print('PreprocessingTime', preprocessingTime)
 68 |     
 69 |     training_start = timer()
 70 |     forest = FastBDT.Classifier(config.nCutLevels, config.nTrees, config.depth, config.shrinkage, config.subSampling)
 71 |     forest.fit(train.X, train.y)
 72 |     training_stop = timer()
 73 |     trainingTime = training_stop - training_start
 74 |     print('TrainingTime', trainingTime)
 75 | 
 76 |     test_start = timer()
 77 |     probabilities = forest.predict(test.X)
 78 |     test_stop = timer()
 79 |     testTime = test_stop - test_start
 80 |     print('TestTime', testTime)
 81 |     return Result("FastBDT", probabilities, preprocessingTime, trainingTime, testTime);
 82 | 
 83 | 
 84 | def measureSKLearn(train, test, config):
 85 |     preprocessing_start = timer()
 86 |     preprocessing_stop = timer()
 87 |     preprocessingTime = preprocessing_stop - preprocessing_start
 88 |     print('PreprocessingTime', preprocessingTime)
 89 |     
 90 |     training_start = timer()
 91 |     forest = GradientBoostingClassifier(n_estimators=config.nTrees, learning_rate=config.shrinkage, max_depth=config.depth, random_state=0, subsample=config.subSampling)
 92 |     forest.fit(train.X, train.y)
 93 |     training_stop = timer()
 94 |     trainingTime = training_stop - training_start
 95 |     print('TrainingTime', trainingTime)
 96 | 
 97 |     test_start = timer()
 98 |     probabilities = forest.predict_proba(test.X)[:, 1]
 99 |     test_stop = timer()
100 |     testTime = test_stop - test_start
101 |     print('TestTime', testTime)
102 |     return Result("SKLearn", probabilities, preprocessingTime, trainingTime, testTime);
103 | 
104 | 
105 | def measureXGBoost(train, test, config):
106 |     preprocessing_start = timer()
107 |     dtrain = xgb.DMatrix(train.X, label=train.y)
108 |     dtest = xgb.DMatrix(test.X, label=test.y)
109 |     preprocessing_stop = timer()
110 |     preprocessingTime = preprocessing_stop - preprocessing_start
111 |     print('PreprocessingTime', preprocessingTime)
112 | 
113 |     training_start = timer()
114 |     param = {'max_depth':config.depth, 'eta':config.shrinkage, 'silent':1, 'objective':'binary:logistic', 'subsample': config.subSampling, 'nthread': 1}
115 |     watchlist  = [(dtrain,'train')]
116 |     bst = xgb.train(param, dtrain, config.nTrees, watchlist)
117 |     training_stop = timer()
118 |     trainingTime = training_stop - training_start
119 |     print('TrainingTime', trainingTime)
120 | 
121 |     test_start = timer()
122 |     probabilities = bst.predict(dtest)
123 |     test_stop = timer()
124 |     testTime = test_stop - test_start
125 |     print('TestTime', testTime)
126 |     return Result("XGBoost", probabilities, preprocessingTime, trainingTime, testTime);
127 | 
128 | 
129 | def measureTMVA(train, test, config):
130 |     preprocessing_start = timer()
131 |     variables = ['index', 'chiProb', 'M', 'dr', 'dz', 'E', 'p', 'pz', 'pt', 'Kid', 'piid', 'Kz', 'piz', 'Kr', 'pir', 'Kz0', 'piz0', 'pi0M',
132 |                  'gamma1E', 'gamma2E', 'pipi0M', 'KpiM', 'Kpi0M', 'errM', 'KpCMS', 'pipCMS', 'pi0pCMS', 'distance', 'gamma1clusterTiming',
133 |                  'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks', 'nECLClusters', 'nKLMClusters']
134 |     variables = variables[:config.numberOfFeatures]
135 | 
136 |     outputFile = ROOT.TFile("temp.root", "recreate")
137 |     train_tree = ROOT.TTree("train_tree", "Training Tree")
138 |     test_tree = ROOT.TTree("train_tree", "Training Tree")
139 | 
140 |     register = {v: array.array('f', [0]) for v in variables + ['isSignal']}
141 |     for v in variables + ['isSignal']:
142 |         train_tree.Branch(v, register[v], v + '/F')
143 |         test_tree.Branch(v, register[v], v + '/F')
144 | 
145 |     for i, row in enumerate(train.X):
146 |         for j, v in enumerate(variables):
147 |             register[v][0] = row[j]
148 |         register['isSignal'][0] = float(train.y[i])
149 |         train_tree.Fill()
150 |     
151 |     for i, row in enumerate(test.X):
152 |         for j, v in enumerate(variables):
153 |             register[v][0] = row[j]
154 |         register['isSignal'][0] = float(test.y[i])
155 |         test_tree.Fill()
156 |     preprocessing_stop = timer()
157 |     preprocessingTime = preprocessing_stop - preprocessing_start
158 |     print('PreprocessingTime', preprocessingTime)
159 |     
160 |     training_start = timer()
161 |     factory = TMVA.Factory( "TMVAClassification", outputFile, 
162 |                             "!V:Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Classification" )
163 |     factory.SetVerbose(False)
164 |     for v in variables:
165 |         factory.AddVariable(v, v, "", 'F')
166 |     factory.SetInputTrees(train_tree, ROOT.TCut("isSignal == 1"), ROOT.TCut("isSignal == 0"))
167 |     nsig = np.sum(train.y)
168 |     nbkg = np.sum(1 - train.y)
169 |     factory.PrepareTrainingAndTestTree(ROOT.TCut(""), "nTrain_Signal={}:nTrain_Background={}:SplitMode=Block:NormMode=NumEvents:!V".format(nsig, nbkg) )
170 | 
171 |     factory.BookMethod( TMVA.Types.kBDT, "BDTG",
172 |             "!H:!V:NTrees={}:BoostType=Grad:Shrinkage={:.2f}:UseBaggedBoost:BaggedSampleFraction={:.2f}:nCuts={}:MaxDepth={}:IgnoreNegWeightsInTraining".format(config.nTrees, config.shrinkage, config.subSampling, 2**config.nCutLevels, config.depth) )                        
173 |     factory.TrainAllMethods()
174 |     reader = ROOT.TMVA.Reader()
175 |     reader.SetVerbose(False)
176 |     for v in variables:
177 |         reader.AddVariable(v, register[v])
178 |     reader.BookMVA("BDTG","weights/TMVAClassification_BDTG.weights.xml")
179 |     training_stop = timer()
180 |     trainingTime = training_stop - training_start
181 |     print('TrainingTime', trainingTime)
182 | 
183 |     test_start = timer()
184 |     probabilities = np.zeros(len(test.y))
185 |     for i in range(test_tree.GetEntries()):
186 |         test_tree.GetEvent(i)
187 |         probabilities[i] = reader.EvaluateMVA("BDTG")
188 |     test_stop = timer()
189 |     testTime = test_stop - test_start
190 |     print('TestTime', testTime)
191 | 
192 |     return Result("TMVA", probabilities, preprocessingTime, trainingTime, testTime);
193 | 
194 | 
195 | i = 0
196 | def measure(config):
197 | 
198 |     load_start = timer()
199 |     train = Data('data/train.csv', config.numberOfFeatures, config.numberOfEvents)
200 |     test = Data('data/test.csv', config.numberOfFeatures, config.numberOfEvents)
201 |     load_stop = timer()
202 |     print('Load', load_stop - load_start)
203 | 
204 |     start = timer()
205 |     resultTMVA = measureTMVA(train, test, config)
206 |     stop = timer()
207 |     print('measureTMVA', stop - start)
208 | 
209 |     start = timer()
210 |     resultFastBDT = measureFastBDT(train, test, config)
211 |     stop = timer()
212 |     print('measureFastBDT', stop - start)
213 | 
214 |     start = timer()
215 |     resultSKLearn = measureSKLearn(train, test, config)
216 |     stop = timer()
217 |     print('measureSKLearn', stop - start)
218 | 
219 |     start = timer()
220 |     resultXGBoost = measureXGBoost(train, test, config)
221 |     stop = timer()
222 |     print('measureXGBoost', stop - start)
223 | 
224 |     ++i
225 |     writeResults('result_{}_python.txt'.format(i), [resultFastBDT, resultXGBoost, resultSKLearn, resultTMVA], test, config)
226 | 
227 | 
228 | if __name__ == '__main__':
229 | 
230 |     config = Config(numberOfFeatures=35, numberOfEvents=50000, nTrees=100, shrinkage=0.1, depth=3, nCutLevels=8, subSampling=0.5)
231 |     for i in range(35, 36):
232 |         config.numberOfFeatures = i
233 |         measure(config)
234 | 
235 | 


--------------------------------------------------------------------------------
/PyFastBDT/FastBDT.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import numpy as np
  5 | 
  6 | import ctypes
  7 | import ctypes.util
  8 | c_double_p = ctypes.POINTER(ctypes.c_double)
  9 | c_float_p = ctypes.POINTER(ctypes.c_float)
 10 | c_bool_p = ctypes.POINTER(ctypes.c_bool)
 11 | c_uint_p = ctypes.POINTER(ctypes.c_uint)
 12 | 
 13 | FastBDT_library =  ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__),'libFastBDT_CInterface.so'))
 14 | 
 15 | FastBDT_library.Create.restype = ctypes.c_void_p
 16 | FastBDT_library.Delete.argtypes = [ctypes.c_void_p]
 17 | 
 18 | FastBDT_library.Load.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
 19 | FastBDT_library.Save.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
 20 | 
 21 | FastBDT_library.Fit.argtypes = [ctypes.c_void_p, c_float_p, c_float_p, c_bool_p, ctypes.c_uint]
 22 | 
 23 | FastBDT_library.Predict.argtypes = [ctypes.c_void_p, c_float_p]
 24 | FastBDT_library.Predict.restype = ctypes.c_float
 25 | 
 26 | FastBDT_library.PredictArray.argtypes = [ctypes.c_void_p, c_float_p, c_float_p, ctypes.c_uint]
 27 | 
 28 | FastBDT_library.SetSubsample.argtypes = [ctypes.c_void_p, ctypes.c_double]
 29 | FastBDT_library.GetSubsample.argtypes = [ctypes.c_void_p]
 30 | FastBDT_library.GetSubsample.restypes = ctypes.c_double
 31 | 
 32 | FastBDT_library.SetShrinkage.argtypes = [ctypes.c_void_p, ctypes.c_double]
 33 | FastBDT_library.GetShrinkage.argtypes = [ctypes.c_void_p]
 34 | FastBDT_library.GetShrinkage.restypes = ctypes.c_double
 35 | 
 36 | FastBDT_library.SetFlatnessLoss.argtypes = [ctypes.c_void_p, ctypes.c_double]
 37 | FastBDT_library.GetFlatnessLoss.argtypes = [ctypes.c_void_p]
 38 | FastBDT_library.GetFlatnessLoss.restypes = ctypes.c_double
 39 | 
 40 | FastBDT_library.SetNTrees.argtypes = [ctypes.c_void_p, ctypes.c_uint]
 41 | FastBDT_library.GetNTrees.argtypes = [ctypes.c_void_p]
 42 | FastBDT_library.GetNTrees.restypes = ctypes.c_uint
 43 | 
 44 | FastBDT_library.SetNumberOfFlatnessFeatures.argtypes = [ctypes.c_void_p, ctypes.c_uint]
 45 | FastBDT_library.GetNumberOfFlatnessFeatures.argtypes = [ctypes.c_void_p]
 46 | FastBDT_library.GetNumberOfFlatnessFeatures.restypes = ctypes.c_uint
 47 | 
 48 | FastBDT_library.SetBinning.argtypes = [ctypes.c_void_p, c_uint_p, ctypes.c_uint]
 49 | FastBDT_library.SetPurityTransformation.argtypes = [ctypes.c_void_p, c_uint_p, ctypes.c_uint]
 50 | 
 51 | FastBDT_library.SetDepth.argtypes = [ctypes.c_void_p, ctypes.c_uint]
 52 | FastBDT_library.GetDepth.argtypes = [ctypes.c_void_p]
 53 | FastBDT_library.GetDepth.restypes = ctypes.c_uint
 54 | 
 55 | FastBDT_library.SetTransform2Probability.argtypes = [ctypes.c_void_p, ctypes.c_bool]
 56 | FastBDT_library.GetTransform2Probability.argtypes = [ctypes.c_void_p]
 57 | FastBDT_library.GetTransform2Probability.restypes = ctypes.c_bool
 58 | 
 59 | FastBDT_library.SetSPlot.argtypes = [ctypes.c_void_p, ctypes.c_bool]
 60 | FastBDT_library.GetSPlot.argtypes = [ctypes.c_void_p]
 61 | FastBDT_library.GetSPlot.restypes = ctypes.c_bool
 62 | 
 63 | 
 64 | FastBDT_library.GetVariableRanking.argtypes = [ctypes.c_void_p]
 65 | FastBDT_library.GetVariableRanking.restype = ctypes.c_void_p
 66 | FastBDT_library.DeleteVariableRanking.argtypes = [ctypes.c_void_p]
 67 | FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.argtypes = [ctypes.c_void_p]
 68 | FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.restype = ctypes.c_uint
 69 | FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.argtypes = [ctypes.c_void_p, ctypes.c_uint]
 70 | FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.restype = ctypes.c_double
 71 |     
 72 | FastBDT_library.GetIndividualVariableRanking.argtypes = [ctypes.c_void_p, c_float_p]
 73 | FastBDT_library.GetIndividualVariableRanking.restype = ctypes.c_void_p
 74 | 
 75 | 
 76 | def PrintVersion():
 77 |     FastBDT_library.PrintVersion()
 78 | 
 79 | 
 80 | def calculate_roc_auc(p, t, w=None):
 81 |     """
 82 |     Calculates the area under the receiver oeprating characteristic curve (AUC ROC)
 83 |     @param p np.array filled with the probability output of a classifier
 84 |     @param t np.array filled with the target (0 or 1)
 85 |     """
 86 |     if w is None:
 87 |         w = np.ones(len(t))
 88 |     N = w.sum()
 89 |     T = np.sum(t*w)
 90 |     t = t*w
 91 |     index = np.argsort(p)
 92 |     efficiency = (T - np.cumsum(t[index])) / float(T)
 93 |     purity = (T - np.cumsum(t[index])) / (N - np.cumsum(w))
 94 |     purity = np.where(np.isnan(purity), 0, purity)
 95 |     return np.abs(np.trapz(purity, efficiency))
 96 | 
 97 | 
 98 | class Classifier(object):
 99 |     def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True, purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0):
100 |         """
101 |         @param binning list of numbers with the power N used for each feature binning e.g. 8 means 2^8 bins
102 |         @param nTrees number of trees
103 |         @param shrinkage reduction factor of each tree, lower shrinkage leads to slower but more stable convergence
104 |         @param subsample the ratio of samples used for each tree
105 |         @param transform2probability whether to transform the output to a probability
106 |         @param purityTransformation list of bools, defines for each feature of in addition the purity-transformed of the feature should be used (this will slow down the inference)
107 |         @param sPlot special treatment of sPlot weights are used
108 |         @param flatnessLoss if bigger than 0 a flatness boost against all flatnessFeatures
109 |         @param numberOfFlatnessFeatures the number of flatness features, it is assumed that the last N features are the flatness features
110 |         """
111 |         self.binning = binning
112 |         self.nTrees = nTrees
113 |         self.depth = depth
114 |         self.shrinkage = shrinkage
115 |         self.subsample = subsample
116 |         self.transform2probability = transform2probability
117 |         self.purityTransformation = purityTransformation
118 |         self.sPlot = sPlot
119 |         self.flatnessLoss = flatnessLoss
120 |         self.numberOfFlatnessFeatures = numberOfFlatnessFeatures
121 |         self.forest = self.create_forest()
122 | 
123 |     def create_forest(self):
124 |         forest = FastBDT_library.Create()
125 |         FastBDT_library.SetBinning(forest, np.array(self.binning).ctypes.data_as(c_uint_p), int(len(self.binning)))
126 |         FastBDT_library.SetNTrees(forest, int(self.nTrees))
127 |         FastBDT_library.SetDepth(forest, int(self.depth))
128 |         FastBDT_library.SetNumberOfFlatnessFeatures(forest, int(self.numberOfFlatnessFeatures))
129 |         FastBDT_library.SetShrinkage(forest, float(self.shrinkage))
130 |         FastBDT_library.SetSubsample(forest, float(self.subsample))
131 |         FastBDT_library.SetFlatnessLoss(forest, float(self.flatnessLoss))
132 |         FastBDT_library.SetTransform2Probability(forest, bool(self.transform2probability))
133 |         FastBDT_library.SetSPlot(forest, bool(self.sPlot))
134 |         FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p), int(len(self.purityTransformation)))
135 |         return forest
136 | 
137 |     def fit(self, X, y, weights=None):
138 |         X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
139 |         y_temp = np.require(y, dtype=np.bool, requirements=['A', 'W', 'C', 'O'])
140 |         if weights is not None:
141 |             w_temp = np.require(weights, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
142 |         numberOfEvents, numberOfFeatures = X_temp.shape
143 |         FastBDT_library.Fit(self.forest, X_temp.ctypes.data_as(c_float_p),
144 |                               w_temp.ctypes.data_as(c_float_p) if weights is not None else None,
145 |                               y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures))
146 |         return self
147 | 
148 |     def predict(self, X):
149 |         X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
150 |         N = len(X)
151 |         p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
152 |         FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), int(X_temp.shape[0]))
153 |         return p
154 |     
155 |     def predict_single(self, row):
156 |         return FastBDT_library.Predict(self.forest, row.ctypes.data_as(c_float_p))
157 | 
158 |     def save(self, weightfile):
159 |         FastBDT_library.Save(self.forest, bytes(weightfile, 'utf-8'))
160 | 
161 |     def load(self, weightfile):
162 |         FastBDT_library.Load(self.forest, bytes(weightfile, 'utf-8'))
163 |     
164 |     def individualFeatureImportance(self, X):
165 |         X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
166 |         _ranking = FastBDT_library.GetIndividualVariableRanking(self.forest, X_temp.ctypes.data_as(c_float_p))
167 |         ranking = dict()
168 |         for i in range(FastBDT_library.ExtractNumberOfVariablesFromVariableRanking(_ranking)):
169 |             ranking[i] = FastBDT_library.ExtractImportanceOfVariableFromVariableRanking(_ranking, int(i))
170 |         FastBDT_library.DeleteVariableRanking(_ranking)
171 |         return ranking
172 | 
173 |     def internFeatureImportance(self):
174 |         _ranking = FastBDT_library.GetVariableRanking(self.forest)
175 |         ranking = dict()
176 |         for i in range(FastBDT_library.ExtractNumberOfVariablesFromVariableRanking(_ranking)):
177 |             ranking[i] = FastBDT_library.ExtractImportanceOfVariableFromVariableRanking(_ranking, int(i))
178 |         FastBDT_library.DeleteVariableRanking(_ranking)
179 |         return ranking
180 | 
181 |     def externFeatureImportance(self, X, y, weights=None, X_test=None, y_test=None, weights_test=None):
182 |         if X_test is None:
183 |             X_test = X
184 |         if y_test is None:
185 |             y_test = y
186 |         if weights_test is None:
187 |             weights_test = weights
188 |         numberOfEvents, numberOfFeatures = X.shape
189 |         global_auc = calculate_roc_auc(self.predict(X_test), y_test, weights_test)
190 |         forest = self.forest
191 |         importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test, y_test, weights_test)
192 |         self.forest = forest
193 |         return importances
194 | 
195 |     def _externFeatureImportance(self, features, global_auc, X, y, weights, X_test, y_test, weights_test):
196 |         importances = dict()
197 |         for i in features:
198 |             remaining_features = [f for f in features if f != i]
199 |             X_temp = X[:, remaining_features]
200 |             X_test_temp = X_test[:, remaining_features]
201 |             self.forest = self.create_forest()
202 |             self.fit(X_temp, y, weights)
203 |             auc = calculate_roc_auc(self.predict(X_test_temp), y_test, weights_test)
204 |             FastBDT_library.Delete(self.forest)
205 |             importances[i] = global_auc - auc
206 |    
207 |         most_important = max(importances.keys(), key=lambda x: importances[x])
208 |         remaining_features = [v for v in features if v != most_important]
209 |         if len(remaining_features) == 1:
210 |             return importances
211 |     
212 |         importances = {most_important: importances[most_important]}
213 |         rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y, weights, X_test, y_test, weights_test)
214 |         importances.update(rest)
215 |         return importances
216 | 
217 |     def __del__(self):
218 |         FastBDT_library.Delete(self.forest)
219 | 


--------------------------------------------------------------------------------
/examples/splot.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('../python/')
  3 | import FastBDT
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib as mpl
  7 | import numpy
  8 | import numpy.linalg
  9 | import pandas
 10 | import seaborn
 11 | import sklearn.metrics
 12 | 
 13 | 
 14 | class Prior(object):
 15 |     def __init__(self, signal, bckgrd):
 16 |         self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(signal)
 17 |         self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(bckgrd)
 18 |         # Avoid numerical instabilities
 19 |         self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
 20 |         self.signal_yield = len(signal)
 21 |         self.bckgrd_yield = len(bckgrd)
 22 | 
 23 |     def get_signal_pdf(self, X):
 24 |         return self.signal_pdf[numpy.digitize(X, bins=self.signal_bins)]
 25 | 
 26 |     def get_bckgrd_pdf(self, X):
 27 |         return self.bckgrd_pdf[numpy.digitize(X, bins=self.bckgrd_bins)]
 28 | 
 29 |     def get_signal_cdf(self, X):
 30 |         return self.signal_cdf[numpy.digitize(X, bins=self.signal_bins)]
 31 | 
 32 |     def get_bckgrd_cdf(self, X):
 33 |         return self.bckgrd_cdf[numpy.digitize(X, bins=self.bckgrd_bins)]
 34 |     
 35 |     def get_prior(self, X):
 36 |         return self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X))
 37 |     
 38 |     def get_signal_boost_weights(self, X):
 39 |         return self.get_signal_cdf(X) / self.get_bckgrd_pdf(X)
 40 |     
 41 |     def get_bckgrd_boost_weights(self, X):
 42 |         # NOT self.get_bckgrd_cdf() here, signal and background are handlet asymmetrical!
 43 |         return (1.0 - self.get_signal_cdf(X)) / self.get_bckgrd_pdf(X)
 44 | 
 45 |     def get_boost_weights(self, X):
 46 |         return numpy.r_[self.get_signal_boost_weights(X), self.get_bckgrd_boost_weights(X)]
 47 | 
 48 |     def get_splot_weights(self, X):
 49 |         pdfs = [self.get_signal_pdf(X), self.get_bckgrd_pdf(X)]
 50 |         yields = [self.signal_yield, self.bckgrd_yield]
 51 |         weights = calculate_splot_weights(pdfs, yields)
 52 |         return numpy.r_[weights[0], weights[1]]
 53 |     
 54 |     def get_aplot_weights(self, X, boost_prediction):
 55 |         reg_boost_prediction = boost_prediction * 0.99 + 0.005
 56 |         weights = (self.get_signal_cdf(X) / reg_boost_prediction +  (1.0 - self.get_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
 57 |         return self.get_splot_weights(X) * numpy.r_[weights, weights]
 58 | 
 59 | 
 60 | def calculate_cdf_and_pdf(X):
 61 |     """
 62 |     Calculates cdf and pdf of given sample and adds under/overflow bins
 63 |         @param X 1-d numpy.array
 64 |     """
 65 |     pdf, bins = numpy.histogram(X, bins=100, density=True)
 66 |     cdf = numpy.cumsum(pdf * (bins - numpy.roll(bins, 1))[1:])
 67 |     return numpy.hstack([0.0, cdf, 1.0]), numpy.hstack([0.0, pdf, 0.0]), bins
 68 | 
 69 | 
 70 | def calculate_splot_weights(pdfs, yields):
 71 |     """
 72 |     Calculates sPlot weights using the pdfs
 73 |         @param pdfs list of 1-d numpy.array with pdf values of the different components for each event
 74 |         @param yields list of the yields of the different components
 75 |     """
 76 |     N_components = len(pdfs)
 77 |     # Consistency checks
 78 |     if N_components != len(yields):
 79 |         raise RuntimeError("You have to provide the same number of pdfs and yields!")
 80 |     if N_components < 2:
 81 |         raise RuntimeError("Need at least two components!")
 82 | 
 83 |     # Calculate covariance matrix
 84 |     inverse_covariance = numpy.zeros((N_components, N_components))
 85 |     norm = sum((yields[k] * pdfs[k] for k in range(1, N_components)), yields[0] * pdfs[0])**2
 86 |     for i in range(N_components):
 87 |         for j in range(N_components):
 88 |             inverse_covariance[i, j] = numpy.nansum(pdfs[i] * pdfs[j] / norm)
 89 |     covariance = numpy.linalg.inv(inverse_covariance)
 90 |     print(inverse_covariance)
 91 |     print(covariance)
 92 | 
 93 |     # Return list of sPlot weights for each component
 94 |     return [sum(covariance[n, k] * pdfs[k] for k in range(N_components)) /
 95 |             sum(yields[k] * pdfs[k] for k in range(N_components)) for n in range(N_components)]
 96 | 
 97 | 
 98 | def calculate_score(label, train_prediction, test_prediction, train_truth, test_truth):
 99 |     train_fpr, train_tpr, train_thresholds = sklearn.metrics.roc_curve(train_truth, train_prediction)
100 |     train_auc = sklearn.metrics.auc(train_fpr, train_tpr)
101 |     #plt.plot(train_fpr, train_tpr, label=label + ' (Train) ROC Integral = {:.3}'.format(train_auc))
102 |     test_fpr, test_tpr, test_thresholds = sklearn.metrics.roc_curve(test_truth, test_prediction)
103 |     test_auc = sklearn.metrics.auc(test_fpr, test_tpr)
104 |     plt.plot(test_fpr, test_tpr, lw=4, label=label + '  ROC Integral = {:.3}'.format(test_auc))
105 |     #plt.legend()
106 |     #plt.show()
107 |     return train_auc, test_auc
108 | 
109 | 
110 | def combine_probabilities(p1, p2):
111 |     return p1*p2 / (p1*p2 + (1-p1)*(1-p2))
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     train_datafile = '../files/D0_2.txt'
116 |     data = pandas.DataFrame.from_csv(train_datafile, sep=' ', index_col=None)
117 |     df = data[data['distance'] < 0.1]
118 |     N = len(df) // 2
119 |     train_df = df.iloc[:N]
120 |     print("Length training data", len(train_df))
121 |     test_df = df.iloc[N:]
122 |     print("Length test data", len(test_df))
123 |     #keys = ['dM', 'chiProb', 'distance', 'gamma1E', 'gamma2E', 'gamma1clusterTiming', 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks']
124 |     keys = ['dM', 'Kpi0M', 'KpiM', 'chiProb', 'distance', 'gamma1E', 'gamma2E', 'gamma1clusterTiming', 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks']
125 |     #keys = ['dM', 'Kpi0M', 'KpiM', 'chiProb', 'distance', 'gamma1E', 'gamma2E', 'gamma1clusterTiming', 'gamma2clusterTiming', 'gamma1E9E25', 'gamma2E9E25', 'nTracks', 'dMBestCandidate']
126 | 
127 |     signal = train_df[train_df.isSignal == 1][keys[0]].values
128 |     bckgrd = train_df[train_df.isSignal == 0][keys[0]].values
129 |     prior = Prior(signal, bckgrd)
130 |     splot_weights = calculate_splot_weights([prior.get_signal_pdf(train_df[keys[0]].values), prior.get_bckgrd_pdf(train_df[keys[0]].values)], [len(signal), len(bckgrd)])
131 | 
132 |     full_forest = FastBDT.Classifier().fit(X=train_df[keys].values,
133 |                                            y=train_df['isSignal'].values)
134 | 
135 |     ordinary_forest = FastBDT.Classifier().fit(X=train_df[keys[1:]].values,
136 |                                                y=train_df['isSignal'].values)
137 | 
138 |     splot_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[keys[1:]].values, train_df[keys[1:]].values],
139 |                                              y=numpy.r_[numpy.ones(N), numpy.zeros(N)],
140 |                                              weights=prior.get_splot_weights(train_df[keys[0]].values))
141 | 
142 |     boost_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[keys[1:]].values, train_df[keys[1:]].values],
143 |                                             y=numpy.r_[numpy.ones(N), numpy.zeros(N)],
144 |                                             weights=prior.get_boost_weights(train_df[keys[0]].values))
145 |     aplot_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[keys[1:]].values, train_df[keys[1:]].values],
146 |                                             y=numpy.r_[numpy.ones(N), numpy.zeros(N)],
147 |                                             weights=prior.get_aplot_weights(train_df[keys[0]].values, boost_forest.predict(train_df[keys[1:]].values))) 
148 |  
149 | 
150 |     # Side-Band Subtraction
151 |     signal_region = (train_df.dM.abs() < 0.05)
152 |     neg_signal_region = (0.24 < train_df.dM.abs()) & (train_df.dM.abs() < 0.2849)
153 |     bckgrd_region = (0.258 < train_df.dM.abs()) & (train_df.dM.abs() < 0.2746)
154 |     print("SignalRegion:", "Signal", (signal_region & (train_df.isSignal == 1)).sum(), "Bckgrd", (signal_region & (train_df.isSignal == 0)).sum())
155 |     print("BckgrdRegion:", "Signal", (bckgrd_region & (train_df.isSignal == 1)).sum(), "Bckgrd", (bckgrd_region & (train_df.isSignal == 0)).sum())
156 |     print("NegSignalRegion:", "Signal", (neg_signal_region & (train_df.isSignal == 1)).sum(), "Bckgrd", (neg_signal_region & (train_df.isSignal == 0)).sum())
157 | 
158 |     side_forest = FastBDT.Classifier().fit(X=numpy.r_[train_df[signal_region][keys[1:]].values, train_df[bckgrd_region][keys[1:]].values, train_df[neg_signal_region][keys[1:]].values],
159 |                                             y=numpy.r_[numpy.ones(signal_region.sum()), numpy.zeros(bckgrd_region.sum()), numpy.ones(neg_signal_region.sum())],
160 |                                             weights=numpy.r_[numpy.ones(signal_region.sum()), numpy.ones(bckgrd_region.sum()), -numpy.ones(neg_signal_region.sum())]) 
161 | 
162 | 
163 |     seaborn.set(font_scale=4.5)
164 |     seaborn.distplot(train_df.dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Data')
165 |     seaborn.distplot(train_df[signal_region].dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Signal Region')
166 |     seaborn.distplot(train_df[neg_signal_region].dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Negative Signal Region')
167 |     seaborn.distplot(train_df[bckgrd_region].dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Background Region')
168 |     plt.xlim((-0.3,0.3))
169 |     plt.xlabel('Reconstructed Mass - Nominal Mass')
170 |     plt.legend()
171 |     figure = plt.gcf() # get current figure
172 |     figure.set_size_inches(24, 16)
173 |     plt.savefig('sideband.png')
174 |     plt.clf()
175 |     
176 |     seaborn.distplot(train_df.dM.values, bins=200, kde=False, hist_kws={'range': (-0.3, 0.3)}, label='Signal Fit')
177 |     seaborn.distplot(train_df[train_df.isSignal == 0].dM.values, kde=False, bins=200, hist_kws={'range': (-0.3, 0.3)}, label='Background Fit')
178 |     plt.xlim((-0.3,0.3))
179 |     plt.xlabel('Reconstructed Mass - Nominal Mass')
180 |     plt.legend()
181 |     figure = plt.gcf() # get current figure
182 |     figure.set_size_inches(24, 16)
183 |     plt.savefig('splot.png')
184 |     plt.clf()
185 | 
186 |     full_prediction_train = full_forest.predict(train_df[keys].values)
187 |     ordinary_prediction_train = ordinary_forest.predict(train_df[keys[1:]].values)
188 |     splot_prediction_train = splot_forest.predict(train_df[keys[1:]].values)
189 |     aplot_prediction_train = aplot_forest.predict(train_df[keys[1:]].values)
190 |     prior_prediction_train = prior.get_prior(train_df[keys[0]].values)
191 |     side_prediction_train = side_forest.predict(train_df[keys[1:]].values)
192 |     ordinary_prior_prediction_train = combine_probabilities(ordinary_prediction_train, prior_prediction_train)
193 |     splot_prior_prediction_train = combine_probabilities(splot_prediction_train, prior_prediction_train)
194 |     aplot_prior_prediction_train = combine_probabilities(aplot_prediction_train, prior_prediction_train)
195 |     side_prior_prediction_train = combine_probabilities(side_prediction_train, prior_prediction_train)
196 |     truth_train = train_df['isSignal'].values
197 |     
198 |     full_prediction_test = full_forest.predict(test_df[keys].values)
199 |     ordinary_prediction_test = ordinary_forest.predict(test_df[keys[1:]].values)
200 |     splot_prediction_test = splot_forest.predict(test_df[keys[1:]].values)
201 |     aplot_prediction_test = aplot_forest.predict(test_df[keys[1:]].values)
202 |     prior_prediction_test = prior.get_prior(test_df[keys[0]].values)
203 |     side_prediction_test = side_forest.predict(test_df[keys[1:]].values)
204 |     ordinary_prior_prediction_test = combine_probabilities(ordinary_prediction_test, prior_prediction_test)
205 |     splot_prior_prediction_test = combine_probabilities(splot_prediction_test, prior_prediction_test)
206 |     aplot_prior_prediction_test = combine_probabilities(aplot_prediction_test, prior_prediction_test)
207 |     side_prior_prediction_test = combine_probabilities(side_prediction_test, prior_prediction_test)
208 |     truth_test = test_df['isSignal'].values
209 | 
210 |     seaborn.set_palette("Set1", n_colors=10, desat=.5)
211 |     trivial_prior = train_df.isSignal.mean()
212 |     #calculate_score("Trivial", numpy.ones(len(truth_train))*trivial_prior, numpy.ones(len(truth_test))*trivial_prior, truth_train, truth_test)
213 |     calculate_score("Full", full_prediction_train, full_prediction_test, truth_train, truth_test)
214 |     calculate_score("Ordinary", ordinary_prediction_train, ordinary_prediction_test, truth_train, truth_test)
215 |     calculate_score("SPlot", splot_prediction_train, splot_prediction_test, truth_train, truth_test)
216 |     calculate_score("APlot", aplot_prediction_train, aplot_prediction_test, truth_train, truth_test)
217 |     calculate_score("Sideband", side_prediction_train, side_prediction_test, truth_train, truth_test)
218 |     calculate_score("Prior", prior_prediction_train, prior_prediction_test, truth_train, truth_test)
219 |     calculate_score("OrdinaryPrior", ordinary_prior_prediction_train, ordinary_prior_prediction_test, truth_train, truth_test)
220 |     calculate_score("SPlotPrior", splot_prior_prediction_train, splot_prior_prediction_test, truth_train, truth_test)
221 |     calculate_score("APlotPrior", aplot_prior_prediction_train, aplot_prior_prediction_test, truth_train, truth_test)
222 |     calculate_score("SidePrior", side_prior_prediction_train, side_prior_prediction_test, truth_train, truth_test)
223 |     plt.xlabel('False Positive Rate (Type I Error)')
224 |     plt.ylabel('True Positive Rate (Efficiency)')
225 |     plt.xlim((0.5,1.0))
226 |     plt.xlim((0.0,0.5))
227 |     plt.legend(loc='lower right')
228 |     figure = plt.gcf() # get current figure
229 |     figure.set_size_inches(24, 16)
230 |     plt.savefig('splot_sideband_roc.png')
231 |     plt.clf()
232 | 


--------------------------------------------------------------------------------
/src/test_FastBDT_IO.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2014
  3 |  */
  4 | 
  5 | #include "FastBDT.h"
  6 | #include "FastBDT_IO.h"
  7 | 
  8 | #include <gtest/gtest.h>
  9 | 
 10 | #include <sstream>
 11 | #include <limits>
 12 | 
 13 | using namespace FastBDT;
 14 | 
 15 | 
 16 | class IOTest : public ::testing::Test {
 17 |     protected:
 18 |         virtual void SetUp() {}
 19 |         virtual void TearDown() {}
 20 | 
 21 | };
 22 | 
 23 | 
 24 | template<typename T>
 25 | ::testing::AssertionResult CmpHelperFloatingPointEQNanSafe(const char* expected_expression, const char* actual_expression, T expected, T actual) {
 26 |   if(std::isnan(expected) and std::isnan(actual)) {
 27 |     return ::testing::AssertionSuccess();
 28 |   }
 29 |   return ::testing::internal::CmpHelperFloatingPointEQ(expected_expression, actual_expression, expected, actual);
 30 | }
 31 | 
 32 | 
 33 | #define EXPECT_FLOAT_EQ_NAN_SAFE(x, y)  EXPECT_PRED_FORMAT2(CmpHelperFloatingPointEQNanSafe<float>, x, y) 
 34 | #define EXPECT_DOUBLE_EQ_NAN_SAFE(x, y)  EXPECT_PRED_FORMAT2(CmpHelperFloatingPointEQNanSafe<double>, x, y) 
 35 | 
 36 | 
 37 | TEST_F(IOTest, IOVector) {
 38 | 
 39 |     std::vector<double> before = {0.0, 1.0, 2.5, 3.2, -1.4, 0.0};
 40 | 
 41 |     std::stringstream stream;
 42 |     stream << before;
 43 | 
 44 |     std::vector<double> after;
 45 |     stream >> after;
 46 | 
 47 |     EXPECT_EQ(before.size(), after.size());
 48 |     for(unsigned int i = 0; i < before.size() and i < after.size(); ++i)
 49 |         EXPECT_FLOAT_EQ(before[i], after[i]);
 50 | 
 51 | }
 52 | 
 53 | TEST_F(IOTest, IOUsingSpecialValuesFloat) {
 54 | 
 55 |     std::vector<float> before = {std::numeric_limits<float>::lowest(),
 56 |                                   std::numeric_limits<float>::denorm_min(),
 57 |                                   std::numeric_limits<float>::min(),
 58 |                                   std::numeric_limits<float>::max(),
 59 |                                   std::numeric_limits<float>::infinity(),
 60 |                                   -std::numeric_limits<float>::infinity(),
 61 |                                  std::numeric_limits<float>::quiet_NaN(),
 62 |                                  std::numeric_limits<float>::signaling_NaN(),
 63 |                                  0.0};
 64 | 
 65 |     std::stringstream stream;
 66 |     stream << before;
 67 | 
 68 |     std::vector<float> after;
 69 |     stream >> after;
 70 | 
 71 |     EXPECT_EQ(before.size(), after.size());
 72 |     for(unsigned int i = 0; i < before.size() and i < after.size(); ++i)
 73 |         EXPECT_FLOAT_EQ_NAN_SAFE(before[i], after[i]);
 74 | 
 75 | }
 76 | 
 77 | TEST_F(IOTest, IOUsingSpecialValuesDouble) {
 78 | 
 79 |     std::vector<double> before = {std::numeric_limits<double>::lowest(),
 80 |                                   std::numeric_limits<double>::denorm_min(),
 81 |                                   std::numeric_limits<double>::min(),
 82 |                                   std::numeric_limits<double>::max(),
 83 |                                   std::numeric_limits<double>::infinity(),
 84 |                                   -std::numeric_limits<double>::infinity(),
 85 |                                  std::numeric_limits<double>::quiet_NaN(),
 86 |                                  std::numeric_limits<double>::signaling_NaN(),
 87 |                                  0.0};
 88 | 
 89 |     std::stringstream stream;
 90 |     stream << before;
 91 | 
 92 |     std::vector<double> after;
 93 |     stream >> after;
 94 | 
 95 |     EXPECT_EQ(before.size(), after.size());
 96 |     for(unsigned int i = 0; i < before.size() and i < after.size(); ++i)
 97 |         EXPECT_DOUBLE_EQ_NAN_SAFE(before[i], after[i]);
 98 | 
 99 | }
100 | 
101 | TEST_F(IOTest, IOFeatureBinning) {
102 | 
103 |     std::vector<double> binning = { 1.0f, 7.0f, 4.0f, 10.0f, 12.0f }; 
104 |     FeatureBinning<double> before(2, binning);
105 |     const auto &before_binning = before.GetBinning();
106 | 
107 |     std::stringstream stream;
108 |     stream << before;
109 | 
110 |     auto after = readFeatureBinningFromStream<double>(stream);
111 |     const auto &after_binning = after.GetBinning();
112 | 
113 |     EXPECT_EQ(before.GetNLevels(), after.GetNLevels());
114 |     EXPECT_EQ(before_binning.size(), after_binning.size());
115 |     for(unsigned int i = 0; i < before_binning.size() and i < after_binning.size(); ++i)
116 |         EXPECT_FLOAT_EQ_NAN_SAFE(before_binning[i], after_binning[i]);
117 | 
118 | }
119 | 
120 | TEST_F(IOTest, IOFeatureBinningVector) {
121 | 
122 |     std::vector<double> binning1 = { 1.0f, 7.0f, 4.0f, 10.0f, 12.0f }; 
123 |     std::vector<double> binning2 = { 6.0f, 7.0f, 2.0f, 12.0f, 12.0f }; 
124 |     std::vector<FeatureBinning<double>> before = {FeatureBinning<double>(2, binning1),
125 |                                                   FeatureBinning<double>(2, binning2)};
126 | 
127 |     std::stringstream stream;
128 |     stream << before;
129 | 
130 |     std::vector<FeatureBinning<double>> after;
131 |     stream >> after;
132 | 
133 |     EXPECT_EQ(before.size(), after.size());
134 |     for(unsigned int j = 0; j < before.size() and j < after.size(); ++j) {
135 | 
136 |         auto &before_featureBinning = before[j];
137 |         auto &after_featureBinning = after[j];
138 |         const auto &after_binning = after_featureBinning.GetBinning();
139 |         const auto &before_binning = before_featureBinning.GetBinning();
140 | 
141 |         EXPECT_EQ(before_featureBinning.GetNLevels(), after_featureBinning.GetNLevels());
142 |         EXPECT_EQ(before_binning.size(), after_binning.size());
143 |         for(unsigned int i = 0; i < before_binning.size() and i < after_binning.size(); ++i)
144 |             EXPECT_FLOAT_EQ_NAN_SAFE(before_binning[i], after_binning[i]);
145 | 
146 |     }
147 | 
148 | }
149 | 
150 | TEST_F(IOTest, IOCut) {
151 | 
152 |     Cut<unsigned int> before;
153 |     before.feature = 1;
154 |     before.gain = 3.4;
155 |     before.index = 5;
156 |     before.valid = true;
157 | 
158 |     std::stringstream stream;
159 |     stream << before;
160 | 
161 |     Cut<unsigned int> after;
162 |     stream >> after;
163 | 
164 |     EXPECT_EQ(before.feature, after.feature);
165 |     EXPECT_EQ(before.gain, after.gain);
166 |     EXPECT_EQ(before.index, after.index);
167 |     EXPECT_EQ(before.valid, after.valid);
168 | 
169 | }
170 | 
171 | TEST_F(IOTest, IOCutSpecialValuesFloat) {
172 |     
173 |     std::vector<float> values = {std::numeric_limits<float>::lowest(),
174 |                                  std::numeric_limits<float>::denorm_min(),
175 |                                  std::numeric_limits<float>::min(),
176 |                                  std::numeric_limits<float>::max(),
177 |                                  std::numeric_limits<float>::infinity(),
178 |                                  -std::numeric_limits<float>::infinity(),
179 |                                  std::numeric_limits<float>::quiet_NaN(),
180 |                                  std::numeric_limits<float>::signaling_NaN(),
181 |                                  0.0};
182 | 
183 |     for(auto &f : values) {
184 |       Cut<float> before;
185 |       before.feature = 1;
186 |       before.gain = 3.4;
187 |       before.index = f;
188 |       before.valid = true;
189 | 
190 |       std::stringstream stream;
191 |       stream << before;
192 | 
193 |       Cut<float> after;
194 |       stream >> after;
195 | 
196 |       EXPECT_EQ(before.feature, after.feature);
197 |       EXPECT_FLOAT_EQ(before.gain, after.gain);
198 |       EXPECT_FLOAT_EQ_NAN_SAFE(before.index, after.index);
199 |       EXPECT_EQ(before.valid, after.valid);
200 |     }
201 | 
202 | }
203 | 
204 | TEST_F(IOTest, IOCutSpecialValuesDouble) {
205 |     
206 |     std::vector<double> values = {std::numeric_limits<double>::lowest(),
207 |                                  std::numeric_limits<double>::denorm_min(),
208 |                                  std::numeric_limits<double>::min(),
209 |                                  std::numeric_limits<double>::max(),
210 |                                  std::numeric_limits<double>::infinity(),
211 |                                  -std::numeric_limits<double>::infinity(),
212 |                                  std::numeric_limits<double>::quiet_NaN(),
213 |                                  std::numeric_limits<double>::signaling_NaN(),
214 |                                  0.0};
215 | 
216 |     for(auto &f : values) {
217 |       Cut<double> before;
218 |       before.feature = 1;
219 |       before.gain = 3.4;
220 |       before.index = f;
221 |       before.valid = true;
222 | 
223 |       std::stringstream stream;
224 |       stream << before;
225 | 
226 |       Cut<double> after;
227 |       stream >> after;
228 | 
229 |       EXPECT_EQ(before.feature, after.feature);
230 |       EXPECT_FLOAT_EQ(before.gain, after.gain);
231 |       EXPECT_DOUBLE_EQ_NAN_SAFE(before.index, after.index);
232 |       EXPECT_EQ(before.valid, after.valid);
233 |     }
234 | 
235 | }
236 |             
237 | TEST_F(IOTest, IOTree) {
238 | 
239 |     Cut<unsigned int> cut1, cut2, cut3;
240 |     cut1.feature = 0;
241 |     cut1.index = 5;
242 |     cut1.valid = true;
243 |     cut1.gain = -3.0;
244 |     cut2.feature = 1;
245 |     cut2.index = 9;
246 |     cut2.gain = 1.0;
247 |     cut2.valid = true;
248 |     cut3.feature = 0;
249 |     cut3.index = 1;
250 |     cut3.gain = 0.0;
251 |     cut3.valid = false;
252 |     
253 |     std::vector<Cut<unsigned int>> before_cuts = {cut1, cut2, cut3};
254 |     std::vector<Weight> before_nEntries = { 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 };
255 |     std::vector<Weight> before_purities = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7 };
256 |     std::vector<Weight> before_boostWeights = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
257 |     Tree<unsigned int> before(before_cuts, before_nEntries, before_purities, before_boostWeights);            
258 |     
259 |     std::stringstream stream;
260 |     stream << before;
261 | 
262 |     auto after = readTreeFromStream<unsigned int>(stream);
263 |     const auto &after_cuts = after.GetCuts();
264 |     const auto &after_purities = after.GetPurities();
265 |     const auto &after_boostWeights = after.GetBoostWeights();
266 |     const auto &after_nEntries = after.GetNEntries();
267 | 
268 |     EXPECT_EQ(before_cuts.size(), after_cuts.size());
269 |     for(unsigned int i = 0; i < before_cuts.size() and i < after_cuts.size(); ++i) {
270 |         EXPECT_FLOAT_EQ(before_cuts[i].feature, after_cuts[i].feature);
271 |         EXPECT_FLOAT_EQ(before_cuts[i].valid, after_cuts[i].valid);
272 |         EXPECT_FLOAT_EQ(before_cuts[i].index, after_cuts[i].index);
273 |         EXPECT_FLOAT_EQ(before_cuts[i].gain, after_cuts[i].gain);
274 |     }
275 |     
276 |     EXPECT_EQ(before_purities.size(), after_purities.size());
277 |     for(unsigned int i = 0; i < before_purities.size() and i < after_purities.size(); ++i)
278 |         EXPECT_FLOAT_EQ(before_purities[i], after_purities[i]);
279 |     
280 |     EXPECT_EQ(before_boostWeights.size(), after_boostWeights.size());
281 |     for(unsigned int i = 0; i < before_boostWeights.size() and i < after_boostWeights.size(); ++i)
282 |         EXPECT_FLOAT_EQ(before_boostWeights[i], after_boostWeights[i]);
283 |     
284 |     EXPECT_EQ(before_nEntries.size(), after_nEntries.size());
285 |     for(unsigned int i = 0; i < before_nEntries.size() and i < after_nEntries.size(); ++i)
286 |         EXPECT_FLOAT_EQ(before_nEntries[i], after_nEntries[i]);
287 | 
288 | }
289 | 
290 | TEST_F(IOTest, IOForest) {
291 | 
292 |     Cut<unsigned int> cut1, cut2, cut3, cut4;
293 |     cut1.feature = 0;
294 |     cut1.index = 5;
295 |     cut1.valid = true;
296 |     cut1.gain = -3.0;
297 |     cut2.feature = 1;
298 |     cut2.index = 9;
299 |     cut2.gain = 1.0;
300 |     cut2.valid = true;
301 |     cut3.feature = 0;
302 |     cut3.index = 1;
303 |     cut3.gain = 0.0;
304 |     cut3.valid = false;
305 |     cut4.feature = 2;
306 |     cut4.index = 3;
307 |     cut4.valid = true;
308 |     cut4.gain = 1.61;
309 |     
310 |     std::vector<Weight> nEntries = { 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 };
311 |     
312 |     Forest<unsigned int> before(0.5, 1.6, true);
313 |     before.AddTree(Tree<unsigned int>({cut1, cut2, cut3}, nEntries, { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7 }, { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}));
314 |     before.AddTree(Tree<unsigned int>({cut1, cut4, cut3}, nEntries, { 0.6, 0.2, 0.5, 0.4, 0.5, 0.6, 0.7 }, { 2.0, 2.0, 3.0, 5.0, 5.0, 6.0, 1.0}));
315 |     const auto &before_forest = before.GetForest();
316 | 
317 |     std::stringstream stream;
318 |     stream << before;
319 | 
320 |     auto after = readForestFromStream<unsigned int>(stream);
321 |     const auto &after_forest = after.GetForest();
322 | 
323 |     EXPECT_EQ(before.GetTransform2Probability(), after.GetTransform2Probability());
324 |     EXPECT_EQ(before.GetF0(), after.GetF0());
325 |     EXPECT_EQ(before.GetShrinkage(), after.GetShrinkage());
326 | 
327 |     EXPECT_EQ(before_forest.size(), after_forest.size());
328 |     for(unsigned int j = 0; j < before_forest.size() and j < after_forest.size(); ++j) {
329 | 
330 |         auto &before_tree = before_forest[j];
331 |         const auto &before_cuts = before_tree.GetCuts();
332 |         const auto &before_purities = before_tree.GetPurities();
333 |         const auto &before_boostWeights = before_tree.GetBoostWeights();
334 |         const auto &before_nEntries = before_tree.GetNEntries();
335 |         
336 |         auto &after_tree = after_forest[j];
337 |         const auto &after_cuts = after_tree.GetCuts();
338 |         const auto &after_purities = after_tree.GetPurities();
339 |         const auto &after_boostWeights = after_tree.GetBoostWeights();
340 |         const auto &after_nEntries = after_tree.GetNEntries();
341 | 
342 |         EXPECT_EQ(before_cuts.size(), after_cuts.size());
343 |         for(unsigned int i = 0; i < before_cuts.size() and i < after_cuts.size(); ++i) {
344 |             EXPECT_FLOAT_EQ(before_cuts[i].feature, after_cuts[i].feature);
345 |             EXPECT_FLOAT_EQ(before_cuts[i].valid, after_cuts[i].valid);
346 |             EXPECT_FLOAT_EQ(before_cuts[i].index, after_cuts[i].index);
347 |             EXPECT_FLOAT_EQ(before_cuts[i].gain, after_cuts[i].gain);
348 |         }
349 |         
350 |         EXPECT_EQ(before_purities.size(), after_purities.size());
351 |         for(unsigned int i = 0; i < before_purities.size() and i < after_purities.size(); ++i)
352 |             EXPECT_FLOAT_EQ(before_purities[i], after_purities[i]);
353 |         
354 |         EXPECT_EQ(before_boostWeights.size(), after_boostWeights.size());
355 |         for(unsigned int i = 0; i < before_boostWeights.size() and i < after_boostWeights.size(); ++i)
356 |             EXPECT_FLOAT_EQ(before_boostWeights[i], after_boostWeights[i]);
357 |         
358 |         EXPECT_EQ(before_nEntries.size(), after_nEntries.size());
359 |         for(unsigned int i = 0; i < before_nEntries.size() and i < after_nEntries.size(); ++i)
360 |             EXPECT_FLOAT_EQ(before_nEntries[i], after_nEntries[i]);
361 |     }
362 | }
363 | 


--------------------------------------------------------------------------------
/src/FastBDT.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2014
  3 |  */
  4 | 
  5 | #include "FastBDT.h"
  6 | #include "FastBDT_IO.h"
  7 | 
  8 | #include <iostream>
  9 | #include <algorithm>
 10 | 
 11 | namespace FastBDT {
 12 | 
 13 |   std::vector<Weight> EventWeights::GetSums(unsigned int nSignals) const {
 14 | 
 15 |     // Vectorizing FTW!
 16 |     std::vector<Weight> sums(3,0);
 17 |     for(unsigned int i = 0; i < nSignals; ++i) {
 18 |       sums[0] += boost_weights[i] * original_weights[i];
 19 |       sums[2] += boost_weights[i]*boost_weights[i] * original_weights[i];
 20 |     }
 21 | 
 22 |     for(unsigned int i = nSignals; i < original_weights.size(); ++i) {
 23 |       sums[1] += boost_weights[i] * original_weights[i];
 24 |       sums[2] += boost_weights[i]*boost_weights[i] * original_weights[i];
 25 |     }
 26 |     return sums;
 27 | 
 28 |   }
 29 |   
 30 |   EventValues::EventValues(unsigned int nEvents, unsigned int nFeatures, unsigned int nSpectators, const std::vector<unsigned int> &nLevels) : values(nEvents*(nFeatures+nSpectators), 0), nFeatures(nFeatures), nSpectators(nSpectators) {
 31 | 
 32 |     if(nFeatures + nSpectators != nLevels.size()) {
 33 |       throw std::runtime_error("Number of features must be the same as the number of provided binning levels! " + std::to_string(nFeatures) + " + " + std::to_string(nSpectators) + " vs " + std::to_string(nLevels.size()));
 34 |     }
 35 | 
 36 |     nBins.reserve(nLevels.size());
 37 |     for(auto& nLevel : nLevels) 
 38 |       nBins.push_back((1 << nLevel)+1);
 39 |     
 40 |     nBinSums.reserve(nLevels.size()+1);
 41 |     nBinSums.push_back(0);
 42 |     for(auto &nBin : nBins) 
 43 |       nBinSums.push_back(nBinSums.back() + nBin);
 44 | 
 45 |   }
 46 | 
 47 |   void EventValues::Set(unsigned int iEvent, const std::vector<unsigned int> &features) {
 48 | 
 49 |     // Check if the feature vector has the correct size
 50 |     if(features.size() != nFeatures + nSpectators) {
 51 |       throw std::runtime_error(std::string("Promised number of features are not provided. ") + std::to_string(features.size()) + " vs " + std::to_string(nFeatures) + " + " + std::to_string(nSpectators));
 52 |     }
 53 | 
 54 |     // Check if the feature values are in the correct range
 55 |     for(unsigned int iFeature = 0; iFeature < nFeatures+nSpectators; ++iFeature) {
 56 |       if( features[iFeature] > nBins[iFeature] )
 57 |         throw std::runtime_error(std::string("Promised number of bins is violated. ") + std::to_string(features[iFeature]) + " vs " + std::to_string(nBins[iFeature]));
 58 |     }
 59 | 
 60 |     // Now add the new values to the values vector.
 61 |     for(unsigned int iFeature = 0; iFeature < nFeatures+nSpectators; ++iFeature) {
 62 |       values[iEvent*(nFeatures+nSpectators) + iFeature] = features[iFeature];
 63 |     }
 64 | 
 65 |   }
 66 | 
 67 |   void EventSample::AddEvent(const std::vector<unsigned int> &features, Weight weight, bool isSignal) {
 68 | 
 69 |     // First check of we have enough space for an additional event. As the number of
 70 |     // events is fixed in the constructor (to avoid time consuming reallocations)
 71 |     if(nSignals + nBckgrds == nEvents) {
 72 |       throw std::runtime_error(std::string("Promised maximum number of events exceeded. ") + std::to_string(nSignals) + " + " + std::to_string(nBckgrds) + " vs " + std::to_string(nEvents) );
 73 |     }
 74 |     
 75 |     if(std::isnan(weight)) {
 76 |       throw std::runtime_error("NAN values as weights are not supported!");
 77 |     }
 78 | 
 79 |     // Now add the weight and the features at the right position of the arrays.
 80 |     // To do so, we calculate the correct index of this event. If it's a signal
 81 |     // event we store it right after the last signal event, starting at the 0 position.
 82 |     // If it's a background event, we store it right before the last added background event,
 83 |     // starting at the nEvents-1 position. We also update the weight sums and amount counts.
 84 |     unsigned int index = 0;
 85 |     if( isSignal ) {
 86 |       index = nSignals;
 87 |       ++nSignals;
 88 |     } else {
 89 |       index = nEvents - 1 - nBckgrds;
 90 |       ++nBckgrds;
 91 |     }
 92 |     weights.SetOriginalWeight(index, weight);
 93 |     values.Set(index, features);
 94 | 
 95 |   }
 96 | 
 97 |   Weight LossFunction(const Weight &nSignal, const Weight &nBckgrd) {
 98 |     // Gini-Index x total number of events (needed to calculate information gain efficiently)!
 99 |     if( nSignal <= 0 or nBckgrd <= 0 )
100 |       return 0; 
101 |     return (nSignal*nBckgrd)/(nSignal+nBckgrd);
102 |     //return (nSignal*nBckgrd)/((nSignal+nBckgrd)*(nSignal+nBckgrd));
103 |   }
104 | 
105 |   CumulativeDistributions::CumulativeDistributions(const unsigned int iLayer, const EventSample &sample) {
106 | 
107 |     const auto &values = sample.GetValues();
108 |     nFeatures = values.GetNFeatures();
109 |     nNodes = (1 << iLayer);
110 |     nBins = values.GetNBins();
111 |     nBinSums = values.GetNBinSums();
112 | 
113 |     signalCDFs = CalculateCDFs(sample, 0, sample.GetNSignals());
114 |     bckgrdCDFs = CalculateCDFs(sample, sample.GetNSignals(), sample.GetNEvents());
115 | 
116 |   }
117 | 
118 |   std::vector<Weight> CumulativeDistributions::CalculateCDFs(const EventSample &sample, const unsigned int firstEvent, const unsigned int lastEvent) const {
119 | 
120 |     const auto &values = sample.GetValues();
121 |     const auto &flags = sample.GetFlags();
122 |     const auto &weights = sample.GetWeights();
123 | 
124 |     std::vector<Weight> bins( nNodes*nBinSums[nFeatures] );
125 | 
126 |     // Fill Cut-PDFs for all nodes in this layer and for every feature
127 |     for(unsigned int iEvent = firstEvent; iEvent < lastEvent; ++iEvent) {
128 |       if( flags.Get(iEvent) < static_cast<int>(nNodes) )
129 |         continue;
130 |       const unsigned int index = (flags.Get(iEvent)-nNodes)*nBinSums[nFeatures];
131 |       for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature ) {
132 |         const unsigned int subindex = nBinSums[iFeature] + values.Get(iEvent,iFeature);
133 |         bins[index+subindex] += weights.GetOriginalWeight(iEvent) * (weights.GetBoostWeight(iEvent) + weights.GetFlatnessWeight(iEvent));
134 |       }
135 |     }
136 | 
137 |     // Sum up Cut-PDFs to culumative Cut-PDFs
138 |     for(unsigned int iNode = 0; iNode < nNodes; ++iNode) {
139 |       for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature) {
140 |         // Start at 2, this ignore the NaN bin at 0!
141 |         for(unsigned int iBin = 2; iBin < nBins[iFeature]; ++iBin) {
142 |           unsigned int index = iNode*nBinSums[nFeatures] + nBinSums[iFeature] + iBin;
143 |           bins[index] += bins[index-1];
144 |         }
145 |       }
146 |     }
147 | 
148 |     return bins;
149 |   }
150 | 
151 |   Cut<unsigned int> Node::CalculateBestCut(const CumulativeDistributions &CDFs) const {
152 | 
153 |     Cut<unsigned int> cut;
154 | 
155 |     const unsigned int nFeatures = CDFs.GetNFeatures();
156 |     const auto& nBins = CDFs.GetNBins();
157 | 
158 |     Weight currentLoss = LossFunction(signal, bckgrd);
159 |     if( currentLoss == 0 )
160 |       return cut;
161 | 
162 |     // Loop over all features and cuts and sum up signal and background histograms to cumulative histograms
163 |     for(unsigned int iFeature = 0; iFeature < nFeatures; ++iFeature) {
164 |       // Start at 2, this ignores the NaN bin at 0
165 |       for(unsigned int iCut = 2; iCut < nBins[iFeature]; ++iCut) {
166 |         Weight s = CDFs.GetSignal(iNode, iFeature, iCut-1);
167 |         Weight b = CDFs.GetBckgrd(iNode, iFeature, iCut-1);
168 |         Weight currentGain = currentLoss - LossFunction( signal-s, bckgrd-b ) - LossFunction( s, b );
169 | 
170 |         if( cut.gain <= currentGain ) {
171 |           cut.gain = currentGain;
172 |           cut.feature = iFeature;
173 |           cut.index = iCut;
174 |           cut.valid = true;
175 |         }
176 |       }
177 |     }
178 | 
179 |     return cut;
180 | 
181 |   }
182 | 
183 |   void Node::AddSignalWeight(Weight weight, Weight original_weight) {
184 |     if(original_weight == 0)
185 |       return;
186 |     signal += weight * original_weight;
187 |     square += weight*weight * original_weight;
188 |   }
189 | 
190 | 
191 |   void Node::AddBckgrdWeight(Weight weight, Weight original_weight) {
192 |     if(original_weight == 0)
193 |       return;
194 |     bckgrd += weight * original_weight;
195 |     square += weight*weight * original_weight;
196 |   }
197 | 
198 |   void Node::SetWeights(std::vector<Weight> weights) {
199 |     signal = weights[0];
200 |     bckgrd = weights[1];
201 |     square = weights[2];
202 |   }
203 | 
204 |   Weight Node::GetBoostWeight() const {
205 | 
206 |     Weight denominator = (2*(signal+bckgrd)-square);
207 |     if( denominator == 0 ) {
208 |         if(signal == bckgrd)
209 |             return 0;
210 |         if(signal > bckgrd)
211 |             return 999.0;
212 |         else
213 |             return -999.0;
214 |     }
215 |     Weight value = (signal - bckgrd)/denominator;
216 |     if( value > 999.0 or value < -999.0 ) {
217 |         if(signal > bckgrd)
218 |             return 999.0;
219 |         else
220 |             return -999.0;
221 |     }
222 |     return value;
223 | 
224 |   }
225 | 
226 |   void Node::Print() const {
227 |     std::cout << "Node: " << iNode << std::endl;
228 |     std::cout << "Layer: " << iLayer << std::endl;
229 |     std::cout << "Signal: " << signal << std::endl;
230 |     std::cout << "Bckgrd: " << bckgrd << std::endl;
231 |     std::cout << "Square: " << square << std::endl;
232 |   }
233 |   
234 |   /**
235 |    * In bin-space NaN is marked by bin 0
236 |    */
237 |   template<>
238 |   bool is_nan(const unsigned int &value) {
239 |     return value == 0;
240 |   }
241 | 
242 | 
243 |   TreeBuilder::TreeBuilder(unsigned int nLayers, EventSample &sample) : nLayers(nLayers) {
244 | 
245 |     const unsigned int nNodes = 1 << nLayers;
246 |     cuts.resize(nNodes - 1);
247 | 
248 |     for(unsigned int iLayer = 0; iLayer <= nLayers; ++iLayer) {
249 |       for(unsigned int iNode = 0; iNode < static_cast<unsigned int>(1<<iLayer); ++iNode) {
250 |         nodes.push_back( Node(iLayer, iNode) );
251 |       }
252 |     }
253 | 
254 |     // The flag of every event is used for two things:
255 |     // Firstly, a flag > 0, determines the node which holds this event at the moment
256 |     // the trees are enumerated from top to bottom from left to right, starting at 1.
257 |     // Secondly, a flag <= 0, disables this event, so it isn't used.
258 |     // flag == 0 means disabled by stochastic bagging
259 |     // flag < 0 means disabled due to missing value, where -flag is the node the event belongs to
260 |     // Initially all events which are not disabled get the flag 1.
261 |     //
262 |     // All the flags of the enabled events are set to 1 by the DecisionForest
263 |     // prepareEventSample method. So there's no need to do this here again.
264 | 
265 |     // The number of signal and bckgrd events at the root node, is given by the total
266 |     // number of signal and background in the sample.
267 |     const auto sums = sample.GetWeights().GetSums(sample.GetNSignals());
268 |     nodes[0].SetWeights(sums);
269 | 
270 |     // The training of the tree is done level by level. So we iterate over the levels of the tree
271 |     // and create histograms for signal and background events for different cuts, nodes and features.
272 |     for(unsigned int iLayer = 0; iLayer < nLayers; ++iLayer) {
273 | 
274 |       CumulativeDistributions CDFs(iLayer, sample);
275 |       UpdateCuts(CDFs, iLayer);
276 |       UpdateFlags(sample);
277 |       UpdateEvents(sample, iLayer);   
278 | 
279 |     } 
280 | 
281 |   }
282 | 
283 |   void TreeBuilder::UpdateCuts(const CumulativeDistributions &CDFs, unsigned int iLayer) {
284 | 
285 |     for(auto &node : nodes) {
286 |       if( node.IsInLayer(iLayer) ) {
287 |         cuts[ node.GetPosition() ] = node.CalculateBestCut(CDFs);
288 |       }
289 |     }
290 |   }
291 | 
292 |   void TreeBuilder::UpdateFlags(EventSample &sample) {
293 | 
294 |     auto &flags = sample.GetFlags();
295 |     const auto &values = sample.GetValues();
296 |     // Iterate over all signal events, and update weights in each node of the next level according to the cuts.
297 |     for(unsigned int iEvent = 0; iEvent < sample.GetNEvents(); ++iEvent) {
298 | 
299 |       const int flag = flags.Get(iEvent);
300 |       if( flag <= 0)
301 |         continue;
302 |       auto &cut = cuts[flag-1];
303 |       if( not cut.valid )
304 |         continue;
305 | 
306 |       const unsigned int index = values.Get(iEvent, cut.feature );
307 |       // If NaN value we throw out the event, but remeber its current node using the a negative flag!
308 |       if( index == 0 ) {
309 |         flags.Set(iEvent, -flag);
310 |       } else if( index < cut.index ) {
311 |         flags.Set(iEvent, flag * 2);
312 |       } else {
313 |         flags.Set(iEvent, flag * 2 + 1);
314 |       }
315 |     }
316 |   }
317 | 
318 |   void TreeBuilder::UpdateEvents(const EventSample &sample, unsigned int iLayer) {
319 | 
320 |     const unsigned int nNodes = (1 << iLayer);
321 |     const auto &weights = sample.GetWeights();
322 |     const auto &flags = sample.GetFlags();
323 | 
324 |     for(unsigned int iEvent = 0; iEvent < sample.GetNSignals(); ++iEvent) {
325 |       const int flag = flags.Get(iEvent);
326 |       if( flag >= static_cast<int>(nNodes) ) {
327 |         nodes[flag-1].AddSignalWeight( weights.GetBoostWeight(iEvent), weights.GetOriginalWeight(iEvent) );
328 |       }
329 |     }
330 |     for(unsigned int iEvent = sample.GetNSignals(); iEvent < sample.GetNEvents(); ++iEvent) {
331 |       const int flag = flags.Get(iEvent);
332 |       if( flag >= static_cast<int>(nNodes) ) {
333 |         nodes[flag-1].AddBckgrdWeight( weights.GetBoostWeight(iEvent), weights.GetOriginalWeight(iEvent) );
334 |       }
335 |     }
336 | 
337 |   }
338 | 
339 | 
340 |   void TreeBuilder::Print() const {
341 | 
342 |     std::cout << "Start Printing Tree" << std::endl;
343 | 
344 |     for(auto &node : nodes) {
345 |       node.Print();
346 |       std::cout << std::endl;
347 |     }
348 | 
349 |     for(auto &cut : cuts) {
350 |       std::cout << "Index: " << cut.index << std::endl;
351 |       std::cout << "Feature: " << cut.feature << std::endl;
352 |       std::cout << "Gain: " << cut.gain << std::endl;
353 |       std::cout << "Valid: " << cut.valid << std::endl;
354 |       std::cout << std::endl;
355 |     }
356 | 
357 |     std::cout << "Finished Printing Tree" << std::endl;
358 |   }
359 | 
360 |   ForestBuilder::ForestBuilder(EventSample &sample, unsigned int nTrees, double shrinkage, double randRatio, unsigned int nLayersPerTree, bool sPlot, double flatnessLoss) : shrinkage(shrinkage), flatnessLoss(flatnessLoss) {
361 | 
362 |     auto &weights = sample.GetWeights();
363 |     sums = weights.GetSums(sample.GetNSignals()); 
364 |     // Calculating the initial F value from the proportion of the number of signal and background events in the sample
365 |     double average = (sums[0] - sums[1])/(sums[0] + sums[1]);
366 |     F0 = 0.5*std::log((1+average)/(1-average));
367 |     
368 |     // Apply F0 to original_weights because F0 is not a boost_weight, otherwise prior probability in case of
369 |     // Events with missing values is wrong.
370 |     if (F0 != 0.0) {
371 |         const unsigned int nEvents = sample.GetNEvents();
372 |         const unsigned int nSignals = sample.GetNSignals();
373 |         for(unsigned int iEvent = 0; iEvent < nSignals; ++iEvent)
374 |           weights.SetOriginalWeight(iEvent, 2.0 * sums[1] / (sums[0] + sums[1]) * weights.GetOriginalWeight(iEvent));
375 |         for(unsigned int iEvent = nSignals; iEvent < nEvents; ++iEvent)
376 |           weights.SetOriginalWeight(iEvent, 2.0 * sums[0] / (sums[0] + sums[1]) * weights.GetOriginalWeight(iEvent));
377 |     }
378 |         
379 |     // Resize the FCache to the number of events, and initalise it with the inital 0.0 value
380 |     // Not F0 because F0 is already used in the original_weights
381 |     FCache.resize(sample.GetNEvents(), 0.0);
382 |      
383 |     // Reserve enough space for the boost_weights and trees, to avoid reallocations
384 |     forest.reserve(nTrees);
385 |      
386 |     // Reserve enough space for binned uniform spectators
387 |     if(flatnessLoss > 0) {
388 |         const auto &values = sample.GetValues();
389 |         auto nFeatures = values.GetNFeatures();
390 |         auto nSpectators = values.GetNSpectators();
391 |         auto &nBins = values.GetNBins();
392 |         const unsigned int nEvents = sample.GetNEvents();
393 |         const unsigned int nSignals = sample.GetNSignals();
394 | 
395 |         signal_event_index_sorted_by_F.resize(nSignals);
396 |         bckgrd_event_index_sorted_by_F.resize(nEvents - nSignals);
397 | 
398 |         uniform_bin_weight_signal.resize(nSpectators);
399 |         uniform_bin_weight_bckgrd.resize(nSpectators);
400 |         weight_below_current_F_per_uniform_bin.resize(nSpectators);
401 |         for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
402 |           uniform_bin_weight_signal[iSpectator].resize(nBins[nFeatures + iSpectator], 0.0);
403 |           uniform_bin_weight_bckgrd[iSpectator].resize(nBins[nFeatures + iSpectator], 0.0);
404 |           weight_below_current_F_per_uniform_bin[iSpectator].resize(nBins[nFeatures + iSpectator], 0.0);
405 |         }
406 |           
407 |         for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) {
408 |           for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
409 |             const uint64_t uniformBin = values.GetSpectator(iEvent, iSpectator);
410 |             if (iEvent < nSignals)
411 |               uniform_bin_weight_signal[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent);
412 |             else
413 |               uniform_bin_weight_bckgrd[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent);
414 |           }
415 |         }
416 |         for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
417 |           for(uint64_t iUniformBin = 0; iUniformBin < uniform_bin_weight_signal[iSpectator].size(); ++iUniformBin) {
418 |               uniform_bin_weight_signal[iSpectator][iUniformBin] /= sums[0];
419 |           }
420 |           for(uint64_t iUniformBin = 0; iUniformBin < uniform_bin_weight_bckgrd[iSpectator].size(); ++iUniformBin) {
421 |               uniform_bin_weight_bckgrd[iSpectator][iUniformBin] /= sums[1];
422 |           }
423 |         }
424 |     }
425 | 
426 |     // Now train config.nTrees!
427 |     for(unsigned int iTree = 0; iTree < nTrees; ++iTree) {
428 | 
429 |       // Update the event weights according to their F value
430 |       updateEventWeights(sample);
431 | 
432 |       // Add flatness loss terms
433 |       if(flatnessLoss > 0 and iTree > 0) 
434 |           updateEventWeightsWithFlatnessPenalty(sample);
435 | 
436 |       // Prepare the flags of the events
437 |       prepareEventSample( sample, randRatio, sPlot );   
438 | 
439 |       // Create and train a new train on the sample
440 |       TreeBuilder builder(nLayersPerTree, sample);
441 |       if(builder.IsValid()) {
442 |         forest.push_back( Tree<unsigned int>( builder.GetCuts(), builder.GetNEntries(), builder.GetPurities(), builder.GetBoostWeights() ) );
443 |       } else {
444 |         std::cerr << "Terminated boosting at tree " << iTree << " out of " << nTrees << std::endl;
445 |         std::cerr << "Because the last tree was not valid, meaning it couldn't find an optimal cut." << std::endl;
446 |         std::cerr << "This can happen if you do a large number of boosting steps." << std::endl;
447 |         break;
448 |       }
449 |     }
450 | 
451 |   }
452 | 
453 |   void ForestBuilder::prepareEventSample(EventSample &sample, double randRatio, bool sPlot) {
454 | 
455 |     // Draw a random sample if stochastic gradient boost is used
456 |     // Draw random number [0,1) and compare it to the given ratio. If bigger disable this event by flagging it with 0.
457 |     // If smaller set the flag to 1. This is important! If the flags are != 1, the DecisionTree algorithm will fail.
458 |     const unsigned int nEvents = sample.GetNEvents();
459 |     auto &flags = sample.GetFlags();
460 |     if( randRatio < 1.0 and sPlot) {
461 |       // For an sPlot Training it is important to take always signal and background pairs together into the training!
462 |       for(unsigned int iEvent = 0; iEvent < nEvents / 2 + 1; ++iEvent) {
463 |         int use = (static_cast<float>(rand())/static_cast<float>(RAND_MAX) > randRatio ) ? 0 : 1;
464 |         flags.Set(iEvent, use);
465 |         unsigned int jEvent = static_cast<unsigned int>(static_cast<int>(nEvents) - static_cast<int>(iEvent) - 1);
466 |         flags.Set(jEvent, use);
467 |       }
468 |     } else if( randRatio < 1.0) {
469 |       for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent)
470 |         flags.Set(iEvent, ( static_cast<float>(rand())/static_cast<float>(RAND_MAX) > randRatio ) ? 0 : 1 );
471 |     } else {
472 |       for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent)
473 |         flags.Set(iEvent, 1);
474 |     }
475 | 
476 |   }
477 | 
478 |   void ForestBuilder::updateEventWeights(EventSample &eventSample) {
479 | 
480 |     const unsigned int nEvents = eventSample.GetNEvents();
481 |     const unsigned int nSignals = eventSample.GetNSignals();
482 | 
483 |     const auto &flags = eventSample.GetFlags();
484 |     const auto &values = eventSample.GetValues();
485 |     auto &weights = eventSample.GetWeights();
486 | 
487 |     // Loop over all events and update FCache
488 |     // If the event wasn't disabled, we can use the flag directly to determine the node of this event
489 |     // If not we have to calculate the node to which this event belongs
490 |     if( forest.size() > 0 ) {
491 |       for(unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) {
492 |         if( flags.Get(iEvent) != 0)
493 |           FCache[iEvent] += shrinkage*forest.back().GetBoostWeight( std::abs(flags.Get(iEvent)) - 1);
494 |         else
495 |           FCache[iEvent] += shrinkage*forest.back().GetBoostWeight( forest.back().ValueToNode(&values.Get(iEvent)) );
496 |       }
497 |     }
498 | 
499 |     for(unsigned int iEvent = 0; iEvent < nSignals; ++iEvent)
500 |       weights.SetBoostWeight(iEvent, 2.0/(1.0+std::exp(2.0*FCache[iEvent])));
501 |     for(unsigned int iEvent = nSignals; iEvent < nEvents; ++iEvent)
502 |       weights.SetBoostWeight(iEvent, 2.0/(1.0+std::exp(-2.0*FCache[iEvent])));
503 | 
504 |   }
505 |   
506 |   void ForestBuilder::updateEventWeightsWithFlatnessPenalty(EventSample &eventSample) {
507 | 
508 |     const unsigned int nEvents = eventSample.GetNEvents();
509 |     const unsigned int nSignals = eventSample.GetNSignals();
510 | 
511 |     const auto &values = eventSample.GetValues();
512 |     auto &weights = eventSample.GetWeights();
513 | 
514 |     auto nSpectators = values.GetNSpectators();
515 |     
516 |     // Sort events in order of increasing F Value
517 |     for(unsigned int iEvent = 0; iEvent < nSignals; ++iEvent) {
518 |         signal_event_index_sorted_by_F[iEvent] = {FCache[iEvent], iEvent};
519 |     }
520 |     for(unsigned int iEvent = 0; iEvent < nEvents-nSignals; ++iEvent) {
521 |         bckgrd_event_index_sorted_by_F[iEvent] = {-FCache[iEvent+nSignals], iEvent+nSignals};
522 |     }
523 | 
524 |     {
525 |         auto first = signal_event_index_sorted_by_F.begin();
526 |         auto last = signal_event_index_sorted_by_F.end();
527 |         std::sort(first, last, compareWithIndex<double>);
528 |     }
529 |     
530 |     {
531 |         auto first = bckgrd_event_index_sorted_by_F.begin();
532 |         auto last = bckgrd_event_index_sorted_by_F.end();
533 |         std::sort(first, last, compareWithIndex<double>);
534 |     }
535 | 
536 |     double global_weight_below_current_F = 0;
537 |     for(unsigned int iIndex = 0; iIndex < signal_event_index_sorted_by_F.size(); ++iIndex) {
538 |         unsigned int iEvent = signal_event_index_sorted_by_F[iIndex].index;
539 |           
540 |         global_weight_below_current_F += weights.GetOriginalWeight(iEvent);
541 |         double F = global_weight_below_current_F / sums[0];
542 |         double fw = 0.0;
543 | 
544 |         for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
545 |           const uint64_t uniformBin = values.GetSpectator(iEvent, iSpectator);
546 |           weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent);
547 |           double F_bin = weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] / (uniform_bin_weight_signal[iSpectator][uniformBin] * sums[0]);
548 | 
549 |           fw += (F_bin - F);
550 |         }
551 |         fw *= flatnessLoss;
552 |         weights.SetFlatnessWeight(iEvent, fw);
553 | 
554 |     }
555 | 
556 |     for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
557 |       for(uint64_t iUniformBin = 0; iUniformBin < weight_below_current_F_per_uniform_bin[iSpectator].size(); ++iUniformBin) {
558 |         weight_below_current_F_per_uniform_bin[iSpectator][iUniformBin] = 0.0;
559 |       }
560 |     }
561 |     
562 |     global_weight_below_current_F = 0;
563 |     
564 |     for(unsigned int iIndex = 0; iIndex < bckgrd_event_index_sorted_by_F.size(); ++iIndex) {
565 |         unsigned int iEvent = bckgrd_event_index_sorted_by_F[iIndex].index;
566 |           
567 |         global_weight_below_current_F += weights.GetOriginalWeight(iEvent);
568 |         double F = global_weight_below_current_F / sums[1];
569 |         double fw = 0.0;
570 | 
571 |         for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
572 |           const uint64_t uniformBin = values.GetSpectator(iEvent, iSpectator);
573 |           weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] += weights.GetOriginalWeight(iEvent);
574 |           double F_bin = weight_below_current_F_per_uniform_bin[iSpectator][uniformBin] / (uniform_bin_weight_bckgrd[iSpectator][uniformBin] * sums[1]);
575 | 
576 |           fw += (F_bin - F);
577 |         }
578 |         fw *= flatnessLoss;
579 |         weights.SetFlatnessWeight(iEvent, fw);
580 | 
581 |     }
582 | 
583 |     for(unsigned int iSpectator = 0; iSpectator < nSpectators; ++iSpectator) {
584 |       for(uint64_t iUniformBin = 0; iUniformBin < weight_below_current_F_per_uniform_bin[iSpectator].size(); ++iUniformBin) {
585 |         weight_below_current_F_per_uniform_bin[iSpectator][iUniformBin] = 0.0;
586 |       }
587 |     }
588 | 
589 |   }
590 | 
591 | }
592 | 
593 | 


--------------------------------------------------------------------------------
/examples/comparison.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Thomas Keck 2017
  3 |  *
  4 |  * Performance comparison code.
  5 |  *
  6 |  * This file measures the runtime of the fitting and application phase for different hyper-parameter settings for
  7 |  *  - FastBDT
  8 |  *  - XGBoost
  9 |  *  - TMVA
 10 |  *  - SKLearn
 11 |  * All methods are accessed via C++ to ensure an optimal performance.
 12 |  * In case of XGBoost and SKLearn this is rather unusual, but it is faster than to use Python.
 13 |  * I wouldn't recommend using XGBoost and SKLearn in the way it is shown below in your daily work,
 14 |  * it is rather error-prone, and I only do this to ensure fair conditions between the contestants.
 15 |  *
 16 |  * Compiling this code is complicated because it involves all the different frameworks.
 17 |  * You have to install FastBDT and XGBoost from github, ROOT, and sklearn using pip3; as well as python3.5 headers and libraries for your distribution.
 18 |  *
 19 |  * I compile in the following way
 20 |  * g++ comparison.cxx -o comparison -O3
 21 |  *   -L ../FastBDT/ -I ../FastBDT/include/ -lFastBDT_shared
 22 |  *   -I ../xgboost/rabit/include/ -I ../xgboost/dmlc-core/include/ -L ../xgboost/rabit/lib/ -L ../xgboost/dmlc-core/ -I ../xgboost/include/ -L ../xgboost/lib/ -l xgboost
 23 |  *   `root-config --cflags --libs` -lTMVA -lMLP -lXMLIO
 24 |  *   -lpython3.5
 25 |  *
 26 |  * And execute it like this:
 27 |  * LD_LIBRARY_PATH=$PATH_TO_XGBOOST/lib/:$PATH_TO_FASTBDT/:$LD_LIBRARY_PATH ./comparison 10
 28 |  *
 29 |  * The executable takes a command line argument, which I use to call the executable multiple times
 30 |  * with different hyper-parameter configurations.
 31 |  *
 32 |  * The code reads its data from data/train.csv and data/test.csv
 33 |  * These files must contain the appropriate amount of features (separated by whitespaces) and events (separated by linebreaks).
 34 |  * The last feature must be an integer with the truth information (1 for signal and 0 for background).
 35 |  *
 36 |  * The results of the measurements are outputted in files result_$id_cpp.txt, which contain the runtime in the preprocessing (preparation of the data),
 37 |  * fitting (fitting the classifier) and application (inference on indepentend test data using the classifier); as well as the outputted probabilities
 38 |  * for the test dataset for each event (where the last column contains the truth variable).
 39 |  */
 40 | 
 41 | #include "FastBDT.h"
 42 | 
 43 | #include "xgboost/c_api.h"
 44 | #include "xgboost/data.h"
 45 | 
 46 | #include <TMVA/Factory.h>
 47 | #include <TMVA/Reader.h>
 48 | #include <TMVA/Tools.h>
 49 | #include <TMVA/DataLoader.h>
 50 | 
 51 | #include <TFile.h>
 52 | #include <TTree.h>
 53 | #include <TTreeFormula.h>
 54 | #include <TString.h>
 55 | 
 56 | #include <Python.h>
 57 | #include <numpy/arrayobject.h>
 58 | #include <numpy/npy_common.h>
 59 | 
 60 | #include <iostream>
 61 | #include <algorithm>
 62 | #include <iomanip>
 63 | #include <fstream>
 64 | #include <sstream>
 65 | #include <chrono>
 66 | #include <cmath>
 67 | 
 68 | class Data {
 69 |   public:
 70 |     Data(std::string datafile, unsigned int _numberOfFeatures, unsigned int _numberOfEvents) : numberOfFeatures(_numberOfFeatures), numberOfEvents(_numberOfEvents) {
 71 | 
 72 |       X.reserve(numberOfEvents);
 73 |       y.reserve(numberOfEvents);
 74 | 
 75 |       std::fstream fs (datafile, std::fstream::in);
 76 |       std::string line;
 77 | 
 78 |       // Skip Header
 79 |       std::getline(fs, line);
 80 | 
 81 |       unsigned int iEvent = 0;
 82 |       while(std::getline(fs, line)) {
 83 | 
 84 |         std::istringstream sin(line);
 85 |         std::vector<float> row;
 86 |         float value = 0;
 87 |         unsigned int iFeature = 0;
 88 |         while(sin >> value) {
 89 |           if(iFeature < numberOfFeatures)
 90 |             row.push_back(value);
 91 |           ++iFeature;
 92 |         }
 93 |         X.push_back(row);
 94 |         y.push_back(static_cast<int>(value));
 95 | 
 96 |         ++iEvent;
 97 |         if(iEvent >= numberOfEvents) {
 98 |           break;
 99 |         }
100 |       }
101 | 
102 |       std::cout << "Loaded " << iEvent << " Events" << std::endl;
103 |     }
104 | 
105 |     unsigned int numberOfEvents = 0;
106 |     unsigned int numberOfFeatures = 0;
107 |     std::vector<std::vector<float>> X;
108 |     std::vector<unsigned int> y;
109 | };
110 | 
111 | 
112 | struct Config {
113 |     unsigned int nTrees;
114 |     unsigned int depth;
115 |     double shrinkage;
116 |     double subSampling;
117 |     // Only TMVA and FastBDT
118 |     unsigned int nCutLevels;
119 |     unsigned int numberOfFeatures;
120 |     unsigned int numberOfEvents;
121 | };
122 | 
123 | struct Result {
124 | 
125 |   std::string label;
126 |   std::vector<double> probabilities;
127 |   std::chrono::duration<double, std::milli> preprocessingTime;
128 |   std::chrono::duration<double, std::milli> trainingTime;
129 |   std::chrono::duration<double, std::milli> testTime;
130 | };
131 | 
132 | 
133 | void writeResults(std::string filename, const std::vector<Result> &results, const Data& test, const Config& config) {
134 | 
135 |   std::fstream str(filename, std::fstream::out);
136 |   str << config.nTrees << " " << config.depth << " " << config.shrinkage << " " << config.subSampling << " " << config.nCutLevels << " " << config.numberOfFeatures << " " << config.numberOfEvents << std::endl;
137 | 
138 |   str << "Labels: ";
139 |   for(auto &r : results) {
140 |     str << r.label << " ";
141 |   }
142 |   str << std::endl;
143 |   
144 |   str << "PreprocessingTime: ";
145 |   for(auto &r : results) {
146 |     str << r.preprocessingTime.count() << " ";
147 |   }
148 |   str << std::endl;
149 |   
150 |   str << "TrainingTime: ";
151 |   for(auto &r : results) {
152 |     str << r.trainingTime.count() << " ";
153 |   }
154 |   str << std::endl;
155 |   
156 |   str << "TestTime: ";
157 |   for(auto &r : results) {
158 |     str << r.testTime.count() << " ";
159 |   }
160 |   str << std::endl;
161 | 
162 |   for(unsigned int iEvent = 0; iEvent < config.numberOfEvents;  ++iEvent) {
163 |     for(auto &r : results) {
164 |         str << r.probabilities[iEvent] << " ";
165 |     }
166 |     str << test.y[iEvent] << std::endl;
167 |   }
168 | 
169 | }
170 | 
171 | Result measureSKLearn(const Data& train, const Data& test, const Config& config) {
172 |     
173 |     Result result;
174 |     result.label = "SKLearn";
175 |     
176 |     std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now();
177 |     PyObject* cls = PyUnicode_FromString((char*)"GradientBoostingClassifier");
178 |     PyObject* fit = PyUnicode_FromString((char*)"fit");
179 |     PyObject* predict = PyUnicode_FromString((char*)"predict_proba");
180 |     PyObject* pModule = PyImport_ImportModule("sklearn.ensemble");
181 |     
182 |     PyObject* loss = PyUnicode_FromString((char*)"deviance");
183 |     PyObject* learning_rate = PyFloat_FromDouble(static_cast<double>(config.shrinkage));
184 |     PyObject* n_estimators = PyLong_FromLong(static_cast<long>(config.nTrees));
185 |     PyObject* subsample = PyFloat_FromDouble(static_cast<double>(config.subSampling));
186 |     PyObject* criterion = PyUnicode_FromString((char*)"friedman_mse");
187 |     PyObject* min_samples_split = PyLong_FromLong(static_cast<long>(2));
188 |     PyObject* min_samples_leaf = PyLong_FromLong(static_cast<long>(1));
189 |     PyObject* min_weight_fraction_leaf = PyFloat_FromDouble(static_cast<double>(0.0));
190 |     PyObject* max_depth = PyLong_FromLong(static_cast<long>(config.depth));
191 |     PyObject* forest = PyObject_CallMethodObjArgs(pModule, cls, loss, learning_rate, n_estimators, subsample, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, NULL);
192 |     Py_DECREF(loss);
193 |     Py_DECREF(learning_rate);
194 |     Py_DECREF(n_estimators);
195 |     Py_DECREF(subsample);
196 |     Py_DECREF(min_samples_split);
197 |     Py_DECREF(min_samples_leaf);
198 |     Py_DECREF(min_weight_fraction_leaf);
199 |     Py_DECREF(max_depth);
200 |     
201 |     float *X = new float[train.numberOfEvents*train.numberOfFeatures];
202 |     float *y = new float[train.numberOfEvents];
203 |     for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) {
204 |         for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature)
205 |             X[iEvent*train.numberOfFeatures + iFeature] = train.X[iEvent][iFeature];
206 |         y[iEvent] = static_cast<float>(train.y[iEvent]);
207 |     }
208 |     long dimensions_X[2] = {train.numberOfEvents, train.numberOfFeatures};
209 |     long dimensions_y[1] = {train.numberOfEvents};
210 |     PyObject* ndarray_X = PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X);
211 |     PyObject* ndarray_y = PyArray_SimpleNewFromData(1, dimensions_y, NPY_FLOAT32, y);
212 | 
213 |     std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now();
214 |     result.preprocessingTime = preprocessingTime2 - preprocessingTime1;
215 |     std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl;
216 | 
217 |     std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now();
218 |     PyObject *x = PyObject_CallMethodObjArgs(forest, fit, ndarray_X, ndarray_y, NULL);
219 |     std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now();
220 |     result.trainingTime = trainingTime2 - trainingTime1;
221 |     std::cout << "TrainingTime " << result.trainingTime.count() << std::endl;
222 | 
223 |     result.probabilities.resize(test.numberOfEvents);
224 |     
225 |     std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now();
226 |     float *X_test = new float[test.numberOfEvents*test.numberOfFeatures];
227 |     for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) {
228 |         for(unsigned int iFeature = 0; iFeature < test.numberOfFeatures; ++iFeature)
229 |             X_test[iEvent*test.numberOfFeatures + iFeature] = test.X[iEvent][iFeature];
230 |     }
231 |     long dimensions_X_test[2] = {test.numberOfEvents, test.numberOfFeatures};
232 |     PyObject* ndarray_X_test = PyArray_SimpleNewFromData(2, dimensions_X_test, NPY_FLOAT32, X_test);
233 | 
234 |     PyObject *pyresult = PyObject_CallMethodObjArgs(forest, predict, ndarray_X_test, NULL);
235 |     for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) {
236 |         result.probabilities[iEvent] = 1.0 - static_cast<float>(*static_cast<double*>(PyArray_GETPTR1(pyresult, iEvent)));
237 |     }
238 |     std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now();
239 |     result.testTime = testTime2 - testTime1;
240 |     std::cout << "TestTime " << result.testTime.count() << std::endl;
241 |     
242 |     Py_DECREF(ndarray_X);
243 |     Py_DECREF(ndarray_X_test);
244 |     Py_DECREF(ndarray_y);
245 |     Py_DECREF(pyresult);
246 |     Py_DECREF(cls);
247 |     Py_DECREF(predict);
248 |     Py_DECREF(fit);
249 |     Py_DECREF(pModule);
250 |     delete[] X;
251 |     delete[] X_test;
252 |     delete[] y;
253 | 
254 |     return result;
255 | 
256 | }
257 | 
258 | Result measureFastBDT(const Data& train, const Data& test, const Config& config) {
259 |     
260 |     Result result;
261 |     result.label = "FastBDT";
262 | 
263 |     std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now();
264 |     // Equal statistics binning
265 |     std::vector<FastBDT::FeatureBinning<float>> featureBinnings(train.numberOfFeatures);
266 |     std::vector<float> feature(train.numberOfEvents);
267 |     for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) {
268 |         for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent)
269 |           feature[iEvent] = train.X[iEvent][iFeature];
270 |         featureBinnings[iFeature] = FastBDT::FeatureBinning<float>(config.nCutLevels, feature);
271 |     }
272 | 
273 |     // Fill event Sample
274 |     FastBDT::EventSample eventSample(train.numberOfEvents, train.numberOfFeatures, 0, std::vector<unsigned int>(train.numberOfFeatures, config.nCutLevels));
275 |     std::vector<unsigned int> bins(train.numberOfFeatures);
276 |     for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) {
277 |         for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature)
278 |             bins[iFeature] = featureBinnings[iFeature].ValueToBin( train.X[iEvent][iFeature] );
279 |         eventSample.AddEvent(bins, 1.0, train.y[iEvent] == 1);
280 |     }
281 |     std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now();
282 |     result.preprocessingTime = preprocessingTime2 - preprocessingTime1;
283 |     std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl;
284 | 
285 |     std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now();
286 |     // Train classifier using training data
287 |     FastBDT::ForestBuilder dt(eventSample, config.nTrees, config.shrinkage, config.subSampling, config.depth);
288 |     FastBDT::Forest<float> forest( dt.GetShrinkage(), dt.GetF0(), false);
289 |     for( auto t : dt.GetForest() )
290 |         forest.AddTree(FastBDT::removeFeatureBinningTransformationFromTree(t, featureBinnings));
291 |     std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now();
292 |     result.trainingTime = trainingTime2 - trainingTime1;
293 |     std::cout << "TrainingTime " << result.trainingTime.count() << std::endl;
294 | 
295 |     result.probabilities.resize(test.numberOfEvents);
296 |     
297 |     // Apply classifier on test data
298 |     std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now();
299 |     for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) {
300 |       result.probabilities[iEvent] = forest.Analyse(test.X[iEvent]);
301 |     }
302 |     std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now();
303 |     result.testTime = testTime2 - testTime1;
304 |     std::cout << "TestTime " << result.testTime.count() << std::endl;
305 |     return result;
306 | 
307 | }
308 | 
309 | Result measureTMVA(const Data& train, const Data& test, const Config& config) {
310 |     
311 |     Result result;
312 |     result.label = "TMVA";
313 | 
314 |     std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now();
315 |     TMVA::Tools::Instance();
316 |     TFile classFile("TMVA.root", "RECREATE");
317 |     classFile.cd();
318 |     TMVA::DataLoader data_loader("TMVAClassification");
319 |     TMVA::Factory factory("TMVAClassification", &classFile, "!V:Silent:Color:DrawProgressBar:AnalysisType=Classification");
320 | 
321 |     std::vector<std::string> variables = {"M", "p", "pt", "pz", "phi", "daughter__bo0__cm__spp__bc",       "daughter__bo0__cm__sppz__bc", "daughter__bo0__cm__sppt__bc",       "daughter__bo0__cm__spphi__bc", "daughter__bo1__cm__spp__bc",       "daughter__bo1__cm__sppz__bc", "daughter__bo1__cm__sppt__bc",       "daughter__bo1__cm__spphi__bc", "daughter__bo2__cm__spp__bc",       "daughter__bo2__cm__sppz__bc", "daughter__bo2__cm__sppt__bc",       "daughter__bo2__cm__spphi__bc", "chiProb", "dr", "dz", "dphi",       "daughter__bo0__cm__spdr__bc", "daughter__bo1__cm__spdr__bc",       "daughter__bo0__cm__spdz__bc", "daughter__bo1__cm__spdz__bc",       "daughter__bo0__cm__spdphi__bc", "daughter__bo1__cm__spdphi__bc",       "daughter__bo0__cm__spchiProb__bc", "daughter__bo1__cm__spchiProb__bc",       "daughter__bo2__cm__spchiProb__bc", "daughter__bo0__cm__spKid__bc",       "daughter__bo0__cm__sppiid__bc", "daughter__bo1__cm__spKid__bc",       "daughter__bo1__cm__sppiid__bc", "daughterAngle__bo0__cm__sp1__bc",       "daughterAngle__bo0__cm__sp2__bc", "daughterAngle__bo1__cm__sp2__bc",       "daughter__bo2__cm__spdaughter__bo0__cm__spE__bc__bc",       "daughter__bo2__cm__spdaughter__bo1__cm__spE__bc__bc",       "daughter__bo2__cm__spdaughter__bo0__cm__spclusterTiming__bc__bc",       "daughter__bo2__cm__spdaughter__bo1__cm__spclusterTiming__bc__bc",       "daughter__bo2__cm__spdaughter__bo0__cm__spclusterE9E25__bc__bc",       "daughter__bo2__cm__spdaughter__bo1__cm__spclusterE9E25__bc__bc",       "daughter__bo2__cm__spdaughter__bo0__cm__spminC2HDist__bc__bc",       "daughter__bo2__cm__spdaughter__bo1__cm__spminC2HDist__bc__bc",       "daughterInvariantMass__bo0__cm__sp1__bc",       "daughterInvariantMass__bo0__cm__sp2__bc",       "daughterInvariantMass__bo1__cm__sp2__bc"};
322 | 
323 |     std::vector<float> vec(train.numberOfFeatures);
324 |     for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) {
325 |       data_loader.AddVariable(variables[iFeature].c_str());
326 |     }
327 | 
328 |     TTree *signal_tree = new TTree("signal_tree", "signal_tree");
329 |     TTree *background_tree = new TTree("background_tree", "background_tree");
330 | 
331 |     for (unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) {
332 |       signal_tree->Branch(variables[iFeature].c_str(), &vec[iFeature]);
333 |       background_tree->Branch(variables[iFeature].c_str(), &vec[iFeature]);
334 |     }
335 | 
336 |     unsigned int nsig = 0;
337 |     unsigned int nbkg = 0;
338 |     for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent) {
339 |       for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) {
340 |         vec[iFeature] = train.X[iEvent][iFeature];
341 |       }
342 |       if(train.y[iEvent] == 1) {
343 |         ++nsig;
344 |         signal_tree->Fill();
345 |       } else {
346 |         ++nbkg;
347 |         background_tree->Fill();
348 |       }
349 |     }
350 | 
351 |     data_loader.AddSignalTree(signal_tree);
352 |     data_loader.AddBackgroundTree(background_tree);
353 | 
354 |     data_loader.PrepareTrainingAndTestTree("", std::string("nTrain_Signal=") + std::to_string(nsig) + std::string(":nTrain_Background=") + std::to_string(nbkg) + std::string(":SplitMode=Block:!V"));
355 |     factory.BookMethod(&data_loader, TMVA::Types::kBDT, "BDTG", std::string("!H:!V:NTrees=") + std::to_string(config.nTrees) + std::string("BoostType=Grad:Shrinkage=") + std::to_string(config.shrinkage) + std::string(":UseBaggedBoost:BaggedSampleFraction=") + std::to_string(config.subSampling) + std::string(":nCuts=") + std::to_string(1 << config.nCutLevels) + std::string(":MaxDepth=") + std::to_string(config.depth) + std::string(":IgnoreNegWeightsInTraining"));
356 | 
357 |     TMVA::Reader *reader = new TMVA::Reader("!Color:!Silent");
358 |     for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature) {
359 |       reader->AddVariable(variables[iFeature].c_str(), &vec[iFeature]);
360 |     }
361 |     
362 |     std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now();
363 |     result.preprocessingTime = preprocessingTime2 - preprocessingTime1;
364 |     std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl;
365 | 
366 |     std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now();
367 |     factory.TrainAllMethods();
368 |     std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now();
369 |     result.trainingTime = trainingTime2 - trainingTime1;
370 |     std::cout << "TrainingTime " << result.trainingTime.count() << std::endl;
371 | 
372 |     //factory.TestAllMethods();
373 |     //factory.EvaluateAllMethods();
374 | 
375 |     reader->BookMVA("BDTG","TMVAClassification/weights/TMVAClassification_BDTG.weights.xml");
376 |     result.probabilities.resize(test.numberOfEvents);
377 |     
378 |     // Apply classifier on test data
379 |     std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now();
380 |     for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) {
381 |         for(unsigned int iFeature = 0; iFeature < test.numberOfFeatures; ++iFeature) {
382 |           vec[iFeature] = test.X[iEvent][iFeature];
383 |         }
384 |         result.probabilities[iEvent] = (reader->EvaluateMVA("BDTG") + 1)*0.5;
385 |     }
386 |     std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now();
387 |     result.testTime = testTime2 - testTime1;
388 |     std::cout << "TestTime " << result.testTime.count() << std::endl;
389 | 
390 |     delete reader;
391 |     delete signal_tree;
392 |     delete background_tree;
393 | 
394 |     return result;
395 | 
396 | }
397 | 
398 | Result measureXGBoost(const Data& train, const Data& test, const Config& config) {
399 |     
400 |     Result result;
401 |     result.label = "XGBoost";
402 | 
403 |     std::chrono::high_resolution_clock::time_point preprocessingTime1 = std::chrono::high_resolution_clock::now();
404 |     // Create XGDMatrix
405 |     float *matrix = new float[train.numberOfEvents*train.numberOfFeatures];
406 |     for(unsigned int iEvent = 0; iEvent < train.numberOfEvents; ++iEvent)
407 |       for(unsigned int iFeature = 0; iFeature < train.numberOfFeatures; ++iFeature)
408 |         matrix[iEvent*train.numberOfFeatures + iFeature] = train.X[iEvent][iFeature];
409 | 
410 |     DMatrixHandle dmatrix;
411 |     XGDMatrixCreateFromMat(matrix, train.numberOfEvents, train.numberOfFeatures, NAN, &dmatrix);
412 |     delete[] matrix;
413 |     
414 |     XGDMatrixSetUIntInfo(dmatrix, "label", train.y.data(), train.numberOfEvents);
415 |     
416 |     std::chrono::high_resolution_clock::time_point preprocessingTime2 = std::chrono::high_resolution_clock::now();
417 |     result.preprocessingTime = preprocessingTime2 - preprocessingTime1;
418 |     std::cout << "PreprocessingTime " << result.preprocessingTime.count() << std::endl;
419 | 
420 |     std::chrono::high_resolution_clock::time_point trainingTime1 = std::chrono::high_resolution_clock::now();
421 |     BoosterHandle booster;
422 |     XGBoosterCreate(&dmatrix, 1, &booster);
423 |     XGBoosterSetParam(booster, "max_depth", std::to_string(config.depth).c_str());
424 |     XGBoosterSetParam(booster, "eta", std::to_string(config.shrinkage).c_str());
425 |     XGBoosterSetParam(booster, "silent", std::to_string(1).c_str());
426 |     XGBoosterSetParam(booster, "subsample", std::to_string(config.subSampling).c_str());
427 |     XGBoosterSetParam(booster, "nthread", std::to_string(1).c_str());
428 |     XGBoosterSetParam(booster, "objective", "binary:logistic");
429 |     XGBoosterSetParam(booster, "tree_method", "hist");
430 | 
431 |     // Train classifier using training data
432 |     for(unsigned int iBoost = 0; iBoost < config.nTrees; ++iBoost) {
433 |       XGBoosterUpdateOneIter(booster, iBoost, dmatrix);
434 |     }
435 | 
436 |     std::chrono::high_resolution_clock::time_point trainingTime2 = std::chrono::high_resolution_clock::now();
437 |     result.trainingTime = trainingTime2 - trainingTime1;
438 |     std::cout << "TrainingTime " << result.trainingTime.count() << std::endl;
439 | 
440 |     result.probabilities.resize(test.numberOfEvents);
441 |     
442 |     // Apply classifier on test data
443 |     std::chrono::high_resolution_clock::time_point testTime1 = std::chrono::high_resolution_clock::now();
444 |     float *test_matrix = new float[test.numberOfEvents*test.numberOfFeatures];
445 |     for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent)
446 |       for(unsigned int iFeature = 0; iFeature < test.numberOfFeatures; ++iFeature)
447 |         test_matrix[iEvent*train.numberOfFeatures + iFeature] = test.X[iEvent][iFeature];
448 |     DMatrixHandle test_dmatrix;
449 |     XGDMatrixCreateFromMat(test_matrix, test.numberOfEvents, test.numberOfFeatures, NAN, &test_dmatrix);
450 |     delete[] test_matrix;
451 |     long unsigned int out_len;
452 |     const float *out_result;
453 |     XGBoosterPredict(booster, test_dmatrix, 0, 0, &out_len, &out_result);
454 |     for(unsigned int iEvent = 0; iEvent < test.numberOfEvents; ++iEvent) {
455 |       result.probabilities[iEvent] = out_result[iEvent];
456 |     }
457 |     std::chrono::high_resolution_clock::time_point testTime2 = std::chrono::high_resolution_clock::now();
458 |     result.testTime = testTime2 - testTime1;
459 |     std::cout << "TestTime " << result.testTime.count() << std::endl;
460 | 
461 |     XGBoosterFree(booster);
462 |     XGDMatrixFree(dmatrix);
463 |     XGDMatrixFree(test_dmatrix);
464 | 
465 |     return result;
466 | 
467 | }
468 | 
469 | void measure(Config &config, unsigned int id) {
470 | 
471 |   std::chrono::high_resolution_clock::time_point loadTime1 = std::chrono::high_resolution_clock::now();
472 |   Data train("data/train.csv", config.numberOfFeatures, config.numberOfEvents);
473 |   Data test("data/test.csv", config.numberOfFeatures, config.numberOfEvents);
474 |   std::chrono::high_resolution_clock::time_point loadTime2 = std::chrono::high_resolution_clock::now();
475 |   std::chrono::duration<double, std::milli> loadTime = loadTime2 - loadTime1;
476 |   std::cout << "LoadTime " << loadTime.count() << std::endl;
477 | 
478 |   // Repeat each measurement 5 times
479 |   for(unsigned int i = 0; i < 5; ++i) {
480 |     
481 |     std::chrono::high_resolution_clock::time_point measureSKLearnTime1 = std::chrono::high_resolution_clock::now();
482 |     Result resultSKLearn = measureSKLearn(train, test, config);
483 |     std::chrono::high_resolution_clock::time_point measureSKLearnTime2 = std::chrono::high_resolution_clock::now();
484 |     std::chrono::duration<double, std::milli> measureSKLearnTime = measureSKLearnTime2 - measureSKLearnTime1;
485 |     std::cout << "MeasureSKLearnTime " << measureSKLearnTime.count() << std::endl;
486 | 
487 |     std::chrono::high_resolution_clock::time_point measureTMVATime1 = std::chrono::high_resolution_clock::now();
488 |     Result resultTMVA = measureTMVA(train, test, config);
489 |     std::chrono::high_resolution_clock::time_point measureTMVATime2 = std::chrono::high_resolution_clock::now();
490 |     std::chrono::duration<double, std::milli> measureTMVATime = measureTMVATime2 - measureTMVATime1;
491 |     std::cout << "MeasureTMVATime " << measureTMVATime.count() << std::endl;
492 | 
493 |     std::chrono::high_resolution_clock::time_point measureXGBoostTime1 = std::chrono::high_resolution_clock::now();
494 |     Result resultXGBoost = measureXGBoost(train, test, config);
495 |     std::chrono::high_resolution_clock::time_point measureXGBoostTime2 = std::chrono::high_resolution_clock::now();
496 |     std::chrono::duration<double, std::milli> measureXGBoostTime = measureXGBoostTime2 - measureXGBoostTime1;
497 |     std::cout << "MeasureXGBoostTime " << measureXGBoostTime.count() << std::endl;
498 | 
499 |     std::chrono::high_resolution_clock::time_point measureFastBDTTime1 = std::chrono::high_resolution_clock::now();
500 |     Result resultFastBDT = measureFastBDT(train, test, config);
501 |     std::chrono::high_resolution_clock::time_point measureFastBDTTime2 = std::chrono::high_resolution_clock::now();
502 |     std::chrono::duration<double, std::milli> measureFastBDTTime = measureFastBDTTime2 - measureFastBDTTime1;
503 |     std::cout << "MeasureFastBDTTime " << measureFastBDTTime.count() << std::endl;
504 | 
505 |     writeResults(std::string("result_") + std::to_string(id+i) + std::string("_cpp.txt"), {resultFastBDT, resultXGBoost, resultSKLearn, resultTMVA}, test, config);
506 |   }
507 | }
508 | 
509 | int main(int argc, char *argv[]) {
510 | 
511 |   Py_Initialize();
512 |   import_array();
513 | 
514 |   Config config;
515 |   config.nTrees = 100;
516 |   config.depth = 3;
517 |   config.shrinkage = 0.1;
518 |   config.subSampling = 0.5;
519 |   config.nCutLevels = 8;
520 |   config.numberOfEvents = 800000;
521 |   config.numberOfFeatures = 40;
522 |  
523 |   unsigned int id = atoi(argv[1]);
524 | 
525 |   // Here you can choose different hyper-parameters depending on the passed id
526 |   //config.nTrees = id*10;
527 |   //config.numberOfEvents = 500000 >> (id - 1);
528 |   //config.numberOfFeatures = id;
529 |   //config.depth = id;
530 |   //config.subSampling = 0.1*id;
531 | 
532 |   measure(config, id*10);
533 | 
534 |   Py_Finalize();
535 | 
536 | }
537 | 
538 | 


--------------------------------------------------------------------------------